diff --git a/.gitattributes b/.gitattributes
index cc2d7e7bcc456102ee879502c1c0c1311f4f1098..3908da47dcdf344e365c9d7e9fa7d24895f8dbb3 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -34,3 +34,59 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 last-checkpoint/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-100/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-1000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-1100/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-1200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-1300/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-1400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-1500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-1600/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-1700/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-1800/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-1900/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-2000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-2100/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-2200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-2300/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-2400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-2500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-2600/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-2700/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-2800/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-2900/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-300/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-3000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-3100/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-3200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-3300/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-3400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-3500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-3600/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-3700/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-3800/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-3900/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-4000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-4100/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-4200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-4300/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-4400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-4500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-4600/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-4700/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-4800/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-4900/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-5000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-5100/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-5200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-5300/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-5400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-5500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-600/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-700/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-800/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-900/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..949a98efd228358c4d1d1ad461114a3ee2d232f8
--- /dev/null
+++ b/README.md
@@ -0,0 +1,63 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+model_name: telugu
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+licence: license
+pipeline_tag: text-generation
+---
+
+# Model Card for telugu
+
+This model is a fine-tuned version of [unsloth/gemma-4-E4B-it](https://huggingface.co/unsloth/gemma-4-E4B-it).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+
+## Quick start
+
+```python
+from transformers import pipeline
+
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="None", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+
+## Training procedure
+
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/rohithsaimidigudla-omnisynkai/gemma-health-adapters/runs/mwc9jt0z) 
+
+
+This model was trained with SFT.
+
+### Framework versions
+
+- PEFT 0.19.1
+- TRL: 0.19.1
+- Transformers: 5.5.0
+- Pytorch: 2.7.0+cu128
+- Datasets: 3.6.0
+- Tokenizers: 0.22.2
+
+## Citations
+
+
+
+Cite TRL as:
+    
+```bibtex
+@misc{vonwerra2022trl,
+	title        = {{TRL: Transformer Reinforcement Learning}},
+	author       = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec},
+	year         = 2020,
+	journal      = {GitHub repository},
+	publisher    = {GitHub},
+	howpublished = {\url{https://github.com/huggingface/trl}}
+}
+```
\ No newline at end of file
diff --git a/adapter_config.json b/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/adapter_model.safetensors b/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..13e7eccf51f5d9d11e1fc349773e81db85eac36b
--- /dev/null
+++ b/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:023fcb9c596c99c5e8d74320f9720621834918ec3bcd5d877b44b0fe0907ce2e
+size 169741912
diff --git a/chat_template.jinja b/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-100/README.md b/checkpoint-100/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-100/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-100/adapter_config.json b/checkpoint-100/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-100/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-100/adapter_model.safetensors b/checkpoint-100/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..65f7cd319648121f105b20714d692f24f6414140
--- /dev/null
+++ b/checkpoint-100/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2a0d76b0ebb45ec68a37d642d7342c66a7ebc9bc3239f3387972226f24509e56
+size 169741912
diff --git a/checkpoint-100/chat_template.jinja b/checkpoint-100/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-100/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-100/optimizer.pt b/checkpoint-100/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b875bef738c81236b17817d1cf9a749f98fc8bef
--- /dev/null
+++ b/checkpoint-100/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7ebe97922ef0bee5a2887cb2ee8f12595764d517de7176ed003caf71939844df
+size 71463733
diff --git a/checkpoint-100/processor_config.json b/checkpoint-100/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-100/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-100/rng_state.pth b/checkpoint-100/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-100/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-100/scheduler.pt b/checkpoint-100/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..35089dbf59de1bc505764378fda1dbc247fe0d6b
--- /dev/null
+++ b/checkpoint-100/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bfa39a08ca6ca0b25c44556fe7464362808ae67fd00d1432e1130777acac8674
+size 1465
diff --git a/checkpoint-100/tokenizer.json b/checkpoint-100/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-100/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-100/tokenizer_config.json b/checkpoint-100/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-100/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-100/trainer_state.json b/checkpoint-100/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..d86d0125e0eee9e10ca1d4c1419f560117f3bbab
--- /dev/null
+++ b/checkpoint-100/trainer_state.json
@@ -0,0 +1,182 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.018195050946142648,
+  "eval_steps": 100,
+  "global_step": 100,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 6.444622973392128e+16,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-100/training_args.bin b/checkpoint-100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..bc886230771b1d9c8f306c7a4b9b3c7960936750
--- /dev/null
+++ b/checkpoint-100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:195f79601dec1ad668a414b5c045319cec84f48961f45b7d32762f86750cd8b1
+size 5777
diff --git a/checkpoint-1000/README.md b/checkpoint-1000/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-1000/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-1000/adapter_config.json b/checkpoint-1000/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-1000/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-1000/adapter_model.safetensors b/checkpoint-1000/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e6ea49cef3e00abb142b0eeedfbae9c372378b5d
--- /dev/null
+++ b/checkpoint-1000/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f94c7dd4d79ecdb435c295a616d4707c2bf0e734fbefe7d10ecfa59b195ee625
+size 169741912
diff --git a/checkpoint-1000/chat_template.jinja b/checkpoint-1000/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-1000/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-1000/optimizer.pt b/checkpoint-1000/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5db9f22bf2e772e6511dedec2f5297297df9802b
--- /dev/null
+++ b/checkpoint-1000/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:795a63e9a73654a7dd8a4dac66a5a2b305d11f32784400415681ec19ef91f007
+size 72807355
diff --git a/checkpoint-1000/processor_config.json b/checkpoint-1000/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-1000/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-1000/rng_state.pth b/checkpoint-1000/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-1000/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-1000/scheduler.pt b/checkpoint-1000/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5dd592c118762fb5e01051201291cfbb5392dbcd
--- /dev/null
+++ b/checkpoint-1000/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:406994c2cf2acc1e48ce8857e7cbb9e95d4fab92a97bbe36f71721705be347d7
+size 1465
diff --git a/checkpoint-1000/tokenizer.json b/checkpoint-1000/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-1000/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-1000/tokenizer_config.json b/checkpoint-1000/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-1000/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-1000/trainer_state.json b/checkpoint-1000/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..9c3d8304c154fda794f591a45f133637c71e7699
--- /dev/null
+++ b/checkpoint-1000/trainer_state.json
@@ -0,0 +1,1442 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.1819505094614265,
+  "eval_steps": 100,
+  "global_step": 1000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    },
+    {
+      "epoch": 0.03729985443959243,
+      "grad_norm": 0.061678580939769745,
+      "learning_rate": 4.999340748636462e-05,
+      "loss": 0.24956226348876953,
+      "step": 205
+    },
+    {
+      "epoch": 0.03820960698689956,
+      "grad_norm": 0.07328873127698898,
+      "learning_rate": 4.999160884067051e-05,
+      "loss": 0.26169676780700685,
+      "step": 210
+    },
+    {
+      "epoch": 0.0391193595342067,
+      "grad_norm": 0.08287990838289261,
+      "learning_rate": 4.9989593541926246e-05,
+      "loss": 0.2574604034423828,
+      "step": 215
+    },
+    {
+      "epoch": 0.04002911208151383,
+      "grad_norm": 0.06787359714508057,
+      "learning_rate": 4.9987361607602525e-05,
+      "loss": 0.25351409912109374,
+      "step": 220
+    },
+    {
+      "epoch": 0.04093886462882096,
+      "grad_norm": 0.06695502996444702,
+      "learning_rate": 4.998491305704805e-05,
+      "loss": 0.24522039890289307,
+      "step": 225
+    },
+    {
+      "epoch": 0.041848617176128096,
+      "grad_norm": 0.08872214704751968,
+      "learning_rate": 4.9982247911489375e-05,
+      "loss": 0.2581867933273315,
+      "step": 230
+    },
+    {
+      "epoch": 0.042758369723435226,
+      "grad_norm": 0.07637131959199905,
+      "learning_rate": 4.9979366194030743e-05,
+      "loss": 0.25569658279418944,
+      "step": 235
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 0.08158119022846222,
+      "learning_rate": 4.997626792965385e-05,
+      "loss": 0.2529409646987915,
+      "step": 240
+    },
+    {
+      "epoch": 0.04457787481804949,
+      "grad_norm": 0.07529161125421524,
+      "learning_rate": 4.997295314521766e-05,
+      "loss": 0.24049024581909179,
+      "step": 245
+    },
+    {
+      "epoch": 0.04548762736535662,
+      "grad_norm": 0.08860139548778534,
+      "learning_rate": 4.996942186945813e-05,
+      "loss": 0.2490522861480713,
+      "step": 250
+    },
+    {
+      "epoch": 0.04639737991266375,
+      "grad_norm": 0.0850321501493454,
+      "learning_rate": 4.9965674132988005e-05,
+      "loss": 0.24180831909179687,
+      "step": 255
+    },
+    {
+      "epoch": 0.04730713245997089,
+      "grad_norm": 0.07556115090847015,
+      "learning_rate": 4.996170996829653e-05,
+      "loss": 0.2509631872177124,
+      "step": 260
+    },
+    {
+      "epoch": 0.04821688500727802,
+      "grad_norm": 0.07971206307411194,
+      "learning_rate": 4.995752940974918e-05,
+      "loss": 0.24398891925811766,
+      "step": 265
+    },
+    {
+      "epoch": 0.04912663755458515,
+      "grad_norm": 0.09149336814880371,
+      "learning_rate": 4.9953132493587344e-05,
+      "loss": 0.2300492286682129,
+      "step": 270
+    },
+    {
+      "epoch": 0.050036390101892286,
+      "grad_norm": 0.08265820890665054,
+      "learning_rate": 4.9948519257928034e-05,
+      "loss": 0.24246792793273925,
+      "step": 275
+    },
+    {
+      "epoch": 0.050946142649199416,
+      "grad_norm": 0.10328587144613266,
+      "learning_rate": 4.9943689742763534e-05,
+      "loss": 0.2367171049118042,
+      "step": 280
+    },
+    {
+      "epoch": 0.05185589519650655,
+      "grad_norm": 0.0836917981505394,
+      "learning_rate": 4.993864398996105e-05,
+      "loss": 0.23215813636779786,
+      "step": 285
+    },
+    {
+      "epoch": 0.05276564774381368,
+      "grad_norm": 0.09475161135196686,
+      "learning_rate": 4.99333820432624e-05,
+      "loss": 0.2350748062133789,
+      "step": 290
+    },
+    {
+      "epoch": 0.05367540029112081,
+      "grad_norm": 0.08040128648281097,
+      "learning_rate": 4.992790394828355e-05,
+      "loss": 0.23253886699676513,
+      "step": 295
+    },
+    {
+      "epoch": 0.05458515283842795,
+      "grad_norm": 0.08852150291204453,
+      "learning_rate": 4.992220975251428e-05,
+      "loss": 0.23856515884399415,
+      "step": 300
+    },
+    {
+      "epoch": 0.05549490538573508,
+      "grad_norm": 0.09565229713916779,
+      "learning_rate": 4.991629950531775e-05,
+      "loss": 0.23311660289764405,
+      "step": 305
+    },
+    {
+      "epoch": 0.05640465793304221,
+      "grad_norm": 0.08158160001039505,
+      "learning_rate": 4.991017325793009e-05,
+      "loss": 0.22467944622039795,
+      "step": 310
+    },
+    {
+      "epoch": 0.05731441048034935,
+      "grad_norm": 0.07746429741382599,
+      "learning_rate": 4.990383106345994e-05,
+      "loss": 0.229844069480896,
+      "step": 315
+    },
+    {
+      "epoch": 0.05822416302765648,
+      "grad_norm": 0.08564355969429016,
+      "learning_rate": 4.989727297688797e-05,
+      "loss": 0.22414517402648926,
+      "step": 320
+    },
+    {
+      "epoch": 0.05913391557496361,
+      "grad_norm": 0.07517435401678085,
+      "learning_rate": 4.9890499055066435e-05,
+      "loss": 0.2236532211303711,
+      "step": 325
+    },
+    {
+      "epoch": 0.060043668122270744,
+      "grad_norm": 0.111734539270401,
+      "learning_rate": 4.988350935671869e-05,
+      "loss": 0.21474847793579102,
+      "step": 330
+    },
+    {
+      "epoch": 0.060953420669577874,
+      "grad_norm": 0.09906989336013794,
+      "learning_rate": 4.987630394243866e-05,
+      "loss": 0.23321933746337892,
+      "step": 335
+    },
+    {
+      "epoch": 0.06186317321688501,
+      "grad_norm": 0.10131457448005676,
+      "learning_rate": 4.98688828746903e-05,
+      "loss": 0.2310662031173706,
+      "step": 340
+    },
+    {
+      "epoch": 0.06277292576419213,
+      "grad_norm": 0.09203507006168365,
+      "learning_rate": 4.986124621780708e-05,
+      "loss": 0.22021169662475587,
+      "step": 345
+    },
+    {
+      "epoch": 0.06368267831149928,
+      "grad_norm": 0.09505912661552429,
+      "learning_rate": 4.9853394037991416e-05,
+      "loss": 0.2197155237197876,
+      "step": 350
+    },
+    {
+      "epoch": 0.06459243085880641,
+      "grad_norm": 0.09038657695055008,
+      "learning_rate": 4.984532640331412e-05,
+      "loss": 0.22066287994384765,
+      "step": 355
+    },
+    {
+      "epoch": 0.06550218340611354,
+      "grad_norm": 0.09707064181566238,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 0.22455451488494874,
+      "step": 360
+    },
+    {
+      "epoch": 0.06641193595342067,
+      "grad_norm": 0.10367228090763092,
+      "learning_rate": 4.98285450509961e-05,
+      "loss": 0.21993820667266845,
+      "step": 365
+    },
+    {
+      "epoch": 0.0673216885007278,
+      "grad_norm": 0.12229471653699875,
+      "learning_rate": 4.9819831478833456e-05,
+      "loss": 0.2168867588043213,
+      "step": 370
+    },
+    {
+      "epoch": 0.06823144104803494,
+      "grad_norm": 0.0964592918753624,
+      "learning_rate": 4.981090274276406e-05,
+      "loss": 0.21579203605651856,
+      "step": 375
+    },
+    {
+      "epoch": 0.06914119359534207,
+      "grad_norm": 0.09400496631860733,
+      "learning_rate": 4.980175892019141e-05,
+      "loss": 0.20972180366516113,
+      "step": 380
+    },
+    {
+      "epoch": 0.0700509461426492,
+      "grad_norm": 0.08158645778894424,
+      "learning_rate": 4.9792400090383594e-05,
+      "loss": 0.22148358821868896,
+      "step": 385
+    },
+    {
+      "epoch": 0.07096069868995633,
+      "grad_norm": 0.10916394740343094,
+      "learning_rate": 4.978282633447261e-05,
+      "loss": 0.2214418649673462,
+      "step": 390
+    },
+    {
+      "epoch": 0.07187045123726346,
+      "grad_norm": 0.11138810962438583,
+      "learning_rate": 4.9773037735453636e-05,
+      "loss": 0.21814754009246826,
+      "step": 395
+    },
+    {
+      "epoch": 0.07278020378457059,
+      "grad_norm": 0.10914396494626999,
+      "learning_rate": 4.9763034378184365e-05,
+      "loss": 0.21310818195343018,
+      "step": 400
+    },
+    {
+      "epoch": 0.07368995633187773,
+      "grad_norm": 0.1043366864323616,
+      "learning_rate": 4.975281634938421e-05,
+      "loss": 0.21266789436340333,
+      "step": 405
+    },
+    {
+      "epoch": 0.07459970887918486,
+      "grad_norm": 0.1036868542432785,
+      "learning_rate": 4.9742383737633594e-05,
+      "loss": 0.21606721878051757,
+      "step": 410
+    },
+    {
+      "epoch": 0.075509461426492,
+      "grad_norm": 0.11640442907810211,
+      "learning_rate": 4.9731736633373144e-05,
+      "loss": 0.21532948017120362,
+      "step": 415
+    },
+    {
+      "epoch": 0.07641921397379912,
+      "grad_norm": 0.11219926178455353,
+      "learning_rate": 4.9720875128902956e-05,
+      "loss": 0.2191627025604248,
+      "step": 420
+    },
+    {
+      "epoch": 0.07732896652110625,
+      "grad_norm": 0.12103637307882309,
+      "learning_rate": 4.970979931838176e-05,
+      "loss": 0.20938868522644044,
+      "step": 425
+    },
+    {
+      "epoch": 0.0782387190684134,
+      "grad_norm": 0.13274189829826355,
+      "learning_rate": 4.96985092978261e-05,
+      "loss": 0.21792960166931152,
+      "step": 430
+    },
+    {
+      "epoch": 0.07914847161572053,
+      "grad_norm": 0.11164513230323792,
+      "learning_rate": 4.968700516510954e-05,
+      "loss": 0.2022618055343628,
+      "step": 435
+    },
+    {
+      "epoch": 0.08005822416302766,
+      "grad_norm": 0.09532847255468369,
+      "learning_rate": 4.967528701996174e-05,
+      "loss": 0.21255812644958497,
+      "step": 440
+    },
+    {
+      "epoch": 0.08096797671033479,
+      "grad_norm": 0.10279258340597153,
+      "learning_rate": 4.96633549639677e-05,
+      "loss": 0.20683050155639648,
+      "step": 445
+    },
+    {
+      "epoch": 0.08187772925764192,
+      "grad_norm": 0.1257462352514267,
+      "learning_rate": 4.965120910056677e-05,
+      "loss": 0.21419920921325683,
+      "step": 450
+    },
+    {
+      "epoch": 0.08278748180494905,
+      "grad_norm": 0.11663137376308441,
+      "learning_rate": 4.963884953505186e-05,
+      "loss": 0.2072287082672119,
+      "step": 455
+    },
+    {
+      "epoch": 0.08369723435225619,
+      "grad_norm": 0.10488224029541016,
+      "learning_rate": 4.96262763745684e-05,
+      "loss": 0.1982678532600403,
+      "step": 460
+    },
+    {
+      "epoch": 0.08460698689956332,
+      "grad_norm": 0.11801692098379135,
+      "learning_rate": 4.961348972811354e-05,
+      "loss": 0.20662031173706055,
+      "step": 465
+    },
+    {
+      "epoch": 0.08551673944687045,
+      "grad_norm": 0.11318827420473099,
+      "learning_rate": 4.96004897065351e-05,
+      "loss": 0.20947303771972656,
+      "step": 470
+    },
+    {
+      "epoch": 0.08642649199417758,
+      "grad_norm": 0.13409486413002014,
+      "learning_rate": 4.95872764225307e-05,
+      "loss": 0.19670876264572143,
+      "step": 475
+    },
+    {
+      "epoch": 0.08733624454148471,
+      "grad_norm": 0.14440792798995972,
+      "learning_rate": 4.957384999064672e-05,
+      "loss": 0.19842848777770997,
+      "step": 480
+    },
+    {
+      "epoch": 0.08824599708879186,
+      "grad_norm": 0.12246996909379959,
+      "learning_rate": 4.956021052727731e-05,
+      "loss": 0.20318071842193602,
+      "step": 485
+    },
+    {
+      "epoch": 0.08915574963609899,
+      "grad_norm": 0.13437233865261078,
+      "learning_rate": 4.954635815066342e-05,
+      "loss": 0.21675212383270265,
+      "step": 490
+    },
+    {
+      "epoch": 0.09006550218340612,
+      "grad_norm": 0.11109672486782074,
+      "learning_rate": 4.9532292980891744e-05,
+      "loss": 0.2100757837295532,
+      "step": 495
+    },
+    {
+      "epoch": 0.09097525473071325,
+      "grad_norm": 0.1388893872499466,
+      "learning_rate": 4.9518015139893675e-05,
+      "loss": 0.20303285121917725,
+      "step": 500
+    },
+    {
+      "epoch": 0.09188500727802038,
+      "grad_norm": 0.13239721953868866,
+      "learning_rate": 4.950352475144427e-05,
+      "loss": 0.2152268409729004,
+      "step": 505
+    },
+    {
+      "epoch": 0.0927947598253275,
+      "grad_norm": 0.12834979593753815,
+      "learning_rate": 4.948882194116119e-05,
+      "loss": 0.20799248218536376,
+      "step": 510
+    },
+    {
+      "epoch": 0.09370451237263465,
+      "grad_norm": 0.11886704713106155,
+      "learning_rate": 4.947390683650354e-05,
+      "loss": 0.20394976139068605,
+      "step": 515
+    },
+    {
+      "epoch": 0.09461426491994178,
+      "grad_norm": 0.11398876458406448,
+      "learning_rate": 4.945877956677083e-05,
+      "loss": 0.2091092586517334,
+      "step": 520
+    },
+    {
+      "epoch": 0.09552401746724891,
+      "grad_norm": 0.1422540694475174,
+      "learning_rate": 4.944344026310186e-05,
+      "loss": 0.19564238786697388,
+      "step": 525
+    },
+    {
+      "epoch": 0.09643377001455604,
+      "grad_norm": 0.11359584331512451,
+      "learning_rate": 4.9427889058473535e-05,
+      "loss": 0.20493624210357667,
+      "step": 530
+    },
+    {
+      "epoch": 0.09734352256186317,
+      "grad_norm": 0.11703553050756454,
+      "learning_rate": 4.941212608769974e-05,
+      "loss": 0.2098615884780884,
+      "step": 535
+    },
+    {
+      "epoch": 0.0982532751091703,
+      "grad_norm": 0.14552047848701477,
+      "learning_rate": 4.939615148743017e-05,
+      "loss": 0.20382182598114013,
+      "step": 540
+    },
+    {
+      "epoch": 0.09916302765647744,
+      "grad_norm": 0.13178016245365143,
+      "learning_rate": 4.937996539614914e-05,
+      "loss": 0.19901862144470214,
+      "step": 545
+    },
+    {
+      "epoch": 0.10007278020378457,
+      "grad_norm": 0.635392427444458,
+      "learning_rate": 4.936356795417439e-05,
+      "loss": 0.20694944858551026,
+      "step": 550
+    },
+    {
+      "epoch": 0.1009825327510917,
+      "grad_norm": 0.15019077062606812,
+      "learning_rate": 4.934695930365586e-05,
+      "loss": 0.19313746690750122,
+      "step": 555
+    },
+    {
+      "epoch": 0.10189228529839883,
+      "grad_norm": 0.12941956520080566,
+      "learning_rate": 4.9330139588574474e-05,
+      "loss": 0.19671722650527954,
+      "step": 560
+    },
+    {
+      "epoch": 0.10280203784570596,
+      "grad_norm": 0.13818831741809845,
+      "learning_rate": 4.931310895474088e-05,
+      "loss": 0.20026786327362062,
+      "step": 565
+    },
+    {
+      "epoch": 0.1037117903930131,
+      "grad_norm": 0.12011194974184036,
+      "learning_rate": 4.929586754979417e-05,
+      "loss": 0.1932437539100647,
+      "step": 570
+    },
+    {
+      "epoch": 0.10462154294032024,
+      "grad_norm": 0.1345364898443222,
+      "learning_rate": 4.9278415523200644e-05,
+      "loss": 0.20245940685272218,
+      "step": 575
+    },
+    {
+      "epoch": 0.10553129548762737,
+      "grad_norm": 0.13281017541885376,
+      "learning_rate": 4.926075302625247e-05,
+      "loss": 0.19864981174468993,
+      "step": 580
+    },
+    {
+      "epoch": 0.1064410480349345,
+      "grad_norm": 0.13465586304664612,
+      "learning_rate": 4.924288021206639e-05,
+      "loss": 0.19573183059692384,
+      "step": 585
+    },
+    {
+      "epoch": 0.10735080058224163,
+      "grad_norm": 0.15225961804389954,
+      "learning_rate": 4.9224797235582396e-05,
+      "loss": 0.19946801662445068,
+      "step": 590
+    },
+    {
+      "epoch": 0.10826055312954876,
+      "grad_norm": 0.12816746532917023,
+      "learning_rate": 4.92065042535624e-05,
+      "loss": 0.19851526021957397,
+      "step": 595
+    },
+    {
+      "epoch": 0.1091703056768559,
+      "grad_norm": 0.13802853226661682,
+      "learning_rate": 4.9188001424588824e-05,
+      "loss": 0.19321763515472412,
+      "step": 600
+    },
+    {
+      "epoch": 0.11008005822416303,
+      "grad_norm": 0.17504797875881195,
+      "learning_rate": 4.9169288909063295e-05,
+      "loss": 0.2032616138458252,
+      "step": 605
+    },
+    {
+      "epoch": 0.11098981077147016,
+      "grad_norm": 0.13544194400310516,
+      "learning_rate": 4.91503668692052e-05,
+      "loss": 0.2011256456375122,
+      "step": 610
+    },
+    {
+      "epoch": 0.11189956331877729,
+      "grad_norm": 1.3976134061813354,
+      "learning_rate": 4.91312354690503e-05,
+      "loss": 0.19916868209838867,
+      "step": 615
+    },
+    {
+      "epoch": 0.11280931586608442,
+      "grad_norm": 0.1465059071779251,
+      "learning_rate": 4.91118948744493e-05,
+      "loss": 0.19487457275390624,
+      "step": 620
+    },
+    {
+      "epoch": 0.11371906841339156,
+      "grad_norm": 0.12103168666362762,
+      "learning_rate": 4.909234525306645e-05,
+      "loss": 0.1907251238822937,
+      "step": 625
+    },
+    {
+      "epoch": 0.1146288209606987,
+      "grad_norm": 0.12660574913024902,
+      "learning_rate": 4.907258677437802e-05,
+      "loss": 0.19327253103256226,
+      "step": 630
+    },
+    {
+      "epoch": 0.11553857350800582,
+      "grad_norm": 0.1347813606262207,
+      "learning_rate": 4.90526196096709e-05,
+      "loss": 0.19637736082077026,
+      "step": 635
+    },
+    {
+      "epoch": 0.11644832605531295,
+      "grad_norm": 0.14953652024269104,
+      "learning_rate": 4.903244393204107e-05,
+      "loss": 0.20325069427490233,
+      "step": 640
+    },
+    {
+      "epoch": 0.11735807860262008,
+      "grad_norm": 0.13936272263526917,
+      "learning_rate": 4.901205991639213e-05,
+      "loss": 0.1930275321006775,
+      "step": 645
+    },
+    {
+      "epoch": 0.11826783114992721,
+      "grad_norm": 0.1448420137166977,
+      "learning_rate": 4.899146773943374e-05,
+      "loss": 0.20026936531066894,
+      "step": 650
+    },
+    {
+      "epoch": 0.11917758369723436,
+      "grad_norm": 0.1312534064054489,
+      "learning_rate": 4.897066757968014e-05,
+      "loss": 0.19062033891677857,
+      "step": 655
+    },
+    {
+      "epoch": 0.12008733624454149,
+      "grad_norm": 0.13644742965698242,
+      "learning_rate": 4.894965961744859e-05,
+      "loss": 0.18719595670700073,
+      "step": 660
+    },
+    {
+      "epoch": 0.12099708879184862,
+      "grad_norm": 0.14276087284088135,
+      "learning_rate": 4.892844403485777e-05,
+      "loss": 0.19784307479858398,
+      "step": 665
+    },
+    {
+      "epoch": 0.12190684133915575,
+      "grad_norm": 0.14735399186611176,
+      "learning_rate": 4.890702101582623e-05,
+      "loss": 0.19163782596588136,
+      "step": 670
+    },
+    {
+      "epoch": 0.12281659388646288,
+      "grad_norm": 0.15742065012454987,
+      "learning_rate": 4.888539074607082e-05,
+      "loss": 0.19312986135482788,
+      "step": 675
+    },
+    {
+      "epoch": 0.12372634643377002,
+      "grad_norm": 0.12917031347751617,
+      "learning_rate": 4.8863553413105025e-05,
+      "loss": 0.20066320896148682,
+      "step": 680
+    },
+    {
+      "epoch": 0.12463609898107715,
+      "grad_norm": 0.1484801322221756,
+      "learning_rate": 4.884150920623737e-05,
+      "loss": 0.20096096992492676,
+      "step": 685
+    },
+    {
+      "epoch": 0.12554585152838427,
+      "grad_norm": 0.1455296128988266,
+      "learning_rate": 4.88192583165698e-05,
+      "loss": 0.20518505573272705,
+      "step": 690
+    },
+    {
+      "epoch": 0.12645560407569142,
+      "grad_norm": 0.14517490565776825,
+      "learning_rate": 4.879680093699598e-05,
+      "loss": 0.18859238624572755,
+      "step": 695
+    },
+    {
+      "epoch": 0.12736535662299855,
+      "grad_norm": 0.18778090178966522,
+      "learning_rate": 4.877413726219964e-05,
+      "loss": 0.197074818611145,
+      "step": 700
+    },
+    {
+      "epoch": 0.12827510917030568,
+      "grad_norm": 0.13497677445411682,
+      "learning_rate": 4.87512674886529e-05,
+      "loss": 0.18713107109069824,
+      "step": 705
+    },
+    {
+      "epoch": 0.12918486171761281,
+      "grad_norm": 0.12657155096530914,
+      "learning_rate": 4.872819181461455e-05,
+      "loss": 0.1858484387397766,
+      "step": 710
+    },
+    {
+      "epoch": 0.13009461426491994,
+      "grad_norm": 0.11458148807287216,
+      "learning_rate": 4.870491044012834e-05,
+      "loss": 0.18732179403305055,
+      "step": 715
+    },
+    {
+      "epoch": 0.13100436681222707,
+      "grad_norm": 0.13000249862670898,
+      "learning_rate": 4.8681423567021244e-05,
+      "loss": 0.1872936010360718,
+      "step": 720
+    },
+    {
+      "epoch": 0.1319141193595342,
+      "grad_norm": 0.14580890536308289,
+      "learning_rate": 4.865773139890172e-05,
+      "loss": 0.19280019998550416,
+      "step": 725
+    },
+    {
+      "epoch": 0.13282387190684133,
+      "grad_norm": 0.1507277935743332,
+      "learning_rate": 4.8633834141157913e-05,
+      "loss": 0.1898929238319397,
+      "step": 730
+    },
+    {
+      "epoch": 0.13373362445414846,
+      "grad_norm": 0.1418737769126892,
+      "learning_rate": 4.860973200095592e-05,
+      "loss": 0.17926375865936278,
+      "step": 735
+    },
+    {
+      "epoch": 0.1346433770014556,
+      "grad_norm": 0.17151866853237152,
+      "learning_rate": 4.858542518723794e-05,
+      "loss": 0.18963592052459716,
+      "step": 740
+    },
+    {
+      "epoch": 0.13555312954876272,
+      "grad_norm": 0.11162743717432022,
+      "learning_rate": 4.8560913910720535e-05,
+      "loss": 0.19466646909713745,
+      "step": 745
+    },
+    {
+      "epoch": 0.13646288209606988,
+      "grad_norm": 0.15628376603126526,
+      "learning_rate": 4.8536198383892725e-05,
+      "loss": 0.19494034051895143,
+      "step": 750
+    },
+    {
+      "epoch": 0.137372634643377,
+      "grad_norm": 0.18209289014339447,
+      "learning_rate": 4.851127882101421e-05,
+      "loss": 0.18747550249099731,
+      "step": 755
+    },
+    {
+      "epoch": 0.13828238719068414,
+      "grad_norm": 0.14559614658355713,
+      "learning_rate": 4.8486155438113454e-05,
+      "loss": 0.1897158980369568,
+      "step": 760
+    },
+    {
+      "epoch": 0.13919213973799127,
+      "grad_norm": 0.3198587894439697,
+      "learning_rate": 4.846082845298586e-05,
+      "loss": 0.18571001291275024,
+      "step": 765
+    },
+    {
+      "epoch": 0.1401018922852984,
+      "grad_norm": 0.1486678421497345,
+      "learning_rate": 4.843529808519189e-05,
+      "loss": 0.19561930894851684,
+      "step": 770
+    },
+    {
+      "epoch": 0.14101164483260553,
+      "grad_norm": 0.15318170189857483,
+      "learning_rate": 4.840956455605509e-05,
+      "loss": 0.187040114402771,
+      "step": 775
+    },
+    {
+      "epoch": 0.14192139737991266,
+      "grad_norm": 0.13754244148731232,
+      "learning_rate": 4.838362808866025e-05,
+      "loss": 0.18345539569854735,
+      "step": 780
+    },
+    {
+      "epoch": 0.1428311499272198,
+      "grad_norm": 0.12943248450756073,
+      "learning_rate": 4.835748890785143e-05,
+      "loss": 0.1921079397201538,
+      "step": 785
+    },
+    {
+      "epoch": 0.14374090247452692,
+      "grad_norm": 0.110458143055439,
+      "learning_rate": 4.833114724023001e-05,
+      "loss": 0.17927205562591553,
+      "step": 790
+    },
+    {
+      "epoch": 0.14465065502183405,
+      "grad_norm": 0.2421770840883255,
+      "learning_rate": 4.830460331415275e-05,
+      "loss": 0.18317567110061644,
+      "step": 795
+    },
+    {
+      "epoch": 0.14556040756914118,
+      "grad_norm": 0.14752762019634247,
+      "learning_rate": 4.8277857359729787e-05,
+      "loss": 0.1843916058540344,
+      "step": 800
+    },
+    {
+      "epoch": 0.14647016011644834,
+      "grad_norm": 0.15043556690216064,
+      "learning_rate": 4.8250909608822644e-05,
+      "loss": 0.18354393243789674,
+      "step": 805
+    },
+    {
+      "epoch": 0.14737991266375547,
+      "grad_norm": 0.1381794661283493,
+      "learning_rate": 4.822376029504223e-05,
+      "loss": 0.1789781332015991,
+      "step": 810
+    },
+    {
+      "epoch": 0.1482896652110626,
+      "grad_norm": 0.18386174738407135,
+      "learning_rate": 4.819640965374681e-05,
+      "loss": 0.19494292736053467,
+      "step": 815
+    },
+    {
+      "epoch": 0.14919941775836973,
+      "grad_norm": 0.13829593360424042,
+      "learning_rate": 4.816885792203996e-05,
+      "loss": 0.18486063480377196,
+      "step": 820
+    },
+    {
+      "epoch": 0.15010917030567686,
+      "grad_norm": 0.15033291280269623,
+      "learning_rate": 4.814110533876852e-05,
+      "loss": 0.18061509132385253,
+      "step": 825
+    },
+    {
+      "epoch": 0.151018922852984,
+      "grad_norm": 0.17150473594665527,
+      "learning_rate": 4.811315214452051e-05,
+      "loss": 0.18464866876602173,
+      "step": 830
+    },
+    {
+      "epoch": 0.15192867540029112,
+      "grad_norm": 0.15317125618457794,
+      "learning_rate": 4.808499858162307e-05,
+      "loss": 0.1837708592414856,
+      "step": 835
+    },
+    {
+      "epoch": 0.15283842794759825,
+      "grad_norm": 0.2671392560005188,
+      "learning_rate": 4.805664489414031e-05,
+      "loss": 0.19338636398315429,
+      "step": 840
+    },
+    {
+      "epoch": 0.15374818049490538,
+      "grad_norm": 0.14047028124332428,
+      "learning_rate": 4.802809132787125e-05,
+      "loss": 0.17069108486175538,
+      "step": 845
+    },
+    {
+      "epoch": 0.1546579330422125,
+      "grad_norm": 0.1520431935787201,
+      "learning_rate": 4.799933813034768e-05,
+      "loss": 0.18607735633850098,
+      "step": 850
+    },
+    {
+      "epoch": 0.15556768558951964,
+      "grad_norm": 0.17239463329315186,
+      "learning_rate": 4.797038555083197e-05,
+      "loss": 0.18069062232971192,
+      "step": 855
+    },
+    {
+      "epoch": 0.1564774381368268,
+      "grad_norm": 0.1377955675125122,
+      "learning_rate": 4.794123384031495e-05,
+      "loss": 0.18870222568511963,
+      "step": 860
+    },
+    {
+      "epoch": 0.15738719068413393,
+      "grad_norm": 0.15901461243629456,
+      "learning_rate": 4.791188325151373e-05,
+      "loss": 0.18128334283828734,
+      "step": 865
+    },
+    {
+      "epoch": 0.15829694323144106,
+      "grad_norm": 0.14634132385253906,
+      "learning_rate": 4.7882334038869495e-05,
+      "loss": 0.1866163969039917,
+      "step": 870
+    },
+    {
+      "epoch": 0.1592066957787482,
+      "grad_norm": 0.15361061692237854,
+      "learning_rate": 4.785258645854529e-05,
+      "loss": 0.17850807905197144,
+      "step": 875
+    },
+    {
+      "epoch": 0.16011644832605532,
+      "grad_norm": 0.13751649856567383,
+      "learning_rate": 4.782264076842385e-05,
+      "loss": 0.17731113433837892,
+      "step": 880
+    },
+    {
+      "epoch": 0.16102620087336245,
+      "grad_norm": 0.17909638583660126,
+      "learning_rate": 4.7792497228105314e-05,
+      "loss": 0.18344542980194092,
+      "step": 885
+    },
+    {
+      "epoch": 0.16193595342066958,
+      "grad_norm": 0.16038304567337036,
+      "learning_rate": 4.776215609890498e-05,
+      "loss": 0.18868647813796996,
+      "step": 890
+    },
+    {
+      "epoch": 0.1628457059679767,
+      "grad_norm": 0.1653951108455658,
+      "learning_rate": 4.773161764385107e-05,
+      "loss": 0.18614152669906617,
+      "step": 895
+    },
+    {
+      "epoch": 0.16375545851528384,
+      "grad_norm": 0.16193026304244995,
+      "learning_rate": 4.770088212768241e-05,
+      "loss": 0.18564575910568237,
+      "step": 900
+    },
+    {
+      "epoch": 0.16466521106259097,
+      "grad_norm": 0.16048531234264374,
+      "learning_rate": 4.7669949816846173e-05,
+      "loss": 0.18330031633377075,
+      "step": 905
+    },
+    {
+      "epoch": 0.1655749636098981,
+      "grad_norm": 0.1440177708864212,
+      "learning_rate": 4.7638820979495534e-05,
+      "loss": 0.17712442874908446,
+      "step": 910
+    },
+    {
+      "epoch": 0.16648471615720525,
+      "grad_norm": 0.19635969400405884,
+      "learning_rate": 4.760749588548738e-05,
+      "loss": 0.18679027557373046,
+      "step": 915
+    },
+    {
+      "epoch": 0.16739446870451238,
+      "grad_norm": 0.15576541423797607,
+      "learning_rate": 4.757597480637995e-05,
+      "loss": 0.19283764362335204,
+      "step": 920
+    },
+    {
+      "epoch": 0.1683042212518195,
+      "grad_norm": 0.1550331562757492,
+      "learning_rate": 4.7544258015430463e-05,
+      "loss": 0.18269542455673218,
+      "step": 925
+    },
+    {
+      "epoch": 0.16921397379912664,
+      "grad_norm": 0.18369626998901367,
+      "learning_rate": 4.75123457875928e-05,
+      "loss": 0.1697891116142273,
+      "step": 930
+    },
+    {
+      "epoch": 0.17012372634643377,
+      "grad_norm": 0.15266314148902893,
+      "learning_rate": 4.7480238399515074e-05,
+      "loss": 0.18523451089859008,
+      "step": 935
+    },
+    {
+      "epoch": 0.1710334788937409,
+      "grad_norm": 0.16709664463996887,
+      "learning_rate": 4.744793612953724e-05,
+      "loss": 0.1803238034248352,
+      "step": 940
+    },
+    {
+      "epoch": 0.17194323144104803,
+      "grad_norm": 0.14929179847240448,
+      "learning_rate": 4.741543925768872e-05,
+      "loss": 0.1861217737197876,
+      "step": 945
+    },
+    {
+      "epoch": 0.17285298398835516,
+      "grad_norm": 0.1362280696630478,
+      "learning_rate": 4.7382748065685915e-05,
+      "loss": 0.17896100282669067,
+      "step": 950
+    },
+    {
+      "epoch": 0.1737627365356623,
+      "grad_norm": 0.15290239453315735,
+      "learning_rate": 4.734986283692982e-05,
+      "loss": 0.18432788848876952,
+      "step": 955
+    },
+    {
+      "epoch": 0.17467248908296942,
+      "grad_norm": 0.1287035197019577,
+      "learning_rate": 4.73167838565035e-05,
+      "loss": 0.18485682010650634,
+      "step": 960
+    },
+    {
+      "epoch": 0.17558224163027655,
+      "grad_norm": 0.17969627678394318,
+      "learning_rate": 4.728351141116971e-05,
+      "loss": 0.17361557483673096,
+      "step": 965
+    },
+    {
+      "epoch": 0.1764919941775837,
+      "grad_norm": 0.13751201331615448,
+      "learning_rate": 4.7250045789368326e-05,
+      "loss": 0.1731679320335388,
+      "step": 970
+    },
+    {
+      "epoch": 0.17740174672489084,
+      "grad_norm": 0.1603265255689621,
+      "learning_rate": 4.721638728121388e-05,
+      "loss": 0.17308170795440675,
+      "step": 975
+    },
+    {
+      "epoch": 0.17831149927219797,
+      "grad_norm": 0.1592789888381958,
+      "learning_rate": 4.718253617849306e-05,
+      "loss": 0.17534757852554322,
+      "step": 980
+    },
+    {
+      "epoch": 0.1792212518195051,
+      "grad_norm": 0.12727224826812744,
+      "learning_rate": 4.714849277466214e-05,
+      "loss": 0.17817609310150145,
+      "step": 985
+    },
+    {
+      "epoch": 0.18013100436681223,
+      "grad_norm": 0.15401554107666016,
+      "learning_rate": 4.711425736484447e-05,
+      "loss": 0.1733405351638794,
+      "step": 990
+    },
+    {
+      "epoch": 0.18104075691411936,
+      "grad_norm": 0.13253968954086304,
+      "learning_rate": 4.7079830245827906e-05,
+      "loss": 0.17846795320510864,
+      "step": 995
+    },
+    {
+      "epoch": 0.1819505094614265,
+      "grad_norm": 0.21846213936805725,
+      "learning_rate": 4.7045211716062245e-05,
+      "loss": 0.18021599054336548,
+      "step": 1000
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 5.583006871819799e+17,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-1000/training_args.bin b/checkpoint-1000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-1000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/checkpoint-1100/README.md b/checkpoint-1100/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-1100/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-1100/adapter_config.json b/checkpoint-1100/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-1100/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-1100/adapter_model.safetensors b/checkpoint-1100/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..cb2647dbe1250de395963cb8f8ad1ab34e3f03be
--- /dev/null
+++ b/checkpoint-1100/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a4be3bea2ca3bd38e446c68a30717eb1a31d7d5b77955efe33bf656a8162068a
+size 169741912
diff --git a/checkpoint-1100/chat_template.jinja b/checkpoint-1100/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-1100/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-1100/optimizer.pt b/checkpoint-1100/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f14357495e786417a1152b2baa822b3169f33375
--- /dev/null
+++ b/checkpoint-1100/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:66120ce4d55186cce9be5cdf28e030e89994c81dac5711321d07d2b5ce8153e3
+size 72807355
diff --git a/checkpoint-1100/processor_config.json b/checkpoint-1100/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-1100/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-1100/rng_state.pth b/checkpoint-1100/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-1100/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-1100/scheduler.pt b/checkpoint-1100/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a30411f3b56e549bf9d0e3fcfc041ba0aea0119e
--- /dev/null
+++ b/checkpoint-1100/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:936724e73ecd7ecf26460f7aeb2b5af5460899f93c78695a46fc00c541454d94
+size 1465
diff --git a/checkpoint-1100/tokenizer.json b/checkpoint-1100/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-1100/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-1100/tokenizer_config.json b/checkpoint-1100/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-1100/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-1100/trainer_state.json b/checkpoint-1100/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..fe7549052c2122f500081edda24a93eabfecd1ac
--- /dev/null
+++ b/checkpoint-1100/trainer_state.json
@@ -0,0 +1,1582 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.20014556040756915,
+  "eval_steps": 100,
+  "global_step": 1100,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    },
+    {
+      "epoch": 0.03729985443959243,
+      "grad_norm": 0.061678580939769745,
+      "learning_rate": 4.999340748636462e-05,
+      "loss": 0.24956226348876953,
+      "step": 205
+    },
+    {
+      "epoch": 0.03820960698689956,
+      "grad_norm": 0.07328873127698898,
+      "learning_rate": 4.999160884067051e-05,
+      "loss": 0.26169676780700685,
+      "step": 210
+    },
+    {
+      "epoch": 0.0391193595342067,
+      "grad_norm": 0.08287990838289261,
+      "learning_rate": 4.9989593541926246e-05,
+      "loss": 0.2574604034423828,
+      "step": 215
+    },
+    {
+      "epoch": 0.04002911208151383,
+      "grad_norm": 0.06787359714508057,
+      "learning_rate": 4.9987361607602525e-05,
+      "loss": 0.25351409912109374,
+      "step": 220
+    },
+    {
+      "epoch": 0.04093886462882096,
+      "grad_norm": 0.06695502996444702,
+      "learning_rate": 4.998491305704805e-05,
+      "loss": 0.24522039890289307,
+      "step": 225
+    },
+    {
+      "epoch": 0.041848617176128096,
+      "grad_norm": 0.08872214704751968,
+      "learning_rate": 4.9982247911489375e-05,
+      "loss": 0.2581867933273315,
+      "step": 230
+    },
+    {
+      "epoch": 0.042758369723435226,
+      "grad_norm": 0.07637131959199905,
+      "learning_rate": 4.9979366194030743e-05,
+      "loss": 0.25569658279418944,
+      "step": 235
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 0.08158119022846222,
+      "learning_rate": 4.997626792965385e-05,
+      "loss": 0.2529409646987915,
+      "step": 240
+    },
+    {
+      "epoch": 0.04457787481804949,
+      "grad_norm": 0.07529161125421524,
+      "learning_rate": 4.997295314521766e-05,
+      "loss": 0.24049024581909179,
+      "step": 245
+    },
+    {
+      "epoch": 0.04548762736535662,
+      "grad_norm": 0.08860139548778534,
+      "learning_rate": 4.996942186945813e-05,
+      "loss": 0.2490522861480713,
+      "step": 250
+    },
+    {
+      "epoch": 0.04639737991266375,
+      "grad_norm": 0.0850321501493454,
+      "learning_rate": 4.9965674132988005e-05,
+      "loss": 0.24180831909179687,
+      "step": 255
+    },
+    {
+      "epoch": 0.04730713245997089,
+      "grad_norm": 0.07556115090847015,
+      "learning_rate": 4.996170996829653e-05,
+      "loss": 0.2509631872177124,
+      "step": 260
+    },
+    {
+      "epoch": 0.04821688500727802,
+      "grad_norm": 0.07971206307411194,
+      "learning_rate": 4.995752940974918e-05,
+      "loss": 0.24398891925811766,
+      "step": 265
+    },
+    {
+      "epoch": 0.04912663755458515,
+      "grad_norm": 0.09149336814880371,
+      "learning_rate": 4.9953132493587344e-05,
+      "loss": 0.2300492286682129,
+      "step": 270
+    },
+    {
+      "epoch": 0.050036390101892286,
+      "grad_norm": 0.08265820890665054,
+      "learning_rate": 4.9948519257928034e-05,
+      "loss": 0.24246792793273925,
+      "step": 275
+    },
+    {
+      "epoch": 0.050946142649199416,
+      "grad_norm": 0.10328587144613266,
+      "learning_rate": 4.9943689742763534e-05,
+      "loss": 0.2367171049118042,
+      "step": 280
+    },
+    {
+      "epoch": 0.05185589519650655,
+      "grad_norm": 0.0836917981505394,
+      "learning_rate": 4.993864398996105e-05,
+      "loss": 0.23215813636779786,
+      "step": 285
+    },
+    {
+      "epoch": 0.05276564774381368,
+      "grad_norm": 0.09475161135196686,
+      "learning_rate": 4.99333820432624e-05,
+      "loss": 0.2350748062133789,
+      "step": 290
+    },
+    {
+      "epoch": 0.05367540029112081,
+      "grad_norm": 0.08040128648281097,
+      "learning_rate": 4.992790394828355e-05,
+      "loss": 0.23253886699676513,
+      "step": 295
+    },
+    {
+      "epoch": 0.05458515283842795,
+      "grad_norm": 0.08852150291204453,
+      "learning_rate": 4.992220975251428e-05,
+      "loss": 0.23856515884399415,
+      "step": 300
+    },
+    {
+      "epoch": 0.05549490538573508,
+      "grad_norm": 0.09565229713916779,
+      "learning_rate": 4.991629950531775e-05,
+      "loss": 0.23311660289764405,
+      "step": 305
+    },
+    {
+      "epoch": 0.05640465793304221,
+      "grad_norm": 0.08158160001039505,
+      "learning_rate": 4.991017325793009e-05,
+      "loss": 0.22467944622039795,
+      "step": 310
+    },
+    {
+      "epoch": 0.05731441048034935,
+      "grad_norm": 0.07746429741382599,
+      "learning_rate": 4.990383106345994e-05,
+      "loss": 0.229844069480896,
+      "step": 315
+    },
+    {
+      "epoch": 0.05822416302765648,
+      "grad_norm": 0.08564355969429016,
+      "learning_rate": 4.989727297688797e-05,
+      "loss": 0.22414517402648926,
+      "step": 320
+    },
+    {
+      "epoch": 0.05913391557496361,
+      "grad_norm": 0.07517435401678085,
+      "learning_rate": 4.9890499055066435e-05,
+      "loss": 0.2236532211303711,
+      "step": 325
+    },
+    {
+      "epoch": 0.060043668122270744,
+      "grad_norm": 0.111734539270401,
+      "learning_rate": 4.988350935671869e-05,
+      "loss": 0.21474847793579102,
+      "step": 330
+    },
+    {
+      "epoch": 0.060953420669577874,
+      "grad_norm": 0.09906989336013794,
+      "learning_rate": 4.987630394243866e-05,
+      "loss": 0.23321933746337892,
+      "step": 335
+    },
+    {
+      "epoch": 0.06186317321688501,
+      "grad_norm": 0.10131457448005676,
+      "learning_rate": 4.98688828746903e-05,
+      "loss": 0.2310662031173706,
+      "step": 340
+    },
+    {
+      "epoch": 0.06277292576419213,
+      "grad_norm": 0.09203507006168365,
+      "learning_rate": 4.986124621780708e-05,
+      "loss": 0.22021169662475587,
+      "step": 345
+    },
+    {
+      "epoch": 0.06368267831149928,
+      "grad_norm": 0.09505912661552429,
+      "learning_rate": 4.9853394037991416e-05,
+      "loss": 0.2197155237197876,
+      "step": 350
+    },
+    {
+      "epoch": 0.06459243085880641,
+      "grad_norm": 0.09038657695055008,
+      "learning_rate": 4.984532640331412e-05,
+      "loss": 0.22066287994384765,
+      "step": 355
+    },
+    {
+      "epoch": 0.06550218340611354,
+      "grad_norm": 0.09707064181566238,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 0.22455451488494874,
+      "step": 360
+    },
+    {
+      "epoch": 0.06641193595342067,
+      "grad_norm": 0.10367228090763092,
+      "learning_rate": 4.98285450509961e-05,
+      "loss": 0.21993820667266845,
+      "step": 365
+    },
+    {
+      "epoch": 0.0673216885007278,
+      "grad_norm": 0.12229471653699875,
+      "learning_rate": 4.9819831478833456e-05,
+      "loss": 0.2168867588043213,
+      "step": 370
+    },
+    {
+      "epoch": 0.06823144104803494,
+      "grad_norm": 0.0964592918753624,
+      "learning_rate": 4.981090274276406e-05,
+      "loss": 0.21579203605651856,
+      "step": 375
+    },
+    {
+      "epoch": 0.06914119359534207,
+      "grad_norm": 0.09400496631860733,
+      "learning_rate": 4.980175892019141e-05,
+      "loss": 0.20972180366516113,
+      "step": 380
+    },
+    {
+      "epoch": 0.0700509461426492,
+      "grad_norm": 0.08158645778894424,
+      "learning_rate": 4.9792400090383594e-05,
+      "loss": 0.22148358821868896,
+      "step": 385
+    },
+    {
+      "epoch": 0.07096069868995633,
+      "grad_norm": 0.10916394740343094,
+      "learning_rate": 4.978282633447261e-05,
+      "loss": 0.2214418649673462,
+      "step": 390
+    },
+    {
+      "epoch": 0.07187045123726346,
+      "grad_norm": 0.11138810962438583,
+      "learning_rate": 4.9773037735453636e-05,
+      "loss": 0.21814754009246826,
+      "step": 395
+    },
+    {
+      "epoch": 0.07278020378457059,
+      "grad_norm": 0.10914396494626999,
+      "learning_rate": 4.9763034378184365e-05,
+      "loss": 0.21310818195343018,
+      "step": 400
+    },
+    {
+      "epoch": 0.07368995633187773,
+      "grad_norm": 0.1043366864323616,
+      "learning_rate": 4.975281634938421e-05,
+      "loss": 0.21266789436340333,
+      "step": 405
+    },
+    {
+      "epoch": 0.07459970887918486,
+      "grad_norm": 0.1036868542432785,
+      "learning_rate": 4.9742383737633594e-05,
+      "loss": 0.21606721878051757,
+      "step": 410
+    },
+    {
+      "epoch": 0.075509461426492,
+      "grad_norm": 0.11640442907810211,
+      "learning_rate": 4.9731736633373144e-05,
+      "loss": 0.21532948017120362,
+      "step": 415
+    },
+    {
+      "epoch": 0.07641921397379912,
+      "grad_norm": 0.11219926178455353,
+      "learning_rate": 4.9720875128902956e-05,
+      "loss": 0.2191627025604248,
+      "step": 420
+    },
+    {
+      "epoch": 0.07732896652110625,
+      "grad_norm": 0.12103637307882309,
+      "learning_rate": 4.970979931838176e-05,
+      "loss": 0.20938868522644044,
+      "step": 425
+    },
+    {
+      "epoch": 0.0782387190684134,
+      "grad_norm": 0.13274189829826355,
+      "learning_rate": 4.96985092978261e-05,
+      "loss": 0.21792960166931152,
+      "step": 430
+    },
+    {
+      "epoch": 0.07914847161572053,
+      "grad_norm": 0.11164513230323792,
+      "learning_rate": 4.968700516510954e-05,
+      "loss": 0.2022618055343628,
+      "step": 435
+    },
+    {
+      "epoch": 0.08005822416302766,
+      "grad_norm": 0.09532847255468369,
+      "learning_rate": 4.967528701996174e-05,
+      "loss": 0.21255812644958497,
+      "step": 440
+    },
+    {
+      "epoch": 0.08096797671033479,
+      "grad_norm": 0.10279258340597153,
+      "learning_rate": 4.96633549639677e-05,
+      "loss": 0.20683050155639648,
+      "step": 445
+    },
+    {
+      "epoch": 0.08187772925764192,
+      "grad_norm": 0.1257462352514267,
+      "learning_rate": 4.965120910056677e-05,
+      "loss": 0.21419920921325683,
+      "step": 450
+    },
+    {
+      "epoch": 0.08278748180494905,
+      "grad_norm": 0.11663137376308441,
+      "learning_rate": 4.963884953505186e-05,
+      "loss": 0.2072287082672119,
+      "step": 455
+    },
+    {
+      "epoch": 0.08369723435225619,
+      "grad_norm": 0.10488224029541016,
+      "learning_rate": 4.96262763745684e-05,
+      "loss": 0.1982678532600403,
+      "step": 460
+    },
+    {
+      "epoch": 0.08460698689956332,
+      "grad_norm": 0.11801692098379135,
+      "learning_rate": 4.961348972811354e-05,
+      "loss": 0.20662031173706055,
+      "step": 465
+    },
+    {
+      "epoch": 0.08551673944687045,
+      "grad_norm": 0.11318827420473099,
+      "learning_rate": 4.96004897065351e-05,
+      "loss": 0.20947303771972656,
+      "step": 470
+    },
+    {
+      "epoch": 0.08642649199417758,
+      "grad_norm": 0.13409486413002014,
+      "learning_rate": 4.95872764225307e-05,
+      "loss": 0.19670876264572143,
+      "step": 475
+    },
+    {
+      "epoch": 0.08733624454148471,
+      "grad_norm": 0.14440792798995972,
+      "learning_rate": 4.957384999064672e-05,
+      "loss": 0.19842848777770997,
+      "step": 480
+    },
+    {
+      "epoch": 0.08824599708879186,
+      "grad_norm": 0.12246996909379959,
+      "learning_rate": 4.956021052727731e-05,
+      "loss": 0.20318071842193602,
+      "step": 485
+    },
+    {
+      "epoch": 0.08915574963609899,
+      "grad_norm": 0.13437233865261078,
+      "learning_rate": 4.954635815066342e-05,
+      "loss": 0.21675212383270265,
+      "step": 490
+    },
+    {
+      "epoch": 0.09006550218340612,
+      "grad_norm": 0.11109672486782074,
+      "learning_rate": 4.9532292980891744e-05,
+      "loss": 0.2100757837295532,
+      "step": 495
+    },
+    {
+      "epoch": 0.09097525473071325,
+      "grad_norm": 0.1388893872499466,
+      "learning_rate": 4.9518015139893675e-05,
+      "loss": 0.20303285121917725,
+      "step": 500
+    },
+    {
+      "epoch": 0.09188500727802038,
+      "grad_norm": 0.13239721953868866,
+      "learning_rate": 4.950352475144427e-05,
+      "loss": 0.2152268409729004,
+      "step": 505
+    },
+    {
+      "epoch": 0.0927947598253275,
+      "grad_norm": 0.12834979593753815,
+      "learning_rate": 4.948882194116119e-05,
+      "loss": 0.20799248218536376,
+      "step": 510
+    },
+    {
+      "epoch": 0.09370451237263465,
+      "grad_norm": 0.11886704713106155,
+      "learning_rate": 4.947390683650354e-05,
+      "loss": 0.20394976139068605,
+      "step": 515
+    },
+    {
+      "epoch": 0.09461426491994178,
+      "grad_norm": 0.11398876458406448,
+      "learning_rate": 4.945877956677083e-05,
+      "loss": 0.2091092586517334,
+      "step": 520
+    },
+    {
+      "epoch": 0.09552401746724891,
+      "grad_norm": 0.1422540694475174,
+      "learning_rate": 4.944344026310186e-05,
+      "loss": 0.19564238786697388,
+      "step": 525
+    },
+    {
+      "epoch": 0.09643377001455604,
+      "grad_norm": 0.11359584331512451,
+      "learning_rate": 4.9427889058473535e-05,
+      "loss": 0.20493624210357667,
+      "step": 530
+    },
+    {
+      "epoch": 0.09734352256186317,
+      "grad_norm": 0.11703553050756454,
+      "learning_rate": 4.941212608769974e-05,
+      "loss": 0.2098615884780884,
+      "step": 535
+    },
+    {
+      "epoch": 0.0982532751091703,
+      "grad_norm": 0.14552047848701477,
+      "learning_rate": 4.939615148743017e-05,
+      "loss": 0.20382182598114013,
+      "step": 540
+    },
+    {
+      "epoch": 0.09916302765647744,
+      "grad_norm": 0.13178016245365143,
+      "learning_rate": 4.937996539614914e-05,
+      "loss": 0.19901862144470214,
+      "step": 545
+    },
+    {
+      "epoch": 0.10007278020378457,
+      "grad_norm": 0.635392427444458,
+      "learning_rate": 4.936356795417439e-05,
+      "loss": 0.20694944858551026,
+      "step": 550
+    },
+    {
+      "epoch": 0.1009825327510917,
+      "grad_norm": 0.15019077062606812,
+      "learning_rate": 4.934695930365586e-05,
+      "loss": 0.19313746690750122,
+      "step": 555
+    },
+    {
+      "epoch": 0.10189228529839883,
+      "grad_norm": 0.12941956520080566,
+      "learning_rate": 4.9330139588574474e-05,
+      "loss": 0.19671722650527954,
+      "step": 560
+    },
+    {
+      "epoch": 0.10280203784570596,
+      "grad_norm": 0.13818831741809845,
+      "learning_rate": 4.931310895474088e-05,
+      "loss": 0.20026786327362062,
+      "step": 565
+    },
+    {
+      "epoch": 0.1037117903930131,
+      "grad_norm": 0.12011194974184036,
+      "learning_rate": 4.929586754979417e-05,
+      "loss": 0.1932437539100647,
+      "step": 570
+    },
+    {
+      "epoch": 0.10462154294032024,
+      "grad_norm": 0.1345364898443222,
+      "learning_rate": 4.9278415523200644e-05,
+      "loss": 0.20245940685272218,
+      "step": 575
+    },
+    {
+      "epoch": 0.10553129548762737,
+      "grad_norm": 0.13281017541885376,
+      "learning_rate": 4.926075302625247e-05,
+      "loss": 0.19864981174468993,
+      "step": 580
+    },
+    {
+      "epoch": 0.1064410480349345,
+      "grad_norm": 0.13465586304664612,
+      "learning_rate": 4.924288021206639e-05,
+      "loss": 0.19573183059692384,
+      "step": 585
+    },
+    {
+      "epoch": 0.10735080058224163,
+      "grad_norm": 0.15225961804389954,
+      "learning_rate": 4.9224797235582396e-05,
+      "loss": 0.19946801662445068,
+      "step": 590
+    },
+    {
+      "epoch": 0.10826055312954876,
+      "grad_norm": 0.12816746532917023,
+      "learning_rate": 4.92065042535624e-05,
+      "loss": 0.19851526021957397,
+      "step": 595
+    },
+    {
+      "epoch": 0.1091703056768559,
+      "grad_norm": 0.13802853226661682,
+      "learning_rate": 4.9188001424588824e-05,
+      "loss": 0.19321763515472412,
+      "step": 600
+    },
+    {
+      "epoch": 0.11008005822416303,
+      "grad_norm": 0.17504797875881195,
+      "learning_rate": 4.9169288909063295e-05,
+      "loss": 0.2032616138458252,
+      "step": 605
+    },
+    {
+      "epoch": 0.11098981077147016,
+      "grad_norm": 0.13544194400310516,
+      "learning_rate": 4.91503668692052e-05,
+      "loss": 0.2011256456375122,
+      "step": 610
+    },
+    {
+      "epoch": 0.11189956331877729,
+      "grad_norm": 1.3976134061813354,
+      "learning_rate": 4.91312354690503e-05,
+      "loss": 0.19916868209838867,
+      "step": 615
+    },
+    {
+      "epoch": 0.11280931586608442,
+      "grad_norm": 0.1465059071779251,
+      "learning_rate": 4.91118948744493e-05,
+      "loss": 0.19487457275390624,
+      "step": 620
+    },
+    {
+      "epoch": 0.11371906841339156,
+      "grad_norm": 0.12103168666362762,
+      "learning_rate": 4.909234525306645e-05,
+      "loss": 0.1907251238822937,
+      "step": 625
+    },
+    {
+      "epoch": 0.1146288209606987,
+      "grad_norm": 0.12660574913024902,
+      "learning_rate": 4.907258677437802e-05,
+      "loss": 0.19327253103256226,
+      "step": 630
+    },
+    {
+      "epoch": 0.11553857350800582,
+      "grad_norm": 0.1347813606262207,
+      "learning_rate": 4.90526196096709e-05,
+      "loss": 0.19637736082077026,
+      "step": 635
+    },
+    {
+      "epoch": 0.11644832605531295,
+      "grad_norm": 0.14953652024269104,
+      "learning_rate": 4.903244393204107e-05,
+      "loss": 0.20325069427490233,
+      "step": 640
+    },
+    {
+      "epoch": 0.11735807860262008,
+      "grad_norm": 0.13936272263526917,
+      "learning_rate": 4.901205991639213e-05,
+      "loss": 0.1930275321006775,
+      "step": 645
+    },
+    {
+      "epoch": 0.11826783114992721,
+      "grad_norm": 0.1448420137166977,
+      "learning_rate": 4.899146773943374e-05,
+      "loss": 0.20026936531066894,
+      "step": 650
+    },
+    {
+      "epoch": 0.11917758369723436,
+      "grad_norm": 0.1312534064054489,
+      "learning_rate": 4.897066757968014e-05,
+      "loss": 0.19062033891677857,
+      "step": 655
+    },
+    {
+      "epoch": 0.12008733624454149,
+      "grad_norm": 0.13644742965698242,
+      "learning_rate": 4.894965961744859e-05,
+      "loss": 0.18719595670700073,
+      "step": 660
+    },
+    {
+      "epoch": 0.12099708879184862,
+      "grad_norm": 0.14276087284088135,
+      "learning_rate": 4.892844403485777e-05,
+      "loss": 0.19784307479858398,
+      "step": 665
+    },
+    {
+      "epoch": 0.12190684133915575,
+      "grad_norm": 0.14735399186611176,
+      "learning_rate": 4.890702101582623e-05,
+      "loss": 0.19163782596588136,
+      "step": 670
+    },
+    {
+      "epoch": 0.12281659388646288,
+      "grad_norm": 0.15742065012454987,
+      "learning_rate": 4.888539074607082e-05,
+      "loss": 0.19312986135482788,
+      "step": 675
+    },
+    {
+      "epoch": 0.12372634643377002,
+      "grad_norm": 0.12917031347751617,
+      "learning_rate": 4.8863553413105025e-05,
+      "loss": 0.20066320896148682,
+      "step": 680
+    },
+    {
+      "epoch": 0.12463609898107715,
+      "grad_norm": 0.1484801322221756,
+      "learning_rate": 4.884150920623737e-05,
+      "loss": 0.20096096992492676,
+      "step": 685
+    },
+    {
+      "epoch": 0.12554585152838427,
+      "grad_norm": 0.1455296128988266,
+      "learning_rate": 4.88192583165698e-05,
+      "loss": 0.20518505573272705,
+      "step": 690
+    },
+    {
+      "epoch": 0.12645560407569142,
+      "grad_norm": 0.14517490565776825,
+      "learning_rate": 4.879680093699598e-05,
+      "loss": 0.18859238624572755,
+      "step": 695
+    },
+    {
+      "epoch": 0.12736535662299855,
+      "grad_norm": 0.18778090178966522,
+      "learning_rate": 4.877413726219964e-05,
+      "loss": 0.197074818611145,
+      "step": 700
+    },
+    {
+      "epoch": 0.12827510917030568,
+      "grad_norm": 0.13497677445411682,
+      "learning_rate": 4.87512674886529e-05,
+      "loss": 0.18713107109069824,
+      "step": 705
+    },
+    {
+      "epoch": 0.12918486171761281,
+      "grad_norm": 0.12657155096530914,
+      "learning_rate": 4.872819181461455e-05,
+      "loss": 0.1858484387397766,
+      "step": 710
+    },
+    {
+      "epoch": 0.13009461426491994,
+      "grad_norm": 0.11458148807287216,
+      "learning_rate": 4.870491044012834e-05,
+      "loss": 0.18732179403305055,
+      "step": 715
+    },
+    {
+      "epoch": 0.13100436681222707,
+      "grad_norm": 0.13000249862670898,
+      "learning_rate": 4.8681423567021244e-05,
+      "loss": 0.1872936010360718,
+      "step": 720
+    },
+    {
+      "epoch": 0.1319141193595342,
+      "grad_norm": 0.14580890536308289,
+      "learning_rate": 4.865773139890172e-05,
+      "loss": 0.19280019998550416,
+      "step": 725
+    },
+    {
+      "epoch": 0.13282387190684133,
+      "grad_norm": 0.1507277935743332,
+      "learning_rate": 4.8633834141157913e-05,
+      "loss": 0.1898929238319397,
+      "step": 730
+    },
+    {
+      "epoch": 0.13373362445414846,
+      "grad_norm": 0.1418737769126892,
+      "learning_rate": 4.860973200095592e-05,
+      "loss": 0.17926375865936278,
+      "step": 735
+    },
+    {
+      "epoch": 0.1346433770014556,
+      "grad_norm": 0.17151866853237152,
+      "learning_rate": 4.858542518723794e-05,
+      "loss": 0.18963592052459716,
+      "step": 740
+    },
+    {
+      "epoch": 0.13555312954876272,
+      "grad_norm": 0.11162743717432022,
+      "learning_rate": 4.8560913910720535e-05,
+      "loss": 0.19466646909713745,
+      "step": 745
+    },
+    {
+      "epoch": 0.13646288209606988,
+      "grad_norm": 0.15628376603126526,
+      "learning_rate": 4.8536198383892725e-05,
+      "loss": 0.19494034051895143,
+      "step": 750
+    },
+    {
+      "epoch": 0.137372634643377,
+      "grad_norm": 0.18209289014339447,
+      "learning_rate": 4.851127882101421e-05,
+      "loss": 0.18747550249099731,
+      "step": 755
+    },
+    {
+      "epoch": 0.13828238719068414,
+      "grad_norm": 0.14559614658355713,
+      "learning_rate": 4.8486155438113454e-05,
+      "loss": 0.1897158980369568,
+      "step": 760
+    },
+    {
+      "epoch": 0.13919213973799127,
+      "grad_norm": 0.3198587894439697,
+      "learning_rate": 4.846082845298586e-05,
+      "loss": 0.18571001291275024,
+      "step": 765
+    },
+    {
+      "epoch": 0.1401018922852984,
+      "grad_norm": 0.1486678421497345,
+      "learning_rate": 4.843529808519189e-05,
+      "loss": 0.19561930894851684,
+      "step": 770
+    },
+    {
+      "epoch": 0.14101164483260553,
+      "grad_norm": 0.15318170189857483,
+      "learning_rate": 4.840956455605509e-05,
+      "loss": 0.187040114402771,
+      "step": 775
+    },
+    {
+      "epoch": 0.14192139737991266,
+      "grad_norm": 0.13754244148731232,
+      "learning_rate": 4.838362808866025e-05,
+      "loss": 0.18345539569854735,
+      "step": 780
+    },
+    {
+      "epoch": 0.1428311499272198,
+      "grad_norm": 0.12943248450756073,
+      "learning_rate": 4.835748890785143e-05,
+      "loss": 0.1921079397201538,
+      "step": 785
+    },
+    {
+      "epoch": 0.14374090247452692,
+      "grad_norm": 0.110458143055439,
+      "learning_rate": 4.833114724023001e-05,
+      "loss": 0.17927205562591553,
+      "step": 790
+    },
+    {
+      "epoch": 0.14465065502183405,
+      "grad_norm": 0.2421770840883255,
+      "learning_rate": 4.830460331415275e-05,
+      "loss": 0.18317567110061644,
+      "step": 795
+    },
+    {
+      "epoch": 0.14556040756914118,
+      "grad_norm": 0.14752762019634247,
+      "learning_rate": 4.8277857359729787e-05,
+      "loss": 0.1843916058540344,
+      "step": 800
+    },
+    {
+      "epoch": 0.14647016011644834,
+      "grad_norm": 0.15043556690216064,
+      "learning_rate": 4.8250909608822644e-05,
+      "loss": 0.18354393243789674,
+      "step": 805
+    },
+    {
+      "epoch": 0.14737991266375547,
+      "grad_norm": 0.1381794661283493,
+      "learning_rate": 4.822376029504223e-05,
+      "loss": 0.1789781332015991,
+      "step": 810
+    },
+    {
+      "epoch": 0.1482896652110626,
+      "grad_norm": 0.18386174738407135,
+      "learning_rate": 4.819640965374681e-05,
+      "loss": 0.19494292736053467,
+      "step": 815
+    },
+    {
+      "epoch": 0.14919941775836973,
+      "grad_norm": 0.13829593360424042,
+      "learning_rate": 4.816885792203996e-05,
+      "loss": 0.18486063480377196,
+      "step": 820
+    },
+    {
+      "epoch": 0.15010917030567686,
+      "grad_norm": 0.15033291280269623,
+      "learning_rate": 4.814110533876852e-05,
+      "loss": 0.18061509132385253,
+      "step": 825
+    },
+    {
+      "epoch": 0.151018922852984,
+      "grad_norm": 0.17150473594665527,
+      "learning_rate": 4.811315214452051e-05,
+      "loss": 0.18464866876602173,
+      "step": 830
+    },
+    {
+      "epoch": 0.15192867540029112,
+      "grad_norm": 0.15317125618457794,
+      "learning_rate": 4.808499858162307e-05,
+      "loss": 0.1837708592414856,
+      "step": 835
+    },
+    {
+      "epoch": 0.15283842794759825,
+      "grad_norm": 0.2671392560005188,
+      "learning_rate": 4.805664489414031e-05,
+      "loss": 0.19338636398315429,
+      "step": 840
+    },
+    {
+      "epoch": 0.15374818049490538,
+      "grad_norm": 0.14047028124332428,
+      "learning_rate": 4.802809132787125e-05,
+      "loss": 0.17069108486175538,
+      "step": 845
+    },
+    {
+      "epoch": 0.1546579330422125,
+      "grad_norm": 0.1520431935787201,
+      "learning_rate": 4.799933813034768e-05,
+      "loss": 0.18607735633850098,
+      "step": 850
+    },
+    {
+      "epoch": 0.15556768558951964,
+      "grad_norm": 0.17239463329315186,
+      "learning_rate": 4.797038555083197e-05,
+      "loss": 0.18069062232971192,
+      "step": 855
+    },
+    {
+      "epoch": 0.1564774381368268,
+      "grad_norm": 0.1377955675125122,
+      "learning_rate": 4.794123384031495e-05,
+      "loss": 0.18870222568511963,
+      "step": 860
+    },
+    {
+      "epoch": 0.15738719068413393,
+      "grad_norm": 0.15901461243629456,
+      "learning_rate": 4.791188325151373e-05,
+      "loss": 0.18128334283828734,
+      "step": 865
+    },
+    {
+      "epoch": 0.15829694323144106,
+      "grad_norm": 0.14634132385253906,
+      "learning_rate": 4.7882334038869495e-05,
+      "loss": 0.1866163969039917,
+      "step": 870
+    },
+    {
+      "epoch": 0.1592066957787482,
+      "grad_norm": 0.15361061692237854,
+      "learning_rate": 4.785258645854529e-05,
+      "loss": 0.17850807905197144,
+      "step": 875
+    },
+    {
+      "epoch": 0.16011644832605532,
+      "grad_norm": 0.13751649856567383,
+      "learning_rate": 4.782264076842385e-05,
+      "loss": 0.17731113433837892,
+      "step": 880
+    },
+    {
+      "epoch": 0.16102620087336245,
+      "grad_norm": 0.17909638583660126,
+      "learning_rate": 4.7792497228105314e-05,
+      "loss": 0.18344542980194092,
+      "step": 885
+    },
+    {
+      "epoch": 0.16193595342066958,
+      "grad_norm": 0.16038304567337036,
+      "learning_rate": 4.776215609890498e-05,
+      "loss": 0.18868647813796996,
+      "step": 890
+    },
+    {
+      "epoch": 0.1628457059679767,
+      "grad_norm": 0.1653951108455658,
+      "learning_rate": 4.773161764385107e-05,
+      "loss": 0.18614152669906617,
+      "step": 895
+    },
+    {
+      "epoch": 0.16375545851528384,
+      "grad_norm": 0.16193026304244995,
+      "learning_rate": 4.770088212768241e-05,
+      "loss": 0.18564575910568237,
+      "step": 900
+    },
+    {
+      "epoch": 0.16466521106259097,
+      "grad_norm": 0.16048531234264374,
+      "learning_rate": 4.7669949816846173e-05,
+      "loss": 0.18330031633377075,
+      "step": 905
+    },
+    {
+      "epoch": 0.1655749636098981,
+      "grad_norm": 0.1440177708864212,
+      "learning_rate": 4.7638820979495534e-05,
+      "loss": 0.17712442874908446,
+      "step": 910
+    },
+    {
+      "epoch": 0.16648471615720525,
+      "grad_norm": 0.19635969400405884,
+      "learning_rate": 4.760749588548738e-05,
+      "loss": 0.18679027557373046,
+      "step": 915
+    },
+    {
+      "epoch": 0.16739446870451238,
+      "grad_norm": 0.15576541423797607,
+      "learning_rate": 4.757597480637995e-05,
+      "loss": 0.19283764362335204,
+      "step": 920
+    },
+    {
+      "epoch": 0.1683042212518195,
+      "grad_norm": 0.1550331562757492,
+      "learning_rate": 4.7544258015430463e-05,
+      "loss": 0.18269542455673218,
+      "step": 925
+    },
+    {
+      "epoch": 0.16921397379912664,
+      "grad_norm": 0.18369626998901367,
+      "learning_rate": 4.75123457875928e-05,
+      "loss": 0.1697891116142273,
+      "step": 930
+    },
+    {
+      "epoch": 0.17012372634643377,
+      "grad_norm": 0.15266314148902893,
+      "learning_rate": 4.7480238399515074e-05,
+      "loss": 0.18523451089859008,
+      "step": 935
+    },
+    {
+      "epoch": 0.1710334788937409,
+      "grad_norm": 0.16709664463996887,
+      "learning_rate": 4.744793612953724e-05,
+      "loss": 0.1803238034248352,
+      "step": 940
+    },
+    {
+      "epoch": 0.17194323144104803,
+      "grad_norm": 0.14929179847240448,
+      "learning_rate": 4.741543925768872e-05,
+      "loss": 0.1861217737197876,
+      "step": 945
+    },
+    {
+      "epoch": 0.17285298398835516,
+      "grad_norm": 0.1362280696630478,
+      "learning_rate": 4.7382748065685915e-05,
+      "loss": 0.17896100282669067,
+      "step": 950
+    },
+    {
+      "epoch": 0.1737627365356623,
+      "grad_norm": 0.15290239453315735,
+      "learning_rate": 4.734986283692982e-05,
+      "loss": 0.18432788848876952,
+      "step": 955
+    },
+    {
+      "epoch": 0.17467248908296942,
+      "grad_norm": 0.1287035197019577,
+      "learning_rate": 4.73167838565035e-05,
+      "loss": 0.18485682010650634,
+      "step": 960
+    },
+    {
+      "epoch": 0.17558224163027655,
+      "grad_norm": 0.17969627678394318,
+      "learning_rate": 4.728351141116971e-05,
+      "loss": 0.17361557483673096,
+      "step": 965
+    },
+    {
+      "epoch": 0.1764919941775837,
+      "grad_norm": 0.13751201331615448,
+      "learning_rate": 4.7250045789368326e-05,
+      "loss": 0.1731679320335388,
+      "step": 970
+    },
+    {
+      "epoch": 0.17740174672489084,
+      "grad_norm": 0.1603265255689621,
+      "learning_rate": 4.721638728121388e-05,
+      "loss": 0.17308170795440675,
+      "step": 975
+    },
+    {
+      "epoch": 0.17831149927219797,
+      "grad_norm": 0.1592789888381958,
+      "learning_rate": 4.718253617849306e-05,
+      "loss": 0.17534757852554322,
+      "step": 980
+    },
+    {
+      "epoch": 0.1792212518195051,
+      "grad_norm": 0.12727224826812744,
+      "learning_rate": 4.714849277466214e-05,
+      "loss": 0.17817609310150145,
+      "step": 985
+    },
+    {
+      "epoch": 0.18013100436681223,
+      "grad_norm": 0.15401554107666016,
+      "learning_rate": 4.711425736484447e-05,
+      "loss": 0.1733405351638794,
+      "step": 990
+    },
+    {
+      "epoch": 0.18104075691411936,
+      "grad_norm": 0.13253968954086304,
+      "learning_rate": 4.7079830245827906e-05,
+      "loss": 0.17846795320510864,
+      "step": 995
+    },
+    {
+      "epoch": 0.1819505094614265,
+      "grad_norm": 0.21846213936805725,
+      "learning_rate": 4.7045211716062245e-05,
+      "loss": 0.18021599054336548,
+      "step": 1000
+    },
+    {
+      "epoch": 0.18286026200873362,
+      "grad_norm": 0.16867990791797638,
+      "learning_rate": 4.7010402075656595e-05,
+      "loss": 0.18232386112213134,
+      "step": 1005
+    },
+    {
+      "epoch": 0.18377001455604075,
+      "grad_norm": 0.17180582880973816,
+      "learning_rate": 4.697540162637686e-05,
+      "loss": 0.1816317319869995,
+      "step": 1010
+    },
+    {
+      "epoch": 0.18467976710334788,
+      "grad_norm": 0.16480213403701782,
+      "learning_rate": 4.694021067164303e-05,
+      "loss": 0.17718446254730225,
+      "step": 1015
+    },
+    {
+      "epoch": 0.185589519650655,
+      "grad_norm": 0.15015918016433716,
+      "learning_rate": 4.6904829516526605e-05,
+      "loss": 0.17412011623382567,
+      "step": 1020
+    },
+    {
+      "epoch": 0.18649927219796217,
+      "grad_norm": 0.14445139467716217,
+      "learning_rate": 4.686925846774795e-05,
+      "loss": 0.1778018832206726,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1874090247452693,
+      "grad_norm": 0.1701960265636444,
+      "learning_rate": 4.683349783367362e-05,
+      "loss": 0.16901081800460815,
+      "step": 1030
+    },
+    {
+      "epoch": 0.18831877729257643,
+      "grad_norm": 0.15894867479801178,
+      "learning_rate": 4.679754792431368e-05,
+      "loss": 0.17055928707122803,
+      "step": 1035
+    },
+    {
+      "epoch": 0.18922852983988356,
+      "grad_norm": 0.1511942446231842,
+      "learning_rate": 4.676140905131903e-05,
+      "loss": 0.17339680194854737,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1901382823871907,
+      "grad_norm": 0.14735209941864014,
+      "learning_rate": 4.672508152797872e-05,
+      "loss": 0.17802717685699462,
+      "step": 1045
+    },
+    {
+      "epoch": 0.19104803493449782,
+      "grad_norm": 0.17367291450500488,
+      "learning_rate": 4.66885656692172e-05,
+      "loss": 0.1732744097709656,
+      "step": 1050
+    },
+    {
+      "epoch": 0.19195778748180495,
+      "grad_norm": 0.147227481007576,
+      "learning_rate": 4.665186179159159e-05,
+      "loss": 0.17040517330169677,
+      "step": 1055
+    },
+    {
+      "epoch": 0.19286754002911208,
+      "grad_norm": 0.1709655076265335,
+      "learning_rate": 4.6614970213289e-05,
+      "loss": 0.17794088125228882,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1937772925764192,
+      "grad_norm": 0.1588088721036911,
+      "learning_rate": 4.657789125412366e-05,
+      "loss": 0.17180380821228028,
+      "step": 1065
+    },
+    {
+      "epoch": 0.19468704512372634,
+      "grad_norm": 0.14827021956443787,
+      "learning_rate": 4.654062523553428e-05,
+      "loss": 0.182997989654541,
+      "step": 1070
+    },
+    {
+      "epoch": 0.19559679767103347,
+      "grad_norm": 0.16230466961860657,
+      "learning_rate": 4.6503172480581126e-05,
+      "loss": 0.17346880435943604,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1965065502183406,
+      "grad_norm": 0.1637624353170395,
+      "learning_rate": 4.646553331394333e-05,
+      "loss": 0.17263576984405518,
+      "step": 1080
+    },
+    {
+      "epoch": 0.19741630276564776,
+      "grad_norm": 0.15977843105793,
+      "learning_rate": 4.642770806191603e-05,
+      "loss": 0.17284308671951293,
+      "step": 1085
+    },
+    {
+      "epoch": 0.19832605531295489,
+      "grad_norm": 0.15394869446754456,
+      "learning_rate": 4.6389697052407534e-05,
+      "loss": 0.17797101736068727,
+      "step": 1090
+    },
+    {
+      "epoch": 0.19923580786026202,
+      "grad_norm": 0.15995225310325623,
+      "learning_rate": 4.6351500614936485e-05,
+      "loss": 0.18137198686599731,
+      "step": 1095
+    },
+    {
+      "epoch": 0.20014556040756915,
+      "grad_norm": 0.1779479682445526,
+      "learning_rate": 4.6313119080629006e-05,
+      "loss": 0.17998344898223878,
+      "step": 1100
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 6.127484770153037e+17,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-1100/training_args.bin b/checkpoint-1100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-1100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/checkpoint-1200/README.md b/checkpoint-1200/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-1200/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-1200/adapter_config.json b/checkpoint-1200/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-1200/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-1200/adapter_model.safetensors b/checkpoint-1200/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..292d935185c8ac00919d157f9ff64a889d356961
--- /dev/null
+++ b/checkpoint-1200/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:758b7e5c64f7b3b9a2dfb7f9c3f402266b67013f70427ae941acb07350f0c694
+size 169741912
diff --git a/checkpoint-1200/chat_template.jinja b/checkpoint-1200/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-1200/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-1200/optimizer.pt b/checkpoint-1200/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8f13dac5f0b23147b58f5cda5e2c132b30fb971b
--- /dev/null
+++ b/checkpoint-1200/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6e1e647229ebd58f619f9224174e6d5fab90526935a57bf68b5a5fbc119fb909
+size 72807355
diff --git a/checkpoint-1200/processor_config.json b/checkpoint-1200/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-1200/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-1200/rng_state.pth b/checkpoint-1200/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-1200/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-1200/scheduler.pt b/checkpoint-1200/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ea3b11a51b0e81378736f4c09e20d923f2a9c07c
--- /dev/null
+++ b/checkpoint-1200/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:efcf962131305188aae5d8c42fb21f39c330e15fc73bc76b4411e357b0d01cee
+size 1465
diff --git a/checkpoint-1200/tokenizer.json b/checkpoint-1200/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-1200/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-1200/tokenizer_config.json b/checkpoint-1200/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-1200/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-1200/trainer_state.json b/checkpoint-1200/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..dad094a6a4be8ecdf658f8f1f7edf3308e22b870
--- /dev/null
+++ b/checkpoint-1200/trainer_state.json
@@ -0,0 +1,1722 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.2183406113537118,
+  "eval_steps": 100,
+  "global_step": 1200,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    },
+    {
+      "epoch": 0.03729985443959243,
+      "grad_norm": 0.061678580939769745,
+      "learning_rate": 4.999340748636462e-05,
+      "loss": 0.24956226348876953,
+      "step": 205
+    },
+    {
+      "epoch": 0.03820960698689956,
+      "grad_norm": 0.07328873127698898,
+      "learning_rate": 4.999160884067051e-05,
+      "loss": 0.26169676780700685,
+      "step": 210
+    },
+    {
+      "epoch": 0.0391193595342067,
+      "grad_norm": 0.08287990838289261,
+      "learning_rate": 4.9989593541926246e-05,
+      "loss": 0.2574604034423828,
+      "step": 215
+    },
+    {
+      "epoch": 0.04002911208151383,
+      "grad_norm": 0.06787359714508057,
+      "learning_rate": 4.9987361607602525e-05,
+      "loss": 0.25351409912109374,
+      "step": 220
+    },
+    {
+      "epoch": 0.04093886462882096,
+      "grad_norm": 0.06695502996444702,
+      "learning_rate": 4.998491305704805e-05,
+      "loss": 0.24522039890289307,
+      "step": 225
+    },
+    {
+      "epoch": 0.041848617176128096,
+      "grad_norm": 0.08872214704751968,
+      "learning_rate": 4.9982247911489375e-05,
+      "loss": 0.2581867933273315,
+      "step": 230
+    },
+    {
+      "epoch": 0.042758369723435226,
+      "grad_norm": 0.07637131959199905,
+      "learning_rate": 4.9979366194030743e-05,
+      "loss": 0.25569658279418944,
+      "step": 235
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 0.08158119022846222,
+      "learning_rate": 4.997626792965385e-05,
+      "loss": 0.2529409646987915,
+      "step": 240
+    },
+    {
+      "epoch": 0.04457787481804949,
+      "grad_norm": 0.07529161125421524,
+      "learning_rate": 4.997295314521766e-05,
+      "loss": 0.24049024581909179,
+      "step": 245
+    },
+    {
+      "epoch": 0.04548762736535662,
+      "grad_norm": 0.08860139548778534,
+      "learning_rate": 4.996942186945813e-05,
+      "loss": 0.2490522861480713,
+      "step": 250
+    },
+    {
+      "epoch": 0.04639737991266375,
+      "grad_norm": 0.0850321501493454,
+      "learning_rate": 4.9965674132988005e-05,
+      "loss": 0.24180831909179687,
+      "step": 255
+    },
+    {
+      "epoch": 0.04730713245997089,
+      "grad_norm": 0.07556115090847015,
+      "learning_rate": 4.996170996829653e-05,
+      "loss": 0.2509631872177124,
+      "step": 260
+    },
+    {
+      "epoch": 0.04821688500727802,
+      "grad_norm": 0.07971206307411194,
+      "learning_rate": 4.995752940974918e-05,
+      "loss": 0.24398891925811766,
+      "step": 265
+    },
+    {
+      "epoch": 0.04912663755458515,
+      "grad_norm": 0.09149336814880371,
+      "learning_rate": 4.9953132493587344e-05,
+      "loss": 0.2300492286682129,
+      "step": 270
+    },
+    {
+      "epoch": 0.050036390101892286,
+      "grad_norm": 0.08265820890665054,
+      "learning_rate": 4.9948519257928034e-05,
+      "loss": 0.24246792793273925,
+      "step": 275
+    },
+    {
+      "epoch": 0.050946142649199416,
+      "grad_norm": 0.10328587144613266,
+      "learning_rate": 4.9943689742763534e-05,
+      "loss": 0.2367171049118042,
+      "step": 280
+    },
+    {
+      "epoch": 0.05185589519650655,
+      "grad_norm": 0.0836917981505394,
+      "learning_rate": 4.993864398996105e-05,
+      "loss": 0.23215813636779786,
+      "step": 285
+    },
+    {
+      "epoch": 0.05276564774381368,
+      "grad_norm": 0.09475161135196686,
+      "learning_rate": 4.99333820432624e-05,
+      "loss": 0.2350748062133789,
+      "step": 290
+    },
+    {
+      "epoch": 0.05367540029112081,
+      "grad_norm": 0.08040128648281097,
+      "learning_rate": 4.992790394828355e-05,
+      "loss": 0.23253886699676513,
+      "step": 295
+    },
+    {
+      "epoch": 0.05458515283842795,
+      "grad_norm": 0.08852150291204453,
+      "learning_rate": 4.992220975251428e-05,
+      "loss": 0.23856515884399415,
+      "step": 300
+    },
+    {
+      "epoch": 0.05549490538573508,
+      "grad_norm": 0.09565229713916779,
+      "learning_rate": 4.991629950531775e-05,
+      "loss": 0.23311660289764405,
+      "step": 305
+    },
+    {
+      "epoch": 0.05640465793304221,
+      "grad_norm": 0.08158160001039505,
+      "learning_rate": 4.991017325793009e-05,
+      "loss": 0.22467944622039795,
+      "step": 310
+    },
+    {
+      "epoch": 0.05731441048034935,
+      "grad_norm": 0.07746429741382599,
+      "learning_rate": 4.990383106345994e-05,
+      "loss": 0.229844069480896,
+      "step": 315
+    },
+    {
+      "epoch": 0.05822416302765648,
+      "grad_norm": 0.08564355969429016,
+      "learning_rate": 4.989727297688797e-05,
+      "loss": 0.22414517402648926,
+      "step": 320
+    },
+    {
+      "epoch": 0.05913391557496361,
+      "grad_norm": 0.07517435401678085,
+      "learning_rate": 4.9890499055066435e-05,
+      "loss": 0.2236532211303711,
+      "step": 325
+    },
+    {
+      "epoch": 0.060043668122270744,
+      "grad_norm": 0.111734539270401,
+      "learning_rate": 4.988350935671869e-05,
+      "loss": 0.21474847793579102,
+      "step": 330
+    },
+    {
+      "epoch": 0.060953420669577874,
+      "grad_norm": 0.09906989336013794,
+      "learning_rate": 4.987630394243866e-05,
+      "loss": 0.23321933746337892,
+      "step": 335
+    },
+    {
+      "epoch": 0.06186317321688501,
+      "grad_norm": 0.10131457448005676,
+      "learning_rate": 4.98688828746903e-05,
+      "loss": 0.2310662031173706,
+      "step": 340
+    },
+    {
+      "epoch": 0.06277292576419213,
+      "grad_norm": 0.09203507006168365,
+      "learning_rate": 4.986124621780708e-05,
+      "loss": 0.22021169662475587,
+      "step": 345
+    },
+    {
+      "epoch": 0.06368267831149928,
+      "grad_norm": 0.09505912661552429,
+      "learning_rate": 4.9853394037991416e-05,
+      "loss": 0.2197155237197876,
+      "step": 350
+    },
+    {
+      "epoch": 0.06459243085880641,
+      "grad_norm": 0.09038657695055008,
+      "learning_rate": 4.984532640331412e-05,
+      "loss": 0.22066287994384765,
+      "step": 355
+    },
+    {
+      "epoch": 0.06550218340611354,
+      "grad_norm": 0.09707064181566238,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 0.22455451488494874,
+      "step": 360
+    },
+    {
+      "epoch": 0.06641193595342067,
+      "grad_norm": 0.10367228090763092,
+      "learning_rate": 4.98285450509961e-05,
+      "loss": 0.21993820667266845,
+      "step": 365
+    },
+    {
+      "epoch": 0.0673216885007278,
+      "grad_norm": 0.12229471653699875,
+      "learning_rate": 4.9819831478833456e-05,
+      "loss": 0.2168867588043213,
+      "step": 370
+    },
+    {
+      "epoch": 0.06823144104803494,
+      "grad_norm": 0.0964592918753624,
+      "learning_rate": 4.981090274276406e-05,
+      "loss": 0.21579203605651856,
+      "step": 375
+    },
+    {
+      "epoch": 0.06914119359534207,
+      "grad_norm": 0.09400496631860733,
+      "learning_rate": 4.980175892019141e-05,
+      "loss": 0.20972180366516113,
+      "step": 380
+    },
+    {
+      "epoch": 0.0700509461426492,
+      "grad_norm": 0.08158645778894424,
+      "learning_rate": 4.9792400090383594e-05,
+      "loss": 0.22148358821868896,
+      "step": 385
+    },
+    {
+      "epoch": 0.07096069868995633,
+      "grad_norm": 0.10916394740343094,
+      "learning_rate": 4.978282633447261e-05,
+      "loss": 0.2214418649673462,
+      "step": 390
+    },
+    {
+      "epoch": 0.07187045123726346,
+      "grad_norm": 0.11138810962438583,
+      "learning_rate": 4.9773037735453636e-05,
+      "loss": 0.21814754009246826,
+      "step": 395
+    },
+    {
+      "epoch": 0.07278020378457059,
+      "grad_norm": 0.10914396494626999,
+      "learning_rate": 4.9763034378184365e-05,
+      "loss": 0.21310818195343018,
+      "step": 400
+    },
+    {
+      "epoch": 0.07368995633187773,
+      "grad_norm": 0.1043366864323616,
+      "learning_rate": 4.975281634938421e-05,
+      "loss": 0.21266789436340333,
+      "step": 405
+    },
+    {
+      "epoch": 0.07459970887918486,
+      "grad_norm": 0.1036868542432785,
+      "learning_rate": 4.9742383737633594e-05,
+      "loss": 0.21606721878051757,
+      "step": 410
+    },
+    {
+      "epoch": 0.075509461426492,
+      "grad_norm": 0.11640442907810211,
+      "learning_rate": 4.9731736633373144e-05,
+      "loss": 0.21532948017120362,
+      "step": 415
+    },
+    {
+      "epoch": 0.07641921397379912,
+      "grad_norm": 0.11219926178455353,
+      "learning_rate": 4.9720875128902956e-05,
+      "loss": 0.2191627025604248,
+      "step": 420
+    },
+    {
+      "epoch": 0.07732896652110625,
+      "grad_norm": 0.12103637307882309,
+      "learning_rate": 4.970979931838176e-05,
+      "loss": 0.20938868522644044,
+      "step": 425
+    },
+    {
+      "epoch": 0.0782387190684134,
+      "grad_norm": 0.13274189829826355,
+      "learning_rate": 4.96985092978261e-05,
+      "loss": 0.21792960166931152,
+      "step": 430
+    },
+    {
+      "epoch": 0.07914847161572053,
+      "grad_norm": 0.11164513230323792,
+      "learning_rate": 4.968700516510954e-05,
+      "loss": 0.2022618055343628,
+      "step": 435
+    },
+    {
+      "epoch": 0.08005822416302766,
+      "grad_norm": 0.09532847255468369,
+      "learning_rate": 4.967528701996174e-05,
+      "loss": 0.21255812644958497,
+      "step": 440
+    },
+    {
+      "epoch": 0.08096797671033479,
+      "grad_norm": 0.10279258340597153,
+      "learning_rate": 4.96633549639677e-05,
+      "loss": 0.20683050155639648,
+      "step": 445
+    },
+    {
+      "epoch": 0.08187772925764192,
+      "grad_norm": 0.1257462352514267,
+      "learning_rate": 4.965120910056677e-05,
+      "loss": 0.21419920921325683,
+      "step": 450
+    },
+    {
+      "epoch": 0.08278748180494905,
+      "grad_norm": 0.11663137376308441,
+      "learning_rate": 4.963884953505186e-05,
+      "loss": 0.2072287082672119,
+      "step": 455
+    },
+    {
+      "epoch": 0.08369723435225619,
+      "grad_norm": 0.10488224029541016,
+      "learning_rate": 4.96262763745684e-05,
+      "loss": 0.1982678532600403,
+      "step": 460
+    },
+    {
+      "epoch": 0.08460698689956332,
+      "grad_norm": 0.11801692098379135,
+      "learning_rate": 4.961348972811354e-05,
+      "loss": 0.20662031173706055,
+      "step": 465
+    },
+    {
+      "epoch": 0.08551673944687045,
+      "grad_norm": 0.11318827420473099,
+      "learning_rate": 4.96004897065351e-05,
+      "loss": 0.20947303771972656,
+      "step": 470
+    },
+    {
+      "epoch": 0.08642649199417758,
+      "grad_norm": 0.13409486413002014,
+      "learning_rate": 4.95872764225307e-05,
+      "loss": 0.19670876264572143,
+      "step": 475
+    },
+    {
+      "epoch": 0.08733624454148471,
+      "grad_norm": 0.14440792798995972,
+      "learning_rate": 4.957384999064672e-05,
+      "loss": 0.19842848777770997,
+      "step": 480
+    },
+    {
+      "epoch": 0.08824599708879186,
+      "grad_norm": 0.12246996909379959,
+      "learning_rate": 4.956021052727731e-05,
+      "loss": 0.20318071842193602,
+      "step": 485
+    },
+    {
+      "epoch": 0.08915574963609899,
+      "grad_norm": 0.13437233865261078,
+      "learning_rate": 4.954635815066342e-05,
+      "loss": 0.21675212383270265,
+      "step": 490
+    },
+    {
+      "epoch": 0.09006550218340612,
+      "grad_norm": 0.11109672486782074,
+      "learning_rate": 4.9532292980891744e-05,
+      "loss": 0.2100757837295532,
+      "step": 495
+    },
+    {
+      "epoch": 0.09097525473071325,
+      "grad_norm": 0.1388893872499466,
+      "learning_rate": 4.9518015139893675e-05,
+      "loss": 0.20303285121917725,
+      "step": 500
+    },
+    {
+      "epoch": 0.09188500727802038,
+      "grad_norm": 0.13239721953868866,
+      "learning_rate": 4.950352475144427e-05,
+      "loss": 0.2152268409729004,
+      "step": 505
+    },
+    {
+      "epoch": 0.0927947598253275,
+      "grad_norm": 0.12834979593753815,
+      "learning_rate": 4.948882194116119e-05,
+      "loss": 0.20799248218536376,
+      "step": 510
+    },
+    {
+      "epoch": 0.09370451237263465,
+      "grad_norm": 0.11886704713106155,
+      "learning_rate": 4.947390683650354e-05,
+      "loss": 0.20394976139068605,
+      "step": 515
+    },
+    {
+      "epoch": 0.09461426491994178,
+      "grad_norm": 0.11398876458406448,
+      "learning_rate": 4.945877956677083e-05,
+      "loss": 0.2091092586517334,
+      "step": 520
+    },
+    {
+      "epoch": 0.09552401746724891,
+      "grad_norm": 0.1422540694475174,
+      "learning_rate": 4.944344026310186e-05,
+      "loss": 0.19564238786697388,
+      "step": 525
+    },
+    {
+      "epoch": 0.09643377001455604,
+      "grad_norm": 0.11359584331512451,
+      "learning_rate": 4.9427889058473535e-05,
+      "loss": 0.20493624210357667,
+      "step": 530
+    },
+    {
+      "epoch": 0.09734352256186317,
+      "grad_norm": 0.11703553050756454,
+      "learning_rate": 4.941212608769974e-05,
+      "loss": 0.2098615884780884,
+      "step": 535
+    },
+    {
+      "epoch": 0.0982532751091703,
+      "grad_norm": 0.14552047848701477,
+      "learning_rate": 4.939615148743017e-05,
+      "loss": 0.20382182598114013,
+      "step": 540
+    },
+    {
+      "epoch": 0.09916302765647744,
+      "grad_norm": 0.13178016245365143,
+      "learning_rate": 4.937996539614914e-05,
+      "loss": 0.19901862144470214,
+      "step": 545
+    },
+    {
+      "epoch": 0.10007278020378457,
+      "grad_norm": 0.635392427444458,
+      "learning_rate": 4.936356795417439e-05,
+      "loss": 0.20694944858551026,
+      "step": 550
+    },
+    {
+      "epoch": 0.1009825327510917,
+      "grad_norm": 0.15019077062606812,
+      "learning_rate": 4.934695930365586e-05,
+      "loss": 0.19313746690750122,
+      "step": 555
+    },
+    {
+      "epoch": 0.10189228529839883,
+      "grad_norm": 0.12941956520080566,
+      "learning_rate": 4.9330139588574474e-05,
+      "loss": 0.19671722650527954,
+      "step": 560
+    },
+    {
+      "epoch": 0.10280203784570596,
+      "grad_norm": 0.13818831741809845,
+      "learning_rate": 4.931310895474088e-05,
+      "loss": 0.20026786327362062,
+      "step": 565
+    },
+    {
+      "epoch": 0.1037117903930131,
+      "grad_norm": 0.12011194974184036,
+      "learning_rate": 4.929586754979417e-05,
+      "loss": 0.1932437539100647,
+      "step": 570
+    },
+    {
+      "epoch": 0.10462154294032024,
+      "grad_norm": 0.1345364898443222,
+      "learning_rate": 4.9278415523200644e-05,
+      "loss": 0.20245940685272218,
+      "step": 575
+    },
+    {
+      "epoch": 0.10553129548762737,
+      "grad_norm": 0.13281017541885376,
+      "learning_rate": 4.926075302625247e-05,
+      "loss": 0.19864981174468993,
+      "step": 580
+    },
+    {
+      "epoch": 0.1064410480349345,
+      "grad_norm": 0.13465586304664612,
+      "learning_rate": 4.924288021206639e-05,
+      "loss": 0.19573183059692384,
+      "step": 585
+    },
+    {
+      "epoch": 0.10735080058224163,
+      "grad_norm": 0.15225961804389954,
+      "learning_rate": 4.9224797235582396e-05,
+      "loss": 0.19946801662445068,
+      "step": 590
+    },
+    {
+      "epoch": 0.10826055312954876,
+      "grad_norm": 0.12816746532917023,
+      "learning_rate": 4.92065042535624e-05,
+      "loss": 0.19851526021957397,
+      "step": 595
+    },
+    {
+      "epoch": 0.1091703056768559,
+      "grad_norm": 0.13802853226661682,
+      "learning_rate": 4.9188001424588824e-05,
+      "loss": 0.19321763515472412,
+      "step": 600
+    },
+    {
+      "epoch": 0.11008005822416303,
+      "grad_norm": 0.17504797875881195,
+      "learning_rate": 4.9169288909063295e-05,
+      "loss": 0.2032616138458252,
+      "step": 605
+    },
+    {
+      "epoch": 0.11098981077147016,
+      "grad_norm": 0.13544194400310516,
+      "learning_rate": 4.91503668692052e-05,
+      "loss": 0.2011256456375122,
+      "step": 610
+    },
+    {
+      "epoch": 0.11189956331877729,
+      "grad_norm": 1.3976134061813354,
+      "learning_rate": 4.91312354690503e-05,
+      "loss": 0.19916868209838867,
+      "step": 615
+    },
+    {
+      "epoch": 0.11280931586608442,
+      "grad_norm": 0.1465059071779251,
+      "learning_rate": 4.91118948744493e-05,
+      "loss": 0.19487457275390624,
+      "step": 620
+    },
+    {
+      "epoch": 0.11371906841339156,
+      "grad_norm": 0.12103168666362762,
+      "learning_rate": 4.909234525306645e-05,
+      "loss": 0.1907251238822937,
+      "step": 625
+    },
+    {
+      "epoch": 0.1146288209606987,
+      "grad_norm": 0.12660574913024902,
+      "learning_rate": 4.907258677437802e-05,
+      "loss": 0.19327253103256226,
+      "step": 630
+    },
+    {
+      "epoch": 0.11553857350800582,
+      "grad_norm": 0.1347813606262207,
+      "learning_rate": 4.90526196096709e-05,
+      "loss": 0.19637736082077026,
+      "step": 635
+    },
+    {
+      "epoch": 0.11644832605531295,
+      "grad_norm": 0.14953652024269104,
+      "learning_rate": 4.903244393204107e-05,
+      "loss": 0.20325069427490233,
+      "step": 640
+    },
+    {
+      "epoch": 0.11735807860262008,
+      "grad_norm": 0.13936272263526917,
+      "learning_rate": 4.901205991639213e-05,
+      "loss": 0.1930275321006775,
+      "step": 645
+    },
+    {
+      "epoch": 0.11826783114992721,
+      "grad_norm": 0.1448420137166977,
+      "learning_rate": 4.899146773943374e-05,
+      "loss": 0.20026936531066894,
+      "step": 650
+    },
+    {
+      "epoch": 0.11917758369723436,
+      "grad_norm": 0.1312534064054489,
+      "learning_rate": 4.897066757968014e-05,
+      "loss": 0.19062033891677857,
+      "step": 655
+    },
+    {
+      "epoch": 0.12008733624454149,
+      "grad_norm": 0.13644742965698242,
+      "learning_rate": 4.894965961744859e-05,
+      "loss": 0.18719595670700073,
+      "step": 660
+    },
+    {
+      "epoch": 0.12099708879184862,
+      "grad_norm": 0.14276087284088135,
+      "learning_rate": 4.892844403485777e-05,
+      "loss": 0.19784307479858398,
+      "step": 665
+    },
+    {
+      "epoch": 0.12190684133915575,
+      "grad_norm": 0.14735399186611176,
+      "learning_rate": 4.890702101582623e-05,
+      "loss": 0.19163782596588136,
+      "step": 670
+    },
+    {
+      "epoch": 0.12281659388646288,
+      "grad_norm": 0.15742065012454987,
+      "learning_rate": 4.888539074607082e-05,
+      "loss": 0.19312986135482788,
+      "step": 675
+    },
+    {
+      "epoch": 0.12372634643377002,
+      "grad_norm": 0.12917031347751617,
+      "learning_rate": 4.8863553413105025e-05,
+      "loss": 0.20066320896148682,
+      "step": 680
+    },
+    {
+      "epoch": 0.12463609898107715,
+      "grad_norm": 0.1484801322221756,
+      "learning_rate": 4.884150920623737e-05,
+      "loss": 0.20096096992492676,
+      "step": 685
+    },
+    {
+      "epoch": 0.12554585152838427,
+      "grad_norm": 0.1455296128988266,
+      "learning_rate": 4.88192583165698e-05,
+      "loss": 0.20518505573272705,
+      "step": 690
+    },
+    {
+      "epoch": 0.12645560407569142,
+      "grad_norm": 0.14517490565776825,
+      "learning_rate": 4.879680093699598e-05,
+      "loss": 0.18859238624572755,
+      "step": 695
+    },
+    {
+      "epoch": 0.12736535662299855,
+      "grad_norm": 0.18778090178966522,
+      "learning_rate": 4.877413726219964e-05,
+      "loss": 0.197074818611145,
+      "step": 700
+    },
+    {
+      "epoch": 0.12827510917030568,
+      "grad_norm": 0.13497677445411682,
+      "learning_rate": 4.87512674886529e-05,
+      "loss": 0.18713107109069824,
+      "step": 705
+    },
+    {
+      "epoch": 0.12918486171761281,
+      "grad_norm": 0.12657155096530914,
+      "learning_rate": 4.872819181461455e-05,
+      "loss": 0.1858484387397766,
+      "step": 710
+    },
+    {
+      "epoch": 0.13009461426491994,
+      "grad_norm": 0.11458148807287216,
+      "learning_rate": 4.870491044012834e-05,
+      "loss": 0.18732179403305055,
+      "step": 715
+    },
+    {
+      "epoch": 0.13100436681222707,
+      "grad_norm": 0.13000249862670898,
+      "learning_rate": 4.8681423567021244e-05,
+      "loss": 0.1872936010360718,
+      "step": 720
+    },
+    {
+      "epoch": 0.1319141193595342,
+      "grad_norm": 0.14580890536308289,
+      "learning_rate": 4.865773139890172e-05,
+      "loss": 0.19280019998550416,
+      "step": 725
+    },
+    {
+      "epoch": 0.13282387190684133,
+      "grad_norm": 0.1507277935743332,
+      "learning_rate": 4.8633834141157913e-05,
+      "loss": 0.1898929238319397,
+      "step": 730
+    },
+    {
+      "epoch": 0.13373362445414846,
+      "grad_norm": 0.1418737769126892,
+      "learning_rate": 4.860973200095592e-05,
+      "loss": 0.17926375865936278,
+      "step": 735
+    },
+    {
+      "epoch": 0.1346433770014556,
+      "grad_norm": 0.17151866853237152,
+      "learning_rate": 4.858542518723794e-05,
+      "loss": 0.18963592052459716,
+      "step": 740
+    },
+    {
+      "epoch": 0.13555312954876272,
+      "grad_norm": 0.11162743717432022,
+      "learning_rate": 4.8560913910720535e-05,
+      "loss": 0.19466646909713745,
+      "step": 745
+    },
+    {
+      "epoch": 0.13646288209606988,
+      "grad_norm": 0.15628376603126526,
+      "learning_rate": 4.8536198383892725e-05,
+      "loss": 0.19494034051895143,
+      "step": 750
+    },
+    {
+      "epoch": 0.137372634643377,
+      "grad_norm": 0.18209289014339447,
+      "learning_rate": 4.851127882101421e-05,
+      "loss": 0.18747550249099731,
+      "step": 755
+    },
+    {
+      "epoch": 0.13828238719068414,
+      "grad_norm": 0.14559614658355713,
+      "learning_rate": 4.8486155438113454e-05,
+      "loss": 0.1897158980369568,
+      "step": 760
+    },
+    {
+      "epoch": 0.13919213973799127,
+      "grad_norm": 0.3198587894439697,
+      "learning_rate": 4.846082845298586e-05,
+      "loss": 0.18571001291275024,
+      "step": 765
+    },
+    {
+      "epoch": 0.1401018922852984,
+      "grad_norm": 0.1486678421497345,
+      "learning_rate": 4.843529808519189e-05,
+      "loss": 0.19561930894851684,
+      "step": 770
+    },
+    {
+      "epoch": 0.14101164483260553,
+      "grad_norm": 0.15318170189857483,
+      "learning_rate": 4.840956455605509e-05,
+      "loss": 0.187040114402771,
+      "step": 775
+    },
+    {
+      "epoch": 0.14192139737991266,
+      "grad_norm": 0.13754244148731232,
+      "learning_rate": 4.838362808866025e-05,
+      "loss": 0.18345539569854735,
+      "step": 780
+    },
+    {
+      "epoch": 0.1428311499272198,
+      "grad_norm": 0.12943248450756073,
+      "learning_rate": 4.835748890785143e-05,
+      "loss": 0.1921079397201538,
+      "step": 785
+    },
+    {
+      "epoch": 0.14374090247452692,
+      "grad_norm": 0.110458143055439,
+      "learning_rate": 4.833114724023001e-05,
+      "loss": 0.17927205562591553,
+      "step": 790
+    },
+    {
+      "epoch": 0.14465065502183405,
+      "grad_norm": 0.2421770840883255,
+      "learning_rate": 4.830460331415275e-05,
+      "loss": 0.18317567110061644,
+      "step": 795
+    },
+    {
+      "epoch": 0.14556040756914118,
+      "grad_norm": 0.14752762019634247,
+      "learning_rate": 4.8277857359729787e-05,
+      "loss": 0.1843916058540344,
+      "step": 800
+    },
+    {
+      "epoch": 0.14647016011644834,
+      "grad_norm": 0.15043556690216064,
+      "learning_rate": 4.8250909608822644e-05,
+      "loss": 0.18354393243789674,
+      "step": 805
+    },
+    {
+      "epoch": 0.14737991266375547,
+      "grad_norm": 0.1381794661283493,
+      "learning_rate": 4.822376029504223e-05,
+      "loss": 0.1789781332015991,
+      "step": 810
+    },
+    {
+      "epoch": 0.1482896652110626,
+      "grad_norm": 0.18386174738407135,
+      "learning_rate": 4.819640965374681e-05,
+      "loss": 0.19494292736053467,
+      "step": 815
+    },
+    {
+      "epoch": 0.14919941775836973,
+      "grad_norm": 0.13829593360424042,
+      "learning_rate": 4.816885792203996e-05,
+      "loss": 0.18486063480377196,
+      "step": 820
+    },
+    {
+      "epoch": 0.15010917030567686,
+      "grad_norm": 0.15033291280269623,
+      "learning_rate": 4.814110533876852e-05,
+      "loss": 0.18061509132385253,
+      "step": 825
+    },
+    {
+      "epoch": 0.151018922852984,
+      "grad_norm": 0.17150473594665527,
+      "learning_rate": 4.811315214452051e-05,
+      "loss": 0.18464866876602173,
+      "step": 830
+    },
+    {
+      "epoch": 0.15192867540029112,
+      "grad_norm": 0.15317125618457794,
+      "learning_rate": 4.808499858162307e-05,
+      "loss": 0.1837708592414856,
+      "step": 835
+    },
+    {
+      "epoch": 0.15283842794759825,
+      "grad_norm": 0.2671392560005188,
+      "learning_rate": 4.805664489414031e-05,
+      "loss": 0.19338636398315429,
+      "step": 840
+    },
+    {
+      "epoch": 0.15374818049490538,
+      "grad_norm": 0.14047028124332428,
+      "learning_rate": 4.802809132787125e-05,
+      "loss": 0.17069108486175538,
+      "step": 845
+    },
+    {
+      "epoch": 0.1546579330422125,
+      "grad_norm": 0.1520431935787201,
+      "learning_rate": 4.799933813034768e-05,
+      "loss": 0.18607735633850098,
+      "step": 850
+    },
+    {
+      "epoch": 0.15556768558951964,
+      "grad_norm": 0.17239463329315186,
+      "learning_rate": 4.797038555083197e-05,
+      "loss": 0.18069062232971192,
+      "step": 855
+    },
+    {
+      "epoch": 0.1564774381368268,
+      "grad_norm": 0.1377955675125122,
+      "learning_rate": 4.794123384031495e-05,
+      "loss": 0.18870222568511963,
+      "step": 860
+    },
+    {
+      "epoch": 0.15738719068413393,
+      "grad_norm": 0.15901461243629456,
+      "learning_rate": 4.791188325151373e-05,
+      "loss": 0.18128334283828734,
+      "step": 865
+    },
+    {
+      "epoch": 0.15829694323144106,
+      "grad_norm": 0.14634132385253906,
+      "learning_rate": 4.7882334038869495e-05,
+      "loss": 0.1866163969039917,
+      "step": 870
+    },
+    {
+      "epoch": 0.1592066957787482,
+      "grad_norm": 0.15361061692237854,
+      "learning_rate": 4.785258645854529e-05,
+      "loss": 0.17850807905197144,
+      "step": 875
+    },
+    {
+      "epoch": 0.16011644832605532,
+      "grad_norm": 0.13751649856567383,
+      "learning_rate": 4.782264076842385e-05,
+      "loss": 0.17731113433837892,
+      "step": 880
+    },
+    {
+      "epoch": 0.16102620087336245,
+      "grad_norm": 0.17909638583660126,
+      "learning_rate": 4.7792497228105314e-05,
+      "loss": 0.18344542980194092,
+      "step": 885
+    },
+    {
+      "epoch": 0.16193595342066958,
+      "grad_norm": 0.16038304567337036,
+      "learning_rate": 4.776215609890498e-05,
+      "loss": 0.18868647813796996,
+      "step": 890
+    },
+    {
+      "epoch": 0.1628457059679767,
+      "grad_norm": 0.1653951108455658,
+      "learning_rate": 4.773161764385107e-05,
+      "loss": 0.18614152669906617,
+      "step": 895
+    },
+    {
+      "epoch": 0.16375545851528384,
+      "grad_norm": 0.16193026304244995,
+      "learning_rate": 4.770088212768241e-05,
+      "loss": 0.18564575910568237,
+      "step": 900
+    },
+    {
+      "epoch": 0.16466521106259097,
+      "grad_norm": 0.16048531234264374,
+      "learning_rate": 4.7669949816846173e-05,
+      "loss": 0.18330031633377075,
+      "step": 905
+    },
+    {
+      "epoch": 0.1655749636098981,
+      "grad_norm": 0.1440177708864212,
+      "learning_rate": 4.7638820979495534e-05,
+      "loss": 0.17712442874908446,
+      "step": 910
+    },
+    {
+      "epoch": 0.16648471615720525,
+      "grad_norm": 0.19635969400405884,
+      "learning_rate": 4.760749588548738e-05,
+      "loss": 0.18679027557373046,
+      "step": 915
+    },
+    {
+      "epoch": 0.16739446870451238,
+      "grad_norm": 0.15576541423797607,
+      "learning_rate": 4.757597480637995e-05,
+      "loss": 0.19283764362335204,
+      "step": 920
+    },
+    {
+      "epoch": 0.1683042212518195,
+      "grad_norm": 0.1550331562757492,
+      "learning_rate": 4.7544258015430463e-05,
+      "loss": 0.18269542455673218,
+      "step": 925
+    },
+    {
+      "epoch": 0.16921397379912664,
+      "grad_norm": 0.18369626998901367,
+      "learning_rate": 4.75123457875928e-05,
+      "loss": 0.1697891116142273,
+      "step": 930
+    },
+    {
+      "epoch": 0.17012372634643377,
+      "grad_norm": 0.15266314148902893,
+      "learning_rate": 4.7480238399515074e-05,
+      "loss": 0.18523451089859008,
+      "step": 935
+    },
+    {
+      "epoch": 0.1710334788937409,
+      "grad_norm": 0.16709664463996887,
+      "learning_rate": 4.744793612953724e-05,
+      "loss": 0.1803238034248352,
+      "step": 940
+    },
+    {
+      "epoch": 0.17194323144104803,
+      "grad_norm": 0.14929179847240448,
+      "learning_rate": 4.741543925768872e-05,
+      "loss": 0.1861217737197876,
+      "step": 945
+    },
+    {
+      "epoch": 0.17285298398835516,
+      "grad_norm": 0.1362280696630478,
+      "learning_rate": 4.7382748065685915e-05,
+      "loss": 0.17896100282669067,
+      "step": 950
+    },
+    {
+      "epoch": 0.1737627365356623,
+      "grad_norm": 0.15290239453315735,
+      "learning_rate": 4.734986283692982e-05,
+      "loss": 0.18432788848876952,
+      "step": 955
+    },
+    {
+      "epoch": 0.17467248908296942,
+      "grad_norm": 0.1287035197019577,
+      "learning_rate": 4.73167838565035e-05,
+      "loss": 0.18485682010650634,
+      "step": 960
+    },
+    {
+      "epoch": 0.17558224163027655,
+      "grad_norm": 0.17969627678394318,
+      "learning_rate": 4.728351141116971e-05,
+      "loss": 0.17361557483673096,
+      "step": 965
+    },
+    {
+      "epoch": 0.1764919941775837,
+      "grad_norm": 0.13751201331615448,
+      "learning_rate": 4.7250045789368326e-05,
+      "loss": 0.1731679320335388,
+      "step": 970
+    },
+    {
+      "epoch": 0.17740174672489084,
+      "grad_norm": 0.1603265255689621,
+      "learning_rate": 4.721638728121388e-05,
+      "loss": 0.17308170795440675,
+      "step": 975
+    },
+    {
+      "epoch": 0.17831149927219797,
+      "grad_norm": 0.1592789888381958,
+      "learning_rate": 4.718253617849306e-05,
+      "loss": 0.17534757852554322,
+      "step": 980
+    },
+    {
+      "epoch": 0.1792212518195051,
+      "grad_norm": 0.12727224826812744,
+      "learning_rate": 4.714849277466214e-05,
+      "loss": 0.17817609310150145,
+      "step": 985
+    },
+    {
+      "epoch": 0.18013100436681223,
+      "grad_norm": 0.15401554107666016,
+      "learning_rate": 4.711425736484447e-05,
+      "loss": 0.1733405351638794,
+      "step": 990
+    },
+    {
+      "epoch": 0.18104075691411936,
+      "grad_norm": 0.13253968954086304,
+      "learning_rate": 4.7079830245827906e-05,
+      "loss": 0.17846795320510864,
+      "step": 995
+    },
+    {
+      "epoch": 0.1819505094614265,
+      "grad_norm": 0.21846213936805725,
+      "learning_rate": 4.7045211716062245e-05,
+      "loss": 0.18021599054336548,
+      "step": 1000
+    },
+    {
+      "epoch": 0.18286026200873362,
+      "grad_norm": 0.16867990791797638,
+      "learning_rate": 4.7010402075656595e-05,
+      "loss": 0.18232386112213134,
+      "step": 1005
+    },
+    {
+      "epoch": 0.18377001455604075,
+      "grad_norm": 0.17180582880973816,
+      "learning_rate": 4.697540162637686e-05,
+      "loss": 0.1816317319869995,
+      "step": 1010
+    },
+    {
+      "epoch": 0.18467976710334788,
+      "grad_norm": 0.16480213403701782,
+      "learning_rate": 4.694021067164303e-05,
+      "loss": 0.17718446254730225,
+      "step": 1015
+    },
+    {
+      "epoch": 0.185589519650655,
+      "grad_norm": 0.15015918016433716,
+      "learning_rate": 4.6904829516526605e-05,
+      "loss": 0.17412011623382567,
+      "step": 1020
+    },
+    {
+      "epoch": 0.18649927219796217,
+      "grad_norm": 0.14445139467716217,
+      "learning_rate": 4.686925846774795e-05,
+      "loss": 0.1778018832206726,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1874090247452693,
+      "grad_norm": 0.1701960265636444,
+      "learning_rate": 4.683349783367362e-05,
+      "loss": 0.16901081800460815,
+      "step": 1030
+    },
+    {
+      "epoch": 0.18831877729257643,
+      "grad_norm": 0.15894867479801178,
+      "learning_rate": 4.679754792431368e-05,
+      "loss": 0.17055928707122803,
+      "step": 1035
+    },
+    {
+      "epoch": 0.18922852983988356,
+      "grad_norm": 0.1511942446231842,
+      "learning_rate": 4.676140905131903e-05,
+      "loss": 0.17339680194854737,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1901382823871907,
+      "grad_norm": 0.14735209941864014,
+      "learning_rate": 4.672508152797872e-05,
+      "loss": 0.17802717685699462,
+      "step": 1045
+    },
+    {
+      "epoch": 0.19104803493449782,
+      "grad_norm": 0.17367291450500488,
+      "learning_rate": 4.66885656692172e-05,
+      "loss": 0.1732744097709656,
+      "step": 1050
+    },
+    {
+      "epoch": 0.19195778748180495,
+      "grad_norm": 0.147227481007576,
+      "learning_rate": 4.665186179159159e-05,
+      "loss": 0.17040517330169677,
+      "step": 1055
+    },
+    {
+      "epoch": 0.19286754002911208,
+      "grad_norm": 0.1709655076265335,
+      "learning_rate": 4.6614970213289e-05,
+      "loss": 0.17794088125228882,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1937772925764192,
+      "grad_norm": 0.1588088721036911,
+      "learning_rate": 4.657789125412366e-05,
+      "loss": 0.17180380821228028,
+      "step": 1065
+    },
+    {
+      "epoch": 0.19468704512372634,
+      "grad_norm": 0.14827021956443787,
+      "learning_rate": 4.654062523553428e-05,
+      "loss": 0.182997989654541,
+      "step": 1070
+    },
+    {
+      "epoch": 0.19559679767103347,
+      "grad_norm": 0.16230466961860657,
+      "learning_rate": 4.6503172480581126e-05,
+      "loss": 0.17346880435943604,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1965065502183406,
+      "grad_norm": 0.1637624353170395,
+      "learning_rate": 4.646553331394333e-05,
+      "loss": 0.17263576984405518,
+      "step": 1080
+    },
+    {
+      "epoch": 0.19741630276564776,
+      "grad_norm": 0.15977843105793,
+      "learning_rate": 4.642770806191603e-05,
+      "loss": 0.17284308671951293,
+      "step": 1085
+    },
+    {
+      "epoch": 0.19832605531295489,
+      "grad_norm": 0.15394869446754456,
+      "learning_rate": 4.6389697052407534e-05,
+      "loss": 0.17797101736068727,
+      "step": 1090
+    },
+    {
+      "epoch": 0.19923580786026202,
+      "grad_norm": 0.15995225310325623,
+      "learning_rate": 4.6351500614936485e-05,
+      "loss": 0.18137198686599731,
+      "step": 1095
+    },
+    {
+      "epoch": 0.20014556040756915,
+      "grad_norm": 0.1779479682445526,
+      "learning_rate": 4.6313119080629006e-05,
+      "loss": 0.17998344898223878,
+      "step": 1100
+    },
+    {
+      "epoch": 0.20105531295487628,
+      "grad_norm": 0.14362832903862,
+      "learning_rate": 4.627455278221584e-05,
+      "loss": 0.18196423053741456,
+      "step": 1105
+    },
+    {
+      "epoch": 0.2019650655021834,
+      "grad_norm": 0.15951639413833618,
+      "learning_rate": 4.623580205402947e-05,
+      "loss": 0.17423888444900512,
+      "step": 1110
+    },
+    {
+      "epoch": 0.20287481804949054,
+      "grad_norm": 0.17273563146591187,
+      "learning_rate": 4.619686723200115e-05,
+      "loss": 0.17392473220825194,
+      "step": 1115
+    },
+    {
+      "epoch": 0.20378457059679767,
+      "grad_norm": 0.1655360758304596,
+      "learning_rate": 4.615774865365813e-05,
+      "loss": 0.17528389692306517,
+      "step": 1120
+    },
+    {
+      "epoch": 0.2046943231441048,
+      "grad_norm": 0.15920691192150116,
+      "learning_rate": 4.611844665812058e-05,
+      "loss": 0.1806849241256714,
+      "step": 1125
+    },
+    {
+      "epoch": 0.20560407569141192,
+      "grad_norm": 0.16114577651023865,
+      "learning_rate": 4.607896158609875e-05,
+      "loss": 0.17217352390289306,
+      "step": 1130
+    },
+    {
+      "epoch": 0.20651382823871905,
+      "grad_norm": 0.1499422937631607,
+      "learning_rate": 4.603929377988999e-05,
+      "loss": 0.17806737422943114,
+      "step": 1135
+    },
+    {
+      "epoch": 0.2074235807860262,
+      "grad_norm": 0.17605191469192505,
+      "learning_rate": 4.5999443583375765e-05,
+      "loss": 0.17842113971710205,
+      "step": 1140
+    },
+    {
+      "epoch": 0.20833333333333334,
+      "grad_norm": 0.16117210686206818,
+      "learning_rate": 4.595941134201871e-05,
+      "loss": 0.18379683494567872,
+      "step": 1145
+    },
+    {
+      "epoch": 0.20924308588064047,
+      "grad_norm": 0.21199050545692444,
+      "learning_rate": 4.591919740285957e-05,
+      "loss": 0.16286123991012574,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2101528384279476,
+      "grad_norm": 0.15100529789924622,
+      "learning_rate": 4.587880211451427e-05,
+      "loss": 0.17995200157165528,
+      "step": 1155
+    },
+    {
+      "epoch": 0.21106259097525473,
+      "grad_norm": 0.16618172824382782,
+      "learning_rate": 4.583822582717085e-05,
+      "loss": 0.16960303783416747,
+      "step": 1160
+    },
+    {
+      "epoch": 0.21197234352256186,
+      "grad_norm": 0.14743569493293762,
+      "learning_rate": 4.579746889258643e-05,
+      "loss": 0.17762668132781984,
+      "step": 1165
+    },
+    {
+      "epoch": 0.212882096069869,
+      "grad_norm": 0.1697179079055786,
+      "learning_rate": 4.575653166408417e-05,
+      "loss": 0.16656005382537842,
+      "step": 1170
+    },
+    {
+      "epoch": 0.21379184861717612,
+      "grad_norm": 0.14886513352394104,
+      "learning_rate": 4.57154144965502e-05,
+      "loss": 0.17091882228851318,
+      "step": 1175
+    },
+    {
+      "epoch": 0.21470160116448325,
+      "grad_norm": 0.18197473883628845,
+      "learning_rate": 4.5674117746430556e-05,
+      "loss": 0.1770920753479004,
+      "step": 1180
+    },
+    {
+      "epoch": 0.21561135371179038,
+      "grad_norm": 0.17323088645935059,
+      "learning_rate": 4.563264177172807e-05,
+      "loss": 0.1734643578529358,
+      "step": 1185
+    },
+    {
+      "epoch": 0.2165211062590975,
+      "grad_norm": 0.1521984338760376,
+      "learning_rate": 4.559098693199929e-05,
+      "loss": 0.17515116930007935,
+      "step": 1190
+    },
+    {
+      "epoch": 0.21743085880640467,
+      "grad_norm": 0.1842304915189743,
+      "learning_rate": 4.554915358835134e-05,
+      "loss": 0.16798022985458375,
+      "step": 1195
+    },
+    {
+      "epoch": 0.2183406113537118,
+      "grad_norm": 0.14753451943397522,
+      "learning_rate": 4.5507142103438794e-05,
+      "loss": 0.1755476713180542,
+      "step": 1200
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 6.681503343752571e+17,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-1200/training_args.bin b/checkpoint-1200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-1200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/checkpoint-1300/README.md b/checkpoint-1300/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-1300/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-1300/adapter_config.json b/checkpoint-1300/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-1300/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-1300/adapter_model.safetensors b/checkpoint-1300/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..07c4583f14fc07208b1f75ebd0101b70546e0f20
--- /dev/null
+++ b/checkpoint-1300/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f21824c6a8f5083ba2221748ebd811b7a9fc6278660a9f4521ad7824fdcbb2c6
+size 169741912
diff --git a/checkpoint-1300/chat_template.jinja b/checkpoint-1300/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-1300/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-1300/optimizer.pt b/checkpoint-1300/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c44d446e2fdb1bf1a1b53cd353392a5325a742cc
--- /dev/null
+++ b/checkpoint-1300/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5175257136617c2ec9a887b2cd454032ebf3146bc0c36e17cbd121d9793eadfc
+size 72807355
diff --git a/checkpoint-1300/processor_config.json b/checkpoint-1300/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-1300/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-1300/rng_state.pth b/checkpoint-1300/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-1300/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-1300/scheduler.pt b/checkpoint-1300/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c7b8971608fe833cc9e47f0e552ebb0c2d967871
--- /dev/null
+++ b/checkpoint-1300/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f3a92a0fa04e93d4646a22102148247efeafa7bddd78a00b75c6f94617b15fd2
+size 1465
diff --git a/checkpoint-1300/tokenizer.json b/checkpoint-1300/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-1300/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-1300/tokenizer_config.json b/checkpoint-1300/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-1300/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-1300/trainer_state.json b/checkpoint-1300/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..d6b247df7d42d81cf64caf66e08e88fbff8e386b
--- /dev/null
+++ b/checkpoint-1300/trainer_state.json
@@ -0,0 +1,1862 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.23653566229985443,
+  "eval_steps": 100,
+  "global_step": 1300,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    },
+    {
+      "epoch": 0.03729985443959243,
+      "grad_norm": 0.061678580939769745,
+      "learning_rate": 4.999340748636462e-05,
+      "loss": 0.24956226348876953,
+      "step": 205
+    },
+    {
+      "epoch": 0.03820960698689956,
+      "grad_norm": 0.07328873127698898,
+      "learning_rate": 4.999160884067051e-05,
+      "loss": 0.26169676780700685,
+      "step": 210
+    },
+    {
+      "epoch": 0.0391193595342067,
+      "grad_norm": 0.08287990838289261,
+      "learning_rate": 4.9989593541926246e-05,
+      "loss": 0.2574604034423828,
+      "step": 215
+    },
+    {
+      "epoch": 0.04002911208151383,
+      "grad_norm": 0.06787359714508057,
+      "learning_rate": 4.9987361607602525e-05,
+      "loss": 0.25351409912109374,
+      "step": 220
+    },
+    {
+      "epoch": 0.04093886462882096,
+      "grad_norm": 0.06695502996444702,
+      "learning_rate": 4.998491305704805e-05,
+      "loss": 0.24522039890289307,
+      "step": 225
+    },
+    {
+      "epoch": 0.041848617176128096,
+      "grad_norm": 0.08872214704751968,
+      "learning_rate": 4.9982247911489375e-05,
+      "loss": 0.2581867933273315,
+      "step": 230
+    },
+    {
+      "epoch": 0.042758369723435226,
+      "grad_norm": 0.07637131959199905,
+      "learning_rate": 4.9979366194030743e-05,
+      "loss": 0.25569658279418944,
+      "step": 235
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 0.08158119022846222,
+      "learning_rate": 4.997626792965385e-05,
+      "loss": 0.2529409646987915,
+      "step": 240
+    },
+    {
+      "epoch": 0.04457787481804949,
+      "grad_norm": 0.07529161125421524,
+      "learning_rate": 4.997295314521766e-05,
+      "loss": 0.24049024581909179,
+      "step": 245
+    },
+    {
+      "epoch": 0.04548762736535662,
+      "grad_norm": 0.08860139548778534,
+      "learning_rate": 4.996942186945813e-05,
+      "loss": 0.2490522861480713,
+      "step": 250
+    },
+    {
+      "epoch": 0.04639737991266375,
+      "grad_norm": 0.0850321501493454,
+      "learning_rate": 4.9965674132988005e-05,
+      "loss": 0.24180831909179687,
+      "step": 255
+    },
+    {
+      "epoch": 0.04730713245997089,
+      "grad_norm": 0.07556115090847015,
+      "learning_rate": 4.996170996829653e-05,
+      "loss": 0.2509631872177124,
+      "step": 260
+    },
+    {
+      "epoch": 0.04821688500727802,
+      "grad_norm": 0.07971206307411194,
+      "learning_rate": 4.995752940974918e-05,
+      "loss": 0.24398891925811766,
+      "step": 265
+    },
+    {
+      "epoch": 0.04912663755458515,
+      "grad_norm": 0.09149336814880371,
+      "learning_rate": 4.9953132493587344e-05,
+      "loss": 0.2300492286682129,
+      "step": 270
+    },
+    {
+      "epoch": 0.050036390101892286,
+      "grad_norm": 0.08265820890665054,
+      "learning_rate": 4.9948519257928034e-05,
+      "loss": 0.24246792793273925,
+      "step": 275
+    },
+    {
+      "epoch": 0.050946142649199416,
+      "grad_norm": 0.10328587144613266,
+      "learning_rate": 4.9943689742763534e-05,
+      "loss": 0.2367171049118042,
+      "step": 280
+    },
+    {
+      "epoch": 0.05185589519650655,
+      "grad_norm": 0.0836917981505394,
+      "learning_rate": 4.993864398996105e-05,
+      "loss": 0.23215813636779786,
+      "step": 285
+    },
+    {
+      "epoch": 0.05276564774381368,
+      "grad_norm": 0.09475161135196686,
+      "learning_rate": 4.99333820432624e-05,
+      "loss": 0.2350748062133789,
+      "step": 290
+    },
+    {
+      "epoch": 0.05367540029112081,
+      "grad_norm": 0.08040128648281097,
+      "learning_rate": 4.992790394828355e-05,
+      "loss": 0.23253886699676513,
+      "step": 295
+    },
+    {
+      "epoch": 0.05458515283842795,
+      "grad_norm": 0.08852150291204453,
+      "learning_rate": 4.992220975251428e-05,
+      "loss": 0.23856515884399415,
+      "step": 300
+    },
+    {
+      "epoch": 0.05549490538573508,
+      "grad_norm": 0.09565229713916779,
+      "learning_rate": 4.991629950531775e-05,
+      "loss": 0.23311660289764405,
+      "step": 305
+    },
+    {
+      "epoch": 0.05640465793304221,
+      "grad_norm": 0.08158160001039505,
+      "learning_rate": 4.991017325793009e-05,
+      "loss": 0.22467944622039795,
+      "step": 310
+    },
+    {
+      "epoch": 0.05731441048034935,
+      "grad_norm": 0.07746429741382599,
+      "learning_rate": 4.990383106345994e-05,
+      "loss": 0.229844069480896,
+      "step": 315
+    },
+    {
+      "epoch": 0.05822416302765648,
+      "grad_norm": 0.08564355969429016,
+      "learning_rate": 4.989727297688797e-05,
+      "loss": 0.22414517402648926,
+      "step": 320
+    },
+    {
+      "epoch": 0.05913391557496361,
+      "grad_norm": 0.07517435401678085,
+      "learning_rate": 4.9890499055066435e-05,
+      "loss": 0.2236532211303711,
+      "step": 325
+    },
+    {
+      "epoch": 0.060043668122270744,
+      "grad_norm": 0.111734539270401,
+      "learning_rate": 4.988350935671869e-05,
+      "loss": 0.21474847793579102,
+      "step": 330
+    },
+    {
+      "epoch": 0.060953420669577874,
+      "grad_norm": 0.09906989336013794,
+      "learning_rate": 4.987630394243866e-05,
+      "loss": 0.23321933746337892,
+      "step": 335
+    },
+    {
+      "epoch": 0.06186317321688501,
+      "grad_norm": 0.10131457448005676,
+      "learning_rate": 4.98688828746903e-05,
+      "loss": 0.2310662031173706,
+      "step": 340
+    },
+    {
+      "epoch": 0.06277292576419213,
+      "grad_norm": 0.09203507006168365,
+      "learning_rate": 4.986124621780708e-05,
+      "loss": 0.22021169662475587,
+      "step": 345
+    },
+    {
+      "epoch": 0.06368267831149928,
+      "grad_norm": 0.09505912661552429,
+      "learning_rate": 4.9853394037991416e-05,
+      "loss": 0.2197155237197876,
+      "step": 350
+    },
+    {
+      "epoch": 0.06459243085880641,
+      "grad_norm": 0.09038657695055008,
+      "learning_rate": 4.984532640331412e-05,
+      "loss": 0.22066287994384765,
+      "step": 355
+    },
+    {
+      "epoch": 0.06550218340611354,
+      "grad_norm": 0.09707064181566238,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 0.22455451488494874,
+      "step": 360
+    },
+    {
+      "epoch": 0.06641193595342067,
+      "grad_norm": 0.10367228090763092,
+      "learning_rate": 4.98285450509961e-05,
+      "loss": 0.21993820667266845,
+      "step": 365
+    },
+    {
+      "epoch": 0.0673216885007278,
+      "grad_norm": 0.12229471653699875,
+      "learning_rate": 4.9819831478833456e-05,
+      "loss": 0.2168867588043213,
+      "step": 370
+    },
+    {
+      "epoch": 0.06823144104803494,
+      "grad_norm": 0.0964592918753624,
+      "learning_rate": 4.981090274276406e-05,
+      "loss": 0.21579203605651856,
+      "step": 375
+    },
+    {
+      "epoch": 0.06914119359534207,
+      "grad_norm": 0.09400496631860733,
+      "learning_rate": 4.980175892019141e-05,
+      "loss": 0.20972180366516113,
+      "step": 380
+    },
+    {
+      "epoch": 0.0700509461426492,
+      "grad_norm": 0.08158645778894424,
+      "learning_rate": 4.9792400090383594e-05,
+      "loss": 0.22148358821868896,
+      "step": 385
+    },
+    {
+      "epoch": 0.07096069868995633,
+      "grad_norm": 0.10916394740343094,
+      "learning_rate": 4.978282633447261e-05,
+      "loss": 0.2214418649673462,
+      "step": 390
+    },
+    {
+      "epoch": 0.07187045123726346,
+      "grad_norm": 0.11138810962438583,
+      "learning_rate": 4.9773037735453636e-05,
+      "loss": 0.21814754009246826,
+      "step": 395
+    },
+    {
+      "epoch": 0.07278020378457059,
+      "grad_norm": 0.10914396494626999,
+      "learning_rate": 4.9763034378184365e-05,
+      "loss": 0.21310818195343018,
+      "step": 400
+    },
+    {
+      "epoch": 0.07368995633187773,
+      "grad_norm": 0.1043366864323616,
+      "learning_rate": 4.975281634938421e-05,
+      "loss": 0.21266789436340333,
+      "step": 405
+    },
+    {
+      "epoch": 0.07459970887918486,
+      "grad_norm": 0.1036868542432785,
+      "learning_rate": 4.9742383737633594e-05,
+      "loss": 0.21606721878051757,
+      "step": 410
+    },
+    {
+      "epoch": 0.075509461426492,
+      "grad_norm": 0.11640442907810211,
+      "learning_rate": 4.9731736633373144e-05,
+      "loss": 0.21532948017120362,
+      "step": 415
+    },
+    {
+      "epoch": 0.07641921397379912,
+      "grad_norm": 0.11219926178455353,
+      "learning_rate": 4.9720875128902956e-05,
+      "loss": 0.2191627025604248,
+      "step": 420
+    },
+    {
+      "epoch": 0.07732896652110625,
+      "grad_norm": 0.12103637307882309,
+      "learning_rate": 4.970979931838176e-05,
+      "loss": 0.20938868522644044,
+      "step": 425
+    },
+    {
+      "epoch": 0.0782387190684134,
+      "grad_norm": 0.13274189829826355,
+      "learning_rate": 4.96985092978261e-05,
+      "loss": 0.21792960166931152,
+      "step": 430
+    },
+    {
+      "epoch": 0.07914847161572053,
+      "grad_norm": 0.11164513230323792,
+      "learning_rate": 4.968700516510954e-05,
+      "loss": 0.2022618055343628,
+      "step": 435
+    },
+    {
+      "epoch": 0.08005822416302766,
+      "grad_norm": 0.09532847255468369,
+      "learning_rate": 4.967528701996174e-05,
+      "loss": 0.21255812644958497,
+      "step": 440
+    },
+    {
+      "epoch": 0.08096797671033479,
+      "grad_norm": 0.10279258340597153,
+      "learning_rate": 4.96633549639677e-05,
+      "loss": 0.20683050155639648,
+      "step": 445
+    },
+    {
+      "epoch": 0.08187772925764192,
+      "grad_norm": 0.1257462352514267,
+      "learning_rate": 4.965120910056677e-05,
+      "loss": 0.21419920921325683,
+      "step": 450
+    },
+    {
+      "epoch": 0.08278748180494905,
+      "grad_norm": 0.11663137376308441,
+      "learning_rate": 4.963884953505186e-05,
+      "loss": 0.2072287082672119,
+      "step": 455
+    },
+    {
+      "epoch": 0.08369723435225619,
+      "grad_norm": 0.10488224029541016,
+      "learning_rate": 4.96262763745684e-05,
+      "loss": 0.1982678532600403,
+      "step": 460
+    },
+    {
+      "epoch": 0.08460698689956332,
+      "grad_norm": 0.11801692098379135,
+      "learning_rate": 4.961348972811354e-05,
+      "loss": 0.20662031173706055,
+      "step": 465
+    },
+    {
+      "epoch": 0.08551673944687045,
+      "grad_norm": 0.11318827420473099,
+      "learning_rate": 4.96004897065351e-05,
+      "loss": 0.20947303771972656,
+      "step": 470
+    },
+    {
+      "epoch": 0.08642649199417758,
+      "grad_norm": 0.13409486413002014,
+      "learning_rate": 4.95872764225307e-05,
+      "loss": 0.19670876264572143,
+      "step": 475
+    },
+    {
+      "epoch": 0.08733624454148471,
+      "grad_norm": 0.14440792798995972,
+      "learning_rate": 4.957384999064672e-05,
+      "loss": 0.19842848777770997,
+      "step": 480
+    },
+    {
+      "epoch": 0.08824599708879186,
+      "grad_norm": 0.12246996909379959,
+      "learning_rate": 4.956021052727731e-05,
+      "loss": 0.20318071842193602,
+      "step": 485
+    },
+    {
+      "epoch": 0.08915574963609899,
+      "grad_norm": 0.13437233865261078,
+      "learning_rate": 4.954635815066342e-05,
+      "loss": 0.21675212383270265,
+      "step": 490
+    },
+    {
+      "epoch": 0.09006550218340612,
+      "grad_norm": 0.11109672486782074,
+      "learning_rate": 4.9532292980891744e-05,
+      "loss": 0.2100757837295532,
+      "step": 495
+    },
+    {
+      "epoch": 0.09097525473071325,
+      "grad_norm": 0.1388893872499466,
+      "learning_rate": 4.9518015139893675e-05,
+      "loss": 0.20303285121917725,
+      "step": 500
+    },
+    {
+      "epoch": 0.09188500727802038,
+      "grad_norm": 0.13239721953868866,
+      "learning_rate": 4.950352475144427e-05,
+      "loss": 0.2152268409729004,
+      "step": 505
+    },
+    {
+      "epoch": 0.0927947598253275,
+      "grad_norm": 0.12834979593753815,
+      "learning_rate": 4.948882194116119e-05,
+      "loss": 0.20799248218536376,
+      "step": 510
+    },
+    {
+      "epoch": 0.09370451237263465,
+      "grad_norm": 0.11886704713106155,
+      "learning_rate": 4.947390683650354e-05,
+      "loss": 0.20394976139068605,
+      "step": 515
+    },
+    {
+      "epoch": 0.09461426491994178,
+      "grad_norm": 0.11398876458406448,
+      "learning_rate": 4.945877956677083e-05,
+      "loss": 0.2091092586517334,
+      "step": 520
+    },
+    {
+      "epoch": 0.09552401746724891,
+      "grad_norm": 0.1422540694475174,
+      "learning_rate": 4.944344026310186e-05,
+      "loss": 0.19564238786697388,
+      "step": 525
+    },
+    {
+      "epoch": 0.09643377001455604,
+      "grad_norm": 0.11359584331512451,
+      "learning_rate": 4.9427889058473535e-05,
+      "loss": 0.20493624210357667,
+      "step": 530
+    },
+    {
+      "epoch": 0.09734352256186317,
+      "grad_norm": 0.11703553050756454,
+      "learning_rate": 4.941212608769974e-05,
+      "loss": 0.2098615884780884,
+      "step": 535
+    },
+    {
+      "epoch": 0.0982532751091703,
+      "grad_norm": 0.14552047848701477,
+      "learning_rate": 4.939615148743017e-05,
+      "loss": 0.20382182598114013,
+      "step": 540
+    },
+    {
+      "epoch": 0.09916302765647744,
+      "grad_norm": 0.13178016245365143,
+      "learning_rate": 4.937996539614914e-05,
+      "loss": 0.19901862144470214,
+      "step": 545
+    },
+    {
+      "epoch": 0.10007278020378457,
+      "grad_norm": 0.635392427444458,
+      "learning_rate": 4.936356795417439e-05,
+      "loss": 0.20694944858551026,
+      "step": 550
+    },
+    {
+      "epoch": 0.1009825327510917,
+      "grad_norm": 0.15019077062606812,
+      "learning_rate": 4.934695930365586e-05,
+      "loss": 0.19313746690750122,
+      "step": 555
+    },
+    {
+      "epoch": 0.10189228529839883,
+      "grad_norm": 0.12941956520080566,
+      "learning_rate": 4.9330139588574474e-05,
+      "loss": 0.19671722650527954,
+      "step": 560
+    },
+    {
+      "epoch": 0.10280203784570596,
+      "grad_norm": 0.13818831741809845,
+      "learning_rate": 4.931310895474088e-05,
+      "loss": 0.20026786327362062,
+      "step": 565
+    },
+    {
+      "epoch": 0.1037117903930131,
+      "grad_norm": 0.12011194974184036,
+      "learning_rate": 4.929586754979417e-05,
+      "loss": 0.1932437539100647,
+      "step": 570
+    },
+    {
+      "epoch": 0.10462154294032024,
+      "grad_norm": 0.1345364898443222,
+      "learning_rate": 4.9278415523200644e-05,
+      "loss": 0.20245940685272218,
+      "step": 575
+    },
+    {
+      "epoch": 0.10553129548762737,
+      "grad_norm": 0.13281017541885376,
+      "learning_rate": 4.926075302625247e-05,
+      "loss": 0.19864981174468993,
+      "step": 580
+    },
+    {
+      "epoch": 0.1064410480349345,
+      "grad_norm": 0.13465586304664612,
+      "learning_rate": 4.924288021206639e-05,
+      "loss": 0.19573183059692384,
+      "step": 585
+    },
+    {
+      "epoch": 0.10735080058224163,
+      "grad_norm": 0.15225961804389954,
+      "learning_rate": 4.9224797235582396e-05,
+      "loss": 0.19946801662445068,
+      "step": 590
+    },
+    {
+      "epoch": 0.10826055312954876,
+      "grad_norm": 0.12816746532917023,
+      "learning_rate": 4.92065042535624e-05,
+      "loss": 0.19851526021957397,
+      "step": 595
+    },
+    {
+      "epoch": 0.1091703056768559,
+      "grad_norm": 0.13802853226661682,
+      "learning_rate": 4.9188001424588824e-05,
+      "loss": 0.19321763515472412,
+      "step": 600
+    },
+    {
+      "epoch": 0.11008005822416303,
+      "grad_norm": 0.17504797875881195,
+      "learning_rate": 4.9169288909063295e-05,
+      "loss": 0.2032616138458252,
+      "step": 605
+    },
+    {
+      "epoch": 0.11098981077147016,
+      "grad_norm": 0.13544194400310516,
+      "learning_rate": 4.91503668692052e-05,
+      "loss": 0.2011256456375122,
+      "step": 610
+    },
+    {
+      "epoch": 0.11189956331877729,
+      "grad_norm": 1.3976134061813354,
+      "learning_rate": 4.91312354690503e-05,
+      "loss": 0.19916868209838867,
+      "step": 615
+    },
+    {
+      "epoch": 0.11280931586608442,
+      "grad_norm": 0.1465059071779251,
+      "learning_rate": 4.91118948744493e-05,
+      "loss": 0.19487457275390624,
+      "step": 620
+    },
+    {
+      "epoch": 0.11371906841339156,
+      "grad_norm": 0.12103168666362762,
+      "learning_rate": 4.909234525306645e-05,
+      "loss": 0.1907251238822937,
+      "step": 625
+    },
+    {
+      "epoch": 0.1146288209606987,
+      "grad_norm": 0.12660574913024902,
+      "learning_rate": 4.907258677437802e-05,
+      "loss": 0.19327253103256226,
+      "step": 630
+    },
+    {
+      "epoch": 0.11553857350800582,
+      "grad_norm": 0.1347813606262207,
+      "learning_rate": 4.90526196096709e-05,
+      "loss": 0.19637736082077026,
+      "step": 635
+    },
+    {
+      "epoch": 0.11644832605531295,
+      "grad_norm": 0.14953652024269104,
+      "learning_rate": 4.903244393204107e-05,
+      "loss": 0.20325069427490233,
+      "step": 640
+    },
+    {
+      "epoch": 0.11735807860262008,
+      "grad_norm": 0.13936272263526917,
+      "learning_rate": 4.901205991639213e-05,
+      "loss": 0.1930275321006775,
+      "step": 645
+    },
+    {
+      "epoch": 0.11826783114992721,
+      "grad_norm": 0.1448420137166977,
+      "learning_rate": 4.899146773943374e-05,
+      "loss": 0.20026936531066894,
+      "step": 650
+    },
+    {
+      "epoch": 0.11917758369723436,
+      "grad_norm": 0.1312534064054489,
+      "learning_rate": 4.897066757968014e-05,
+      "loss": 0.19062033891677857,
+      "step": 655
+    },
+    {
+      "epoch": 0.12008733624454149,
+      "grad_norm": 0.13644742965698242,
+      "learning_rate": 4.894965961744859e-05,
+      "loss": 0.18719595670700073,
+      "step": 660
+    },
+    {
+      "epoch": 0.12099708879184862,
+      "grad_norm": 0.14276087284088135,
+      "learning_rate": 4.892844403485777e-05,
+      "loss": 0.19784307479858398,
+      "step": 665
+    },
+    {
+      "epoch": 0.12190684133915575,
+      "grad_norm": 0.14735399186611176,
+      "learning_rate": 4.890702101582623e-05,
+      "loss": 0.19163782596588136,
+      "step": 670
+    },
+    {
+      "epoch": 0.12281659388646288,
+      "grad_norm": 0.15742065012454987,
+      "learning_rate": 4.888539074607082e-05,
+      "loss": 0.19312986135482788,
+      "step": 675
+    },
+    {
+      "epoch": 0.12372634643377002,
+      "grad_norm": 0.12917031347751617,
+      "learning_rate": 4.8863553413105025e-05,
+      "loss": 0.20066320896148682,
+      "step": 680
+    },
+    {
+      "epoch": 0.12463609898107715,
+      "grad_norm": 0.1484801322221756,
+      "learning_rate": 4.884150920623737e-05,
+      "loss": 0.20096096992492676,
+      "step": 685
+    },
+    {
+      "epoch": 0.12554585152838427,
+      "grad_norm": 0.1455296128988266,
+      "learning_rate": 4.88192583165698e-05,
+      "loss": 0.20518505573272705,
+      "step": 690
+    },
+    {
+      "epoch": 0.12645560407569142,
+      "grad_norm": 0.14517490565776825,
+      "learning_rate": 4.879680093699598e-05,
+      "loss": 0.18859238624572755,
+      "step": 695
+    },
+    {
+      "epoch": 0.12736535662299855,
+      "grad_norm": 0.18778090178966522,
+      "learning_rate": 4.877413726219964e-05,
+      "loss": 0.197074818611145,
+      "step": 700
+    },
+    {
+      "epoch": 0.12827510917030568,
+      "grad_norm": 0.13497677445411682,
+      "learning_rate": 4.87512674886529e-05,
+      "loss": 0.18713107109069824,
+      "step": 705
+    },
+    {
+      "epoch": 0.12918486171761281,
+      "grad_norm": 0.12657155096530914,
+      "learning_rate": 4.872819181461455e-05,
+      "loss": 0.1858484387397766,
+      "step": 710
+    },
+    {
+      "epoch": 0.13009461426491994,
+      "grad_norm": 0.11458148807287216,
+      "learning_rate": 4.870491044012834e-05,
+      "loss": 0.18732179403305055,
+      "step": 715
+    },
+    {
+      "epoch": 0.13100436681222707,
+      "grad_norm": 0.13000249862670898,
+      "learning_rate": 4.8681423567021244e-05,
+      "loss": 0.1872936010360718,
+      "step": 720
+    },
+    {
+      "epoch": 0.1319141193595342,
+      "grad_norm": 0.14580890536308289,
+      "learning_rate": 4.865773139890172e-05,
+      "loss": 0.19280019998550416,
+      "step": 725
+    },
+    {
+      "epoch": 0.13282387190684133,
+      "grad_norm": 0.1507277935743332,
+      "learning_rate": 4.8633834141157913e-05,
+      "loss": 0.1898929238319397,
+      "step": 730
+    },
+    {
+      "epoch": 0.13373362445414846,
+      "grad_norm": 0.1418737769126892,
+      "learning_rate": 4.860973200095592e-05,
+      "loss": 0.17926375865936278,
+      "step": 735
+    },
+    {
+      "epoch": 0.1346433770014556,
+      "grad_norm": 0.17151866853237152,
+      "learning_rate": 4.858542518723794e-05,
+      "loss": 0.18963592052459716,
+      "step": 740
+    },
+    {
+      "epoch": 0.13555312954876272,
+      "grad_norm": 0.11162743717432022,
+      "learning_rate": 4.8560913910720535e-05,
+      "loss": 0.19466646909713745,
+      "step": 745
+    },
+    {
+      "epoch": 0.13646288209606988,
+      "grad_norm": 0.15628376603126526,
+      "learning_rate": 4.8536198383892725e-05,
+      "loss": 0.19494034051895143,
+      "step": 750
+    },
+    {
+      "epoch": 0.137372634643377,
+      "grad_norm": 0.18209289014339447,
+      "learning_rate": 4.851127882101421e-05,
+      "loss": 0.18747550249099731,
+      "step": 755
+    },
+    {
+      "epoch": 0.13828238719068414,
+      "grad_norm": 0.14559614658355713,
+      "learning_rate": 4.8486155438113454e-05,
+      "loss": 0.1897158980369568,
+      "step": 760
+    },
+    {
+      "epoch": 0.13919213973799127,
+      "grad_norm": 0.3198587894439697,
+      "learning_rate": 4.846082845298586e-05,
+      "loss": 0.18571001291275024,
+      "step": 765
+    },
+    {
+      "epoch": 0.1401018922852984,
+      "grad_norm": 0.1486678421497345,
+      "learning_rate": 4.843529808519189e-05,
+      "loss": 0.19561930894851684,
+      "step": 770
+    },
+    {
+      "epoch": 0.14101164483260553,
+      "grad_norm": 0.15318170189857483,
+      "learning_rate": 4.840956455605509e-05,
+      "loss": 0.187040114402771,
+      "step": 775
+    },
+    {
+      "epoch": 0.14192139737991266,
+      "grad_norm": 0.13754244148731232,
+      "learning_rate": 4.838362808866025e-05,
+      "loss": 0.18345539569854735,
+      "step": 780
+    },
+    {
+      "epoch": 0.1428311499272198,
+      "grad_norm": 0.12943248450756073,
+      "learning_rate": 4.835748890785143e-05,
+      "loss": 0.1921079397201538,
+      "step": 785
+    },
+    {
+      "epoch": 0.14374090247452692,
+      "grad_norm": 0.110458143055439,
+      "learning_rate": 4.833114724023001e-05,
+      "loss": 0.17927205562591553,
+      "step": 790
+    },
+    {
+      "epoch": 0.14465065502183405,
+      "grad_norm": 0.2421770840883255,
+      "learning_rate": 4.830460331415275e-05,
+      "loss": 0.18317567110061644,
+      "step": 795
+    },
+    {
+      "epoch": 0.14556040756914118,
+      "grad_norm": 0.14752762019634247,
+      "learning_rate": 4.8277857359729787e-05,
+      "loss": 0.1843916058540344,
+      "step": 800
+    },
+    {
+      "epoch": 0.14647016011644834,
+      "grad_norm": 0.15043556690216064,
+      "learning_rate": 4.8250909608822644e-05,
+      "loss": 0.18354393243789674,
+      "step": 805
+    },
+    {
+      "epoch": 0.14737991266375547,
+      "grad_norm": 0.1381794661283493,
+      "learning_rate": 4.822376029504223e-05,
+      "loss": 0.1789781332015991,
+      "step": 810
+    },
+    {
+      "epoch": 0.1482896652110626,
+      "grad_norm": 0.18386174738407135,
+      "learning_rate": 4.819640965374681e-05,
+      "loss": 0.19494292736053467,
+      "step": 815
+    },
+    {
+      "epoch": 0.14919941775836973,
+      "grad_norm": 0.13829593360424042,
+      "learning_rate": 4.816885792203996e-05,
+      "loss": 0.18486063480377196,
+      "step": 820
+    },
+    {
+      "epoch": 0.15010917030567686,
+      "grad_norm": 0.15033291280269623,
+      "learning_rate": 4.814110533876852e-05,
+      "loss": 0.18061509132385253,
+      "step": 825
+    },
+    {
+      "epoch": 0.151018922852984,
+      "grad_norm": 0.17150473594665527,
+      "learning_rate": 4.811315214452051e-05,
+      "loss": 0.18464866876602173,
+      "step": 830
+    },
+    {
+      "epoch": 0.15192867540029112,
+      "grad_norm": 0.15317125618457794,
+      "learning_rate": 4.808499858162307e-05,
+      "loss": 0.1837708592414856,
+      "step": 835
+    },
+    {
+      "epoch": 0.15283842794759825,
+      "grad_norm": 0.2671392560005188,
+      "learning_rate": 4.805664489414031e-05,
+      "loss": 0.19338636398315429,
+      "step": 840
+    },
+    {
+      "epoch": 0.15374818049490538,
+      "grad_norm": 0.14047028124332428,
+      "learning_rate": 4.802809132787125e-05,
+      "loss": 0.17069108486175538,
+      "step": 845
+    },
+    {
+      "epoch": 0.1546579330422125,
+      "grad_norm": 0.1520431935787201,
+      "learning_rate": 4.799933813034768e-05,
+      "loss": 0.18607735633850098,
+      "step": 850
+    },
+    {
+      "epoch": 0.15556768558951964,
+      "grad_norm": 0.17239463329315186,
+      "learning_rate": 4.797038555083197e-05,
+      "loss": 0.18069062232971192,
+      "step": 855
+    },
+    {
+      "epoch": 0.1564774381368268,
+      "grad_norm": 0.1377955675125122,
+      "learning_rate": 4.794123384031495e-05,
+      "loss": 0.18870222568511963,
+      "step": 860
+    },
+    {
+      "epoch": 0.15738719068413393,
+      "grad_norm": 0.15901461243629456,
+      "learning_rate": 4.791188325151373e-05,
+      "loss": 0.18128334283828734,
+      "step": 865
+    },
+    {
+      "epoch": 0.15829694323144106,
+      "grad_norm": 0.14634132385253906,
+      "learning_rate": 4.7882334038869495e-05,
+      "loss": 0.1866163969039917,
+      "step": 870
+    },
+    {
+      "epoch": 0.1592066957787482,
+      "grad_norm": 0.15361061692237854,
+      "learning_rate": 4.785258645854529e-05,
+      "loss": 0.17850807905197144,
+      "step": 875
+    },
+    {
+      "epoch": 0.16011644832605532,
+      "grad_norm": 0.13751649856567383,
+      "learning_rate": 4.782264076842385e-05,
+      "loss": 0.17731113433837892,
+      "step": 880
+    },
+    {
+      "epoch": 0.16102620087336245,
+      "grad_norm": 0.17909638583660126,
+      "learning_rate": 4.7792497228105314e-05,
+      "loss": 0.18344542980194092,
+      "step": 885
+    },
+    {
+      "epoch": 0.16193595342066958,
+      "grad_norm": 0.16038304567337036,
+      "learning_rate": 4.776215609890498e-05,
+      "loss": 0.18868647813796996,
+      "step": 890
+    },
+    {
+      "epoch": 0.1628457059679767,
+      "grad_norm": 0.1653951108455658,
+      "learning_rate": 4.773161764385107e-05,
+      "loss": 0.18614152669906617,
+      "step": 895
+    },
+    {
+      "epoch": 0.16375545851528384,
+      "grad_norm": 0.16193026304244995,
+      "learning_rate": 4.770088212768241e-05,
+      "loss": 0.18564575910568237,
+      "step": 900
+    },
+    {
+      "epoch": 0.16466521106259097,
+      "grad_norm": 0.16048531234264374,
+      "learning_rate": 4.7669949816846173e-05,
+      "loss": 0.18330031633377075,
+      "step": 905
+    },
+    {
+      "epoch": 0.1655749636098981,
+      "grad_norm": 0.1440177708864212,
+      "learning_rate": 4.7638820979495534e-05,
+      "loss": 0.17712442874908446,
+      "step": 910
+    },
+    {
+      "epoch": 0.16648471615720525,
+      "grad_norm": 0.19635969400405884,
+      "learning_rate": 4.760749588548738e-05,
+      "loss": 0.18679027557373046,
+      "step": 915
+    },
+    {
+      "epoch": 0.16739446870451238,
+      "grad_norm": 0.15576541423797607,
+      "learning_rate": 4.757597480637995e-05,
+      "loss": 0.19283764362335204,
+      "step": 920
+    },
+    {
+      "epoch": 0.1683042212518195,
+      "grad_norm": 0.1550331562757492,
+      "learning_rate": 4.7544258015430463e-05,
+      "loss": 0.18269542455673218,
+      "step": 925
+    },
+    {
+      "epoch": 0.16921397379912664,
+      "grad_norm": 0.18369626998901367,
+      "learning_rate": 4.75123457875928e-05,
+      "loss": 0.1697891116142273,
+      "step": 930
+    },
+    {
+      "epoch": 0.17012372634643377,
+      "grad_norm": 0.15266314148902893,
+      "learning_rate": 4.7480238399515074e-05,
+      "loss": 0.18523451089859008,
+      "step": 935
+    },
+    {
+      "epoch": 0.1710334788937409,
+      "grad_norm": 0.16709664463996887,
+      "learning_rate": 4.744793612953724e-05,
+      "loss": 0.1803238034248352,
+      "step": 940
+    },
+    {
+      "epoch": 0.17194323144104803,
+      "grad_norm": 0.14929179847240448,
+      "learning_rate": 4.741543925768872e-05,
+      "loss": 0.1861217737197876,
+      "step": 945
+    },
+    {
+      "epoch": 0.17285298398835516,
+      "grad_norm": 0.1362280696630478,
+      "learning_rate": 4.7382748065685915e-05,
+      "loss": 0.17896100282669067,
+      "step": 950
+    },
+    {
+      "epoch": 0.1737627365356623,
+      "grad_norm": 0.15290239453315735,
+      "learning_rate": 4.734986283692982e-05,
+      "loss": 0.18432788848876952,
+      "step": 955
+    },
+    {
+      "epoch": 0.17467248908296942,
+      "grad_norm": 0.1287035197019577,
+      "learning_rate": 4.73167838565035e-05,
+      "loss": 0.18485682010650634,
+      "step": 960
+    },
+    {
+      "epoch": 0.17558224163027655,
+      "grad_norm": 0.17969627678394318,
+      "learning_rate": 4.728351141116971e-05,
+      "loss": 0.17361557483673096,
+      "step": 965
+    },
+    {
+      "epoch": 0.1764919941775837,
+      "grad_norm": 0.13751201331615448,
+      "learning_rate": 4.7250045789368326e-05,
+      "loss": 0.1731679320335388,
+      "step": 970
+    },
+    {
+      "epoch": 0.17740174672489084,
+      "grad_norm": 0.1603265255689621,
+      "learning_rate": 4.721638728121388e-05,
+      "loss": 0.17308170795440675,
+      "step": 975
+    },
+    {
+      "epoch": 0.17831149927219797,
+      "grad_norm": 0.1592789888381958,
+      "learning_rate": 4.718253617849306e-05,
+      "loss": 0.17534757852554322,
+      "step": 980
+    },
+    {
+      "epoch": 0.1792212518195051,
+      "grad_norm": 0.12727224826812744,
+      "learning_rate": 4.714849277466214e-05,
+      "loss": 0.17817609310150145,
+      "step": 985
+    },
+    {
+      "epoch": 0.18013100436681223,
+      "grad_norm": 0.15401554107666016,
+      "learning_rate": 4.711425736484447e-05,
+      "loss": 0.1733405351638794,
+      "step": 990
+    },
+    {
+      "epoch": 0.18104075691411936,
+      "grad_norm": 0.13253968954086304,
+      "learning_rate": 4.7079830245827906e-05,
+      "loss": 0.17846795320510864,
+      "step": 995
+    },
+    {
+      "epoch": 0.1819505094614265,
+      "grad_norm": 0.21846213936805725,
+      "learning_rate": 4.7045211716062245e-05,
+      "loss": 0.18021599054336548,
+      "step": 1000
+    },
+    {
+      "epoch": 0.18286026200873362,
+      "grad_norm": 0.16867990791797638,
+      "learning_rate": 4.7010402075656595e-05,
+      "loss": 0.18232386112213134,
+      "step": 1005
+    },
+    {
+      "epoch": 0.18377001455604075,
+      "grad_norm": 0.17180582880973816,
+      "learning_rate": 4.697540162637686e-05,
+      "loss": 0.1816317319869995,
+      "step": 1010
+    },
+    {
+      "epoch": 0.18467976710334788,
+      "grad_norm": 0.16480213403701782,
+      "learning_rate": 4.694021067164303e-05,
+      "loss": 0.17718446254730225,
+      "step": 1015
+    },
+    {
+      "epoch": 0.185589519650655,
+      "grad_norm": 0.15015918016433716,
+      "learning_rate": 4.6904829516526605e-05,
+      "loss": 0.17412011623382567,
+      "step": 1020
+    },
+    {
+      "epoch": 0.18649927219796217,
+      "grad_norm": 0.14445139467716217,
+      "learning_rate": 4.686925846774795e-05,
+      "loss": 0.1778018832206726,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1874090247452693,
+      "grad_norm": 0.1701960265636444,
+      "learning_rate": 4.683349783367362e-05,
+      "loss": 0.16901081800460815,
+      "step": 1030
+    },
+    {
+      "epoch": 0.18831877729257643,
+      "grad_norm": 0.15894867479801178,
+      "learning_rate": 4.679754792431368e-05,
+      "loss": 0.17055928707122803,
+      "step": 1035
+    },
+    {
+      "epoch": 0.18922852983988356,
+      "grad_norm": 0.1511942446231842,
+      "learning_rate": 4.676140905131903e-05,
+      "loss": 0.17339680194854737,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1901382823871907,
+      "grad_norm": 0.14735209941864014,
+      "learning_rate": 4.672508152797872e-05,
+      "loss": 0.17802717685699462,
+      "step": 1045
+    },
+    {
+      "epoch": 0.19104803493449782,
+      "grad_norm": 0.17367291450500488,
+      "learning_rate": 4.66885656692172e-05,
+      "loss": 0.1732744097709656,
+      "step": 1050
+    },
+    {
+      "epoch": 0.19195778748180495,
+      "grad_norm": 0.147227481007576,
+      "learning_rate": 4.665186179159159e-05,
+      "loss": 0.17040517330169677,
+      "step": 1055
+    },
+    {
+      "epoch": 0.19286754002911208,
+      "grad_norm": 0.1709655076265335,
+      "learning_rate": 4.6614970213289e-05,
+      "loss": 0.17794088125228882,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1937772925764192,
+      "grad_norm": 0.1588088721036911,
+      "learning_rate": 4.657789125412366e-05,
+      "loss": 0.17180380821228028,
+      "step": 1065
+    },
+    {
+      "epoch": 0.19468704512372634,
+      "grad_norm": 0.14827021956443787,
+      "learning_rate": 4.654062523553428e-05,
+      "loss": 0.182997989654541,
+      "step": 1070
+    },
+    {
+      "epoch": 0.19559679767103347,
+      "grad_norm": 0.16230466961860657,
+      "learning_rate": 4.6503172480581126e-05,
+      "loss": 0.17346880435943604,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1965065502183406,
+      "grad_norm": 0.1637624353170395,
+      "learning_rate": 4.646553331394333e-05,
+      "loss": 0.17263576984405518,
+      "step": 1080
+    },
+    {
+      "epoch": 0.19741630276564776,
+      "grad_norm": 0.15977843105793,
+      "learning_rate": 4.642770806191603e-05,
+      "loss": 0.17284308671951293,
+      "step": 1085
+    },
+    {
+      "epoch": 0.19832605531295489,
+      "grad_norm": 0.15394869446754456,
+      "learning_rate": 4.6389697052407534e-05,
+      "loss": 0.17797101736068727,
+      "step": 1090
+    },
+    {
+      "epoch": 0.19923580786026202,
+      "grad_norm": 0.15995225310325623,
+      "learning_rate": 4.6351500614936485e-05,
+      "loss": 0.18137198686599731,
+      "step": 1095
+    },
+    {
+      "epoch": 0.20014556040756915,
+      "grad_norm": 0.1779479682445526,
+      "learning_rate": 4.6313119080629006e-05,
+      "loss": 0.17998344898223878,
+      "step": 1100
+    },
+    {
+      "epoch": 0.20105531295487628,
+      "grad_norm": 0.14362832903862,
+      "learning_rate": 4.627455278221584e-05,
+      "loss": 0.18196423053741456,
+      "step": 1105
+    },
+    {
+      "epoch": 0.2019650655021834,
+      "grad_norm": 0.15951639413833618,
+      "learning_rate": 4.623580205402947e-05,
+      "loss": 0.17423888444900512,
+      "step": 1110
+    },
+    {
+      "epoch": 0.20287481804949054,
+      "grad_norm": 0.17273563146591187,
+      "learning_rate": 4.619686723200115e-05,
+      "loss": 0.17392473220825194,
+      "step": 1115
+    },
+    {
+      "epoch": 0.20378457059679767,
+      "grad_norm": 0.1655360758304596,
+      "learning_rate": 4.615774865365813e-05,
+      "loss": 0.17528389692306517,
+      "step": 1120
+    },
+    {
+      "epoch": 0.2046943231441048,
+      "grad_norm": 0.15920691192150116,
+      "learning_rate": 4.611844665812058e-05,
+      "loss": 0.1806849241256714,
+      "step": 1125
+    },
+    {
+      "epoch": 0.20560407569141192,
+      "grad_norm": 0.16114577651023865,
+      "learning_rate": 4.607896158609875e-05,
+      "loss": 0.17217352390289306,
+      "step": 1130
+    },
+    {
+      "epoch": 0.20651382823871905,
+      "grad_norm": 0.1499422937631607,
+      "learning_rate": 4.603929377988999e-05,
+      "loss": 0.17806737422943114,
+      "step": 1135
+    },
+    {
+      "epoch": 0.2074235807860262,
+      "grad_norm": 0.17605191469192505,
+      "learning_rate": 4.5999443583375765e-05,
+      "loss": 0.17842113971710205,
+      "step": 1140
+    },
+    {
+      "epoch": 0.20833333333333334,
+      "grad_norm": 0.16117210686206818,
+      "learning_rate": 4.595941134201871e-05,
+      "loss": 0.18379683494567872,
+      "step": 1145
+    },
+    {
+      "epoch": 0.20924308588064047,
+      "grad_norm": 0.21199050545692444,
+      "learning_rate": 4.591919740285957e-05,
+      "loss": 0.16286123991012574,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2101528384279476,
+      "grad_norm": 0.15100529789924622,
+      "learning_rate": 4.587880211451427e-05,
+      "loss": 0.17995200157165528,
+      "step": 1155
+    },
+    {
+      "epoch": 0.21106259097525473,
+      "grad_norm": 0.16618172824382782,
+      "learning_rate": 4.583822582717085e-05,
+      "loss": 0.16960303783416747,
+      "step": 1160
+    },
+    {
+      "epoch": 0.21197234352256186,
+      "grad_norm": 0.14743569493293762,
+      "learning_rate": 4.579746889258643e-05,
+      "loss": 0.17762668132781984,
+      "step": 1165
+    },
+    {
+      "epoch": 0.212882096069869,
+      "grad_norm": 0.1697179079055786,
+      "learning_rate": 4.575653166408417e-05,
+      "loss": 0.16656005382537842,
+      "step": 1170
+    },
+    {
+      "epoch": 0.21379184861717612,
+      "grad_norm": 0.14886513352394104,
+      "learning_rate": 4.57154144965502e-05,
+      "loss": 0.17091882228851318,
+      "step": 1175
+    },
+    {
+      "epoch": 0.21470160116448325,
+      "grad_norm": 0.18197473883628845,
+      "learning_rate": 4.5674117746430556e-05,
+      "loss": 0.1770920753479004,
+      "step": 1180
+    },
+    {
+      "epoch": 0.21561135371179038,
+      "grad_norm": 0.17323088645935059,
+      "learning_rate": 4.563264177172807e-05,
+      "loss": 0.1734643578529358,
+      "step": 1185
+    },
+    {
+      "epoch": 0.2165211062590975,
+      "grad_norm": 0.1521984338760376,
+      "learning_rate": 4.559098693199929e-05,
+      "loss": 0.17515116930007935,
+      "step": 1190
+    },
+    {
+      "epoch": 0.21743085880640467,
+      "grad_norm": 0.1842304915189743,
+      "learning_rate": 4.554915358835134e-05,
+      "loss": 0.16798022985458375,
+      "step": 1195
+    },
+    {
+      "epoch": 0.2183406113537118,
+      "grad_norm": 0.14753451943397522,
+      "learning_rate": 4.5507142103438794e-05,
+      "loss": 0.1755476713180542,
+      "step": 1200
+    },
+    {
+      "epoch": 0.21925036390101893,
+      "grad_norm": 0.17096194624900818,
+      "learning_rate": 4.546495284146057e-05,
+      "loss": 0.1792473554611206,
+      "step": 1205
+    },
+    {
+      "epoch": 0.22016011644832606,
+      "grad_norm": 0.1579233556985855,
+      "learning_rate": 4.542258616815669e-05,
+      "loss": 0.17230144739151002,
+      "step": 1210
+    },
+    {
+      "epoch": 0.2210698689956332,
+      "grad_norm": 0.177297905087471,
+      "learning_rate": 4.5380042450805216e-05,
+      "loss": 0.1807127833366394,
+      "step": 1215
+    },
+    {
+      "epoch": 0.22197962154294032,
+      "grad_norm": 0.14331696927547455,
+      "learning_rate": 4.533732205821897e-05,
+      "loss": 0.17201389074325563,
+      "step": 1220
+    },
+    {
+      "epoch": 0.22288937409024745,
+      "grad_norm": 0.14473360776901245,
+      "learning_rate": 4.529442536074239e-05,
+      "loss": 0.17036900520324708,
+      "step": 1225
+    },
+    {
+      "epoch": 0.22379912663755458,
+      "grad_norm": 0.1820901483297348,
+      "learning_rate": 4.5251352730248314e-05,
+      "loss": 0.17704882621765136,
+      "step": 1230
+    },
+    {
+      "epoch": 0.2247088791848617,
+      "grad_norm": 0.1948976367712021,
+      "learning_rate": 4.5208104540134746e-05,
+      "loss": 0.1706973433494568,
+      "step": 1235
+    },
+    {
+      "epoch": 0.22561863173216884,
+      "grad_norm": 0.16660070419311523,
+      "learning_rate": 4.51646811653216e-05,
+      "loss": 0.17636821269989014,
+      "step": 1240
+    },
+    {
+      "epoch": 0.22652838427947597,
+      "grad_norm": 0.1699984073638916,
+      "learning_rate": 4.512108298224751e-05,
+      "loss": 0.16986632347106934,
+      "step": 1245
+    },
+    {
+      "epoch": 0.22743813682678313,
+      "grad_norm": 0.17601042985916138,
+      "learning_rate": 4.50773103688665e-05,
+      "loss": 0.17507898807525635,
+      "step": 1250
+    },
+    {
+      "epoch": 0.22834788937409026,
+      "grad_norm": 0.17557238042354584,
+      "learning_rate": 4.503336370464476e-05,
+      "loss": 0.17702863216400147,
+      "step": 1255
+    },
+    {
+      "epoch": 0.2292576419213974,
+      "grad_norm": 0.1800651252269745,
+      "learning_rate": 4.498924337055729e-05,
+      "loss": 0.16419180631637573,
+      "step": 1260
+    },
+    {
+      "epoch": 0.23016739446870452,
+      "grad_norm": 0.2022479772567749,
+      "learning_rate": 4.494494974908468e-05,
+      "loss": 0.17482060194015503,
+      "step": 1265
+    },
+    {
+      "epoch": 0.23107714701601165,
+      "grad_norm": 0.14180205762386322,
+      "learning_rate": 4.490048322420973e-05,
+      "loss": 0.1723136067390442,
+      "step": 1270
+    },
+    {
+      "epoch": 0.23198689956331878,
+      "grad_norm": 0.18607310950756073,
+      "learning_rate": 4.485584418141419e-05,
+      "loss": 0.17096419334411622,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2328966521106259,
+      "grad_norm": 0.15958310663700104,
+      "learning_rate": 4.481103300767529e-05,
+      "loss": 0.1656244158744812,
+      "step": 1280
+    },
+    {
+      "epoch": 0.23380640465793304,
+      "grad_norm": 0.17552383244037628,
+      "learning_rate": 4.476605009146255e-05,
+      "loss": 0.17677626609802247,
+      "step": 1285
+    },
+    {
+      "epoch": 0.23471615720524017,
+      "grad_norm": 0.15299823880195618,
+      "learning_rate": 4.472089582273429e-05,
+      "loss": 0.1778991103172302,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2356259097525473,
+      "grad_norm": 0.14613987505435944,
+      "learning_rate": 4.46755705929343e-05,
+      "loss": 0.17071452140808105,
+      "step": 1295
+    },
+    {
+      "epoch": 0.23653566229985443,
+      "grad_norm": 0.17781122028827667,
+      "learning_rate": 4.463007479498843e-05,
+      "loss": 0.16955430507659913,
+      "step": 1300
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 7.228400527632223e+17,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-1300/training_args.bin b/checkpoint-1300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-1300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/checkpoint-1400/README.md b/checkpoint-1400/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-1400/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-1400/adapter_config.json b/checkpoint-1400/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-1400/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-1400/adapter_model.safetensors b/checkpoint-1400/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..4f01d4e06af34c6612821e55a7eb012b178477cf
--- /dev/null
+++ b/checkpoint-1400/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0cb28cfb399618f61de528c45e4399799c020a48d356ee719d582cf526507da2
+size 169741912
diff --git a/checkpoint-1400/chat_template.jinja b/checkpoint-1400/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-1400/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-1400/optimizer.pt b/checkpoint-1400/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6e8f193f386581ef2e412324e5c1e4dfd8be8331
--- /dev/null
+++ b/checkpoint-1400/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f4e89c6629846413da997174e8f2034125889039290a4be48aed11dbe5707be8
+size 72807355
diff --git a/checkpoint-1400/processor_config.json b/checkpoint-1400/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-1400/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-1400/rng_state.pth b/checkpoint-1400/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-1400/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-1400/scheduler.pt b/checkpoint-1400/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c3a541ea94708ae9b1d6e581389ef1d1fc95c392
--- /dev/null
+++ b/checkpoint-1400/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:857af1b441f656602733aca456e3a743a89384ac648ee3fd3489a940ab95b523
+size 1465
diff --git a/checkpoint-1400/tokenizer.json b/checkpoint-1400/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-1400/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-1400/tokenizer_config.json b/checkpoint-1400/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-1400/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-1400/trainer_state.json b/checkpoint-1400/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..4c6f04641ac09dbac246ba5e604d79b75a56b651
--- /dev/null
+++ b/checkpoint-1400/trainer_state.json
@@ -0,0 +1,2002 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.2547307132459971,
+  "eval_steps": 100,
+  "global_step": 1400,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    },
+    {
+      "epoch": 0.03729985443959243,
+      "grad_norm": 0.061678580939769745,
+      "learning_rate": 4.999340748636462e-05,
+      "loss": 0.24956226348876953,
+      "step": 205
+    },
+    {
+      "epoch": 0.03820960698689956,
+      "grad_norm": 0.07328873127698898,
+      "learning_rate": 4.999160884067051e-05,
+      "loss": 0.26169676780700685,
+      "step": 210
+    },
+    {
+      "epoch": 0.0391193595342067,
+      "grad_norm": 0.08287990838289261,
+      "learning_rate": 4.9989593541926246e-05,
+      "loss": 0.2574604034423828,
+      "step": 215
+    },
+    {
+      "epoch": 0.04002911208151383,
+      "grad_norm": 0.06787359714508057,
+      "learning_rate": 4.9987361607602525e-05,
+      "loss": 0.25351409912109374,
+      "step": 220
+    },
+    {
+      "epoch": 0.04093886462882096,
+      "grad_norm": 0.06695502996444702,
+      "learning_rate": 4.998491305704805e-05,
+      "loss": 0.24522039890289307,
+      "step": 225
+    },
+    {
+      "epoch": 0.041848617176128096,
+      "grad_norm": 0.08872214704751968,
+      "learning_rate": 4.9982247911489375e-05,
+      "loss": 0.2581867933273315,
+      "step": 230
+    },
+    {
+      "epoch": 0.042758369723435226,
+      "grad_norm": 0.07637131959199905,
+      "learning_rate": 4.9979366194030743e-05,
+      "loss": 0.25569658279418944,
+      "step": 235
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 0.08158119022846222,
+      "learning_rate": 4.997626792965385e-05,
+      "loss": 0.2529409646987915,
+      "step": 240
+    },
+    {
+      "epoch": 0.04457787481804949,
+      "grad_norm": 0.07529161125421524,
+      "learning_rate": 4.997295314521766e-05,
+      "loss": 0.24049024581909179,
+      "step": 245
+    },
+    {
+      "epoch": 0.04548762736535662,
+      "grad_norm": 0.08860139548778534,
+      "learning_rate": 4.996942186945813e-05,
+      "loss": 0.2490522861480713,
+      "step": 250
+    },
+    {
+      "epoch": 0.04639737991266375,
+      "grad_norm": 0.0850321501493454,
+      "learning_rate": 4.9965674132988005e-05,
+      "loss": 0.24180831909179687,
+      "step": 255
+    },
+    {
+      "epoch": 0.04730713245997089,
+      "grad_norm": 0.07556115090847015,
+      "learning_rate": 4.996170996829653e-05,
+      "loss": 0.2509631872177124,
+      "step": 260
+    },
+    {
+      "epoch": 0.04821688500727802,
+      "grad_norm": 0.07971206307411194,
+      "learning_rate": 4.995752940974918e-05,
+      "loss": 0.24398891925811766,
+      "step": 265
+    },
+    {
+      "epoch": 0.04912663755458515,
+      "grad_norm": 0.09149336814880371,
+      "learning_rate": 4.9953132493587344e-05,
+      "loss": 0.2300492286682129,
+      "step": 270
+    },
+    {
+      "epoch": 0.050036390101892286,
+      "grad_norm": 0.08265820890665054,
+      "learning_rate": 4.9948519257928034e-05,
+      "loss": 0.24246792793273925,
+      "step": 275
+    },
+    {
+      "epoch": 0.050946142649199416,
+      "grad_norm": 0.10328587144613266,
+      "learning_rate": 4.9943689742763534e-05,
+      "loss": 0.2367171049118042,
+      "step": 280
+    },
+    {
+      "epoch": 0.05185589519650655,
+      "grad_norm": 0.0836917981505394,
+      "learning_rate": 4.993864398996105e-05,
+      "loss": 0.23215813636779786,
+      "step": 285
+    },
+    {
+      "epoch": 0.05276564774381368,
+      "grad_norm": 0.09475161135196686,
+      "learning_rate": 4.99333820432624e-05,
+      "loss": 0.2350748062133789,
+      "step": 290
+    },
+    {
+      "epoch": 0.05367540029112081,
+      "grad_norm": 0.08040128648281097,
+      "learning_rate": 4.992790394828355e-05,
+      "loss": 0.23253886699676513,
+      "step": 295
+    },
+    {
+      "epoch": 0.05458515283842795,
+      "grad_norm": 0.08852150291204453,
+      "learning_rate": 4.992220975251428e-05,
+      "loss": 0.23856515884399415,
+      "step": 300
+    },
+    {
+      "epoch": 0.05549490538573508,
+      "grad_norm": 0.09565229713916779,
+      "learning_rate": 4.991629950531775e-05,
+      "loss": 0.23311660289764405,
+      "step": 305
+    },
+    {
+      "epoch": 0.05640465793304221,
+      "grad_norm": 0.08158160001039505,
+      "learning_rate": 4.991017325793009e-05,
+      "loss": 0.22467944622039795,
+      "step": 310
+    },
+    {
+      "epoch": 0.05731441048034935,
+      "grad_norm": 0.07746429741382599,
+      "learning_rate": 4.990383106345994e-05,
+      "loss": 0.229844069480896,
+      "step": 315
+    },
+    {
+      "epoch": 0.05822416302765648,
+      "grad_norm": 0.08564355969429016,
+      "learning_rate": 4.989727297688797e-05,
+      "loss": 0.22414517402648926,
+      "step": 320
+    },
+    {
+      "epoch": 0.05913391557496361,
+      "grad_norm": 0.07517435401678085,
+      "learning_rate": 4.9890499055066435e-05,
+      "loss": 0.2236532211303711,
+      "step": 325
+    },
+    {
+      "epoch": 0.060043668122270744,
+      "grad_norm": 0.111734539270401,
+      "learning_rate": 4.988350935671869e-05,
+      "loss": 0.21474847793579102,
+      "step": 330
+    },
+    {
+      "epoch": 0.060953420669577874,
+      "grad_norm": 0.09906989336013794,
+      "learning_rate": 4.987630394243866e-05,
+      "loss": 0.23321933746337892,
+      "step": 335
+    },
+    {
+      "epoch": 0.06186317321688501,
+      "grad_norm": 0.10131457448005676,
+      "learning_rate": 4.98688828746903e-05,
+      "loss": 0.2310662031173706,
+      "step": 340
+    },
+    {
+      "epoch": 0.06277292576419213,
+      "grad_norm": 0.09203507006168365,
+      "learning_rate": 4.986124621780708e-05,
+      "loss": 0.22021169662475587,
+      "step": 345
+    },
+    {
+      "epoch": 0.06368267831149928,
+      "grad_norm": 0.09505912661552429,
+      "learning_rate": 4.9853394037991416e-05,
+      "loss": 0.2197155237197876,
+      "step": 350
+    },
+    {
+      "epoch": 0.06459243085880641,
+      "grad_norm": 0.09038657695055008,
+      "learning_rate": 4.984532640331412e-05,
+      "loss": 0.22066287994384765,
+      "step": 355
+    },
+    {
+      "epoch": 0.06550218340611354,
+      "grad_norm": 0.09707064181566238,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 0.22455451488494874,
+      "step": 360
+    },
+    {
+      "epoch": 0.06641193595342067,
+      "grad_norm": 0.10367228090763092,
+      "learning_rate": 4.98285450509961e-05,
+      "loss": 0.21993820667266845,
+      "step": 365
+    },
+    {
+      "epoch": 0.0673216885007278,
+      "grad_norm": 0.12229471653699875,
+      "learning_rate": 4.9819831478833456e-05,
+      "loss": 0.2168867588043213,
+      "step": 370
+    },
+    {
+      "epoch": 0.06823144104803494,
+      "grad_norm": 0.0964592918753624,
+      "learning_rate": 4.981090274276406e-05,
+      "loss": 0.21579203605651856,
+      "step": 375
+    },
+    {
+      "epoch": 0.06914119359534207,
+      "grad_norm": 0.09400496631860733,
+      "learning_rate": 4.980175892019141e-05,
+      "loss": 0.20972180366516113,
+      "step": 380
+    },
+    {
+      "epoch": 0.0700509461426492,
+      "grad_norm": 0.08158645778894424,
+      "learning_rate": 4.9792400090383594e-05,
+      "loss": 0.22148358821868896,
+      "step": 385
+    },
+    {
+      "epoch": 0.07096069868995633,
+      "grad_norm": 0.10916394740343094,
+      "learning_rate": 4.978282633447261e-05,
+      "loss": 0.2214418649673462,
+      "step": 390
+    },
+    {
+      "epoch": 0.07187045123726346,
+      "grad_norm": 0.11138810962438583,
+      "learning_rate": 4.9773037735453636e-05,
+      "loss": 0.21814754009246826,
+      "step": 395
+    },
+    {
+      "epoch": 0.07278020378457059,
+      "grad_norm": 0.10914396494626999,
+      "learning_rate": 4.9763034378184365e-05,
+      "loss": 0.21310818195343018,
+      "step": 400
+    },
+    {
+      "epoch": 0.07368995633187773,
+      "grad_norm": 0.1043366864323616,
+      "learning_rate": 4.975281634938421e-05,
+      "loss": 0.21266789436340333,
+      "step": 405
+    },
+    {
+      "epoch": 0.07459970887918486,
+      "grad_norm": 0.1036868542432785,
+      "learning_rate": 4.9742383737633594e-05,
+      "loss": 0.21606721878051757,
+      "step": 410
+    },
+    {
+      "epoch": 0.075509461426492,
+      "grad_norm": 0.11640442907810211,
+      "learning_rate": 4.9731736633373144e-05,
+      "loss": 0.21532948017120362,
+      "step": 415
+    },
+    {
+      "epoch": 0.07641921397379912,
+      "grad_norm": 0.11219926178455353,
+      "learning_rate": 4.9720875128902956e-05,
+      "loss": 0.2191627025604248,
+      "step": 420
+    },
+    {
+      "epoch": 0.07732896652110625,
+      "grad_norm": 0.12103637307882309,
+      "learning_rate": 4.970979931838176e-05,
+      "loss": 0.20938868522644044,
+      "step": 425
+    },
+    {
+      "epoch": 0.0782387190684134,
+      "grad_norm": 0.13274189829826355,
+      "learning_rate": 4.96985092978261e-05,
+      "loss": 0.21792960166931152,
+      "step": 430
+    },
+    {
+      "epoch": 0.07914847161572053,
+      "grad_norm": 0.11164513230323792,
+      "learning_rate": 4.968700516510954e-05,
+      "loss": 0.2022618055343628,
+      "step": 435
+    },
+    {
+      "epoch": 0.08005822416302766,
+      "grad_norm": 0.09532847255468369,
+      "learning_rate": 4.967528701996174e-05,
+      "loss": 0.21255812644958497,
+      "step": 440
+    },
+    {
+      "epoch": 0.08096797671033479,
+      "grad_norm": 0.10279258340597153,
+      "learning_rate": 4.96633549639677e-05,
+      "loss": 0.20683050155639648,
+      "step": 445
+    },
+    {
+      "epoch": 0.08187772925764192,
+      "grad_norm": 0.1257462352514267,
+      "learning_rate": 4.965120910056677e-05,
+      "loss": 0.21419920921325683,
+      "step": 450
+    },
+    {
+      "epoch": 0.08278748180494905,
+      "grad_norm": 0.11663137376308441,
+      "learning_rate": 4.963884953505186e-05,
+      "loss": 0.2072287082672119,
+      "step": 455
+    },
+    {
+      "epoch": 0.08369723435225619,
+      "grad_norm": 0.10488224029541016,
+      "learning_rate": 4.96262763745684e-05,
+      "loss": 0.1982678532600403,
+      "step": 460
+    },
+    {
+      "epoch": 0.08460698689956332,
+      "grad_norm": 0.11801692098379135,
+      "learning_rate": 4.961348972811354e-05,
+      "loss": 0.20662031173706055,
+      "step": 465
+    },
+    {
+      "epoch": 0.08551673944687045,
+      "grad_norm": 0.11318827420473099,
+      "learning_rate": 4.96004897065351e-05,
+      "loss": 0.20947303771972656,
+      "step": 470
+    },
+    {
+      "epoch": 0.08642649199417758,
+      "grad_norm": 0.13409486413002014,
+      "learning_rate": 4.95872764225307e-05,
+      "loss": 0.19670876264572143,
+      "step": 475
+    },
+    {
+      "epoch": 0.08733624454148471,
+      "grad_norm": 0.14440792798995972,
+      "learning_rate": 4.957384999064672e-05,
+      "loss": 0.19842848777770997,
+      "step": 480
+    },
+    {
+      "epoch": 0.08824599708879186,
+      "grad_norm": 0.12246996909379959,
+      "learning_rate": 4.956021052727731e-05,
+      "loss": 0.20318071842193602,
+      "step": 485
+    },
+    {
+      "epoch": 0.08915574963609899,
+      "grad_norm": 0.13437233865261078,
+      "learning_rate": 4.954635815066342e-05,
+      "loss": 0.21675212383270265,
+      "step": 490
+    },
+    {
+      "epoch": 0.09006550218340612,
+      "grad_norm": 0.11109672486782074,
+      "learning_rate": 4.9532292980891744e-05,
+      "loss": 0.2100757837295532,
+      "step": 495
+    },
+    {
+      "epoch": 0.09097525473071325,
+      "grad_norm": 0.1388893872499466,
+      "learning_rate": 4.9518015139893675e-05,
+      "loss": 0.20303285121917725,
+      "step": 500
+    },
+    {
+      "epoch": 0.09188500727802038,
+      "grad_norm": 0.13239721953868866,
+      "learning_rate": 4.950352475144427e-05,
+      "loss": 0.2152268409729004,
+      "step": 505
+    },
+    {
+      "epoch": 0.0927947598253275,
+      "grad_norm": 0.12834979593753815,
+      "learning_rate": 4.948882194116119e-05,
+      "loss": 0.20799248218536376,
+      "step": 510
+    },
+    {
+      "epoch": 0.09370451237263465,
+      "grad_norm": 0.11886704713106155,
+      "learning_rate": 4.947390683650354e-05,
+      "loss": 0.20394976139068605,
+      "step": 515
+    },
+    {
+      "epoch": 0.09461426491994178,
+      "grad_norm": 0.11398876458406448,
+      "learning_rate": 4.945877956677083e-05,
+      "loss": 0.2091092586517334,
+      "step": 520
+    },
+    {
+      "epoch": 0.09552401746724891,
+      "grad_norm": 0.1422540694475174,
+      "learning_rate": 4.944344026310186e-05,
+      "loss": 0.19564238786697388,
+      "step": 525
+    },
+    {
+      "epoch": 0.09643377001455604,
+      "grad_norm": 0.11359584331512451,
+      "learning_rate": 4.9427889058473535e-05,
+      "loss": 0.20493624210357667,
+      "step": 530
+    },
+    {
+      "epoch": 0.09734352256186317,
+      "grad_norm": 0.11703553050756454,
+      "learning_rate": 4.941212608769974e-05,
+      "loss": 0.2098615884780884,
+      "step": 535
+    },
+    {
+      "epoch": 0.0982532751091703,
+      "grad_norm": 0.14552047848701477,
+      "learning_rate": 4.939615148743017e-05,
+      "loss": 0.20382182598114013,
+      "step": 540
+    },
+    {
+      "epoch": 0.09916302765647744,
+      "grad_norm": 0.13178016245365143,
+      "learning_rate": 4.937996539614914e-05,
+      "loss": 0.19901862144470214,
+      "step": 545
+    },
+    {
+      "epoch": 0.10007278020378457,
+      "grad_norm": 0.635392427444458,
+      "learning_rate": 4.936356795417439e-05,
+      "loss": 0.20694944858551026,
+      "step": 550
+    },
+    {
+      "epoch": 0.1009825327510917,
+      "grad_norm": 0.15019077062606812,
+      "learning_rate": 4.934695930365586e-05,
+      "loss": 0.19313746690750122,
+      "step": 555
+    },
+    {
+      "epoch": 0.10189228529839883,
+      "grad_norm": 0.12941956520080566,
+      "learning_rate": 4.9330139588574474e-05,
+      "loss": 0.19671722650527954,
+      "step": 560
+    },
+    {
+      "epoch": 0.10280203784570596,
+      "grad_norm": 0.13818831741809845,
+      "learning_rate": 4.931310895474088e-05,
+      "loss": 0.20026786327362062,
+      "step": 565
+    },
+    {
+      "epoch": 0.1037117903930131,
+      "grad_norm": 0.12011194974184036,
+      "learning_rate": 4.929586754979417e-05,
+      "loss": 0.1932437539100647,
+      "step": 570
+    },
+    {
+      "epoch": 0.10462154294032024,
+      "grad_norm": 0.1345364898443222,
+      "learning_rate": 4.9278415523200644e-05,
+      "loss": 0.20245940685272218,
+      "step": 575
+    },
+    {
+      "epoch": 0.10553129548762737,
+      "grad_norm": 0.13281017541885376,
+      "learning_rate": 4.926075302625247e-05,
+      "loss": 0.19864981174468993,
+      "step": 580
+    },
+    {
+      "epoch": 0.1064410480349345,
+      "grad_norm": 0.13465586304664612,
+      "learning_rate": 4.924288021206639e-05,
+      "loss": 0.19573183059692384,
+      "step": 585
+    },
+    {
+      "epoch": 0.10735080058224163,
+      "grad_norm": 0.15225961804389954,
+      "learning_rate": 4.9224797235582396e-05,
+      "loss": 0.19946801662445068,
+      "step": 590
+    },
+    {
+      "epoch": 0.10826055312954876,
+      "grad_norm": 0.12816746532917023,
+      "learning_rate": 4.92065042535624e-05,
+      "loss": 0.19851526021957397,
+      "step": 595
+    },
+    {
+      "epoch": 0.1091703056768559,
+      "grad_norm": 0.13802853226661682,
+      "learning_rate": 4.9188001424588824e-05,
+      "loss": 0.19321763515472412,
+      "step": 600
+    },
+    {
+      "epoch": 0.11008005822416303,
+      "grad_norm": 0.17504797875881195,
+      "learning_rate": 4.9169288909063295e-05,
+      "loss": 0.2032616138458252,
+      "step": 605
+    },
+    {
+      "epoch": 0.11098981077147016,
+      "grad_norm": 0.13544194400310516,
+      "learning_rate": 4.91503668692052e-05,
+      "loss": 0.2011256456375122,
+      "step": 610
+    },
+    {
+      "epoch": 0.11189956331877729,
+      "grad_norm": 1.3976134061813354,
+      "learning_rate": 4.91312354690503e-05,
+      "loss": 0.19916868209838867,
+      "step": 615
+    },
+    {
+      "epoch": 0.11280931586608442,
+      "grad_norm": 0.1465059071779251,
+      "learning_rate": 4.91118948744493e-05,
+      "loss": 0.19487457275390624,
+      "step": 620
+    },
+    {
+      "epoch": 0.11371906841339156,
+      "grad_norm": 0.12103168666362762,
+      "learning_rate": 4.909234525306645e-05,
+      "loss": 0.1907251238822937,
+      "step": 625
+    },
+    {
+      "epoch": 0.1146288209606987,
+      "grad_norm": 0.12660574913024902,
+      "learning_rate": 4.907258677437802e-05,
+      "loss": 0.19327253103256226,
+      "step": 630
+    },
+    {
+      "epoch": 0.11553857350800582,
+      "grad_norm": 0.1347813606262207,
+      "learning_rate": 4.90526196096709e-05,
+      "loss": 0.19637736082077026,
+      "step": 635
+    },
+    {
+      "epoch": 0.11644832605531295,
+      "grad_norm": 0.14953652024269104,
+      "learning_rate": 4.903244393204107e-05,
+      "loss": 0.20325069427490233,
+      "step": 640
+    },
+    {
+      "epoch": 0.11735807860262008,
+      "grad_norm": 0.13936272263526917,
+      "learning_rate": 4.901205991639213e-05,
+      "loss": 0.1930275321006775,
+      "step": 645
+    },
+    {
+      "epoch": 0.11826783114992721,
+      "grad_norm": 0.1448420137166977,
+      "learning_rate": 4.899146773943374e-05,
+      "loss": 0.20026936531066894,
+      "step": 650
+    },
+    {
+      "epoch": 0.11917758369723436,
+      "grad_norm": 0.1312534064054489,
+      "learning_rate": 4.897066757968014e-05,
+      "loss": 0.19062033891677857,
+      "step": 655
+    },
+    {
+      "epoch": 0.12008733624454149,
+      "grad_norm": 0.13644742965698242,
+      "learning_rate": 4.894965961744859e-05,
+      "loss": 0.18719595670700073,
+      "step": 660
+    },
+    {
+      "epoch": 0.12099708879184862,
+      "grad_norm": 0.14276087284088135,
+      "learning_rate": 4.892844403485777e-05,
+      "loss": 0.19784307479858398,
+      "step": 665
+    },
+    {
+      "epoch": 0.12190684133915575,
+      "grad_norm": 0.14735399186611176,
+      "learning_rate": 4.890702101582623e-05,
+      "loss": 0.19163782596588136,
+      "step": 670
+    },
+    {
+      "epoch": 0.12281659388646288,
+      "grad_norm": 0.15742065012454987,
+      "learning_rate": 4.888539074607082e-05,
+      "loss": 0.19312986135482788,
+      "step": 675
+    },
+    {
+      "epoch": 0.12372634643377002,
+      "grad_norm": 0.12917031347751617,
+      "learning_rate": 4.8863553413105025e-05,
+      "loss": 0.20066320896148682,
+      "step": 680
+    },
+    {
+      "epoch": 0.12463609898107715,
+      "grad_norm": 0.1484801322221756,
+      "learning_rate": 4.884150920623737e-05,
+      "loss": 0.20096096992492676,
+      "step": 685
+    },
+    {
+      "epoch": 0.12554585152838427,
+      "grad_norm": 0.1455296128988266,
+      "learning_rate": 4.88192583165698e-05,
+      "loss": 0.20518505573272705,
+      "step": 690
+    },
+    {
+      "epoch": 0.12645560407569142,
+      "grad_norm": 0.14517490565776825,
+      "learning_rate": 4.879680093699598e-05,
+      "loss": 0.18859238624572755,
+      "step": 695
+    },
+    {
+      "epoch": 0.12736535662299855,
+      "grad_norm": 0.18778090178966522,
+      "learning_rate": 4.877413726219964e-05,
+      "loss": 0.197074818611145,
+      "step": 700
+    },
+    {
+      "epoch": 0.12827510917030568,
+      "grad_norm": 0.13497677445411682,
+      "learning_rate": 4.87512674886529e-05,
+      "loss": 0.18713107109069824,
+      "step": 705
+    },
+    {
+      "epoch": 0.12918486171761281,
+      "grad_norm": 0.12657155096530914,
+      "learning_rate": 4.872819181461455e-05,
+      "loss": 0.1858484387397766,
+      "step": 710
+    },
+    {
+      "epoch": 0.13009461426491994,
+      "grad_norm": 0.11458148807287216,
+      "learning_rate": 4.870491044012834e-05,
+      "loss": 0.18732179403305055,
+      "step": 715
+    },
+    {
+      "epoch": 0.13100436681222707,
+      "grad_norm": 0.13000249862670898,
+      "learning_rate": 4.8681423567021244e-05,
+      "loss": 0.1872936010360718,
+      "step": 720
+    },
+    {
+      "epoch": 0.1319141193595342,
+      "grad_norm": 0.14580890536308289,
+      "learning_rate": 4.865773139890172e-05,
+      "loss": 0.19280019998550416,
+      "step": 725
+    },
+    {
+      "epoch": 0.13282387190684133,
+      "grad_norm": 0.1507277935743332,
+      "learning_rate": 4.8633834141157913e-05,
+      "loss": 0.1898929238319397,
+      "step": 730
+    },
+    {
+      "epoch": 0.13373362445414846,
+      "grad_norm": 0.1418737769126892,
+      "learning_rate": 4.860973200095592e-05,
+      "loss": 0.17926375865936278,
+      "step": 735
+    },
+    {
+      "epoch": 0.1346433770014556,
+      "grad_norm": 0.17151866853237152,
+      "learning_rate": 4.858542518723794e-05,
+      "loss": 0.18963592052459716,
+      "step": 740
+    },
+    {
+      "epoch": 0.13555312954876272,
+      "grad_norm": 0.11162743717432022,
+      "learning_rate": 4.8560913910720535e-05,
+      "loss": 0.19466646909713745,
+      "step": 745
+    },
+    {
+      "epoch": 0.13646288209606988,
+      "grad_norm": 0.15628376603126526,
+      "learning_rate": 4.8536198383892725e-05,
+      "loss": 0.19494034051895143,
+      "step": 750
+    },
+    {
+      "epoch": 0.137372634643377,
+      "grad_norm": 0.18209289014339447,
+      "learning_rate": 4.851127882101421e-05,
+      "loss": 0.18747550249099731,
+      "step": 755
+    },
+    {
+      "epoch": 0.13828238719068414,
+      "grad_norm": 0.14559614658355713,
+      "learning_rate": 4.8486155438113454e-05,
+      "loss": 0.1897158980369568,
+      "step": 760
+    },
+    {
+      "epoch": 0.13919213973799127,
+      "grad_norm": 0.3198587894439697,
+      "learning_rate": 4.846082845298586e-05,
+      "loss": 0.18571001291275024,
+      "step": 765
+    },
+    {
+      "epoch": 0.1401018922852984,
+      "grad_norm": 0.1486678421497345,
+      "learning_rate": 4.843529808519189e-05,
+      "loss": 0.19561930894851684,
+      "step": 770
+    },
+    {
+      "epoch": 0.14101164483260553,
+      "grad_norm": 0.15318170189857483,
+      "learning_rate": 4.840956455605509e-05,
+      "loss": 0.187040114402771,
+      "step": 775
+    },
+    {
+      "epoch": 0.14192139737991266,
+      "grad_norm": 0.13754244148731232,
+      "learning_rate": 4.838362808866025e-05,
+      "loss": 0.18345539569854735,
+      "step": 780
+    },
+    {
+      "epoch": 0.1428311499272198,
+      "grad_norm": 0.12943248450756073,
+      "learning_rate": 4.835748890785143e-05,
+      "loss": 0.1921079397201538,
+      "step": 785
+    },
+    {
+      "epoch": 0.14374090247452692,
+      "grad_norm": 0.110458143055439,
+      "learning_rate": 4.833114724023001e-05,
+      "loss": 0.17927205562591553,
+      "step": 790
+    },
+    {
+      "epoch": 0.14465065502183405,
+      "grad_norm": 0.2421770840883255,
+      "learning_rate": 4.830460331415275e-05,
+      "loss": 0.18317567110061644,
+      "step": 795
+    },
+    {
+      "epoch": 0.14556040756914118,
+      "grad_norm": 0.14752762019634247,
+      "learning_rate": 4.8277857359729787e-05,
+      "loss": 0.1843916058540344,
+      "step": 800
+    },
+    {
+      "epoch": 0.14647016011644834,
+      "grad_norm": 0.15043556690216064,
+      "learning_rate": 4.8250909608822644e-05,
+      "loss": 0.18354393243789674,
+      "step": 805
+    },
+    {
+      "epoch": 0.14737991266375547,
+      "grad_norm": 0.1381794661283493,
+      "learning_rate": 4.822376029504223e-05,
+      "loss": 0.1789781332015991,
+      "step": 810
+    },
+    {
+      "epoch": 0.1482896652110626,
+      "grad_norm": 0.18386174738407135,
+      "learning_rate": 4.819640965374681e-05,
+      "loss": 0.19494292736053467,
+      "step": 815
+    },
+    {
+      "epoch": 0.14919941775836973,
+      "grad_norm": 0.13829593360424042,
+      "learning_rate": 4.816885792203996e-05,
+      "loss": 0.18486063480377196,
+      "step": 820
+    },
+    {
+      "epoch": 0.15010917030567686,
+      "grad_norm": 0.15033291280269623,
+      "learning_rate": 4.814110533876852e-05,
+      "loss": 0.18061509132385253,
+      "step": 825
+    },
+    {
+      "epoch": 0.151018922852984,
+      "grad_norm": 0.17150473594665527,
+      "learning_rate": 4.811315214452051e-05,
+      "loss": 0.18464866876602173,
+      "step": 830
+    },
+    {
+      "epoch": 0.15192867540029112,
+      "grad_norm": 0.15317125618457794,
+      "learning_rate": 4.808499858162307e-05,
+      "loss": 0.1837708592414856,
+      "step": 835
+    },
+    {
+      "epoch": 0.15283842794759825,
+      "grad_norm": 0.2671392560005188,
+      "learning_rate": 4.805664489414031e-05,
+      "loss": 0.19338636398315429,
+      "step": 840
+    },
+    {
+      "epoch": 0.15374818049490538,
+      "grad_norm": 0.14047028124332428,
+      "learning_rate": 4.802809132787125e-05,
+      "loss": 0.17069108486175538,
+      "step": 845
+    },
+    {
+      "epoch": 0.1546579330422125,
+      "grad_norm": 0.1520431935787201,
+      "learning_rate": 4.799933813034768e-05,
+      "loss": 0.18607735633850098,
+      "step": 850
+    },
+    {
+      "epoch": 0.15556768558951964,
+      "grad_norm": 0.17239463329315186,
+      "learning_rate": 4.797038555083197e-05,
+      "loss": 0.18069062232971192,
+      "step": 855
+    },
+    {
+      "epoch": 0.1564774381368268,
+      "grad_norm": 0.1377955675125122,
+      "learning_rate": 4.794123384031495e-05,
+      "loss": 0.18870222568511963,
+      "step": 860
+    },
+    {
+      "epoch": 0.15738719068413393,
+      "grad_norm": 0.15901461243629456,
+      "learning_rate": 4.791188325151373e-05,
+      "loss": 0.18128334283828734,
+      "step": 865
+    },
+    {
+      "epoch": 0.15829694323144106,
+      "grad_norm": 0.14634132385253906,
+      "learning_rate": 4.7882334038869495e-05,
+      "loss": 0.1866163969039917,
+      "step": 870
+    },
+    {
+      "epoch": 0.1592066957787482,
+      "grad_norm": 0.15361061692237854,
+      "learning_rate": 4.785258645854529e-05,
+      "loss": 0.17850807905197144,
+      "step": 875
+    },
+    {
+      "epoch": 0.16011644832605532,
+      "grad_norm": 0.13751649856567383,
+      "learning_rate": 4.782264076842385e-05,
+      "loss": 0.17731113433837892,
+      "step": 880
+    },
+    {
+      "epoch": 0.16102620087336245,
+      "grad_norm": 0.17909638583660126,
+      "learning_rate": 4.7792497228105314e-05,
+      "loss": 0.18344542980194092,
+      "step": 885
+    },
+    {
+      "epoch": 0.16193595342066958,
+      "grad_norm": 0.16038304567337036,
+      "learning_rate": 4.776215609890498e-05,
+      "loss": 0.18868647813796996,
+      "step": 890
+    },
+    {
+      "epoch": 0.1628457059679767,
+      "grad_norm": 0.1653951108455658,
+      "learning_rate": 4.773161764385107e-05,
+      "loss": 0.18614152669906617,
+      "step": 895
+    },
+    {
+      "epoch": 0.16375545851528384,
+      "grad_norm": 0.16193026304244995,
+      "learning_rate": 4.770088212768241e-05,
+      "loss": 0.18564575910568237,
+      "step": 900
+    },
+    {
+      "epoch": 0.16466521106259097,
+      "grad_norm": 0.16048531234264374,
+      "learning_rate": 4.7669949816846173e-05,
+      "loss": 0.18330031633377075,
+      "step": 905
+    },
+    {
+      "epoch": 0.1655749636098981,
+      "grad_norm": 0.1440177708864212,
+      "learning_rate": 4.7638820979495534e-05,
+      "loss": 0.17712442874908446,
+      "step": 910
+    },
+    {
+      "epoch": 0.16648471615720525,
+      "grad_norm": 0.19635969400405884,
+      "learning_rate": 4.760749588548738e-05,
+      "loss": 0.18679027557373046,
+      "step": 915
+    },
+    {
+      "epoch": 0.16739446870451238,
+      "grad_norm": 0.15576541423797607,
+      "learning_rate": 4.757597480637995e-05,
+      "loss": 0.19283764362335204,
+      "step": 920
+    },
+    {
+      "epoch": 0.1683042212518195,
+      "grad_norm": 0.1550331562757492,
+      "learning_rate": 4.7544258015430463e-05,
+      "loss": 0.18269542455673218,
+      "step": 925
+    },
+    {
+      "epoch": 0.16921397379912664,
+      "grad_norm": 0.18369626998901367,
+      "learning_rate": 4.75123457875928e-05,
+      "loss": 0.1697891116142273,
+      "step": 930
+    },
+    {
+      "epoch": 0.17012372634643377,
+      "grad_norm": 0.15266314148902893,
+      "learning_rate": 4.7480238399515074e-05,
+      "loss": 0.18523451089859008,
+      "step": 935
+    },
+    {
+      "epoch": 0.1710334788937409,
+      "grad_norm": 0.16709664463996887,
+      "learning_rate": 4.744793612953724e-05,
+      "loss": 0.1803238034248352,
+      "step": 940
+    },
+    {
+      "epoch": 0.17194323144104803,
+      "grad_norm": 0.14929179847240448,
+      "learning_rate": 4.741543925768872e-05,
+      "loss": 0.1861217737197876,
+      "step": 945
+    },
+    {
+      "epoch": 0.17285298398835516,
+      "grad_norm": 0.1362280696630478,
+      "learning_rate": 4.7382748065685915e-05,
+      "loss": 0.17896100282669067,
+      "step": 950
+    },
+    {
+      "epoch": 0.1737627365356623,
+      "grad_norm": 0.15290239453315735,
+      "learning_rate": 4.734986283692982e-05,
+      "loss": 0.18432788848876952,
+      "step": 955
+    },
+    {
+      "epoch": 0.17467248908296942,
+      "grad_norm": 0.1287035197019577,
+      "learning_rate": 4.73167838565035e-05,
+      "loss": 0.18485682010650634,
+      "step": 960
+    },
+    {
+      "epoch": 0.17558224163027655,
+      "grad_norm": 0.17969627678394318,
+      "learning_rate": 4.728351141116971e-05,
+      "loss": 0.17361557483673096,
+      "step": 965
+    },
+    {
+      "epoch": 0.1764919941775837,
+      "grad_norm": 0.13751201331615448,
+      "learning_rate": 4.7250045789368326e-05,
+      "loss": 0.1731679320335388,
+      "step": 970
+    },
+    {
+      "epoch": 0.17740174672489084,
+      "grad_norm": 0.1603265255689621,
+      "learning_rate": 4.721638728121388e-05,
+      "loss": 0.17308170795440675,
+      "step": 975
+    },
+    {
+      "epoch": 0.17831149927219797,
+      "grad_norm": 0.1592789888381958,
+      "learning_rate": 4.718253617849306e-05,
+      "loss": 0.17534757852554322,
+      "step": 980
+    },
+    {
+      "epoch": 0.1792212518195051,
+      "grad_norm": 0.12727224826812744,
+      "learning_rate": 4.714849277466214e-05,
+      "loss": 0.17817609310150145,
+      "step": 985
+    },
+    {
+      "epoch": 0.18013100436681223,
+      "grad_norm": 0.15401554107666016,
+      "learning_rate": 4.711425736484447e-05,
+      "loss": 0.1733405351638794,
+      "step": 990
+    },
+    {
+      "epoch": 0.18104075691411936,
+      "grad_norm": 0.13253968954086304,
+      "learning_rate": 4.7079830245827906e-05,
+      "loss": 0.17846795320510864,
+      "step": 995
+    },
+    {
+      "epoch": 0.1819505094614265,
+      "grad_norm": 0.21846213936805725,
+      "learning_rate": 4.7045211716062245e-05,
+      "loss": 0.18021599054336548,
+      "step": 1000
+    },
+    {
+      "epoch": 0.18286026200873362,
+      "grad_norm": 0.16867990791797638,
+      "learning_rate": 4.7010402075656595e-05,
+      "loss": 0.18232386112213134,
+      "step": 1005
+    },
+    {
+      "epoch": 0.18377001455604075,
+      "grad_norm": 0.17180582880973816,
+      "learning_rate": 4.697540162637686e-05,
+      "loss": 0.1816317319869995,
+      "step": 1010
+    },
+    {
+      "epoch": 0.18467976710334788,
+      "grad_norm": 0.16480213403701782,
+      "learning_rate": 4.694021067164303e-05,
+      "loss": 0.17718446254730225,
+      "step": 1015
+    },
+    {
+      "epoch": 0.185589519650655,
+      "grad_norm": 0.15015918016433716,
+      "learning_rate": 4.6904829516526605e-05,
+      "loss": 0.17412011623382567,
+      "step": 1020
+    },
+    {
+      "epoch": 0.18649927219796217,
+      "grad_norm": 0.14445139467716217,
+      "learning_rate": 4.686925846774795e-05,
+      "loss": 0.1778018832206726,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1874090247452693,
+      "grad_norm": 0.1701960265636444,
+      "learning_rate": 4.683349783367362e-05,
+      "loss": 0.16901081800460815,
+      "step": 1030
+    },
+    {
+      "epoch": 0.18831877729257643,
+      "grad_norm": 0.15894867479801178,
+      "learning_rate": 4.679754792431368e-05,
+      "loss": 0.17055928707122803,
+      "step": 1035
+    },
+    {
+      "epoch": 0.18922852983988356,
+      "grad_norm": 0.1511942446231842,
+      "learning_rate": 4.676140905131903e-05,
+      "loss": 0.17339680194854737,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1901382823871907,
+      "grad_norm": 0.14735209941864014,
+      "learning_rate": 4.672508152797872e-05,
+      "loss": 0.17802717685699462,
+      "step": 1045
+    },
+    {
+      "epoch": 0.19104803493449782,
+      "grad_norm": 0.17367291450500488,
+      "learning_rate": 4.66885656692172e-05,
+      "loss": 0.1732744097709656,
+      "step": 1050
+    },
+    {
+      "epoch": 0.19195778748180495,
+      "grad_norm": 0.147227481007576,
+      "learning_rate": 4.665186179159159e-05,
+      "loss": 0.17040517330169677,
+      "step": 1055
+    },
+    {
+      "epoch": 0.19286754002911208,
+      "grad_norm": 0.1709655076265335,
+      "learning_rate": 4.6614970213289e-05,
+      "loss": 0.17794088125228882,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1937772925764192,
+      "grad_norm": 0.1588088721036911,
+      "learning_rate": 4.657789125412366e-05,
+      "loss": 0.17180380821228028,
+      "step": 1065
+    },
+    {
+      "epoch": 0.19468704512372634,
+      "grad_norm": 0.14827021956443787,
+      "learning_rate": 4.654062523553428e-05,
+      "loss": 0.182997989654541,
+      "step": 1070
+    },
+    {
+      "epoch": 0.19559679767103347,
+      "grad_norm": 0.16230466961860657,
+      "learning_rate": 4.6503172480581126e-05,
+      "loss": 0.17346880435943604,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1965065502183406,
+      "grad_norm": 0.1637624353170395,
+      "learning_rate": 4.646553331394333e-05,
+      "loss": 0.17263576984405518,
+      "step": 1080
+    },
+    {
+      "epoch": 0.19741630276564776,
+      "grad_norm": 0.15977843105793,
+      "learning_rate": 4.642770806191603e-05,
+      "loss": 0.17284308671951293,
+      "step": 1085
+    },
+    {
+      "epoch": 0.19832605531295489,
+      "grad_norm": 0.15394869446754456,
+      "learning_rate": 4.6389697052407534e-05,
+      "loss": 0.17797101736068727,
+      "step": 1090
+    },
+    {
+      "epoch": 0.19923580786026202,
+      "grad_norm": 0.15995225310325623,
+      "learning_rate": 4.6351500614936485e-05,
+      "loss": 0.18137198686599731,
+      "step": 1095
+    },
+    {
+      "epoch": 0.20014556040756915,
+      "grad_norm": 0.1779479682445526,
+      "learning_rate": 4.6313119080629006e-05,
+      "loss": 0.17998344898223878,
+      "step": 1100
+    },
+    {
+      "epoch": 0.20105531295487628,
+      "grad_norm": 0.14362832903862,
+      "learning_rate": 4.627455278221584e-05,
+      "loss": 0.18196423053741456,
+      "step": 1105
+    },
+    {
+      "epoch": 0.2019650655021834,
+      "grad_norm": 0.15951639413833618,
+      "learning_rate": 4.623580205402947e-05,
+      "loss": 0.17423888444900512,
+      "step": 1110
+    },
+    {
+      "epoch": 0.20287481804949054,
+      "grad_norm": 0.17273563146591187,
+      "learning_rate": 4.619686723200115e-05,
+      "loss": 0.17392473220825194,
+      "step": 1115
+    },
+    {
+      "epoch": 0.20378457059679767,
+      "grad_norm": 0.1655360758304596,
+      "learning_rate": 4.615774865365813e-05,
+      "loss": 0.17528389692306517,
+      "step": 1120
+    },
+    {
+      "epoch": 0.2046943231441048,
+      "grad_norm": 0.15920691192150116,
+      "learning_rate": 4.611844665812058e-05,
+      "loss": 0.1806849241256714,
+      "step": 1125
+    },
+    {
+      "epoch": 0.20560407569141192,
+      "grad_norm": 0.16114577651023865,
+      "learning_rate": 4.607896158609875e-05,
+      "loss": 0.17217352390289306,
+      "step": 1130
+    },
+    {
+      "epoch": 0.20651382823871905,
+      "grad_norm": 0.1499422937631607,
+      "learning_rate": 4.603929377988999e-05,
+      "loss": 0.17806737422943114,
+      "step": 1135
+    },
+    {
+      "epoch": 0.2074235807860262,
+      "grad_norm": 0.17605191469192505,
+      "learning_rate": 4.5999443583375765e-05,
+      "loss": 0.17842113971710205,
+      "step": 1140
+    },
+    {
+      "epoch": 0.20833333333333334,
+      "grad_norm": 0.16117210686206818,
+      "learning_rate": 4.595941134201871e-05,
+      "loss": 0.18379683494567872,
+      "step": 1145
+    },
+    {
+      "epoch": 0.20924308588064047,
+      "grad_norm": 0.21199050545692444,
+      "learning_rate": 4.591919740285957e-05,
+      "loss": 0.16286123991012574,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2101528384279476,
+      "grad_norm": 0.15100529789924622,
+      "learning_rate": 4.587880211451427e-05,
+      "loss": 0.17995200157165528,
+      "step": 1155
+    },
+    {
+      "epoch": 0.21106259097525473,
+      "grad_norm": 0.16618172824382782,
+      "learning_rate": 4.583822582717085e-05,
+      "loss": 0.16960303783416747,
+      "step": 1160
+    },
+    {
+      "epoch": 0.21197234352256186,
+      "grad_norm": 0.14743569493293762,
+      "learning_rate": 4.579746889258643e-05,
+      "loss": 0.17762668132781984,
+      "step": 1165
+    },
+    {
+      "epoch": 0.212882096069869,
+      "grad_norm": 0.1697179079055786,
+      "learning_rate": 4.575653166408417e-05,
+      "loss": 0.16656005382537842,
+      "step": 1170
+    },
+    {
+      "epoch": 0.21379184861717612,
+      "grad_norm": 0.14886513352394104,
+      "learning_rate": 4.57154144965502e-05,
+      "loss": 0.17091882228851318,
+      "step": 1175
+    },
+    {
+      "epoch": 0.21470160116448325,
+      "grad_norm": 0.18197473883628845,
+      "learning_rate": 4.5674117746430556e-05,
+      "loss": 0.1770920753479004,
+      "step": 1180
+    },
+    {
+      "epoch": 0.21561135371179038,
+      "grad_norm": 0.17323088645935059,
+      "learning_rate": 4.563264177172807e-05,
+      "loss": 0.1734643578529358,
+      "step": 1185
+    },
+    {
+      "epoch": 0.2165211062590975,
+      "grad_norm": 0.1521984338760376,
+      "learning_rate": 4.559098693199929e-05,
+      "loss": 0.17515116930007935,
+      "step": 1190
+    },
+    {
+      "epoch": 0.21743085880640467,
+      "grad_norm": 0.1842304915189743,
+      "learning_rate": 4.554915358835134e-05,
+      "loss": 0.16798022985458375,
+      "step": 1195
+    },
+    {
+      "epoch": 0.2183406113537118,
+      "grad_norm": 0.14753451943397522,
+      "learning_rate": 4.5507142103438794e-05,
+      "loss": 0.1755476713180542,
+      "step": 1200
+    },
+    {
+      "epoch": 0.21925036390101893,
+      "grad_norm": 0.17096194624900818,
+      "learning_rate": 4.546495284146057e-05,
+      "loss": 0.1792473554611206,
+      "step": 1205
+    },
+    {
+      "epoch": 0.22016011644832606,
+      "grad_norm": 0.1579233556985855,
+      "learning_rate": 4.542258616815669e-05,
+      "loss": 0.17230144739151002,
+      "step": 1210
+    },
+    {
+      "epoch": 0.2210698689956332,
+      "grad_norm": 0.177297905087471,
+      "learning_rate": 4.5380042450805216e-05,
+      "loss": 0.1807127833366394,
+      "step": 1215
+    },
+    {
+      "epoch": 0.22197962154294032,
+      "grad_norm": 0.14331696927547455,
+      "learning_rate": 4.533732205821897e-05,
+      "loss": 0.17201389074325563,
+      "step": 1220
+    },
+    {
+      "epoch": 0.22288937409024745,
+      "grad_norm": 0.14473360776901245,
+      "learning_rate": 4.529442536074239e-05,
+      "loss": 0.17036900520324708,
+      "step": 1225
+    },
+    {
+      "epoch": 0.22379912663755458,
+      "grad_norm": 0.1820901483297348,
+      "learning_rate": 4.5251352730248314e-05,
+      "loss": 0.17704882621765136,
+      "step": 1230
+    },
+    {
+      "epoch": 0.2247088791848617,
+      "grad_norm": 0.1948976367712021,
+      "learning_rate": 4.5208104540134746e-05,
+      "loss": 0.1706973433494568,
+      "step": 1235
+    },
+    {
+      "epoch": 0.22561863173216884,
+      "grad_norm": 0.16660070419311523,
+      "learning_rate": 4.51646811653216e-05,
+      "loss": 0.17636821269989014,
+      "step": 1240
+    },
+    {
+      "epoch": 0.22652838427947597,
+      "grad_norm": 0.1699984073638916,
+      "learning_rate": 4.512108298224751e-05,
+      "loss": 0.16986632347106934,
+      "step": 1245
+    },
+    {
+      "epoch": 0.22743813682678313,
+      "grad_norm": 0.17601042985916138,
+      "learning_rate": 4.50773103688665e-05,
+      "loss": 0.17507898807525635,
+      "step": 1250
+    },
+    {
+      "epoch": 0.22834788937409026,
+      "grad_norm": 0.17557238042354584,
+      "learning_rate": 4.503336370464476e-05,
+      "loss": 0.17702863216400147,
+      "step": 1255
+    },
+    {
+      "epoch": 0.2292576419213974,
+      "grad_norm": 0.1800651252269745,
+      "learning_rate": 4.498924337055729e-05,
+      "loss": 0.16419180631637573,
+      "step": 1260
+    },
+    {
+      "epoch": 0.23016739446870452,
+      "grad_norm": 0.2022479772567749,
+      "learning_rate": 4.494494974908468e-05,
+      "loss": 0.17482060194015503,
+      "step": 1265
+    },
+    {
+      "epoch": 0.23107714701601165,
+      "grad_norm": 0.14180205762386322,
+      "learning_rate": 4.490048322420973e-05,
+      "loss": 0.1723136067390442,
+      "step": 1270
+    },
+    {
+      "epoch": 0.23198689956331878,
+      "grad_norm": 0.18607310950756073,
+      "learning_rate": 4.485584418141419e-05,
+      "loss": 0.17096419334411622,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2328966521106259,
+      "grad_norm": 0.15958310663700104,
+      "learning_rate": 4.481103300767529e-05,
+      "loss": 0.1656244158744812,
+      "step": 1280
+    },
+    {
+      "epoch": 0.23380640465793304,
+      "grad_norm": 0.17552383244037628,
+      "learning_rate": 4.476605009146255e-05,
+      "loss": 0.17677626609802247,
+      "step": 1285
+    },
+    {
+      "epoch": 0.23471615720524017,
+      "grad_norm": 0.15299823880195618,
+      "learning_rate": 4.472089582273429e-05,
+      "loss": 0.1778991103172302,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2356259097525473,
+      "grad_norm": 0.14613987505435944,
+      "learning_rate": 4.46755705929343e-05,
+      "loss": 0.17071452140808105,
+      "step": 1295
+    },
+    {
+      "epoch": 0.23653566229985443,
+      "grad_norm": 0.17781122028827667,
+      "learning_rate": 4.463007479498843e-05,
+      "loss": 0.16955430507659913,
+      "step": 1300
+    },
+    {
+      "epoch": 0.23744541484716158,
+      "grad_norm": 0.16326487064361572,
+      "learning_rate": 4.458440882330119e-05,
+      "loss": 0.1777693510055542,
+      "step": 1305
+    },
+    {
+      "epoch": 0.23835516739446871,
+      "grad_norm": 0.17701926827430725,
+      "learning_rate": 4.4538573073752365e-05,
+      "loss": 0.16323351860046387,
+      "step": 1310
+    },
+    {
+      "epoch": 0.23926491994177584,
+      "grad_norm": 0.13104717433452606,
+      "learning_rate": 4.449256794369349e-05,
+      "loss": 0.17653456926345826,
+      "step": 1315
+    },
+    {
+      "epoch": 0.24017467248908297,
+      "grad_norm": 0.1796836256980896,
+      "learning_rate": 4.444639383194452e-05,
+      "loss": 0.17189600467681884,
+      "step": 1320
+    },
+    {
+      "epoch": 0.2410844250363901,
+      "grad_norm": 0.14919696748256683,
+      "learning_rate": 4.440005113879029e-05,
+      "loss": 0.17003334760665895,
+      "step": 1325
+    },
+    {
+      "epoch": 0.24199417758369723,
+      "grad_norm": 0.1728784441947937,
+      "learning_rate": 4.4353540265977064e-05,
+      "loss": 0.17397408485412597,
+      "step": 1330
+    },
+    {
+      "epoch": 0.24290393013100436,
+      "grad_norm": 0.14591015875339508,
+      "learning_rate": 4.43068616167091e-05,
+      "loss": 0.16498478651046752,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2438136826783115,
+      "grad_norm": 0.18417201936244965,
+      "learning_rate": 4.4260015595645055e-05,
+      "loss": 0.16841750144958495,
+      "step": 1340
+    },
+    {
+      "epoch": 0.24472343522561862,
+      "grad_norm": 0.16264279186725616,
+      "learning_rate": 4.4213002608894605e-05,
+      "loss": 0.16907373666763306,
+      "step": 1345
+    },
+    {
+      "epoch": 0.24563318777292575,
+      "grad_norm": 0.15248481929302216,
+      "learning_rate": 4.416582306401481e-05,
+      "loss": 0.15931472778320313,
+      "step": 1350
+    },
+    {
+      "epoch": 0.24654294032023288,
+      "grad_norm": 0.1488373875617981,
+      "learning_rate": 4.4118477370006636e-05,
+      "loss": 0.1701716423034668,
+      "step": 1355
+    },
+    {
+      "epoch": 0.24745269286754004,
+      "grad_norm": 0.14679782092571259,
+      "learning_rate": 4.407096593731142e-05,
+      "loss": 0.157412326335907,
+      "step": 1360
+    },
+    {
+      "epoch": 0.24836244541484717,
+      "grad_norm": 0.17139530181884766,
+      "learning_rate": 4.402328917780728e-05,
+      "loss": 0.17303754091262818,
+      "step": 1365
+    },
+    {
+      "epoch": 0.2492721979621543,
+      "grad_norm": 0.1534871757030487,
+      "learning_rate": 4.397544750480554e-05,
+      "loss": 0.1786255121231079,
+      "step": 1370
+    },
+    {
+      "epoch": 0.2501819505094614,
+      "grad_norm": 0.1876252293586731,
+      "learning_rate": 4.39274413330472e-05,
+      "loss": 0.16442898511886597,
+      "step": 1375
+    },
+    {
+      "epoch": 0.25109170305676853,
+      "grad_norm": 0.16165752708911896,
+      "learning_rate": 4.387927107869928e-05,
+      "loss": 0.1780426025390625,
+      "step": 1380
+    },
+    {
+      "epoch": 0.25200145560407566,
+      "grad_norm": 0.17242255806922913,
+      "learning_rate": 4.383093715935124e-05,
+      "loss": 0.15959256887435913,
+      "step": 1385
+    },
+    {
+      "epoch": 0.25291120815138285,
+      "grad_norm": 0.1627114862203598,
+      "learning_rate": 4.378243999401137e-05,
+      "loss": 0.17606115341186523,
+      "step": 1390
+    },
+    {
+      "epoch": 0.25382096069869,
+      "grad_norm": 0.15911224484443665,
+      "learning_rate": 4.373378000310312e-05,
+      "loss": 0.16798585653305054,
+      "step": 1395
+    },
+    {
+      "epoch": 0.2547307132459971,
+      "grad_norm": 0.15542249381542206,
+      "learning_rate": 4.3684957608461505e-05,
+      "loss": 0.1695417881011963,
+      "step": 1400
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 7.772071633538204e+17,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-1400/training_args.bin b/checkpoint-1400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-1400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/checkpoint-1500/README.md b/checkpoint-1500/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-1500/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-1500/adapter_config.json b/checkpoint-1500/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-1500/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-1500/adapter_model.safetensors b/checkpoint-1500/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e28903f8dde41ae0f08b4ffd11e175d1d7bbd83c
--- /dev/null
+++ b/checkpoint-1500/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:69d60343b506ad0c943d74eadd22f650e82ab11cb475ba6ad3c72df2549e3b00
+size 169741912
diff --git a/checkpoint-1500/chat_template.jinja b/checkpoint-1500/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-1500/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-1500/optimizer.pt b/checkpoint-1500/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c11ad1abb065b3e9c4ecd72e817201e14359910e
--- /dev/null
+++ b/checkpoint-1500/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:52868d6f8fcaa503d24389933d16a144ef1ca33b160fb8bb4f5a732b8c520ddf
+size 72807355
diff --git a/checkpoint-1500/processor_config.json b/checkpoint-1500/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-1500/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-1500/rng_state.pth b/checkpoint-1500/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-1500/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-1500/scheduler.pt b/checkpoint-1500/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e413bcf8f4605605003c34039a363e44b27c2e96
--- /dev/null
+++ b/checkpoint-1500/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3af2dbb852cf80ef122cfd6295dff1ecfd87bd99ba018f26d2f07667a9aed01b
+size 1465
diff --git a/checkpoint-1500/tokenizer.json b/checkpoint-1500/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-1500/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-1500/tokenizer_config.json b/checkpoint-1500/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-1500/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-1500/trainer_state.json b/checkpoint-1500/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..f25f35a90ae22eae5224df099467d13d296c40bc
--- /dev/null
+++ b/checkpoint-1500/trainer_state.json
@@ -0,0 +1,2142 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.27292576419213976,
+  "eval_steps": 100,
+  "global_step": 1500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    },
+    {
+      "epoch": 0.03729985443959243,
+      "grad_norm": 0.061678580939769745,
+      "learning_rate": 4.999340748636462e-05,
+      "loss": 0.24956226348876953,
+      "step": 205
+    },
+    {
+      "epoch": 0.03820960698689956,
+      "grad_norm": 0.07328873127698898,
+      "learning_rate": 4.999160884067051e-05,
+      "loss": 0.26169676780700685,
+      "step": 210
+    },
+    {
+      "epoch": 0.0391193595342067,
+      "grad_norm": 0.08287990838289261,
+      "learning_rate": 4.9989593541926246e-05,
+      "loss": 0.2574604034423828,
+      "step": 215
+    },
+    {
+      "epoch": 0.04002911208151383,
+      "grad_norm": 0.06787359714508057,
+      "learning_rate": 4.9987361607602525e-05,
+      "loss": 0.25351409912109374,
+      "step": 220
+    },
+    {
+      "epoch": 0.04093886462882096,
+      "grad_norm": 0.06695502996444702,
+      "learning_rate": 4.998491305704805e-05,
+      "loss": 0.24522039890289307,
+      "step": 225
+    },
+    {
+      "epoch": 0.041848617176128096,
+      "grad_norm": 0.08872214704751968,
+      "learning_rate": 4.9982247911489375e-05,
+      "loss": 0.2581867933273315,
+      "step": 230
+    },
+    {
+      "epoch": 0.042758369723435226,
+      "grad_norm": 0.07637131959199905,
+      "learning_rate": 4.9979366194030743e-05,
+      "loss": 0.25569658279418944,
+      "step": 235
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 0.08158119022846222,
+      "learning_rate": 4.997626792965385e-05,
+      "loss": 0.2529409646987915,
+      "step": 240
+    },
+    {
+      "epoch": 0.04457787481804949,
+      "grad_norm": 0.07529161125421524,
+      "learning_rate": 4.997295314521766e-05,
+      "loss": 0.24049024581909179,
+      "step": 245
+    },
+    {
+      "epoch": 0.04548762736535662,
+      "grad_norm": 0.08860139548778534,
+      "learning_rate": 4.996942186945813e-05,
+      "loss": 0.2490522861480713,
+      "step": 250
+    },
+    {
+      "epoch": 0.04639737991266375,
+      "grad_norm": 0.0850321501493454,
+      "learning_rate": 4.9965674132988005e-05,
+      "loss": 0.24180831909179687,
+      "step": 255
+    },
+    {
+      "epoch": 0.04730713245997089,
+      "grad_norm": 0.07556115090847015,
+      "learning_rate": 4.996170996829653e-05,
+      "loss": 0.2509631872177124,
+      "step": 260
+    },
+    {
+      "epoch": 0.04821688500727802,
+      "grad_norm": 0.07971206307411194,
+      "learning_rate": 4.995752940974918e-05,
+      "loss": 0.24398891925811766,
+      "step": 265
+    },
+    {
+      "epoch": 0.04912663755458515,
+      "grad_norm": 0.09149336814880371,
+      "learning_rate": 4.9953132493587344e-05,
+      "loss": 0.2300492286682129,
+      "step": 270
+    },
+    {
+      "epoch": 0.050036390101892286,
+      "grad_norm": 0.08265820890665054,
+      "learning_rate": 4.9948519257928034e-05,
+      "loss": 0.24246792793273925,
+      "step": 275
+    },
+    {
+      "epoch": 0.050946142649199416,
+      "grad_norm": 0.10328587144613266,
+      "learning_rate": 4.9943689742763534e-05,
+      "loss": 0.2367171049118042,
+      "step": 280
+    },
+    {
+      "epoch": 0.05185589519650655,
+      "grad_norm": 0.0836917981505394,
+      "learning_rate": 4.993864398996105e-05,
+      "loss": 0.23215813636779786,
+      "step": 285
+    },
+    {
+      "epoch": 0.05276564774381368,
+      "grad_norm": 0.09475161135196686,
+      "learning_rate": 4.99333820432624e-05,
+      "loss": 0.2350748062133789,
+      "step": 290
+    },
+    {
+      "epoch": 0.05367540029112081,
+      "grad_norm": 0.08040128648281097,
+      "learning_rate": 4.992790394828355e-05,
+      "loss": 0.23253886699676513,
+      "step": 295
+    },
+    {
+      "epoch": 0.05458515283842795,
+      "grad_norm": 0.08852150291204453,
+      "learning_rate": 4.992220975251428e-05,
+      "loss": 0.23856515884399415,
+      "step": 300
+    },
+    {
+      "epoch": 0.05549490538573508,
+      "grad_norm": 0.09565229713916779,
+      "learning_rate": 4.991629950531775e-05,
+      "loss": 0.23311660289764405,
+      "step": 305
+    },
+    {
+      "epoch": 0.05640465793304221,
+      "grad_norm": 0.08158160001039505,
+      "learning_rate": 4.991017325793009e-05,
+      "loss": 0.22467944622039795,
+      "step": 310
+    },
+    {
+      "epoch": 0.05731441048034935,
+      "grad_norm": 0.07746429741382599,
+      "learning_rate": 4.990383106345994e-05,
+      "loss": 0.229844069480896,
+      "step": 315
+    },
+    {
+      "epoch": 0.05822416302765648,
+      "grad_norm": 0.08564355969429016,
+      "learning_rate": 4.989727297688797e-05,
+      "loss": 0.22414517402648926,
+      "step": 320
+    },
+    {
+      "epoch": 0.05913391557496361,
+      "grad_norm": 0.07517435401678085,
+      "learning_rate": 4.9890499055066435e-05,
+      "loss": 0.2236532211303711,
+      "step": 325
+    },
+    {
+      "epoch": 0.060043668122270744,
+      "grad_norm": 0.111734539270401,
+      "learning_rate": 4.988350935671869e-05,
+      "loss": 0.21474847793579102,
+      "step": 330
+    },
+    {
+      "epoch": 0.060953420669577874,
+      "grad_norm": 0.09906989336013794,
+      "learning_rate": 4.987630394243866e-05,
+      "loss": 0.23321933746337892,
+      "step": 335
+    },
+    {
+      "epoch": 0.06186317321688501,
+      "grad_norm": 0.10131457448005676,
+      "learning_rate": 4.98688828746903e-05,
+      "loss": 0.2310662031173706,
+      "step": 340
+    },
+    {
+      "epoch": 0.06277292576419213,
+      "grad_norm": 0.09203507006168365,
+      "learning_rate": 4.986124621780708e-05,
+      "loss": 0.22021169662475587,
+      "step": 345
+    },
+    {
+      "epoch": 0.06368267831149928,
+      "grad_norm": 0.09505912661552429,
+      "learning_rate": 4.9853394037991416e-05,
+      "loss": 0.2197155237197876,
+      "step": 350
+    },
+    {
+      "epoch": 0.06459243085880641,
+      "grad_norm": 0.09038657695055008,
+      "learning_rate": 4.984532640331412e-05,
+      "loss": 0.22066287994384765,
+      "step": 355
+    },
+    {
+      "epoch": 0.06550218340611354,
+      "grad_norm": 0.09707064181566238,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 0.22455451488494874,
+      "step": 360
+    },
+    {
+      "epoch": 0.06641193595342067,
+      "grad_norm": 0.10367228090763092,
+      "learning_rate": 4.98285450509961e-05,
+      "loss": 0.21993820667266845,
+      "step": 365
+    },
+    {
+      "epoch": 0.0673216885007278,
+      "grad_norm": 0.12229471653699875,
+      "learning_rate": 4.9819831478833456e-05,
+      "loss": 0.2168867588043213,
+      "step": 370
+    },
+    {
+      "epoch": 0.06823144104803494,
+      "grad_norm": 0.0964592918753624,
+      "learning_rate": 4.981090274276406e-05,
+      "loss": 0.21579203605651856,
+      "step": 375
+    },
+    {
+      "epoch": 0.06914119359534207,
+      "grad_norm": 0.09400496631860733,
+      "learning_rate": 4.980175892019141e-05,
+      "loss": 0.20972180366516113,
+      "step": 380
+    },
+    {
+      "epoch": 0.0700509461426492,
+      "grad_norm": 0.08158645778894424,
+      "learning_rate": 4.9792400090383594e-05,
+      "loss": 0.22148358821868896,
+      "step": 385
+    },
+    {
+      "epoch": 0.07096069868995633,
+      "grad_norm": 0.10916394740343094,
+      "learning_rate": 4.978282633447261e-05,
+      "loss": 0.2214418649673462,
+      "step": 390
+    },
+    {
+      "epoch": 0.07187045123726346,
+      "grad_norm": 0.11138810962438583,
+      "learning_rate": 4.9773037735453636e-05,
+      "loss": 0.21814754009246826,
+      "step": 395
+    },
+    {
+      "epoch": 0.07278020378457059,
+      "grad_norm": 0.10914396494626999,
+      "learning_rate": 4.9763034378184365e-05,
+      "loss": 0.21310818195343018,
+      "step": 400
+    },
+    {
+      "epoch": 0.07368995633187773,
+      "grad_norm": 0.1043366864323616,
+      "learning_rate": 4.975281634938421e-05,
+      "loss": 0.21266789436340333,
+      "step": 405
+    },
+    {
+      "epoch": 0.07459970887918486,
+      "grad_norm": 0.1036868542432785,
+      "learning_rate": 4.9742383737633594e-05,
+      "loss": 0.21606721878051757,
+      "step": 410
+    },
+    {
+      "epoch": 0.075509461426492,
+      "grad_norm": 0.11640442907810211,
+      "learning_rate": 4.9731736633373144e-05,
+      "loss": 0.21532948017120362,
+      "step": 415
+    },
+    {
+      "epoch": 0.07641921397379912,
+      "grad_norm": 0.11219926178455353,
+      "learning_rate": 4.9720875128902956e-05,
+      "loss": 0.2191627025604248,
+      "step": 420
+    },
+    {
+      "epoch": 0.07732896652110625,
+      "grad_norm": 0.12103637307882309,
+      "learning_rate": 4.970979931838176e-05,
+      "loss": 0.20938868522644044,
+      "step": 425
+    },
+    {
+      "epoch": 0.0782387190684134,
+      "grad_norm": 0.13274189829826355,
+      "learning_rate": 4.96985092978261e-05,
+      "loss": 0.21792960166931152,
+      "step": 430
+    },
+    {
+      "epoch": 0.07914847161572053,
+      "grad_norm": 0.11164513230323792,
+      "learning_rate": 4.968700516510954e-05,
+      "loss": 0.2022618055343628,
+      "step": 435
+    },
+    {
+      "epoch": 0.08005822416302766,
+      "grad_norm": 0.09532847255468369,
+      "learning_rate": 4.967528701996174e-05,
+      "loss": 0.21255812644958497,
+      "step": 440
+    },
+    {
+      "epoch": 0.08096797671033479,
+      "grad_norm": 0.10279258340597153,
+      "learning_rate": 4.96633549639677e-05,
+      "loss": 0.20683050155639648,
+      "step": 445
+    },
+    {
+      "epoch": 0.08187772925764192,
+      "grad_norm": 0.1257462352514267,
+      "learning_rate": 4.965120910056677e-05,
+      "loss": 0.21419920921325683,
+      "step": 450
+    },
+    {
+      "epoch": 0.08278748180494905,
+      "grad_norm": 0.11663137376308441,
+      "learning_rate": 4.963884953505186e-05,
+      "loss": 0.2072287082672119,
+      "step": 455
+    },
+    {
+      "epoch": 0.08369723435225619,
+      "grad_norm": 0.10488224029541016,
+      "learning_rate": 4.96262763745684e-05,
+      "loss": 0.1982678532600403,
+      "step": 460
+    },
+    {
+      "epoch": 0.08460698689956332,
+      "grad_norm": 0.11801692098379135,
+      "learning_rate": 4.961348972811354e-05,
+      "loss": 0.20662031173706055,
+      "step": 465
+    },
+    {
+      "epoch": 0.08551673944687045,
+      "grad_norm": 0.11318827420473099,
+      "learning_rate": 4.96004897065351e-05,
+      "loss": 0.20947303771972656,
+      "step": 470
+    },
+    {
+      "epoch": 0.08642649199417758,
+      "grad_norm": 0.13409486413002014,
+      "learning_rate": 4.95872764225307e-05,
+      "loss": 0.19670876264572143,
+      "step": 475
+    },
+    {
+      "epoch": 0.08733624454148471,
+      "grad_norm": 0.14440792798995972,
+      "learning_rate": 4.957384999064672e-05,
+      "loss": 0.19842848777770997,
+      "step": 480
+    },
+    {
+      "epoch": 0.08824599708879186,
+      "grad_norm": 0.12246996909379959,
+      "learning_rate": 4.956021052727731e-05,
+      "loss": 0.20318071842193602,
+      "step": 485
+    },
+    {
+      "epoch": 0.08915574963609899,
+      "grad_norm": 0.13437233865261078,
+      "learning_rate": 4.954635815066342e-05,
+      "loss": 0.21675212383270265,
+      "step": 490
+    },
+    {
+      "epoch": 0.09006550218340612,
+      "grad_norm": 0.11109672486782074,
+      "learning_rate": 4.9532292980891744e-05,
+      "loss": 0.2100757837295532,
+      "step": 495
+    },
+    {
+      "epoch": 0.09097525473071325,
+      "grad_norm": 0.1388893872499466,
+      "learning_rate": 4.9518015139893675e-05,
+      "loss": 0.20303285121917725,
+      "step": 500
+    },
+    {
+      "epoch": 0.09188500727802038,
+      "grad_norm": 0.13239721953868866,
+      "learning_rate": 4.950352475144427e-05,
+      "loss": 0.2152268409729004,
+      "step": 505
+    },
+    {
+      "epoch": 0.0927947598253275,
+      "grad_norm": 0.12834979593753815,
+      "learning_rate": 4.948882194116119e-05,
+      "loss": 0.20799248218536376,
+      "step": 510
+    },
+    {
+      "epoch": 0.09370451237263465,
+      "grad_norm": 0.11886704713106155,
+      "learning_rate": 4.947390683650354e-05,
+      "loss": 0.20394976139068605,
+      "step": 515
+    },
+    {
+      "epoch": 0.09461426491994178,
+      "grad_norm": 0.11398876458406448,
+      "learning_rate": 4.945877956677083e-05,
+      "loss": 0.2091092586517334,
+      "step": 520
+    },
+    {
+      "epoch": 0.09552401746724891,
+      "grad_norm": 0.1422540694475174,
+      "learning_rate": 4.944344026310186e-05,
+      "loss": 0.19564238786697388,
+      "step": 525
+    },
+    {
+      "epoch": 0.09643377001455604,
+      "grad_norm": 0.11359584331512451,
+      "learning_rate": 4.9427889058473535e-05,
+      "loss": 0.20493624210357667,
+      "step": 530
+    },
+    {
+      "epoch": 0.09734352256186317,
+      "grad_norm": 0.11703553050756454,
+      "learning_rate": 4.941212608769974e-05,
+      "loss": 0.2098615884780884,
+      "step": 535
+    },
+    {
+      "epoch": 0.0982532751091703,
+      "grad_norm": 0.14552047848701477,
+      "learning_rate": 4.939615148743017e-05,
+      "loss": 0.20382182598114013,
+      "step": 540
+    },
+    {
+      "epoch": 0.09916302765647744,
+      "grad_norm": 0.13178016245365143,
+      "learning_rate": 4.937996539614914e-05,
+      "loss": 0.19901862144470214,
+      "step": 545
+    },
+    {
+      "epoch": 0.10007278020378457,
+      "grad_norm": 0.635392427444458,
+      "learning_rate": 4.936356795417439e-05,
+      "loss": 0.20694944858551026,
+      "step": 550
+    },
+    {
+      "epoch": 0.1009825327510917,
+      "grad_norm": 0.15019077062606812,
+      "learning_rate": 4.934695930365586e-05,
+      "loss": 0.19313746690750122,
+      "step": 555
+    },
+    {
+      "epoch": 0.10189228529839883,
+      "grad_norm": 0.12941956520080566,
+      "learning_rate": 4.9330139588574474e-05,
+      "loss": 0.19671722650527954,
+      "step": 560
+    },
+    {
+      "epoch": 0.10280203784570596,
+      "grad_norm": 0.13818831741809845,
+      "learning_rate": 4.931310895474088e-05,
+      "loss": 0.20026786327362062,
+      "step": 565
+    },
+    {
+      "epoch": 0.1037117903930131,
+      "grad_norm": 0.12011194974184036,
+      "learning_rate": 4.929586754979417e-05,
+      "loss": 0.1932437539100647,
+      "step": 570
+    },
+    {
+      "epoch": 0.10462154294032024,
+      "grad_norm": 0.1345364898443222,
+      "learning_rate": 4.9278415523200644e-05,
+      "loss": 0.20245940685272218,
+      "step": 575
+    },
+    {
+      "epoch": 0.10553129548762737,
+      "grad_norm": 0.13281017541885376,
+      "learning_rate": 4.926075302625247e-05,
+      "loss": 0.19864981174468993,
+      "step": 580
+    },
+    {
+      "epoch": 0.1064410480349345,
+      "grad_norm": 0.13465586304664612,
+      "learning_rate": 4.924288021206639e-05,
+      "loss": 0.19573183059692384,
+      "step": 585
+    },
+    {
+      "epoch": 0.10735080058224163,
+      "grad_norm": 0.15225961804389954,
+      "learning_rate": 4.9224797235582396e-05,
+      "loss": 0.19946801662445068,
+      "step": 590
+    },
+    {
+      "epoch": 0.10826055312954876,
+      "grad_norm": 0.12816746532917023,
+      "learning_rate": 4.92065042535624e-05,
+      "loss": 0.19851526021957397,
+      "step": 595
+    },
+    {
+      "epoch": 0.1091703056768559,
+      "grad_norm": 0.13802853226661682,
+      "learning_rate": 4.9188001424588824e-05,
+      "loss": 0.19321763515472412,
+      "step": 600
+    },
+    {
+      "epoch": 0.11008005822416303,
+      "grad_norm": 0.17504797875881195,
+      "learning_rate": 4.9169288909063295e-05,
+      "loss": 0.2032616138458252,
+      "step": 605
+    },
+    {
+      "epoch": 0.11098981077147016,
+      "grad_norm": 0.13544194400310516,
+      "learning_rate": 4.91503668692052e-05,
+      "loss": 0.2011256456375122,
+      "step": 610
+    },
+    {
+      "epoch": 0.11189956331877729,
+      "grad_norm": 1.3976134061813354,
+      "learning_rate": 4.91312354690503e-05,
+      "loss": 0.19916868209838867,
+      "step": 615
+    },
+    {
+      "epoch": 0.11280931586608442,
+      "grad_norm": 0.1465059071779251,
+      "learning_rate": 4.91118948744493e-05,
+      "loss": 0.19487457275390624,
+      "step": 620
+    },
+    {
+      "epoch": 0.11371906841339156,
+      "grad_norm": 0.12103168666362762,
+      "learning_rate": 4.909234525306645e-05,
+      "loss": 0.1907251238822937,
+      "step": 625
+    },
+    {
+      "epoch": 0.1146288209606987,
+      "grad_norm": 0.12660574913024902,
+      "learning_rate": 4.907258677437802e-05,
+      "loss": 0.19327253103256226,
+      "step": 630
+    },
+    {
+      "epoch": 0.11553857350800582,
+      "grad_norm": 0.1347813606262207,
+      "learning_rate": 4.90526196096709e-05,
+      "loss": 0.19637736082077026,
+      "step": 635
+    },
+    {
+      "epoch": 0.11644832605531295,
+      "grad_norm": 0.14953652024269104,
+      "learning_rate": 4.903244393204107e-05,
+      "loss": 0.20325069427490233,
+      "step": 640
+    },
+    {
+      "epoch": 0.11735807860262008,
+      "grad_norm": 0.13936272263526917,
+      "learning_rate": 4.901205991639213e-05,
+      "loss": 0.1930275321006775,
+      "step": 645
+    },
+    {
+      "epoch": 0.11826783114992721,
+      "grad_norm": 0.1448420137166977,
+      "learning_rate": 4.899146773943374e-05,
+      "loss": 0.20026936531066894,
+      "step": 650
+    },
+    {
+      "epoch": 0.11917758369723436,
+      "grad_norm": 0.1312534064054489,
+      "learning_rate": 4.897066757968014e-05,
+      "loss": 0.19062033891677857,
+      "step": 655
+    },
+    {
+      "epoch": 0.12008733624454149,
+      "grad_norm": 0.13644742965698242,
+      "learning_rate": 4.894965961744859e-05,
+      "loss": 0.18719595670700073,
+      "step": 660
+    },
+    {
+      "epoch": 0.12099708879184862,
+      "grad_norm": 0.14276087284088135,
+      "learning_rate": 4.892844403485777e-05,
+      "loss": 0.19784307479858398,
+      "step": 665
+    },
+    {
+      "epoch": 0.12190684133915575,
+      "grad_norm": 0.14735399186611176,
+      "learning_rate": 4.890702101582623e-05,
+      "loss": 0.19163782596588136,
+      "step": 670
+    },
+    {
+      "epoch": 0.12281659388646288,
+      "grad_norm": 0.15742065012454987,
+      "learning_rate": 4.888539074607082e-05,
+      "loss": 0.19312986135482788,
+      "step": 675
+    },
+    {
+      "epoch": 0.12372634643377002,
+      "grad_norm": 0.12917031347751617,
+      "learning_rate": 4.8863553413105025e-05,
+      "loss": 0.20066320896148682,
+      "step": 680
+    },
+    {
+      "epoch": 0.12463609898107715,
+      "grad_norm": 0.1484801322221756,
+      "learning_rate": 4.884150920623737e-05,
+      "loss": 0.20096096992492676,
+      "step": 685
+    },
+    {
+      "epoch": 0.12554585152838427,
+      "grad_norm": 0.1455296128988266,
+      "learning_rate": 4.88192583165698e-05,
+      "loss": 0.20518505573272705,
+      "step": 690
+    },
+    {
+      "epoch": 0.12645560407569142,
+      "grad_norm": 0.14517490565776825,
+      "learning_rate": 4.879680093699598e-05,
+      "loss": 0.18859238624572755,
+      "step": 695
+    },
+    {
+      "epoch": 0.12736535662299855,
+      "grad_norm": 0.18778090178966522,
+      "learning_rate": 4.877413726219964e-05,
+      "loss": 0.197074818611145,
+      "step": 700
+    },
+    {
+      "epoch": 0.12827510917030568,
+      "grad_norm": 0.13497677445411682,
+      "learning_rate": 4.87512674886529e-05,
+      "loss": 0.18713107109069824,
+      "step": 705
+    },
+    {
+      "epoch": 0.12918486171761281,
+      "grad_norm": 0.12657155096530914,
+      "learning_rate": 4.872819181461455e-05,
+      "loss": 0.1858484387397766,
+      "step": 710
+    },
+    {
+      "epoch": 0.13009461426491994,
+      "grad_norm": 0.11458148807287216,
+      "learning_rate": 4.870491044012834e-05,
+      "loss": 0.18732179403305055,
+      "step": 715
+    },
+    {
+      "epoch": 0.13100436681222707,
+      "grad_norm": 0.13000249862670898,
+      "learning_rate": 4.8681423567021244e-05,
+      "loss": 0.1872936010360718,
+      "step": 720
+    },
+    {
+      "epoch": 0.1319141193595342,
+      "grad_norm": 0.14580890536308289,
+      "learning_rate": 4.865773139890172e-05,
+      "loss": 0.19280019998550416,
+      "step": 725
+    },
+    {
+      "epoch": 0.13282387190684133,
+      "grad_norm": 0.1507277935743332,
+      "learning_rate": 4.8633834141157913e-05,
+      "loss": 0.1898929238319397,
+      "step": 730
+    },
+    {
+      "epoch": 0.13373362445414846,
+      "grad_norm": 0.1418737769126892,
+      "learning_rate": 4.860973200095592e-05,
+      "loss": 0.17926375865936278,
+      "step": 735
+    },
+    {
+      "epoch": 0.1346433770014556,
+      "grad_norm": 0.17151866853237152,
+      "learning_rate": 4.858542518723794e-05,
+      "loss": 0.18963592052459716,
+      "step": 740
+    },
+    {
+      "epoch": 0.13555312954876272,
+      "grad_norm": 0.11162743717432022,
+      "learning_rate": 4.8560913910720535e-05,
+      "loss": 0.19466646909713745,
+      "step": 745
+    },
+    {
+      "epoch": 0.13646288209606988,
+      "grad_norm": 0.15628376603126526,
+      "learning_rate": 4.8536198383892725e-05,
+      "loss": 0.19494034051895143,
+      "step": 750
+    },
+    {
+      "epoch": 0.137372634643377,
+      "grad_norm": 0.18209289014339447,
+      "learning_rate": 4.851127882101421e-05,
+      "loss": 0.18747550249099731,
+      "step": 755
+    },
+    {
+      "epoch": 0.13828238719068414,
+      "grad_norm": 0.14559614658355713,
+      "learning_rate": 4.8486155438113454e-05,
+      "loss": 0.1897158980369568,
+      "step": 760
+    },
+    {
+      "epoch": 0.13919213973799127,
+      "grad_norm": 0.3198587894439697,
+      "learning_rate": 4.846082845298586e-05,
+      "loss": 0.18571001291275024,
+      "step": 765
+    },
+    {
+      "epoch": 0.1401018922852984,
+      "grad_norm": 0.1486678421497345,
+      "learning_rate": 4.843529808519189e-05,
+      "loss": 0.19561930894851684,
+      "step": 770
+    },
+    {
+      "epoch": 0.14101164483260553,
+      "grad_norm": 0.15318170189857483,
+      "learning_rate": 4.840956455605509e-05,
+      "loss": 0.187040114402771,
+      "step": 775
+    },
+    {
+      "epoch": 0.14192139737991266,
+      "grad_norm": 0.13754244148731232,
+      "learning_rate": 4.838362808866025e-05,
+      "loss": 0.18345539569854735,
+      "step": 780
+    },
+    {
+      "epoch": 0.1428311499272198,
+      "grad_norm": 0.12943248450756073,
+      "learning_rate": 4.835748890785143e-05,
+      "loss": 0.1921079397201538,
+      "step": 785
+    },
+    {
+      "epoch": 0.14374090247452692,
+      "grad_norm": 0.110458143055439,
+      "learning_rate": 4.833114724023001e-05,
+      "loss": 0.17927205562591553,
+      "step": 790
+    },
+    {
+      "epoch": 0.14465065502183405,
+      "grad_norm": 0.2421770840883255,
+      "learning_rate": 4.830460331415275e-05,
+      "loss": 0.18317567110061644,
+      "step": 795
+    },
+    {
+      "epoch": 0.14556040756914118,
+      "grad_norm": 0.14752762019634247,
+      "learning_rate": 4.8277857359729787e-05,
+      "loss": 0.1843916058540344,
+      "step": 800
+    },
+    {
+      "epoch": 0.14647016011644834,
+      "grad_norm": 0.15043556690216064,
+      "learning_rate": 4.8250909608822644e-05,
+      "loss": 0.18354393243789674,
+      "step": 805
+    },
+    {
+      "epoch": 0.14737991266375547,
+      "grad_norm": 0.1381794661283493,
+      "learning_rate": 4.822376029504223e-05,
+      "loss": 0.1789781332015991,
+      "step": 810
+    },
+    {
+      "epoch": 0.1482896652110626,
+      "grad_norm": 0.18386174738407135,
+      "learning_rate": 4.819640965374681e-05,
+      "loss": 0.19494292736053467,
+      "step": 815
+    },
+    {
+      "epoch": 0.14919941775836973,
+      "grad_norm": 0.13829593360424042,
+      "learning_rate": 4.816885792203996e-05,
+      "loss": 0.18486063480377196,
+      "step": 820
+    },
+    {
+      "epoch": 0.15010917030567686,
+      "grad_norm": 0.15033291280269623,
+      "learning_rate": 4.814110533876852e-05,
+      "loss": 0.18061509132385253,
+      "step": 825
+    },
+    {
+      "epoch": 0.151018922852984,
+      "grad_norm": 0.17150473594665527,
+      "learning_rate": 4.811315214452051e-05,
+      "loss": 0.18464866876602173,
+      "step": 830
+    },
+    {
+      "epoch": 0.15192867540029112,
+      "grad_norm": 0.15317125618457794,
+      "learning_rate": 4.808499858162307e-05,
+      "loss": 0.1837708592414856,
+      "step": 835
+    },
+    {
+      "epoch": 0.15283842794759825,
+      "grad_norm": 0.2671392560005188,
+      "learning_rate": 4.805664489414031e-05,
+      "loss": 0.19338636398315429,
+      "step": 840
+    },
+    {
+      "epoch": 0.15374818049490538,
+      "grad_norm": 0.14047028124332428,
+      "learning_rate": 4.802809132787125e-05,
+      "loss": 0.17069108486175538,
+      "step": 845
+    },
+    {
+      "epoch": 0.1546579330422125,
+      "grad_norm": 0.1520431935787201,
+      "learning_rate": 4.799933813034768e-05,
+      "loss": 0.18607735633850098,
+      "step": 850
+    },
+    {
+      "epoch": 0.15556768558951964,
+      "grad_norm": 0.17239463329315186,
+      "learning_rate": 4.797038555083197e-05,
+      "loss": 0.18069062232971192,
+      "step": 855
+    },
+    {
+      "epoch": 0.1564774381368268,
+      "grad_norm": 0.1377955675125122,
+      "learning_rate": 4.794123384031495e-05,
+      "loss": 0.18870222568511963,
+      "step": 860
+    },
+    {
+      "epoch": 0.15738719068413393,
+      "grad_norm": 0.15901461243629456,
+      "learning_rate": 4.791188325151373e-05,
+      "loss": 0.18128334283828734,
+      "step": 865
+    },
+    {
+      "epoch": 0.15829694323144106,
+      "grad_norm": 0.14634132385253906,
+      "learning_rate": 4.7882334038869495e-05,
+      "loss": 0.1866163969039917,
+      "step": 870
+    },
+    {
+      "epoch": 0.1592066957787482,
+      "grad_norm": 0.15361061692237854,
+      "learning_rate": 4.785258645854529e-05,
+      "loss": 0.17850807905197144,
+      "step": 875
+    },
+    {
+      "epoch": 0.16011644832605532,
+      "grad_norm": 0.13751649856567383,
+      "learning_rate": 4.782264076842385e-05,
+      "loss": 0.17731113433837892,
+      "step": 880
+    },
+    {
+      "epoch": 0.16102620087336245,
+      "grad_norm": 0.17909638583660126,
+      "learning_rate": 4.7792497228105314e-05,
+      "loss": 0.18344542980194092,
+      "step": 885
+    },
+    {
+      "epoch": 0.16193595342066958,
+      "grad_norm": 0.16038304567337036,
+      "learning_rate": 4.776215609890498e-05,
+      "loss": 0.18868647813796996,
+      "step": 890
+    },
+    {
+      "epoch": 0.1628457059679767,
+      "grad_norm": 0.1653951108455658,
+      "learning_rate": 4.773161764385107e-05,
+      "loss": 0.18614152669906617,
+      "step": 895
+    },
+    {
+      "epoch": 0.16375545851528384,
+      "grad_norm": 0.16193026304244995,
+      "learning_rate": 4.770088212768241e-05,
+      "loss": 0.18564575910568237,
+      "step": 900
+    },
+    {
+      "epoch": 0.16466521106259097,
+      "grad_norm": 0.16048531234264374,
+      "learning_rate": 4.7669949816846173e-05,
+      "loss": 0.18330031633377075,
+      "step": 905
+    },
+    {
+      "epoch": 0.1655749636098981,
+      "grad_norm": 0.1440177708864212,
+      "learning_rate": 4.7638820979495534e-05,
+      "loss": 0.17712442874908446,
+      "step": 910
+    },
+    {
+      "epoch": 0.16648471615720525,
+      "grad_norm": 0.19635969400405884,
+      "learning_rate": 4.760749588548738e-05,
+      "loss": 0.18679027557373046,
+      "step": 915
+    },
+    {
+      "epoch": 0.16739446870451238,
+      "grad_norm": 0.15576541423797607,
+      "learning_rate": 4.757597480637995e-05,
+      "loss": 0.19283764362335204,
+      "step": 920
+    },
+    {
+      "epoch": 0.1683042212518195,
+      "grad_norm": 0.1550331562757492,
+      "learning_rate": 4.7544258015430463e-05,
+      "loss": 0.18269542455673218,
+      "step": 925
+    },
+    {
+      "epoch": 0.16921397379912664,
+      "grad_norm": 0.18369626998901367,
+      "learning_rate": 4.75123457875928e-05,
+      "loss": 0.1697891116142273,
+      "step": 930
+    },
+    {
+      "epoch": 0.17012372634643377,
+      "grad_norm": 0.15266314148902893,
+      "learning_rate": 4.7480238399515074e-05,
+      "loss": 0.18523451089859008,
+      "step": 935
+    },
+    {
+      "epoch": 0.1710334788937409,
+      "grad_norm": 0.16709664463996887,
+      "learning_rate": 4.744793612953724e-05,
+      "loss": 0.1803238034248352,
+      "step": 940
+    },
+    {
+      "epoch": 0.17194323144104803,
+      "grad_norm": 0.14929179847240448,
+      "learning_rate": 4.741543925768872e-05,
+      "loss": 0.1861217737197876,
+      "step": 945
+    },
+    {
+      "epoch": 0.17285298398835516,
+      "grad_norm": 0.1362280696630478,
+      "learning_rate": 4.7382748065685915e-05,
+      "loss": 0.17896100282669067,
+      "step": 950
+    },
+    {
+      "epoch": 0.1737627365356623,
+      "grad_norm": 0.15290239453315735,
+      "learning_rate": 4.734986283692982e-05,
+      "loss": 0.18432788848876952,
+      "step": 955
+    },
+    {
+      "epoch": 0.17467248908296942,
+      "grad_norm": 0.1287035197019577,
+      "learning_rate": 4.73167838565035e-05,
+      "loss": 0.18485682010650634,
+      "step": 960
+    },
+    {
+      "epoch": 0.17558224163027655,
+      "grad_norm": 0.17969627678394318,
+      "learning_rate": 4.728351141116971e-05,
+      "loss": 0.17361557483673096,
+      "step": 965
+    },
+    {
+      "epoch": 0.1764919941775837,
+      "grad_norm": 0.13751201331615448,
+      "learning_rate": 4.7250045789368326e-05,
+      "loss": 0.1731679320335388,
+      "step": 970
+    },
+    {
+      "epoch": 0.17740174672489084,
+      "grad_norm": 0.1603265255689621,
+      "learning_rate": 4.721638728121388e-05,
+      "loss": 0.17308170795440675,
+      "step": 975
+    },
+    {
+      "epoch": 0.17831149927219797,
+      "grad_norm": 0.1592789888381958,
+      "learning_rate": 4.718253617849306e-05,
+      "loss": 0.17534757852554322,
+      "step": 980
+    },
+    {
+      "epoch": 0.1792212518195051,
+      "grad_norm": 0.12727224826812744,
+      "learning_rate": 4.714849277466214e-05,
+      "loss": 0.17817609310150145,
+      "step": 985
+    },
+    {
+      "epoch": 0.18013100436681223,
+      "grad_norm": 0.15401554107666016,
+      "learning_rate": 4.711425736484447e-05,
+      "loss": 0.1733405351638794,
+      "step": 990
+    },
+    {
+      "epoch": 0.18104075691411936,
+      "grad_norm": 0.13253968954086304,
+      "learning_rate": 4.7079830245827906e-05,
+      "loss": 0.17846795320510864,
+      "step": 995
+    },
+    {
+      "epoch": 0.1819505094614265,
+      "grad_norm": 0.21846213936805725,
+      "learning_rate": 4.7045211716062245e-05,
+      "loss": 0.18021599054336548,
+      "step": 1000
+    },
+    {
+      "epoch": 0.18286026200873362,
+      "grad_norm": 0.16867990791797638,
+      "learning_rate": 4.7010402075656595e-05,
+      "loss": 0.18232386112213134,
+      "step": 1005
+    },
+    {
+      "epoch": 0.18377001455604075,
+      "grad_norm": 0.17180582880973816,
+      "learning_rate": 4.697540162637686e-05,
+      "loss": 0.1816317319869995,
+      "step": 1010
+    },
+    {
+      "epoch": 0.18467976710334788,
+      "grad_norm": 0.16480213403701782,
+      "learning_rate": 4.694021067164303e-05,
+      "loss": 0.17718446254730225,
+      "step": 1015
+    },
+    {
+      "epoch": 0.185589519650655,
+      "grad_norm": 0.15015918016433716,
+      "learning_rate": 4.6904829516526605e-05,
+      "loss": 0.17412011623382567,
+      "step": 1020
+    },
+    {
+      "epoch": 0.18649927219796217,
+      "grad_norm": 0.14445139467716217,
+      "learning_rate": 4.686925846774795e-05,
+      "loss": 0.1778018832206726,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1874090247452693,
+      "grad_norm": 0.1701960265636444,
+      "learning_rate": 4.683349783367362e-05,
+      "loss": 0.16901081800460815,
+      "step": 1030
+    },
+    {
+      "epoch": 0.18831877729257643,
+      "grad_norm": 0.15894867479801178,
+      "learning_rate": 4.679754792431368e-05,
+      "loss": 0.17055928707122803,
+      "step": 1035
+    },
+    {
+      "epoch": 0.18922852983988356,
+      "grad_norm": 0.1511942446231842,
+      "learning_rate": 4.676140905131903e-05,
+      "loss": 0.17339680194854737,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1901382823871907,
+      "grad_norm": 0.14735209941864014,
+      "learning_rate": 4.672508152797872e-05,
+      "loss": 0.17802717685699462,
+      "step": 1045
+    },
+    {
+      "epoch": 0.19104803493449782,
+      "grad_norm": 0.17367291450500488,
+      "learning_rate": 4.66885656692172e-05,
+      "loss": 0.1732744097709656,
+      "step": 1050
+    },
+    {
+      "epoch": 0.19195778748180495,
+      "grad_norm": 0.147227481007576,
+      "learning_rate": 4.665186179159159e-05,
+      "loss": 0.17040517330169677,
+      "step": 1055
+    },
+    {
+      "epoch": 0.19286754002911208,
+      "grad_norm": 0.1709655076265335,
+      "learning_rate": 4.6614970213289e-05,
+      "loss": 0.17794088125228882,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1937772925764192,
+      "grad_norm": 0.1588088721036911,
+      "learning_rate": 4.657789125412366e-05,
+      "loss": 0.17180380821228028,
+      "step": 1065
+    },
+    {
+      "epoch": 0.19468704512372634,
+      "grad_norm": 0.14827021956443787,
+      "learning_rate": 4.654062523553428e-05,
+      "loss": 0.182997989654541,
+      "step": 1070
+    },
+    {
+      "epoch": 0.19559679767103347,
+      "grad_norm": 0.16230466961860657,
+      "learning_rate": 4.6503172480581126e-05,
+      "loss": 0.17346880435943604,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1965065502183406,
+      "grad_norm": 0.1637624353170395,
+      "learning_rate": 4.646553331394333e-05,
+      "loss": 0.17263576984405518,
+      "step": 1080
+    },
+    {
+      "epoch": 0.19741630276564776,
+      "grad_norm": 0.15977843105793,
+      "learning_rate": 4.642770806191603e-05,
+      "loss": 0.17284308671951293,
+      "step": 1085
+    },
+    {
+      "epoch": 0.19832605531295489,
+      "grad_norm": 0.15394869446754456,
+      "learning_rate": 4.6389697052407534e-05,
+      "loss": 0.17797101736068727,
+      "step": 1090
+    },
+    {
+      "epoch": 0.19923580786026202,
+      "grad_norm": 0.15995225310325623,
+      "learning_rate": 4.6351500614936485e-05,
+      "loss": 0.18137198686599731,
+      "step": 1095
+    },
+    {
+      "epoch": 0.20014556040756915,
+      "grad_norm": 0.1779479682445526,
+      "learning_rate": 4.6313119080629006e-05,
+      "loss": 0.17998344898223878,
+      "step": 1100
+    },
+    {
+      "epoch": 0.20105531295487628,
+      "grad_norm": 0.14362832903862,
+      "learning_rate": 4.627455278221584e-05,
+      "loss": 0.18196423053741456,
+      "step": 1105
+    },
+    {
+      "epoch": 0.2019650655021834,
+      "grad_norm": 0.15951639413833618,
+      "learning_rate": 4.623580205402947e-05,
+      "loss": 0.17423888444900512,
+      "step": 1110
+    },
+    {
+      "epoch": 0.20287481804949054,
+      "grad_norm": 0.17273563146591187,
+      "learning_rate": 4.619686723200115e-05,
+      "loss": 0.17392473220825194,
+      "step": 1115
+    },
+    {
+      "epoch": 0.20378457059679767,
+      "grad_norm": 0.1655360758304596,
+      "learning_rate": 4.615774865365813e-05,
+      "loss": 0.17528389692306517,
+      "step": 1120
+    },
+    {
+      "epoch": 0.2046943231441048,
+      "grad_norm": 0.15920691192150116,
+      "learning_rate": 4.611844665812058e-05,
+      "loss": 0.1806849241256714,
+      "step": 1125
+    },
+    {
+      "epoch": 0.20560407569141192,
+      "grad_norm": 0.16114577651023865,
+      "learning_rate": 4.607896158609875e-05,
+      "loss": 0.17217352390289306,
+      "step": 1130
+    },
+    {
+      "epoch": 0.20651382823871905,
+      "grad_norm": 0.1499422937631607,
+      "learning_rate": 4.603929377988999e-05,
+      "loss": 0.17806737422943114,
+      "step": 1135
+    },
+    {
+      "epoch": 0.2074235807860262,
+      "grad_norm": 0.17605191469192505,
+      "learning_rate": 4.5999443583375765e-05,
+      "loss": 0.17842113971710205,
+      "step": 1140
+    },
+    {
+      "epoch": 0.20833333333333334,
+      "grad_norm": 0.16117210686206818,
+      "learning_rate": 4.595941134201871e-05,
+      "loss": 0.18379683494567872,
+      "step": 1145
+    },
+    {
+      "epoch": 0.20924308588064047,
+      "grad_norm": 0.21199050545692444,
+      "learning_rate": 4.591919740285957e-05,
+      "loss": 0.16286123991012574,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2101528384279476,
+      "grad_norm": 0.15100529789924622,
+      "learning_rate": 4.587880211451427e-05,
+      "loss": 0.17995200157165528,
+      "step": 1155
+    },
+    {
+      "epoch": 0.21106259097525473,
+      "grad_norm": 0.16618172824382782,
+      "learning_rate": 4.583822582717085e-05,
+      "loss": 0.16960303783416747,
+      "step": 1160
+    },
+    {
+      "epoch": 0.21197234352256186,
+      "grad_norm": 0.14743569493293762,
+      "learning_rate": 4.579746889258643e-05,
+      "loss": 0.17762668132781984,
+      "step": 1165
+    },
+    {
+      "epoch": 0.212882096069869,
+      "grad_norm": 0.1697179079055786,
+      "learning_rate": 4.575653166408417e-05,
+      "loss": 0.16656005382537842,
+      "step": 1170
+    },
+    {
+      "epoch": 0.21379184861717612,
+      "grad_norm": 0.14886513352394104,
+      "learning_rate": 4.57154144965502e-05,
+      "loss": 0.17091882228851318,
+      "step": 1175
+    },
+    {
+      "epoch": 0.21470160116448325,
+      "grad_norm": 0.18197473883628845,
+      "learning_rate": 4.5674117746430556e-05,
+      "loss": 0.1770920753479004,
+      "step": 1180
+    },
+    {
+      "epoch": 0.21561135371179038,
+      "grad_norm": 0.17323088645935059,
+      "learning_rate": 4.563264177172807e-05,
+      "loss": 0.1734643578529358,
+      "step": 1185
+    },
+    {
+      "epoch": 0.2165211062590975,
+      "grad_norm": 0.1521984338760376,
+      "learning_rate": 4.559098693199929e-05,
+      "loss": 0.17515116930007935,
+      "step": 1190
+    },
+    {
+      "epoch": 0.21743085880640467,
+      "grad_norm": 0.1842304915189743,
+      "learning_rate": 4.554915358835134e-05,
+      "loss": 0.16798022985458375,
+      "step": 1195
+    },
+    {
+      "epoch": 0.2183406113537118,
+      "grad_norm": 0.14753451943397522,
+      "learning_rate": 4.5507142103438794e-05,
+      "loss": 0.1755476713180542,
+      "step": 1200
+    },
+    {
+      "epoch": 0.21925036390101893,
+      "grad_norm": 0.17096194624900818,
+      "learning_rate": 4.546495284146057e-05,
+      "loss": 0.1792473554611206,
+      "step": 1205
+    },
+    {
+      "epoch": 0.22016011644832606,
+      "grad_norm": 0.1579233556985855,
+      "learning_rate": 4.542258616815669e-05,
+      "loss": 0.17230144739151002,
+      "step": 1210
+    },
+    {
+      "epoch": 0.2210698689956332,
+      "grad_norm": 0.177297905087471,
+      "learning_rate": 4.5380042450805216e-05,
+      "loss": 0.1807127833366394,
+      "step": 1215
+    },
+    {
+      "epoch": 0.22197962154294032,
+      "grad_norm": 0.14331696927547455,
+      "learning_rate": 4.533732205821897e-05,
+      "loss": 0.17201389074325563,
+      "step": 1220
+    },
+    {
+      "epoch": 0.22288937409024745,
+      "grad_norm": 0.14473360776901245,
+      "learning_rate": 4.529442536074239e-05,
+      "loss": 0.17036900520324708,
+      "step": 1225
+    },
+    {
+      "epoch": 0.22379912663755458,
+      "grad_norm": 0.1820901483297348,
+      "learning_rate": 4.5251352730248314e-05,
+      "loss": 0.17704882621765136,
+      "step": 1230
+    },
+    {
+      "epoch": 0.2247088791848617,
+      "grad_norm": 0.1948976367712021,
+      "learning_rate": 4.5208104540134746e-05,
+      "loss": 0.1706973433494568,
+      "step": 1235
+    },
+    {
+      "epoch": 0.22561863173216884,
+      "grad_norm": 0.16660070419311523,
+      "learning_rate": 4.51646811653216e-05,
+      "loss": 0.17636821269989014,
+      "step": 1240
+    },
+    {
+      "epoch": 0.22652838427947597,
+      "grad_norm": 0.1699984073638916,
+      "learning_rate": 4.512108298224751e-05,
+      "loss": 0.16986632347106934,
+      "step": 1245
+    },
+    {
+      "epoch": 0.22743813682678313,
+      "grad_norm": 0.17601042985916138,
+      "learning_rate": 4.50773103688665e-05,
+      "loss": 0.17507898807525635,
+      "step": 1250
+    },
+    {
+      "epoch": 0.22834788937409026,
+      "grad_norm": 0.17557238042354584,
+      "learning_rate": 4.503336370464476e-05,
+      "loss": 0.17702863216400147,
+      "step": 1255
+    },
+    {
+      "epoch": 0.2292576419213974,
+      "grad_norm": 0.1800651252269745,
+      "learning_rate": 4.498924337055729e-05,
+      "loss": 0.16419180631637573,
+      "step": 1260
+    },
+    {
+      "epoch": 0.23016739446870452,
+      "grad_norm": 0.2022479772567749,
+      "learning_rate": 4.494494974908468e-05,
+      "loss": 0.17482060194015503,
+      "step": 1265
+    },
+    {
+      "epoch": 0.23107714701601165,
+      "grad_norm": 0.14180205762386322,
+      "learning_rate": 4.490048322420973e-05,
+      "loss": 0.1723136067390442,
+      "step": 1270
+    },
+    {
+      "epoch": 0.23198689956331878,
+      "grad_norm": 0.18607310950756073,
+      "learning_rate": 4.485584418141419e-05,
+      "loss": 0.17096419334411622,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2328966521106259,
+      "grad_norm": 0.15958310663700104,
+      "learning_rate": 4.481103300767529e-05,
+      "loss": 0.1656244158744812,
+      "step": 1280
+    },
+    {
+      "epoch": 0.23380640465793304,
+      "grad_norm": 0.17552383244037628,
+      "learning_rate": 4.476605009146255e-05,
+      "loss": 0.17677626609802247,
+      "step": 1285
+    },
+    {
+      "epoch": 0.23471615720524017,
+      "grad_norm": 0.15299823880195618,
+      "learning_rate": 4.472089582273429e-05,
+      "loss": 0.1778991103172302,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2356259097525473,
+      "grad_norm": 0.14613987505435944,
+      "learning_rate": 4.46755705929343e-05,
+      "loss": 0.17071452140808105,
+      "step": 1295
+    },
+    {
+      "epoch": 0.23653566229985443,
+      "grad_norm": 0.17781122028827667,
+      "learning_rate": 4.463007479498843e-05,
+      "loss": 0.16955430507659913,
+      "step": 1300
+    },
+    {
+      "epoch": 0.23744541484716158,
+      "grad_norm": 0.16326487064361572,
+      "learning_rate": 4.458440882330119e-05,
+      "loss": 0.1777693510055542,
+      "step": 1305
+    },
+    {
+      "epoch": 0.23835516739446871,
+      "grad_norm": 0.17701926827430725,
+      "learning_rate": 4.4538573073752365e-05,
+      "loss": 0.16323351860046387,
+      "step": 1310
+    },
+    {
+      "epoch": 0.23926491994177584,
+      "grad_norm": 0.13104717433452606,
+      "learning_rate": 4.449256794369349e-05,
+      "loss": 0.17653456926345826,
+      "step": 1315
+    },
+    {
+      "epoch": 0.24017467248908297,
+      "grad_norm": 0.1796836256980896,
+      "learning_rate": 4.444639383194452e-05,
+      "loss": 0.17189600467681884,
+      "step": 1320
+    },
+    {
+      "epoch": 0.2410844250363901,
+      "grad_norm": 0.14919696748256683,
+      "learning_rate": 4.440005113879029e-05,
+      "loss": 0.17003334760665895,
+      "step": 1325
+    },
+    {
+      "epoch": 0.24199417758369723,
+      "grad_norm": 0.1728784441947937,
+      "learning_rate": 4.4353540265977064e-05,
+      "loss": 0.17397408485412597,
+      "step": 1330
+    },
+    {
+      "epoch": 0.24290393013100436,
+      "grad_norm": 0.14591015875339508,
+      "learning_rate": 4.43068616167091e-05,
+      "loss": 0.16498478651046752,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2438136826783115,
+      "grad_norm": 0.18417201936244965,
+      "learning_rate": 4.4260015595645055e-05,
+      "loss": 0.16841750144958495,
+      "step": 1340
+    },
+    {
+      "epoch": 0.24472343522561862,
+      "grad_norm": 0.16264279186725616,
+      "learning_rate": 4.4213002608894605e-05,
+      "loss": 0.16907373666763306,
+      "step": 1345
+    },
+    {
+      "epoch": 0.24563318777292575,
+      "grad_norm": 0.15248481929302216,
+      "learning_rate": 4.416582306401481e-05,
+      "loss": 0.15931472778320313,
+      "step": 1350
+    },
+    {
+      "epoch": 0.24654294032023288,
+      "grad_norm": 0.1488373875617981,
+      "learning_rate": 4.4118477370006636e-05,
+      "loss": 0.1701716423034668,
+      "step": 1355
+    },
+    {
+      "epoch": 0.24745269286754004,
+      "grad_norm": 0.14679782092571259,
+      "learning_rate": 4.407096593731142e-05,
+      "loss": 0.157412326335907,
+      "step": 1360
+    },
+    {
+      "epoch": 0.24836244541484717,
+      "grad_norm": 0.17139530181884766,
+      "learning_rate": 4.402328917780728e-05,
+      "loss": 0.17303754091262818,
+      "step": 1365
+    },
+    {
+      "epoch": 0.2492721979621543,
+      "grad_norm": 0.1534871757030487,
+      "learning_rate": 4.397544750480554e-05,
+      "loss": 0.1786255121231079,
+      "step": 1370
+    },
+    {
+      "epoch": 0.2501819505094614,
+      "grad_norm": 0.1876252293586731,
+      "learning_rate": 4.39274413330472e-05,
+      "loss": 0.16442898511886597,
+      "step": 1375
+    },
+    {
+      "epoch": 0.25109170305676853,
+      "grad_norm": 0.16165752708911896,
+      "learning_rate": 4.387927107869928e-05,
+      "loss": 0.1780426025390625,
+      "step": 1380
+    },
+    {
+      "epoch": 0.25200145560407566,
+      "grad_norm": 0.17242255806922913,
+      "learning_rate": 4.383093715935124e-05,
+      "loss": 0.15959256887435913,
+      "step": 1385
+    },
+    {
+      "epoch": 0.25291120815138285,
+      "grad_norm": 0.1627114862203598,
+      "learning_rate": 4.378243999401137e-05,
+      "loss": 0.17606115341186523,
+      "step": 1390
+    },
+    {
+      "epoch": 0.25382096069869,
+      "grad_norm": 0.15911224484443665,
+      "learning_rate": 4.373378000310312e-05,
+      "loss": 0.16798585653305054,
+      "step": 1395
+    },
+    {
+      "epoch": 0.2547307132459971,
+      "grad_norm": 0.15542249381542206,
+      "learning_rate": 4.3684957608461505e-05,
+      "loss": 0.1695417881011963,
+      "step": 1400
+    },
+    {
+      "epoch": 0.25564046579330424,
+      "grad_norm": 0.1475304812192917,
+      "learning_rate": 4.363597323332941e-05,
+      "loss": 0.16340878009796142,
+      "step": 1405
+    },
+    {
+      "epoch": 0.25655021834061137,
+      "grad_norm": 0.16943927109241486,
+      "learning_rate": 4.358682730235395e-05,
+      "loss": 0.17240238189697266,
+      "step": 1410
+    },
+    {
+      "epoch": 0.2574599708879185,
+      "grad_norm": 0.1816391944885254,
+      "learning_rate": 4.3537520241582744e-05,
+      "loss": 0.16558437347412108,
+      "step": 1415
+    },
+    {
+      "epoch": 0.25836972343522563,
+      "grad_norm": 0.23851341009140015,
+      "learning_rate": 4.348805247846027e-05,
+      "loss": 0.16796000003814698,
+      "step": 1420
+    },
+    {
+      "epoch": 0.25927947598253276,
+      "grad_norm": 0.15415243804454803,
+      "learning_rate": 4.343842444182414e-05,
+      "loss": 0.1746017098426819,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2601892285298399,
+      "grad_norm": 0.15651032328605652,
+      "learning_rate": 4.338863656190139e-05,
+      "loss": 0.1649057984352112,
+      "step": 1430
+    },
+    {
+      "epoch": 0.261098981077147,
+      "grad_norm": 0.16601966321468353,
+      "learning_rate": 4.333868927030471e-05,
+      "loss": 0.15888988971710205,
+      "step": 1435
+    },
+    {
+      "epoch": 0.26200873362445415,
+      "grad_norm": 0.1549467295408249,
+      "learning_rate": 4.328858300002876e-05,
+      "loss": 0.16357985734939576,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2629184861717613,
+      "grad_norm": 0.16332370042800903,
+      "learning_rate": 4.32383181854464e-05,
+      "loss": 0.16749982833862304,
+      "step": 1445
+    },
+    {
+      "epoch": 0.2638282387190684,
+      "grad_norm": 0.14827077090740204,
+      "learning_rate": 4.3187895262304894e-05,
+      "loss": 0.16886214017868043,
+      "step": 1450
+    },
+    {
+      "epoch": 0.26473799126637554,
+      "grad_norm": 0.1557198166847229,
+      "learning_rate": 4.313731466772216e-05,
+      "loss": 0.17512214183807373,
+      "step": 1455
+    },
+    {
+      "epoch": 0.26564774381368267,
+      "grad_norm": 0.17263570427894592,
+      "learning_rate": 4.308657684018299e-05,
+      "loss": 0.16248074769973755,
+      "step": 1460
+    },
+    {
+      "epoch": 0.2665574963609898,
+      "grad_norm": 0.17135761678218842,
+      "learning_rate": 4.303568221953521e-05,
+      "loss": 0.16605921983718872,
+      "step": 1465
+    },
+    {
+      "epoch": 0.26746724890829693,
+      "grad_norm": 0.14322632551193237,
+      "learning_rate": 4.2984631246985897e-05,
+      "loss": 0.1610772728919983,
+      "step": 1470
+    },
+    {
+      "epoch": 0.26837700145560406,
+      "grad_norm": 0.18852312862873077,
+      "learning_rate": 4.2933424365097564e-05,
+      "loss": 0.1686462163925171,
+      "step": 1475
+    },
+    {
+      "epoch": 0.2692867540029112,
+      "grad_norm": 0.1780245155096054,
+      "learning_rate": 4.2882062017784294e-05,
+      "loss": 0.16953932046890258,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2701965065502183,
+      "grad_norm": 0.180568665266037,
+      "learning_rate": 4.2830544650307895e-05,
+      "loss": 0.16442664861679077,
+      "step": 1485
+    },
+    {
+      "epoch": 0.27110625909752545,
+      "grad_norm": 0.16876435279846191,
+      "learning_rate": 4.277887270927407e-05,
+      "loss": 0.17128173112869263,
+      "step": 1490
+    },
+    {
+      "epoch": 0.2720160116448326,
+      "grad_norm": 0.164053276181221,
+      "learning_rate": 4.2727046642628513e-05,
+      "loss": 0.16331382989883422,
+      "step": 1495
+    },
+    {
+      "epoch": 0.27292576419213976,
+      "grad_norm": 0.14577528834342957,
+      "learning_rate": 4.267506689965305e-05,
+      "loss": 0.1638316035270691,
+      "step": 1500
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 8.315576795670244e+17,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-1500/training_args.bin b/checkpoint-1500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-1500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/checkpoint-1600/README.md b/checkpoint-1600/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-1600/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-1600/adapter_config.json b/checkpoint-1600/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-1600/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-1600/adapter_model.safetensors b/checkpoint-1600/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b02bce2ee44b6a015a92b9e4cf802d8f302974eb
--- /dev/null
+++ b/checkpoint-1600/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d76afc5ff79e9f9418fab256c9529de31a72b27cd40750f03f0fa62717eb9285
+size 169741912
diff --git a/checkpoint-1600/chat_template.jinja b/checkpoint-1600/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-1600/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-1600/optimizer.pt b/checkpoint-1600/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..64c17bc27169a6c1b414ce1b23f6c4ab528046c1
--- /dev/null
+++ b/checkpoint-1600/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:044e1ca71e64bd33573cf03130723b7c3498a1381191791bc0ac6a53d0f0169f
+size 72807355
diff --git a/checkpoint-1600/processor_config.json b/checkpoint-1600/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-1600/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-1600/rng_state.pth b/checkpoint-1600/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-1600/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-1600/scheduler.pt b/checkpoint-1600/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d8cf0a25e31300fb36150cbf31c849bdb3152739
--- /dev/null
+++ b/checkpoint-1600/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3407cdafc385e5f27b4133c663904e78499531174c286753c8d22a6075323095
+size 1465
diff --git a/checkpoint-1600/tokenizer.json b/checkpoint-1600/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-1600/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-1600/tokenizer_config.json b/checkpoint-1600/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-1600/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-1600/trainer_state.json b/checkpoint-1600/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..e05cf51f3434ca164e2fea211d7366031b688eca
--- /dev/null
+++ b/checkpoint-1600/trainer_state.json
@@ -0,0 +1,2282 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.29112081513828236,
+  "eval_steps": 100,
+  "global_step": 1600,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    },
+    {
+      "epoch": 0.03729985443959243,
+      "grad_norm": 0.061678580939769745,
+      "learning_rate": 4.999340748636462e-05,
+      "loss": 0.24956226348876953,
+      "step": 205
+    },
+    {
+      "epoch": 0.03820960698689956,
+      "grad_norm": 0.07328873127698898,
+      "learning_rate": 4.999160884067051e-05,
+      "loss": 0.26169676780700685,
+      "step": 210
+    },
+    {
+      "epoch": 0.0391193595342067,
+      "grad_norm": 0.08287990838289261,
+      "learning_rate": 4.9989593541926246e-05,
+      "loss": 0.2574604034423828,
+      "step": 215
+    },
+    {
+      "epoch": 0.04002911208151383,
+      "grad_norm": 0.06787359714508057,
+      "learning_rate": 4.9987361607602525e-05,
+      "loss": 0.25351409912109374,
+      "step": 220
+    },
+    {
+      "epoch": 0.04093886462882096,
+      "grad_norm": 0.06695502996444702,
+      "learning_rate": 4.998491305704805e-05,
+      "loss": 0.24522039890289307,
+      "step": 225
+    },
+    {
+      "epoch": 0.041848617176128096,
+      "grad_norm": 0.08872214704751968,
+      "learning_rate": 4.9982247911489375e-05,
+      "loss": 0.2581867933273315,
+      "step": 230
+    },
+    {
+      "epoch": 0.042758369723435226,
+      "grad_norm": 0.07637131959199905,
+      "learning_rate": 4.9979366194030743e-05,
+      "loss": 0.25569658279418944,
+      "step": 235
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 0.08158119022846222,
+      "learning_rate": 4.997626792965385e-05,
+      "loss": 0.2529409646987915,
+      "step": 240
+    },
+    {
+      "epoch": 0.04457787481804949,
+      "grad_norm": 0.07529161125421524,
+      "learning_rate": 4.997295314521766e-05,
+      "loss": 0.24049024581909179,
+      "step": 245
+    },
+    {
+      "epoch": 0.04548762736535662,
+      "grad_norm": 0.08860139548778534,
+      "learning_rate": 4.996942186945813e-05,
+      "loss": 0.2490522861480713,
+      "step": 250
+    },
+    {
+      "epoch": 0.04639737991266375,
+      "grad_norm": 0.0850321501493454,
+      "learning_rate": 4.9965674132988005e-05,
+      "loss": 0.24180831909179687,
+      "step": 255
+    },
+    {
+      "epoch": 0.04730713245997089,
+      "grad_norm": 0.07556115090847015,
+      "learning_rate": 4.996170996829653e-05,
+      "loss": 0.2509631872177124,
+      "step": 260
+    },
+    {
+      "epoch": 0.04821688500727802,
+      "grad_norm": 0.07971206307411194,
+      "learning_rate": 4.995752940974918e-05,
+      "loss": 0.24398891925811766,
+      "step": 265
+    },
+    {
+      "epoch": 0.04912663755458515,
+      "grad_norm": 0.09149336814880371,
+      "learning_rate": 4.9953132493587344e-05,
+      "loss": 0.2300492286682129,
+      "step": 270
+    },
+    {
+      "epoch": 0.050036390101892286,
+      "grad_norm": 0.08265820890665054,
+      "learning_rate": 4.9948519257928034e-05,
+      "loss": 0.24246792793273925,
+      "step": 275
+    },
+    {
+      "epoch": 0.050946142649199416,
+      "grad_norm": 0.10328587144613266,
+      "learning_rate": 4.9943689742763534e-05,
+      "loss": 0.2367171049118042,
+      "step": 280
+    },
+    {
+      "epoch": 0.05185589519650655,
+      "grad_norm": 0.0836917981505394,
+      "learning_rate": 4.993864398996105e-05,
+      "loss": 0.23215813636779786,
+      "step": 285
+    },
+    {
+      "epoch": 0.05276564774381368,
+      "grad_norm": 0.09475161135196686,
+      "learning_rate": 4.99333820432624e-05,
+      "loss": 0.2350748062133789,
+      "step": 290
+    },
+    {
+      "epoch": 0.05367540029112081,
+      "grad_norm": 0.08040128648281097,
+      "learning_rate": 4.992790394828355e-05,
+      "loss": 0.23253886699676513,
+      "step": 295
+    },
+    {
+      "epoch": 0.05458515283842795,
+      "grad_norm": 0.08852150291204453,
+      "learning_rate": 4.992220975251428e-05,
+      "loss": 0.23856515884399415,
+      "step": 300
+    },
+    {
+      "epoch": 0.05549490538573508,
+      "grad_norm": 0.09565229713916779,
+      "learning_rate": 4.991629950531775e-05,
+      "loss": 0.23311660289764405,
+      "step": 305
+    },
+    {
+      "epoch": 0.05640465793304221,
+      "grad_norm": 0.08158160001039505,
+      "learning_rate": 4.991017325793009e-05,
+      "loss": 0.22467944622039795,
+      "step": 310
+    },
+    {
+      "epoch": 0.05731441048034935,
+      "grad_norm": 0.07746429741382599,
+      "learning_rate": 4.990383106345994e-05,
+      "loss": 0.229844069480896,
+      "step": 315
+    },
+    {
+      "epoch": 0.05822416302765648,
+      "grad_norm": 0.08564355969429016,
+      "learning_rate": 4.989727297688797e-05,
+      "loss": 0.22414517402648926,
+      "step": 320
+    },
+    {
+      "epoch": 0.05913391557496361,
+      "grad_norm": 0.07517435401678085,
+      "learning_rate": 4.9890499055066435e-05,
+      "loss": 0.2236532211303711,
+      "step": 325
+    },
+    {
+      "epoch": 0.060043668122270744,
+      "grad_norm": 0.111734539270401,
+      "learning_rate": 4.988350935671869e-05,
+      "loss": 0.21474847793579102,
+      "step": 330
+    },
+    {
+      "epoch": 0.060953420669577874,
+      "grad_norm": 0.09906989336013794,
+      "learning_rate": 4.987630394243866e-05,
+      "loss": 0.23321933746337892,
+      "step": 335
+    },
+    {
+      "epoch": 0.06186317321688501,
+      "grad_norm": 0.10131457448005676,
+      "learning_rate": 4.98688828746903e-05,
+      "loss": 0.2310662031173706,
+      "step": 340
+    },
+    {
+      "epoch": 0.06277292576419213,
+      "grad_norm": 0.09203507006168365,
+      "learning_rate": 4.986124621780708e-05,
+      "loss": 0.22021169662475587,
+      "step": 345
+    },
+    {
+      "epoch": 0.06368267831149928,
+      "grad_norm": 0.09505912661552429,
+      "learning_rate": 4.9853394037991416e-05,
+      "loss": 0.2197155237197876,
+      "step": 350
+    },
+    {
+      "epoch": 0.06459243085880641,
+      "grad_norm": 0.09038657695055008,
+      "learning_rate": 4.984532640331412e-05,
+      "loss": 0.22066287994384765,
+      "step": 355
+    },
+    {
+      "epoch": 0.06550218340611354,
+      "grad_norm": 0.09707064181566238,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 0.22455451488494874,
+      "step": 360
+    },
+    {
+      "epoch": 0.06641193595342067,
+      "grad_norm": 0.10367228090763092,
+      "learning_rate": 4.98285450509961e-05,
+      "loss": 0.21993820667266845,
+      "step": 365
+    },
+    {
+      "epoch": 0.0673216885007278,
+      "grad_norm": 0.12229471653699875,
+      "learning_rate": 4.9819831478833456e-05,
+      "loss": 0.2168867588043213,
+      "step": 370
+    },
+    {
+      "epoch": 0.06823144104803494,
+      "grad_norm": 0.0964592918753624,
+      "learning_rate": 4.981090274276406e-05,
+      "loss": 0.21579203605651856,
+      "step": 375
+    },
+    {
+      "epoch": 0.06914119359534207,
+      "grad_norm": 0.09400496631860733,
+      "learning_rate": 4.980175892019141e-05,
+      "loss": 0.20972180366516113,
+      "step": 380
+    },
+    {
+      "epoch": 0.0700509461426492,
+      "grad_norm": 0.08158645778894424,
+      "learning_rate": 4.9792400090383594e-05,
+      "loss": 0.22148358821868896,
+      "step": 385
+    },
+    {
+      "epoch": 0.07096069868995633,
+      "grad_norm": 0.10916394740343094,
+      "learning_rate": 4.978282633447261e-05,
+      "loss": 0.2214418649673462,
+      "step": 390
+    },
+    {
+      "epoch": 0.07187045123726346,
+      "grad_norm": 0.11138810962438583,
+      "learning_rate": 4.9773037735453636e-05,
+      "loss": 0.21814754009246826,
+      "step": 395
+    },
+    {
+      "epoch": 0.07278020378457059,
+      "grad_norm": 0.10914396494626999,
+      "learning_rate": 4.9763034378184365e-05,
+      "loss": 0.21310818195343018,
+      "step": 400
+    },
+    {
+      "epoch": 0.07368995633187773,
+      "grad_norm": 0.1043366864323616,
+      "learning_rate": 4.975281634938421e-05,
+      "loss": 0.21266789436340333,
+      "step": 405
+    },
+    {
+      "epoch": 0.07459970887918486,
+      "grad_norm": 0.1036868542432785,
+      "learning_rate": 4.9742383737633594e-05,
+      "loss": 0.21606721878051757,
+      "step": 410
+    },
+    {
+      "epoch": 0.075509461426492,
+      "grad_norm": 0.11640442907810211,
+      "learning_rate": 4.9731736633373144e-05,
+      "loss": 0.21532948017120362,
+      "step": 415
+    },
+    {
+      "epoch": 0.07641921397379912,
+      "grad_norm": 0.11219926178455353,
+      "learning_rate": 4.9720875128902956e-05,
+      "loss": 0.2191627025604248,
+      "step": 420
+    },
+    {
+      "epoch": 0.07732896652110625,
+      "grad_norm": 0.12103637307882309,
+      "learning_rate": 4.970979931838176e-05,
+      "loss": 0.20938868522644044,
+      "step": 425
+    },
+    {
+      "epoch": 0.0782387190684134,
+      "grad_norm": 0.13274189829826355,
+      "learning_rate": 4.96985092978261e-05,
+      "loss": 0.21792960166931152,
+      "step": 430
+    },
+    {
+      "epoch": 0.07914847161572053,
+      "grad_norm": 0.11164513230323792,
+      "learning_rate": 4.968700516510954e-05,
+      "loss": 0.2022618055343628,
+      "step": 435
+    },
+    {
+      "epoch": 0.08005822416302766,
+      "grad_norm": 0.09532847255468369,
+      "learning_rate": 4.967528701996174e-05,
+      "loss": 0.21255812644958497,
+      "step": 440
+    },
+    {
+      "epoch": 0.08096797671033479,
+      "grad_norm": 0.10279258340597153,
+      "learning_rate": 4.96633549639677e-05,
+      "loss": 0.20683050155639648,
+      "step": 445
+    },
+    {
+      "epoch": 0.08187772925764192,
+      "grad_norm": 0.1257462352514267,
+      "learning_rate": 4.965120910056677e-05,
+      "loss": 0.21419920921325683,
+      "step": 450
+    },
+    {
+      "epoch": 0.08278748180494905,
+      "grad_norm": 0.11663137376308441,
+      "learning_rate": 4.963884953505186e-05,
+      "loss": 0.2072287082672119,
+      "step": 455
+    },
+    {
+      "epoch": 0.08369723435225619,
+      "grad_norm": 0.10488224029541016,
+      "learning_rate": 4.96262763745684e-05,
+      "loss": 0.1982678532600403,
+      "step": 460
+    },
+    {
+      "epoch": 0.08460698689956332,
+      "grad_norm": 0.11801692098379135,
+      "learning_rate": 4.961348972811354e-05,
+      "loss": 0.20662031173706055,
+      "step": 465
+    },
+    {
+      "epoch": 0.08551673944687045,
+      "grad_norm": 0.11318827420473099,
+      "learning_rate": 4.96004897065351e-05,
+      "loss": 0.20947303771972656,
+      "step": 470
+    },
+    {
+      "epoch": 0.08642649199417758,
+      "grad_norm": 0.13409486413002014,
+      "learning_rate": 4.95872764225307e-05,
+      "loss": 0.19670876264572143,
+      "step": 475
+    },
+    {
+      "epoch": 0.08733624454148471,
+      "grad_norm": 0.14440792798995972,
+      "learning_rate": 4.957384999064672e-05,
+      "loss": 0.19842848777770997,
+      "step": 480
+    },
+    {
+      "epoch": 0.08824599708879186,
+      "grad_norm": 0.12246996909379959,
+      "learning_rate": 4.956021052727731e-05,
+      "loss": 0.20318071842193602,
+      "step": 485
+    },
+    {
+      "epoch": 0.08915574963609899,
+      "grad_norm": 0.13437233865261078,
+      "learning_rate": 4.954635815066342e-05,
+      "loss": 0.21675212383270265,
+      "step": 490
+    },
+    {
+      "epoch": 0.09006550218340612,
+      "grad_norm": 0.11109672486782074,
+      "learning_rate": 4.9532292980891744e-05,
+      "loss": 0.2100757837295532,
+      "step": 495
+    },
+    {
+      "epoch": 0.09097525473071325,
+      "grad_norm": 0.1388893872499466,
+      "learning_rate": 4.9518015139893675e-05,
+      "loss": 0.20303285121917725,
+      "step": 500
+    },
+    {
+      "epoch": 0.09188500727802038,
+      "grad_norm": 0.13239721953868866,
+      "learning_rate": 4.950352475144427e-05,
+      "loss": 0.2152268409729004,
+      "step": 505
+    },
+    {
+      "epoch": 0.0927947598253275,
+      "grad_norm": 0.12834979593753815,
+      "learning_rate": 4.948882194116119e-05,
+      "loss": 0.20799248218536376,
+      "step": 510
+    },
+    {
+      "epoch": 0.09370451237263465,
+      "grad_norm": 0.11886704713106155,
+      "learning_rate": 4.947390683650354e-05,
+      "loss": 0.20394976139068605,
+      "step": 515
+    },
+    {
+      "epoch": 0.09461426491994178,
+      "grad_norm": 0.11398876458406448,
+      "learning_rate": 4.945877956677083e-05,
+      "loss": 0.2091092586517334,
+      "step": 520
+    },
+    {
+      "epoch": 0.09552401746724891,
+      "grad_norm": 0.1422540694475174,
+      "learning_rate": 4.944344026310186e-05,
+      "loss": 0.19564238786697388,
+      "step": 525
+    },
+    {
+      "epoch": 0.09643377001455604,
+      "grad_norm": 0.11359584331512451,
+      "learning_rate": 4.9427889058473535e-05,
+      "loss": 0.20493624210357667,
+      "step": 530
+    },
+    {
+      "epoch": 0.09734352256186317,
+      "grad_norm": 0.11703553050756454,
+      "learning_rate": 4.941212608769974e-05,
+      "loss": 0.2098615884780884,
+      "step": 535
+    },
+    {
+      "epoch": 0.0982532751091703,
+      "grad_norm": 0.14552047848701477,
+      "learning_rate": 4.939615148743017e-05,
+      "loss": 0.20382182598114013,
+      "step": 540
+    },
+    {
+      "epoch": 0.09916302765647744,
+      "grad_norm": 0.13178016245365143,
+      "learning_rate": 4.937996539614914e-05,
+      "loss": 0.19901862144470214,
+      "step": 545
+    },
+    {
+      "epoch": 0.10007278020378457,
+      "grad_norm": 0.635392427444458,
+      "learning_rate": 4.936356795417439e-05,
+      "loss": 0.20694944858551026,
+      "step": 550
+    },
+    {
+      "epoch": 0.1009825327510917,
+      "grad_norm": 0.15019077062606812,
+      "learning_rate": 4.934695930365586e-05,
+      "loss": 0.19313746690750122,
+      "step": 555
+    },
+    {
+      "epoch": 0.10189228529839883,
+      "grad_norm": 0.12941956520080566,
+      "learning_rate": 4.9330139588574474e-05,
+      "loss": 0.19671722650527954,
+      "step": 560
+    },
+    {
+      "epoch": 0.10280203784570596,
+      "grad_norm": 0.13818831741809845,
+      "learning_rate": 4.931310895474088e-05,
+      "loss": 0.20026786327362062,
+      "step": 565
+    },
+    {
+      "epoch": 0.1037117903930131,
+      "grad_norm": 0.12011194974184036,
+      "learning_rate": 4.929586754979417e-05,
+      "loss": 0.1932437539100647,
+      "step": 570
+    },
+    {
+      "epoch": 0.10462154294032024,
+      "grad_norm": 0.1345364898443222,
+      "learning_rate": 4.9278415523200644e-05,
+      "loss": 0.20245940685272218,
+      "step": 575
+    },
+    {
+      "epoch": 0.10553129548762737,
+      "grad_norm": 0.13281017541885376,
+      "learning_rate": 4.926075302625247e-05,
+      "loss": 0.19864981174468993,
+      "step": 580
+    },
+    {
+      "epoch": 0.1064410480349345,
+      "grad_norm": 0.13465586304664612,
+      "learning_rate": 4.924288021206639e-05,
+      "loss": 0.19573183059692384,
+      "step": 585
+    },
+    {
+      "epoch": 0.10735080058224163,
+      "grad_norm": 0.15225961804389954,
+      "learning_rate": 4.9224797235582396e-05,
+      "loss": 0.19946801662445068,
+      "step": 590
+    },
+    {
+      "epoch": 0.10826055312954876,
+      "grad_norm": 0.12816746532917023,
+      "learning_rate": 4.92065042535624e-05,
+      "loss": 0.19851526021957397,
+      "step": 595
+    },
+    {
+      "epoch": 0.1091703056768559,
+      "grad_norm": 0.13802853226661682,
+      "learning_rate": 4.9188001424588824e-05,
+      "loss": 0.19321763515472412,
+      "step": 600
+    },
+    {
+      "epoch": 0.11008005822416303,
+      "grad_norm": 0.17504797875881195,
+      "learning_rate": 4.9169288909063295e-05,
+      "loss": 0.2032616138458252,
+      "step": 605
+    },
+    {
+      "epoch": 0.11098981077147016,
+      "grad_norm": 0.13544194400310516,
+      "learning_rate": 4.91503668692052e-05,
+      "loss": 0.2011256456375122,
+      "step": 610
+    },
+    {
+      "epoch": 0.11189956331877729,
+      "grad_norm": 1.3976134061813354,
+      "learning_rate": 4.91312354690503e-05,
+      "loss": 0.19916868209838867,
+      "step": 615
+    },
+    {
+      "epoch": 0.11280931586608442,
+      "grad_norm": 0.1465059071779251,
+      "learning_rate": 4.91118948744493e-05,
+      "loss": 0.19487457275390624,
+      "step": 620
+    },
+    {
+      "epoch": 0.11371906841339156,
+      "grad_norm": 0.12103168666362762,
+      "learning_rate": 4.909234525306645e-05,
+      "loss": 0.1907251238822937,
+      "step": 625
+    },
+    {
+      "epoch": 0.1146288209606987,
+      "grad_norm": 0.12660574913024902,
+      "learning_rate": 4.907258677437802e-05,
+      "loss": 0.19327253103256226,
+      "step": 630
+    },
+    {
+      "epoch": 0.11553857350800582,
+      "grad_norm": 0.1347813606262207,
+      "learning_rate": 4.90526196096709e-05,
+      "loss": 0.19637736082077026,
+      "step": 635
+    },
+    {
+      "epoch": 0.11644832605531295,
+      "grad_norm": 0.14953652024269104,
+      "learning_rate": 4.903244393204107e-05,
+      "loss": 0.20325069427490233,
+      "step": 640
+    },
+    {
+      "epoch": 0.11735807860262008,
+      "grad_norm": 0.13936272263526917,
+      "learning_rate": 4.901205991639213e-05,
+      "loss": 0.1930275321006775,
+      "step": 645
+    },
+    {
+      "epoch": 0.11826783114992721,
+      "grad_norm": 0.1448420137166977,
+      "learning_rate": 4.899146773943374e-05,
+      "loss": 0.20026936531066894,
+      "step": 650
+    },
+    {
+      "epoch": 0.11917758369723436,
+      "grad_norm": 0.1312534064054489,
+      "learning_rate": 4.897066757968014e-05,
+      "loss": 0.19062033891677857,
+      "step": 655
+    },
+    {
+      "epoch": 0.12008733624454149,
+      "grad_norm": 0.13644742965698242,
+      "learning_rate": 4.894965961744859e-05,
+      "loss": 0.18719595670700073,
+      "step": 660
+    },
+    {
+      "epoch": 0.12099708879184862,
+      "grad_norm": 0.14276087284088135,
+      "learning_rate": 4.892844403485777e-05,
+      "loss": 0.19784307479858398,
+      "step": 665
+    },
+    {
+      "epoch": 0.12190684133915575,
+      "grad_norm": 0.14735399186611176,
+      "learning_rate": 4.890702101582623e-05,
+      "loss": 0.19163782596588136,
+      "step": 670
+    },
+    {
+      "epoch": 0.12281659388646288,
+      "grad_norm": 0.15742065012454987,
+      "learning_rate": 4.888539074607082e-05,
+      "loss": 0.19312986135482788,
+      "step": 675
+    },
+    {
+      "epoch": 0.12372634643377002,
+      "grad_norm": 0.12917031347751617,
+      "learning_rate": 4.8863553413105025e-05,
+      "loss": 0.20066320896148682,
+      "step": 680
+    },
+    {
+      "epoch": 0.12463609898107715,
+      "grad_norm": 0.1484801322221756,
+      "learning_rate": 4.884150920623737e-05,
+      "loss": 0.20096096992492676,
+      "step": 685
+    },
+    {
+      "epoch": 0.12554585152838427,
+      "grad_norm": 0.1455296128988266,
+      "learning_rate": 4.88192583165698e-05,
+      "loss": 0.20518505573272705,
+      "step": 690
+    },
+    {
+      "epoch": 0.12645560407569142,
+      "grad_norm": 0.14517490565776825,
+      "learning_rate": 4.879680093699598e-05,
+      "loss": 0.18859238624572755,
+      "step": 695
+    },
+    {
+      "epoch": 0.12736535662299855,
+      "grad_norm": 0.18778090178966522,
+      "learning_rate": 4.877413726219964e-05,
+      "loss": 0.197074818611145,
+      "step": 700
+    },
+    {
+      "epoch": 0.12827510917030568,
+      "grad_norm": 0.13497677445411682,
+      "learning_rate": 4.87512674886529e-05,
+      "loss": 0.18713107109069824,
+      "step": 705
+    },
+    {
+      "epoch": 0.12918486171761281,
+      "grad_norm": 0.12657155096530914,
+      "learning_rate": 4.872819181461455e-05,
+      "loss": 0.1858484387397766,
+      "step": 710
+    },
+    {
+      "epoch": 0.13009461426491994,
+      "grad_norm": 0.11458148807287216,
+      "learning_rate": 4.870491044012834e-05,
+      "loss": 0.18732179403305055,
+      "step": 715
+    },
+    {
+      "epoch": 0.13100436681222707,
+      "grad_norm": 0.13000249862670898,
+      "learning_rate": 4.8681423567021244e-05,
+      "loss": 0.1872936010360718,
+      "step": 720
+    },
+    {
+      "epoch": 0.1319141193595342,
+      "grad_norm": 0.14580890536308289,
+      "learning_rate": 4.865773139890172e-05,
+      "loss": 0.19280019998550416,
+      "step": 725
+    },
+    {
+      "epoch": 0.13282387190684133,
+      "grad_norm": 0.1507277935743332,
+      "learning_rate": 4.8633834141157913e-05,
+      "loss": 0.1898929238319397,
+      "step": 730
+    },
+    {
+      "epoch": 0.13373362445414846,
+      "grad_norm": 0.1418737769126892,
+      "learning_rate": 4.860973200095592e-05,
+      "loss": 0.17926375865936278,
+      "step": 735
+    },
+    {
+      "epoch": 0.1346433770014556,
+      "grad_norm": 0.17151866853237152,
+      "learning_rate": 4.858542518723794e-05,
+      "loss": 0.18963592052459716,
+      "step": 740
+    },
+    {
+      "epoch": 0.13555312954876272,
+      "grad_norm": 0.11162743717432022,
+      "learning_rate": 4.8560913910720535e-05,
+      "loss": 0.19466646909713745,
+      "step": 745
+    },
+    {
+      "epoch": 0.13646288209606988,
+      "grad_norm": 0.15628376603126526,
+      "learning_rate": 4.8536198383892725e-05,
+      "loss": 0.19494034051895143,
+      "step": 750
+    },
+    {
+      "epoch": 0.137372634643377,
+      "grad_norm": 0.18209289014339447,
+      "learning_rate": 4.851127882101421e-05,
+      "loss": 0.18747550249099731,
+      "step": 755
+    },
+    {
+      "epoch": 0.13828238719068414,
+      "grad_norm": 0.14559614658355713,
+      "learning_rate": 4.8486155438113454e-05,
+      "loss": 0.1897158980369568,
+      "step": 760
+    },
+    {
+      "epoch": 0.13919213973799127,
+      "grad_norm": 0.3198587894439697,
+      "learning_rate": 4.846082845298586e-05,
+      "loss": 0.18571001291275024,
+      "step": 765
+    },
+    {
+      "epoch": 0.1401018922852984,
+      "grad_norm": 0.1486678421497345,
+      "learning_rate": 4.843529808519189e-05,
+      "loss": 0.19561930894851684,
+      "step": 770
+    },
+    {
+      "epoch": 0.14101164483260553,
+      "grad_norm": 0.15318170189857483,
+      "learning_rate": 4.840956455605509e-05,
+      "loss": 0.187040114402771,
+      "step": 775
+    },
+    {
+      "epoch": 0.14192139737991266,
+      "grad_norm": 0.13754244148731232,
+      "learning_rate": 4.838362808866025e-05,
+      "loss": 0.18345539569854735,
+      "step": 780
+    },
+    {
+      "epoch": 0.1428311499272198,
+      "grad_norm": 0.12943248450756073,
+      "learning_rate": 4.835748890785143e-05,
+      "loss": 0.1921079397201538,
+      "step": 785
+    },
+    {
+      "epoch": 0.14374090247452692,
+      "grad_norm": 0.110458143055439,
+      "learning_rate": 4.833114724023001e-05,
+      "loss": 0.17927205562591553,
+      "step": 790
+    },
+    {
+      "epoch": 0.14465065502183405,
+      "grad_norm": 0.2421770840883255,
+      "learning_rate": 4.830460331415275e-05,
+      "loss": 0.18317567110061644,
+      "step": 795
+    },
+    {
+      "epoch": 0.14556040756914118,
+      "grad_norm": 0.14752762019634247,
+      "learning_rate": 4.8277857359729787e-05,
+      "loss": 0.1843916058540344,
+      "step": 800
+    },
+    {
+      "epoch": 0.14647016011644834,
+      "grad_norm": 0.15043556690216064,
+      "learning_rate": 4.8250909608822644e-05,
+      "loss": 0.18354393243789674,
+      "step": 805
+    },
+    {
+      "epoch": 0.14737991266375547,
+      "grad_norm": 0.1381794661283493,
+      "learning_rate": 4.822376029504223e-05,
+      "loss": 0.1789781332015991,
+      "step": 810
+    },
+    {
+      "epoch": 0.1482896652110626,
+      "grad_norm": 0.18386174738407135,
+      "learning_rate": 4.819640965374681e-05,
+      "loss": 0.19494292736053467,
+      "step": 815
+    },
+    {
+      "epoch": 0.14919941775836973,
+      "grad_norm": 0.13829593360424042,
+      "learning_rate": 4.816885792203996e-05,
+      "loss": 0.18486063480377196,
+      "step": 820
+    },
+    {
+      "epoch": 0.15010917030567686,
+      "grad_norm": 0.15033291280269623,
+      "learning_rate": 4.814110533876852e-05,
+      "loss": 0.18061509132385253,
+      "step": 825
+    },
+    {
+      "epoch": 0.151018922852984,
+      "grad_norm": 0.17150473594665527,
+      "learning_rate": 4.811315214452051e-05,
+      "loss": 0.18464866876602173,
+      "step": 830
+    },
+    {
+      "epoch": 0.15192867540029112,
+      "grad_norm": 0.15317125618457794,
+      "learning_rate": 4.808499858162307e-05,
+      "loss": 0.1837708592414856,
+      "step": 835
+    },
+    {
+      "epoch": 0.15283842794759825,
+      "grad_norm": 0.2671392560005188,
+      "learning_rate": 4.805664489414031e-05,
+      "loss": 0.19338636398315429,
+      "step": 840
+    },
+    {
+      "epoch": 0.15374818049490538,
+      "grad_norm": 0.14047028124332428,
+      "learning_rate": 4.802809132787125e-05,
+      "loss": 0.17069108486175538,
+      "step": 845
+    },
+    {
+      "epoch": 0.1546579330422125,
+      "grad_norm": 0.1520431935787201,
+      "learning_rate": 4.799933813034768e-05,
+      "loss": 0.18607735633850098,
+      "step": 850
+    },
+    {
+      "epoch": 0.15556768558951964,
+      "grad_norm": 0.17239463329315186,
+      "learning_rate": 4.797038555083197e-05,
+      "loss": 0.18069062232971192,
+      "step": 855
+    },
+    {
+      "epoch": 0.1564774381368268,
+      "grad_norm": 0.1377955675125122,
+      "learning_rate": 4.794123384031495e-05,
+      "loss": 0.18870222568511963,
+      "step": 860
+    },
+    {
+      "epoch": 0.15738719068413393,
+      "grad_norm": 0.15901461243629456,
+      "learning_rate": 4.791188325151373e-05,
+      "loss": 0.18128334283828734,
+      "step": 865
+    },
+    {
+      "epoch": 0.15829694323144106,
+      "grad_norm": 0.14634132385253906,
+      "learning_rate": 4.7882334038869495e-05,
+      "loss": 0.1866163969039917,
+      "step": 870
+    },
+    {
+      "epoch": 0.1592066957787482,
+      "grad_norm": 0.15361061692237854,
+      "learning_rate": 4.785258645854529e-05,
+      "loss": 0.17850807905197144,
+      "step": 875
+    },
+    {
+      "epoch": 0.16011644832605532,
+      "grad_norm": 0.13751649856567383,
+      "learning_rate": 4.782264076842385e-05,
+      "loss": 0.17731113433837892,
+      "step": 880
+    },
+    {
+      "epoch": 0.16102620087336245,
+      "grad_norm": 0.17909638583660126,
+      "learning_rate": 4.7792497228105314e-05,
+      "loss": 0.18344542980194092,
+      "step": 885
+    },
+    {
+      "epoch": 0.16193595342066958,
+      "grad_norm": 0.16038304567337036,
+      "learning_rate": 4.776215609890498e-05,
+      "loss": 0.18868647813796996,
+      "step": 890
+    },
+    {
+      "epoch": 0.1628457059679767,
+      "grad_norm": 0.1653951108455658,
+      "learning_rate": 4.773161764385107e-05,
+      "loss": 0.18614152669906617,
+      "step": 895
+    },
+    {
+      "epoch": 0.16375545851528384,
+      "grad_norm": 0.16193026304244995,
+      "learning_rate": 4.770088212768241e-05,
+      "loss": 0.18564575910568237,
+      "step": 900
+    },
+    {
+      "epoch": 0.16466521106259097,
+      "grad_norm": 0.16048531234264374,
+      "learning_rate": 4.7669949816846173e-05,
+      "loss": 0.18330031633377075,
+      "step": 905
+    },
+    {
+      "epoch": 0.1655749636098981,
+      "grad_norm": 0.1440177708864212,
+      "learning_rate": 4.7638820979495534e-05,
+      "loss": 0.17712442874908446,
+      "step": 910
+    },
+    {
+      "epoch": 0.16648471615720525,
+      "grad_norm": 0.19635969400405884,
+      "learning_rate": 4.760749588548738e-05,
+      "loss": 0.18679027557373046,
+      "step": 915
+    },
+    {
+      "epoch": 0.16739446870451238,
+      "grad_norm": 0.15576541423797607,
+      "learning_rate": 4.757597480637995e-05,
+      "loss": 0.19283764362335204,
+      "step": 920
+    },
+    {
+      "epoch": 0.1683042212518195,
+      "grad_norm": 0.1550331562757492,
+      "learning_rate": 4.7544258015430463e-05,
+      "loss": 0.18269542455673218,
+      "step": 925
+    },
+    {
+      "epoch": 0.16921397379912664,
+      "grad_norm": 0.18369626998901367,
+      "learning_rate": 4.75123457875928e-05,
+      "loss": 0.1697891116142273,
+      "step": 930
+    },
+    {
+      "epoch": 0.17012372634643377,
+      "grad_norm": 0.15266314148902893,
+      "learning_rate": 4.7480238399515074e-05,
+      "loss": 0.18523451089859008,
+      "step": 935
+    },
+    {
+      "epoch": 0.1710334788937409,
+      "grad_norm": 0.16709664463996887,
+      "learning_rate": 4.744793612953724e-05,
+      "loss": 0.1803238034248352,
+      "step": 940
+    },
+    {
+      "epoch": 0.17194323144104803,
+      "grad_norm": 0.14929179847240448,
+      "learning_rate": 4.741543925768872e-05,
+      "loss": 0.1861217737197876,
+      "step": 945
+    },
+    {
+      "epoch": 0.17285298398835516,
+      "grad_norm": 0.1362280696630478,
+      "learning_rate": 4.7382748065685915e-05,
+      "loss": 0.17896100282669067,
+      "step": 950
+    },
+    {
+      "epoch": 0.1737627365356623,
+      "grad_norm": 0.15290239453315735,
+      "learning_rate": 4.734986283692982e-05,
+      "loss": 0.18432788848876952,
+      "step": 955
+    },
+    {
+      "epoch": 0.17467248908296942,
+      "grad_norm": 0.1287035197019577,
+      "learning_rate": 4.73167838565035e-05,
+      "loss": 0.18485682010650634,
+      "step": 960
+    },
+    {
+      "epoch": 0.17558224163027655,
+      "grad_norm": 0.17969627678394318,
+      "learning_rate": 4.728351141116971e-05,
+      "loss": 0.17361557483673096,
+      "step": 965
+    },
+    {
+      "epoch": 0.1764919941775837,
+      "grad_norm": 0.13751201331615448,
+      "learning_rate": 4.7250045789368326e-05,
+      "loss": 0.1731679320335388,
+      "step": 970
+    },
+    {
+      "epoch": 0.17740174672489084,
+      "grad_norm": 0.1603265255689621,
+      "learning_rate": 4.721638728121388e-05,
+      "loss": 0.17308170795440675,
+      "step": 975
+    },
+    {
+      "epoch": 0.17831149927219797,
+      "grad_norm": 0.1592789888381958,
+      "learning_rate": 4.718253617849306e-05,
+      "loss": 0.17534757852554322,
+      "step": 980
+    },
+    {
+      "epoch": 0.1792212518195051,
+      "grad_norm": 0.12727224826812744,
+      "learning_rate": 4.714849277466214e-05,
+      "loss": 0.17817609310150145,
+      "step": 985
+    },
+    {
+      "epoch": 0.18013100436681223,
+      "grad_norm": 0.15401554107666016,
+      "learning_rate": 4.711425736484447e-05,
+      "loss": 0.1733405351638794,
+      "step": 990
+    },
+    {
+      "epoch": 0.18104075691411936,
+      "grad_norm": 0.13253968954086304,
+      "learning_rate": 4.7079830245827906e-05,
+      "loss": 0.17846795320510864,
+      "step": 995
+    },
+    {
+      "epoch": 0.1819505094614265,
+      "grad_norm": 0.21846213936805725,
+      "learning_rate": 4.7045211716062245e-05,
+      "loss": 0.18021599054336548,
+      "step": 1000
+    },
+    {
+      "epoch": 0.18286026200873362,
+      "grad_norm": 0.16867990791797638,
+      "learning_rate": 4.7010402075656595e-05,
+      "loss": 0.18232386112213134,
+      "step": 1005
+    },
+    {
+      "epoch": 0.18377001455604075,
+      "grad_norm": 0.17180582880973816,
+      "learning_rate": 4.697540162637686e-05,
+      "loss": 0.1816317319869995,
+      "step": 1010
+    },
+    {
+      "epoch": 0.18467976710334788,
+      "grad_norm": 0.16480213403701782,
+      "learning_rate": 4.694021067164303e-05,
+      "loss": 0.17718446254730225,
+      "step": 1015
+    },
+    {
+      "epoch": 0.185589519650655,
+      "grad_norm": 0.15015918016433716,
+      "learning_rate": 4.6904829516526605e-05,
+      "loss": 0.17412011623382567,
+      "step": 1020
+    },
+    {
+      "epoch": 0.18649927219796217,
+      "grad_norm": 0.14445139467716217,
+      "learning_rate": 4.686925846774795e-05,
+      "loss": 0.1778018832206726,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1874090247452693,
+      "grad_norm": 0.1701960265636444,
+      "learning_rate": 4.683349783367362e-05,
+      "loss": 0.16901081800460815,
+      "step": 1030
+    },
+    {
+      "epoch": 0.18831877729257643,
+      "grad_norm": 0.15894867479801178,
+      "learning_rate": 4.679754792431368e-05,
+      "loss": 0.17055928707122803,
+      "step": 1035
+    },
+    {
+      "epoch": 0.18922852983988356,
+      "grad_norm": 0.1511942446231842,
+      "learning_rate": 4.676140905131903e-05,
+      "loss": 0.17339680194854737,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1901382823871907,
+      "grad_norm": 0.14735209941864014,
+      "learning_rate": 4.672508152797872e-05,
+      "loss": 0.17802717685699462,
+      "step": 1045
+    },
+    {
+      "epoch": 0.19104803493449782,
+      "grad_norm": 0.17367291450500488,
+      "learning_rate": 4.66885656692172e-05,
+      "loss": 0.1732744097709656,
+      "step": 1050
+    },
+    {
+      "epoch": 0.19195778748180495,
+      "grad_norm": 0.147227481007576,
+      "learning_rate": 4.665186179159159e-05,
+      "loss": 0.17040517330169677,
+      "step": 1055
+    },
+    {
+      "epoch": 0.19286754002911208,
+      "grad_norm": 0.1709655076265335,
+      "learning_rate": 4.6614970213289e-05,
+      "loss": 0.17794088125228882,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1937772925764192,
+      "grad_norm": 0.1588088721036911,
+      "learning_rate": 4.657789125412366e-05,
+      "loss": 0.17180380821228028,
+      "step": 1065
+    },
+    {
+      "epoch": 0.19468704512372634,
+      "grad_norm": 0.14827021956443787,
+      "learning_rate": 4.654062523553428e-05,
+      "loss": 0.182997989654541,
+      "step": 1070
+    },
+    {
+      "epoch": 0.19559679767103347,
+      "grad_norm": 0.16230466961860657,
+      "learning_rate": 4.6503172480581126e-05,
+      "loss": 0.17346880435943604,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1965065502183406,
+      "grad_norm": 0.1637624353170395,
+      "learning_rate": 4.646553331394333e-05,
+      "loss": 0.17263576984405518,
+      "step": 1080
+    },
+    {
+      "epoch": 0.19741630276564776,
+      "grad_norm": 0.15977843105793,
+      "learning_rate": 4.642770806191603e-05,
+      "loss": 0.17284308671951293,
+      "step": 1085
+    },
+    {
+      "epoch": 0.19832605531295489,
+      "grad_norm": 0.15394869446754456,
+      "learning_rate": 4.6389697052407534e-05,
+      "loss": 0.17797101736068727,
+      "step": 1090
+    },
+    {
+      "epoch": 0.19923580786026202,
+      "grad_norm": 0.15995225310325623,
+      "learning_rate": 4.6351500614936485e-05,
+      "loss": 0.18137198686599731,
+      "step": 1095
+    },
+    {
+      "epoch": 0.20014556040756915,
+      "grad_norm": 0.1779479682445526,
+      "learning_rate": 4.6313119080629006e-05,
+      "loss": 0.17998344898223878,
+      "step": 1100
+    },
+    {
+      "epoch": 0.20105531295487628,
+      "grad_norm": 0.14362832903862,
+      "learning_rate": 4.627455278221584e-05,
+      "loss": 0.18196423053741456,
+      "step": 1105
+    },
+    {
+      "epoch": 0.2019650655021834,
+      "grad_norm": 0.15951639413833618,
+      "learning_rate": 4.623580205402947e-05,
+      "loss": 0.17423888444900512,
+      "step": 1110
+    },
+    {
+      "epoch": 0.20287481804949054,
+      "grad_norm": 0.17273563146591187,
+      "learning_rate": 4.619686723200115e-05,
+      "loss": 0.17392473220825194,
+      "step": 1115
+    },
+    {
+      "epoch": 0.20378457059679767,
+      "grad_norm": 0.1655360758304596,
+      "learning_rate": 4.615774865365813e-05,
+      "loss": 0.17528389692306517,
+      "step": 1120
+    },
+    {
+      "epoch": 0.2046943231441048,
+      "grad_norm": 0.15920691192150116,
+      "learning_rate": 4.611844665812058e-05,
+      "loss": 0.1806849241256714,
+      "step": 1125
+    },
+    {
+      "epoch": 0.20560407569141192,
+      "grad_norm": 0.16114577651023865,
+      "learning_rate": 4.607896158609875e-05,
+      "loss": 0.17217352390289306,
+      "step": 1130
+    },
+    {
+      "epoch": 0.20651382823871905,
+      "grad_norm": 0.1499422937631607,
+      "learning_rate": 4.603929377988999e-05,
+      "loss": 0.17806737422943114,
+      "step": 1135
+    },
+    {
+      "epoch": 0.2074235807860262,
+      "grad_norm": 0.17605191469192505,
+      "learning_rate": 4.5999443583375765e-05,
+      "loss": 0.17842113971710205,
+      "step": 1140
+    },
+    {
+      "epoch": 0.20833333333333334,
+      "grad_norm": 0.16117210686206818,
+      "learning_rate": 4.595941134201871e-05,
+      "loss": 0.18379683494567872,
+      "step": 1145
+    },
+    {
+      "epoch": 0.20924308588064047,
+      "grad_norm": 0.21199050545692444,
+      "learning_rate": 4.591919740285957e-05,
+      "loss": 0.16286123991012574,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2101528384279476,
+      "grad_norm": 0.15100529789924622,
+      "learning_rate": 4.587880211451427e-05,
+      "loss": 0.17995200157165528,
+      "step": 1155
+    },
+    {
+      "epoch": 0.21106259097525473,
+      "grad_norm": 0.16618172824382782,
+      "learning_rate": 4.583822582717085e-05,
+      "loss": 0.16960303783416747,
+      "step": 1160
+    },
+    {
+      "epoch": 0.21197234352256186,
+      "grad_norm": 0.14743569493293762,
+      "learning_rate": 4.579746889258643e-05,
+      "loss": 0.17762668132781984,
+      "step": 1165
+    },
+    {
+      "epoch": 0.212882096069869,
+      "grad_norm": 0.1697179079055786,
+      "learning_rate": 4.575653166408417e-05,
+      "loss": 0.16656005382537842,
+      "step": 1170
+    },
+    {
+      "epoch": 0.21379184861717612,
+      "grad_norm": 0.14886513352394104,
+      "learning_rate": 4.57154144965502e-05,
+      "loss": 0.17091882228851318,
+      "step": 1175
+    },
+    {
+      "epoch": 0.21470160116448325,
+      "grad_norm": 0.18197473883628845,
+      "learning_rate": 4.5674117746430556e-05,
+      "loss": 0.1770920753479004,
+      "step": 1180
+    },
+    {
+      "epoch": 0.21561135371179038,
+      "grad_norm": 0.17323088645935059,
+      "learning_rate": 4.563264177172807e-05,
+      "loss": 0.1734643578529358,
+      "step": 1185
+    },
+    {
+      "epoch": 0.2165211062590975,
+      "grad_norm": 0.1521984338760376,
+      "learning_rate": 4.559098693199929e-05,
+      "loss": 0.17515116930007935,
+      "step": 1190
+    },
+    {
+      "epoch": 0.21743085880640467,
+      "grad_norm": 0.1842304915189743,
+      "learning_rate": 4.554915358835134e-05,
+      "loss": 0.16798022985458375,
+      "step": 1195
+    },
+    {
+      "epoch": 0.2183406113537118,
+      "grad_norm": 0.14753451943397522,
+      "learning_rate": 4.5507142103438794e-05,
+      "loss": 0.1755476713180542,
+      "step": 1200
+    },
+    {
+      "epoch": 0.21925036390101893,
+      "grad_norm": 0.17096194624900818,
+      "learning_rate": 4.546495284146057e-05,
+      "loss": 0.1792473554611206,
+      "step": 1205
+    },
+    {
+      "epoch": 0.22016011644832606,
+      "grad_norm": 0.1579233556985855,
+      "learning_rate": 4.542258616815669e-05,
+      "loss": 0.17230144739151002,
+      "step": 1210
+    },
+    {
+      "epoch": 0.2210698689956332,
+      "grad_norm": 0.177297905087471,
+      "learning_rate": 4.5380042450805216e-05,
+      "loss": 0.1807127833366394,
+      "step": 1215
+    },
+    {
+      "epoch": 0.22197962154294032,
+      "grad_norm": 0.14331696927547455,
+      "learning_rate": 4.533732205821897e-05,
+      "loss": 0.17201389074325563,
+      "step": 1220
+    },
+    {
+      "epoch": 0.22288937409024745,
+      "grad_norm": 0.14473360776901245,
+      "learning_rate": 4.529442536074239e-05,
+      "loss": 0.17036900520324708,
+      "step": 1225
+    },
+    {
+      "epoch": 0.22379912663755458,
+      "grad_norm": 0.1820901483297348,
+      "learning_rate": 4.5251352730248314e-05,
+      "loss": 0.17704882621765136,
+      "step": 1230
+    },
+    {
+      "epoch": 0.2247088791848617,
+      "grad_norm": 0.1948976367712021,
+      "learning_rate": 4.5208104540134746e-05,
+      "loss": 0.1706973433494568,
+      "step": 1235
+    },
+    {
+      "epoch": 0.22561863173216884,
+      "grad_norm": 0.16660070419311523,
+      "learning_rate": 4.51646811653216e-05,
+      "loss": 0.17636821269989014,
+      "step": 1240
+    },
+    {
+      "epoch": 0.22652838427947597,
+      "grad_norm": 0.1699984073638916,
+      "learning_rate": 4.512108298224751e-05,
+      "loss": 0.16986632347106934,
+      "step": 1245
+    },
+    {
+      "epoch": 0.22743813682678313,
+      "grad_norm": 0.17601042985916138,
+      "learning_rate": 4.50773103688665e-05,
+      "loss": 0.17507898807525635,
+      "step": 1250
+    },
+    {
+      "epoch": 0.22834788937409026,
+      "grad_norm": 0.17557238042354584,
+      "learning_rate": 4.503336370464476e-05,
+      "loss": 0.17702863216400147,
+      "step": 1255
+    },
+    {
+      "epoch": 0.2292576419213974,
+      "grad_norm": 0.1800651252269745,
+      "learning_rate": 4.498924337055729e-05,
+      "loss": 0.16419180631637573,
+      "step": 1260
+    },
+    {
+      "epoch": 0.23016739446870452,
+      "grad_norm": 0.2022479772567749,
+      "learning_rate": 4.494494974908468e-05,
+      "loss": 0.17482060194015503,
+      "step": 1265
+    },
+    {
+      "epoch": 0.23107714701601165,
+      "grad_norm": 0.14180205762386322,
+      "learning_rate": 4.490048322420973e-05,
+      "loss": 0.1723136067390442,
+      "step": 1270
+    },
+    {
+      "epoch": 0.23198689956331878,
+      "grad_norm": 0.18607310950756073,
+      "learning_rate": 4.485584418141419e-05,
+      "loss": 0.17096419334411622,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2328966521106259,
+      "grad_norm": 0.15958310663700104,
+      "learning_rate": 4.481103300767529e-05,
+      "loss": 0.1656244158744812,
+      "step": 1280
+    },
+    {
+      "epoch": 0.23380640465793304,
+      "grad_norm": 0.17552383244037628,
+      "learning_rate": 4.476605009146255e-05,
+      "loss": 0.17677626609802247,
+      "step": 1285
+    },
+    {
+      "epoch": 0.23471615720524017,
+      "grad_norm": 0.15299823880195618,
+      "learning_rate": 4.472089582273429e-05,
+      "loss": 0.1778991103172302,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2356259097525473,
+      "grad_norm": 0.14613987505435944,
+      "learning_rate": 4.46755705929343e-05,
+      "loss": 0.17071452140808105,
+      "step": 1295
+    },
+    {
+      "epoch": 0.23653566229985443,
+      "grad_norm": 0.17781122028827667,
+      "learning_rate": 4.463007479498843e-05,
+      "loss": 0.16955430507659913,
+      "step": 1300
+    },
+    {
+      "epoch": 0.23744541484716158,
+      "grad_norm": 0.16326487064361572,
+      "learning_rate": 4.458440882330119e-05,
+      "loss": 0.1777693510055542,
+      "step": 1305
+    },
+    {
+      "epoch": 0.23835516739446871,
+      "grad_norm": 0.17701926827430725,
+      "learning_rate": 4.4538573073752365e-05,
+      "loss": 0.16323351860046387,
+      "step": 1310
+    },
+    {
+      "epoch": 0.23926491994177584,
+      "grad_norm": 0.13104717433452606,
+      "learning_rate": 4.449256794369349e-05,
+      "loss": 0.17653456926345826,
+      "step": 1315
+    },
+    {
+      "epoch": 0.24017467248908297,
+      "grad_norm": 0.1796836256980896,
+      "learning_rate": 4.444639383194452e-05,
+      "loss": 0.17189600467681884,
+      "step": 1320
+    },
+    {
+      "epoch": 0.2410844250363901,
+      "grad_norm": 0.14919696748256683,
+      "learning_rate": 4.440005113879029e-05,
+      "loss": 0.17003334760665895,
+      "step": 1325
+    },
+    {
+      "epoch": 0.24199417758369723,
+      "grad_norm": 0.1728784441947937,
+      "learning_rate": 4.4353540265977064e-05,
+      "loss": 0.17397408485412597,
+      "step": 1330
+    },
+    {
+      "epoch": 0.24290393013100436,
+      "grad_norm": 0.14591015875339508,
+      "learning_rate": 4.43068616167091e-05,
+      "loss": 0.16498478651046752,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2438136826783115,
+      "grad_norm": 0.18417201936244965,
+      "learning_rate": 4.4260015595645055e-05,
+      "loss": 0.16841750144958495,
+      "step": 1340
+    },
+    {
+      "epoch": 0.24472343522561862,
+      "grad_norm": 0.16264279186725616,
+      "learning_rate": 4.4213002608894605e-05,
+      "loss": 0.16907373666763306,
+      "step": 1345
+    },
+    {
+      "epoch": 0.24563318777292575,
+      "grad_norm": 0.15248481929302216,
+      "learning_rate": 4.416582306401481e-05,
+      "loss": 0.15931472778320313,
+      "step": 1350
+    },
+    {
+      "epoch": 0.24654294032023288,
+      "grad_norm": 0.1488373875617981,
+      "learning_rate": 4.4118477370006636e-05,
+      "loss": 0.1701716423034668,
+      "step": 1355
+    },
+    {
+      "epoch": 0.24745269286754004,
+      "grad_norm": 0.14679782092571259,
+      "learning_rate": 4.407096593731142e-05,
+      "loss": 0.157412326335907,
+      "step": 1360
+    },
+    {
+      "epoch": 0.24836244541484717,
+      "grad_norm": 0.17139530181884766,
+      "learning_rate": 4.402328917780728e-05,
+      "loss": 0.17303754091262818,
+      "step": 1365
+    },
+    {
+      "epoch": 0.2492721979621543,
+      "grad_norm": 0.1534871757030487,
+      "learning_rate": 4.397544750480554e-05,
+      "loss": 0.1786255121231079,
+      "step": 1370
+    },
+    {
+      "epoch": 0.2501819505094614,
+      "grad_norm": 0.1876252293586731,
+      "learning_rate": 4.39274413330472e-05,
+      "loss": 0.16442898511886597,
+      "step": 1375
+    },
+    {
+      "epoch": 0.25109170305676853,
+      "grad_norm": 0.16165752708911896,
+      "learning_rate": 4.387927107869928e-05,
+      "loss": 0.1780426025390625,
+      "step": 1380
+    },
+    {
+      "epoch": 0.25200145560407566,
+      "grad_norm": 0.17242255806922913,
+      "learning_rate": 4.383093715935124e-05,
+      "loss": 0.15959256887435913,
+      "step": 1385
+    },
+    {
+      "epoch": 0.25291120815138285,
+      "grad_norm": 0.1627114862203598,
+      "learning_rate": 4.378243999401137e-05,
+      "loss": 0.17606115341186523,
+      "step": 1390
+    },
+    {
+      "epoch": 0.25382096069869,
+      "grad_norm": 0.15911224484443665,
+      "learning_rate": 4.373378000310312e-05,
+      "loss": 0.16798585653305054,
+      "step": 1395
+    },
+    {
+      "epoch": 0.2547307132459971,
+      "grad_norm": 0.15542249381542206,
+      "learning_rate": 4.3684957608461505e-05,
+      "loss": 0.1695417881011963,
+      "step": 1400
+    },
+    {
+      "epoch": 0.25564046579330424,
+      "grad_norm": 0.1475304812192917,
+      "learning_rate": 4.363597323332941e-05,
+      "loss": 0.16340878009796142,
+      "step": 1405
+    },
+    {
+      "epoch": 0.25655021834061137,
+      "grad_norm": 0.16943927109241486,
+      "learning_rate": 4.358682730235395e-05,
+      "loss": 0.17240238189697266,
+      "step": 1410
+    },
+    {
+      "epoch": 0.2574599708879185,
+      "grad_norm": 0.1816391944885254,
+      "learning_rate": 4.3537520241582744e-05,
+      "loss": 0.16558437347412108,
+      "step": 1415
+    },
+    {
+      "epoch": 0.25836972343522563,
+      "grad_norm": 0.23851341009140015,
+      "learning_rate": 4.348805247846027e-05,
+      "loss": 0.16796000003814698,
+      "step": 1420
+    },
+    {
+      "epoch": 0.25927947598253276,
+      "grad_norm": 0.15415243804454803,
+      "learning_rate": 4.343842444182414e-05,
+      "loss": 0.1746017098426819,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2601892285298399,
+      "grad_norm": 0.15651032328605652,
+      "learning_rate": 4.338863656190139e-05,
+      "loss": 0.1649057984352112,
+      "step": 1430
+    },
+    {
+      "epoch": 0.261098981077147,
+      "grad_norm": 0.16601966321468353,
+      "learning_rate": 4.333868927030471e-05,
+      "loss": 0.15888988971710205,
+      "step": 1435
+    },
+    {
+      "epoch": 0.26200873362445415,
+      "grad_norm": 0.1549467295408249,
+      "learning_rate": 4.328858300002876e-05,
+      "loss": 0.16357985734939576,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2629184861717613,
+      "grad_norm": 0.16332370042800903,
+      "learning_rate": 4.32383181854464e-05,
+      "loss": 0.16749982833862304,
+      "step": 1445
+    },
+    {
+      "epoch": 0.2638282387190684,
+      "grad_norm": 0.14827077090740204,
+      "learning_rate": 4.3187895262304894e-05,
+      "loss": 0.16886214017868043,
+      "step": 1450
+    },
+    {
+      "epoch": 0.26473799126637554,
+      "grad_norm": 0.1557198166847229,
+      "learning_rate": 4.313731466772216e-05,
+      "loss": 0.17512214183807373,
+      "step": 1455
+    },
+    {
+      "epoch": 0.26564774381368267,
+      "grad_norm": 0.17263570427894592,
+      "learning_rate": 4.308657684018299e-05,
+      "loss": 0.16248074769973755,
+      "step": 1460
+    },
+    {
+      "epoch": 0.2665574963609898,
+      "grad_norm": 0.17135761678218842,
+      "learning_rate": 4.303568221953521e-05,
+      "loss": 0.16605921983718872,
+      "step": 1465
+    },
+    {
+      "epoch": 0.26746724890829693,
+      "grad_norm": 0.14322632551193237,
+      "learning_rate": 4.2984631246985897e-05,
+      "loss": 0.1610772728919983,
+      "step": 1470
+    },
+    {
+      "epoch": 0.26837700145560406,
+      "grad_norm": 0.18852312862873077,
+      "learning_rate": 4.2933424365097564e-05,
+      "loss": 0.1686462163925171,
+      "step": 1475
+    },
+    {
+      "epoch": 0.2692867540029112,
+      "grad_norm": 0.1780245155096054,
+      "learning_rate": 4.2882062017784294e-05,
+      "loss": 0.16953932046890258,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2701965065502183,
+      "grad_norm": 0.180568665266037,
+      "learning_rate": 4.2830544650307895e-05,
+      "loss": 0.16442664861679077,
+      "step": 1485
+    },
+    {
+      "epoch": 0.27110625909752545,
+      "grad_norm": 0.16876435279846191,
+      "learning_rate": 4.277887270927407e-05,
+      "loss": 0.17128173112869263,
+      "step": 1490
+    },
+    {
+      "epoch": 0.2720160116448326,
+      "grad_norm": 0.164053276181221,
+      "learning_rate": 4.2727046642628513e-05,
+      "loss": 0.16331382989883422,
+      "step": 1495
+    },
+    {
+      "epoch": 0.27292576419213976,
+      "grad_norm": 0.14577528834342957,
+      "learning_rate": 4.267506689965305e-05,
+      "loss": 0.1638316035270691,
+      "step": 1500
+    },
+    {
+      "epoch": 0.2738355167394469,
+      "grad_norm": 0.1648740917444229,
+      "learning_rate": 4.262293393096171e-05,
+      "loss": 0.15332664251327516,
+      "step": 1505
+    },
+    {
+      "epoch": 0.274745269286754,
+      "grad_norm": 0.16445094347000122,
+      "learning_rate": 4.257064818849685e-05,
+      "loss": 0.1706634521484375,
+      "step": 1510
+    },
+    {
+      "epoch": 0.27565502183406115,
+      "grad_norm": 0.1584935486316681,
+      "learning_rate": 4.251821012552524e-05,
+      "loss": 0.1684114694595337,
+      "step": 1515
+    },
+    {
+      "epoch": 0.2765647743813683,
+      "grad_norm": 0.17215611040592194,
+      "learning_rate": 4.24656201966341e-05,
+      "loss": 0.15594131946563722,
+      "step": 1520
+    },
+    {
+      "epoch": 0.2774745269286754,
+      "grad_norm": 0.15945589542388916,
+      "learning_rate": 4.2412878857727214e-05,
+      "loss": 0.1686659574508667,
+      "step": 1525
+    },
+    {
+      "epoch": 0.27838427947598254,
+      "grad_norm": 0.16103951632976532,
+      "learning_rate": 4.2359986566020906e-05,
+      "loss": 0.17779340744018554,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2792940320232897,
+      "grad_norm": 0.1770307570695877,
+      "learning_rate": 4.230694378004014e-05,
+      "loss": 0.16786882877349854,
+      "step": 1535
+    },
+    {
+      "epoch": 0.2802037845705968,
+      "grad_norm": 0.16225053369998932,
+      "learning_rate": 4.2253750959614504e-05,
+      "loss": 0.16558897495269775,
+      "step": 1540
+    },
+    {
+      "epoch": 0.28111353711790393,
+      "grad_norm": 0.27213969826698303,
+      "learning_rate": 4.220040856587425e-05,
+      "loss": 0.1641119599342346,
+      "step": 1545
+    },
+    {
+      "epoch": 0.28202328966521106,
+      "grad_norm": 0.1773071587085724,
+      "learning_rate": 4.2146917061246284e-05,
+      "loss": 0.16919140815734862,
+      "step": 1550
+    },
+    {
+      "epoch": 0.2829330422125182,
+      "grad_norm": 0.15519705414772034,
+      "learning_rate": 4.209327690945014e-05,
+      "loss": 0.15501506328582765,
+      "step": 1555
+    },
+    {
+      "epoch": 0.2838427947598253,
+      "grad_norm": 0.19921597838401794,
+      "learning_rate": 4.203948857549402e-05,
+      "loss": 0.1690821886062622,
+      "step": 1560
+    },
+    {
+      "epoch": 0.28475254730713245,
+      "grad_norm": 0.15417630970478058,
+      "learning_rate": 4.1985552525670696e-05,
+      "loss": 0.1675640344619751,
+      "step": 1565
+    },
+    {
+      "epoch": 0.2856622998544396,
+      "grad_norm": 0.1739572137594223,
+      "learning_rate": 4.193146922755348e-05,
+      "loss": 0.16738017797470092,
+      "step": 1570
+    },
+    {
+      "epoch": 0.2865720524017467,
+      "grad_norm": 0.1384361982345581,
+      "learning_rate": 4.187723914999221e-05,
+      "loss": 0.16802358627319336,
+      "step": 1575
+    },
+    {
+      "epoch": 0.28748180494905384,
+      "grad_norm": 0.1491454839706421,
+      "learning_rate": 4.182286276310915e-05,
+      "loss": 0.1619583249092102,
+      "step": 1580
+    },
+    {
+      "epoch": 0.288391557496361,
+      "grad_norm": 0.15831919014453888,
+      "learning_rate": 4.176834053829492e-05,
+      "loss": 0.1625199794769287,
+      "step": 1585
+    },
+    {
+      "epoch": 0.2893013100436681,
+      "grad_norm": 0.16265396773815155,
+      "learning_rate": 4.1713672948204416e-05,
+      "loss": 0.16718552112579346,
+      "step": 1590
+    },
+    {
+      "epoch": 0.29021106259097523,
+      "grad_norm": 0.15153461694717407,
+      "learning_rate": 4.1658860466752714e-05,
+      "loss": 0.15979087352752686,
+      "step": 1595
+    },
+    {
+      "epoch": 0.29112081513828236,
+      "grad_norm": 0.1620412915945053,
+      "learning_rate": 4.160390356911096e-05,
+      "loss": 0.16103557348251343,
+      "step": 1600
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 8.860485929468659e+17,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-1600/training_args.bin b/checkpoint-1600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-1600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/checkpoint-1700/README.md b/checkpoint-1700/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-1700/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-1700/adapter_config.json b/checkpoint-1700/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-1700/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-1700/adapter_model.safetensors b/checkpoint-1700/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..4fabe07009ae70f5b0cb414813e8df5f42ed1a68
--- /dev/null
+++ b/checkpoint-1700/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1fc93d16691d0e3cdc725a1d2e6b92ff9081c08a109445bcc51a63ae59ccca77
+size 169741912
diff --git a/checkpoint-1700/chat_template.jinja b/checkpoint-1700/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-1700/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-1700/optimizer.pt b/checkpoint-1700/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b285e977f72411d65e135e9e447e5fe5e9c72182
--- /dev/null
+++ b/checkpoint-1700/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e035c7f0b6983e93185d1768718559dd5f3716bce8fa1ef79cff194aa53d17e8
+size 72807355
diff --git a/checkpoint-1700/processor_config.json b/checkpoint-1700/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-1700/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-1700/rng_state.pth b/checkpoint-1700/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-1700/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-1700/scheduler.pt b/checkpoint-1700/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b6434a76d4d87812a0731572c67f178ae6865ae7
--- /dev/null
+++ b/checkpoint-1700/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d443ebce6b9e1fa228706e611e162a7398618a90c3923fa065457abec2fe8fa0
+size 1465
diff --git a/checkpoint-1700/tokenizer.json b/checkpoint-1700/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-1700/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-1700/tokenizer_config.json b/checkpoint-1700/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-1700/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-1700/trainer_state.json b/checkpoint-1700/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..1f22bbf02fd2298130e2bde7a2667499990b6d6f
--- /dev/null
+++ b/checkpoint-1700/trainer_state.json
@@ -0,0 +1,2422 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.309315866084425,
+  "eval_steps": 100,
+  "global_step": 1700,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    },
+    {
+      "epoch": 0.03729985443959243,
+      "grad_norm": 0.061678580939769745,
+      "learning_rate": 4.999340748636462e-05,
+      "loss": 0.24956226348876953,
+      "step": 205
+    },
+    {
+      "epoch": 0.03820960698689956,
+      "grad_norm": 0.07328873127698898,
+      "learning_rate": 4.999160884067051e-05,
+      "loss": 0.26169676780700685,
+      "step": 210
+    },
+    {
+      "epoch": 0.0391193595342067,
+      "grad_norm": 0.08287990838289261,
+      "learning_rate": 4.9989593541926246e-05,
+      "loss": 0.2574604034423828,
+      "step": 215
+    },
+    {
+      "epoch": 0.04002911208151383,
+      "grad_norm": 0.06787359714508057,
+      "learning_rate": 4.9987361607602525e-05,
+      "loss": 0.25351409912109374,
+      "step": 220
+    },
+    {
+      "epoch": 0.04093886462882096,
+      "grad_norm": 0.06695502996444702,
+      "learning_rate": 4.998491305704805e-05,
+      "loss": 0.24522039890289307,
+      "step": 225
+    },
+    {
+      "epoch": 0.041848617176128096,
+      "grad_norm": 0.08872214704751968,
+      "learning_rate": 4.9982247911489375e-05,
+      "loss": 0.2581867933273315,
+      "step": 230
+    },
+    {
+      "epoch": 0.042758369723435226,
+      "grad_norm": 0.07637131959199905,
+      "learning_rate": 4.9979366194030743e-05,
+      "loss": 0.25569658279418944,
+      "step": 235
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 0.08158119022846222,
+      "learning_rate": 4.997626792965385e-05,
+      "loss": 0.2529409646987915,
+      "step": 240
+    },
+    {
+      "epoch": 0.04457787481804949,
+      "grad_norm": 0.07529161125421524,
+      "learning_rate": 4.997295314521766e-05,
+      "loss": 0.24049024581909179,
+      "step": 245
+    },
+    {
+      "epoch": 0.04548762736535662,
+      "grad_norm": 0.08860139548778534,
+      "learning_rate": 4.996942186945813e-05,
+      "loss": 0.2490522861480713,
+      "step": 250
+    },
+    {
+      "epoch": 0.04639737991266375,
+      "grad_norm": 0.0850321501493454,
+      "learning_rate": 4.9965674132988005e-05,
+      "loss": 0.24180831909179687,
+      "step": 255
+    },
+    {
+      "epoch": 0.04730713245997089,
+      "grad_norm": 0.07556115090847015,
+      "learning_rate": 4.996170996829653e-05,
+      "loss": 0.2509631872177124,
+      "step": 260
+    },
+    {
+      "epoch": 0.04821688500727802,
+      "grad_norm": 0.07971206307411194,
+      "learning_rate": 4.995752940974918e-05,
+      "loss": 0.24398891925811766,
+      "step": 265
+    },
+    {
+      "epoch": 0.04912663755458515,
+      "grad_norm": 0.09149336814880371,
+      "learning_rate": 4.9953132493587344e-05,
+      "loss": 0.2300492286682129,
+      "step": 270
+    },
+    {
+      "epoch": 0.050036390101892286,
+      "grad_norm": 0.08265820890665054,
+      "learning_rate": 4.9948519257928034e-05,
+      "loss": 0.24246792793273925,
+      "step": 275
+    },
+    {
+      "epoch": 0.050946142649199416,
+      "grad_norm": 0.10328587144613266,
+      "learning_rate": 4.9943689742763534e-05,
+      "loss": 0.2367171049118042,
+      "step": 280
+    },
+    {
+      "epoch": 0.05185589519650655,
+      "grad_norm": 0.0836917981505394,
+      "learning_rate": 4.993864398996105e-05,
+      "loss": 0.23215813636779786,
+      "step": 285
+    },
+    {
+      "epoch": 0.05276564774381368,
+      "grad_norm": 0.09475161135196686,
+      "learning_rate": 4.99333820432624e-05,
+      "loss": 0.2350748062133789,
+      "step": 290
+    },
+    {
+      "epoch": 0.05367540029112081,
+      "grad_norm": 0.08040128648281097,
+      "learning_rate": 4.992790394828355e-05,
+      "loss": 0.23253886699676513,
+      "step": 295
+    },
+    {
+      "epoch": 0.05458515283842795,
+      "grad_norm": 0.08852150291204453,
+      "learning_rate": 4.992220975251428e-05,
+      "loss": 0.23856515884399415,
+      "step": 300
+    },
+    {
+      "epoch": 0.05549490538573508,
+      "grad_norm": 0.09565229713916779,
+      "learning_rate": 4.991629950531775e-05,
+      "loss": 0.23311660289764405,
+      "step": 305
+    },
+    {
+      "epoch": 0.05640465793304221,
+      "grad_norm": 0.08158160001039505,
+      "learning_rate": 4.991017325793009e-05,
+      "loss": 0.22467944622039795,
+      "step": 310
+    },
+    {
+      "epoch": 0.05731441048034935,
+      "grad_norm": 0.07746429741382599,
+      "learning_rate": 4.990383106345994e-05,
+      "loss": 0.229844069480896,
+      "step": 315
+    },
+    {
+      "epoch": 0.05822416302765648,
+      "grad_norm": 0.08564355969429016,
+      "learning_rate": 4.989727297688797e-05,
+      "loss": 0.22414517402648926,
+      "step": 320
+    },
+    {
+      "epoch": 0.05913391557496361,
+      "grad_norm": 0.07517435401678085,
+      "learning_rate": 4.9890499055066435e-05,
+      "loss": 0.2236532211303711,
+      "step": 325
+    },
+    {
+      "epoch": 0.060043668122270744,
+      "grad_norm": 0.111734539270401,
+      "learning_rate": 4.988350935671869e-05,
+      "loss": 0.21474847793579102,
+      "step": 330
+    },
+    {
+      "epoch": 0.060953420669577874,
+      "grad_norm": 0.09906989336013794,
+      "learning_rate": 4.987630394243866e-05,
+      "loss": 0.23321933746337892,
+      "step": 335
+    },
+    {
+      "epoch": 0.06186317321688501,
+      "grad_norm": 0.10131457448005676,
+      "learning_rate": 4.98688828746903e-05,
+      "loss": 0.2310662031173706,
+      "step": 340
+    },
+    {
+      "epoch": 0.06277292576419213,
+      "grad_norm": 0.09203507006168365,
+      "learning_rate": 4.986124621780708e-05,
+      "loss": 0.22021169662475587,
+      "step": 345
+    },
+    {
+      "epoch": 0.06368267831149928,
+      "grad_norm": 0.09505912661552429,
+      "learning_rate": 4.9853394037991416e-05,
+      "loss": 0.2197155237197876,
+      "step": 350
+    },
+    {
+      "epoch": 0.06459243085880641,
+      "grad_norm": 0.09038657695055008,
+      "learning_rate": 4.984532640331412e-05,
+      "loss": 0.22066287994384765,
+      "step": 355
+    },
+    {
+      "epoch": 0.06550218340611354,
+      "grad_norm": 0.09707064181566238,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 0.22455451488494874,
+      "step": 360
+    },
+    {
+      "epoch": 0.06641193595342067,
+      "grad_norm": 0.10367228090763092,
+      "learning_rate": 4.98285450509961e-05,
+      "loss": 0.21993820667266845,
+      "step": 365
+    },
+    {
+      "epoch": 0.0673216885007278,
+      "grad_norm": 0.12229471653699875,
+      "learning_rate": 4.9819831478833456e-05,
+      "loss": 0.2168867588043213,
+      "step": 370
+    },
+    {
+      "epoch": 0.06823144104803494,
+      "grad_norm": 0.0964592918753624,
+      "learning_rate": 4.981090274276406e-05,
+      "loss": 0.21579203605651856,
+      "step": 375
+    },
+    {
+      "epoch": 0.06914119359534207,
+      "grad_norm": 0.09400496631860733,
+      "learning_rate": 4.980175892019141e-05,
+      "loss": 0.20972180366516113,
+      "step": 380
+    },
+    {
+      "epoch": 0.0700509461426492,
+      "grad_norm": 0.08158645778894424,
+      "learning_rate": 4.9792400090383594e-05,
+      "loss": 0.22148358821868896,
+      "step": 385
+    },
+    {
+      "epoch": 0.07096069868995633,
+      "grad_norm": 0.10916394740343094,
+      "learning_rate": 4.978282633447261e-05,
+      "loss": 0.2214418649673462,
+      "step": 390
+    },
+    {
+      "epoch": 0.07187045123726346,
+      "grad_norm": 0.11138810962438583,
+      "learning_rate": 4.9773037735453636e-05,
+      "loss": 0.21814754009246826,
+      "step": 395
+    },
+    {
+      "epoch": 0.07278020378457059,
+      "grad_norm": 0.10914396494626999,
+      "learning_rate": 4.9763034378184365e-05,
+      "loss": 0.21310818195343018,
+      "step": 400
+    },
+    {
+      "epoch": 0.07368995633187773,
+      "grad_norm": 0.1043366864323616,
+      "learning_rate": 4.975281634938421e-05,
+      "loss": 0.21266789436340333,
+      "step": 405
+    },
+    {
+      "epoch": 0.07459970887918486,
+      "grad_norm": 0.1036868542432785,
+      "learning_rate": 4.9742383737633594e-05,
+      "loss": 0.21606721878051757,
+      "step": 410
+    },
+    {
+      "epoch": 0.075509461426492,
+      "grad_norm": 0.11640442907810211,
+      "learning_rate": 4.9731736633373144e-05,
+      "loss": 0.21532948017120362,
+      "step": 415
+    },
+    {
+      "epoch": 0.07641921397379912,
+      "grad_norm": 0.11219926178455353,
+      "learning_rate": 4.9720875128902956e-05,
+      "loss": 0.2191627025604248,
+      "step": 420
+    },
+    {
+      "epoch": 0.07732896652110625,
+      "grad_norm": 0.12103637307882309,
+      "learning_rate": 4.970979931838176e-05,
+      "loss": 0.20938868522644044,
+      "step": 425
+    },
+    {
+      "epoch": 0.0782387190684134,
+      "grad_norm": 0.13274189829826355,
+      "learning_rate": 4.96985092978261e-05,
+      "loss": 0.21792960166931152,
+      "step": 430
+    },
+    {
+      "epoch": 0.07914847161572053,
+      "grad_norm": 0.11164513230323792,
+      "learning_rate": 4.968700516510954e-05,
+      "loss": 0.2022618055343628,
+      "step": 435
+    },
+    {
+      "epoch": 0.08005822416302766,
+      "grad_norm": 0.09532847255468369,
+      "learning_rate": 4.967528701996174e-05,
+      "loss": 0.21255812644958497,
+      "step": 440
+    },
+    {
+      "epoch": 0.08096797671033479,
+      "grad_norm": 0.10279258340597153,
+      "learning_rate": 4.96633549639677e-05,
+      "loss": 0.20683050155639648,
+      "step": 445
+    },
+    {
+      "epoch": 0.08187772925764192,
+      "grad_norm": 0.1257462352514267,
+      "learning_rate": 4.965120910056677e-05,
+      "loss": 0.21419920921325683,
+      "step": 450
+    },
+    {
+      "epoch": 0.08278748180494905,
+      "grad_norm": 0.11663137376308441,
+      "learning_rate": 4.963884953505186e-05,
+      "loss": 0.2072287082672119,
+      "step": 455
+    },
+    {
+      "epoch": 0.08369723435225619,
+      "grad_norm": 0.10488224029541016,
+      "learning_rate": 4.96262763745684e-05,
+      "loss": 0.1982678532600403,
+      "step": 460
+    },
+    {
+      "epoch": 0.08460698689956332,
+      "grad_norm": 0.11801692098379135,
+      "learning_rate": 4.961348972811354e-05,
+      "loss": 0.20662031173706055,
+      "step": 465
+    },
+    {
+      "epoch": 0.08551673944687045,
+      "grad_norm": 0.11318827420473099,
+      "learning_rate": 4.96004897065351e-05,
+      "loss": 0.20947303771972656,
+      "step": 470
+    },
+    {
+      "epoch": 0.08642649199417758,
+      "grad_norm": 0.13409486413002014,
+      "learning_rate": 4.95872764225307e-05,
+      "loss": 0.19670876264572143,
+      "step": 475
+    },
+    {
+      "epoch": 0.08733624454148471,
+      "grad_norm": 0.14440792798995972,
+      "learning_rate": 4.957384999064672e-05,
+      "loss": 0.19842848777770997,
+      "step": 480
+    },
+    {
+      "epoch": 0.08824599708879186,
+      "grad_norm": 0.12246996909379959,
+      "learning_rate": 4.956021052727731e-05,
+      "loss": 0.20318071842193602,
+      "step": 485
+    },
+    {
+      "epoch": 0.08915574963609899,
+      "grad_norm": 0.13437233865261078,
+      "learning_rate": 4.954635815066342e-05,
+      "loss": 0.21675212383270265,
+      "step": 490
+    },
+    {
+      "epoch": 0.09006550218340612,
+      "grad_norm": 0.11109672486782074,
+      "learning_rate": 4.9532292980891744e-05,
+      "loss": 0.2100757837295532,
+      "step": 495
+    },
+    {
+      "epoch": 0.09097525473071325,
+      "grad_norm": 0.1388893872499466,
+      "learning_rate": 4.9518015139893675e-05,
+      "loss": 0.20303285121917725,
+      "step": 500
+    },
+    {
+      "epoch": 0.09188500727802038,
+      "grad_norm": 0.13239721953868866,
+      "learning_rate": 4.950352475144427e-05,
+      "loss": 0.2152268409729004,
+      "step": 505
+    },
+    {
+      "epoch": 0.0927947598253275,
+      "grad_norm": 0.12834979593753815,
+      "learning_rate": 4.948882194116119e-05,
+      "loss": 0.20799248218536376,
+      "step": 510
+    },
+    {
+      "epoch": 0.09370451237263465,
+      "grad_norm": 0.11886704713106155,
+      "learning_rate": 4.947390683650354e-05,
+      "loss": 0.20394976139068605,
+      "step": 515
+    },
+    {
+      "epoch": 0.09461426491994178,
+      "grad_norm": 0.11398876458406448,
+      "learning_rate": 4.945877956677083e-05,
+      "loss": 0.2091092586517334,
+      "step": 520
+    },
+    {
+      "epoch": 0.09552401746724891,
+      "grad_norm": 0.1422540694475174,
+      "learning_rate": 4.944344026310186e-05,
+      "loss": 0.19564238786697388,
+      "step": 525
+    },
+    {
+      "epoch": 0.09643377001455604,
+      "grad_norm": 0.11359584331512451,
+      "learning_rate": 4.9427889058473535e-05,
+      "loss": 0.20493624210357667,
+      "step": 530
+    },
+    {
+      "epoch": 0.09734352256186317,
+      "grad_norm": 0.11703553050756454,
+      "learning_rate": 4.941212608769974e-05,
+      "loss": 0.2098615884780884,
+      "step": 535
+    },
+    {
+      "epoch": 0.0982532751091703,
+      "grad_norm": 0.14552047848701477,
+      "learning_rate": 4.939615148743017e-05,
+      "loss": 0.20382182598114013,
+      "step": 540
+    },
+    {
+      "epoch": 0.09916302765647744,
+      "grad_norm": 0.13178016245365143,
+      "learning_rate": 4.937996539614914e-05,
+      "loss": 0.19901862144470214,
+      "step": 545
+    },
+    {
+      "epoch": 0.10007278020378457,
+      "grad_norm": 0.635392427444458,
+      "learning_rate": 4.936356795417439e-05,
+      "loss": 0.20694944858551026,
+      "step": 550
+    },
+    {
+      "epoch": 0.1009825327510917,
+      "grad_norm": 0.15019077062606812,
+      "learning_rate": 4.934695930365586e-05,
+      "loss": 0.19313746690750122,
+      "step": 555
+    },
+    {
+      "epoch": 0.10189228529839883,
+      "grad_norm": 0.12941956520080566,
+      "learning_rate": 4.9330139588574474e-05,
+      "loss": 0.19671722650527954,
+      "step": 560
+    },
+    {
+      "epoch": 0.10280203784570596,
+      "grad_norm": 0.13818831741809845,
+      "learning_rate": 4.931310895474088e-05,
+      "loss": 0.20026786327362062,
+      "step": 565
+    },
+    {
+      "epoch": 0.1037117903930131,
+      "grad_norm": 0.12011194974184036,
+      "learning_rate": 4.929586754979417e-05,
+      "loss": 0.1932437539100647,
+      "step": 570
+    },
+    {
+      "epoch": 0.10462154294032024,
+      "grad_norm": 0.1345364898443222,
+      "learning_rate": 4.9278415523200644e-05,
+      "loss": 0.20245940685272218,
+      "step": 575
+    },
+    {
+      "epoch": 0.10553129548762737,
+      "grad_norm": 0.13281017541885376,
+      "learning_rate": 4.926075302625247e-05,
+      "loss": 0.19864981174468993,
+      "step": 580
+    },
+    {
+      "epoch": 0.1064410480349345,
+      "grad_norm": 0.13465586304664612,
+      "learning_rate": 4.924288021206639e-05,
+      "loss": 0.19573183059692384,
+      "step": 585
+    },
+    {
+      "epoch": 0.10735080058224163,
+      "grad_norm": 0.15225961804389954,
+      "learning_rate": 4.9224797235582396e-05,
+      "loss": 0.19946801662445068,
+      "step": 590
+    },
+    {
+      "epoch": 0.10826055312954876,
+      "grad_norm": 0.12816746532917023,
+      "learning_rate": 4.92065042535624e-05,
+      "loss": 0.19851526021957397,
+      "step": 595
+    },
+    {
+      "epoch": 0.1091703056768559,
+      "grad_norm": 0.13802853226661682,
+      "learning_rate": 4.9188001424588824e-05,
+      "loss": 0.19321763515472412,
+      "step": 600
+    },
+    {
+      "epoch": 0.11008005822416303,
+      "grad_norm": 0.17504797875881195,
+      "learning_rate": 4.9169288909063295e-05,
+      "loss": 0.2032616138458252,
+      "step": 605
+    },
+    {
+      "epoch": 0.11098981077147016,
+      "grad_norm": 0.13544194400310516,
+      "learning_rate": 4.91503668692052e-05,
+      "loss": 0.2011256456375122,
+      "step": 610
+    },
+    {
+      "epoch": 0.11189956331877729,
+      "grad_norm": 1.3976134061813354,
+      "learning_rate": 4.91312354690503e-05,
+      "loss": 0.19916868209838867,
+      "step": 615
+    },
+    {
+      "epoch": 0.11280931586608442,
+      "grad_norm": 0.1465059071779251,
+      "learning_rate": 4.91118948744493e-05,
+      "loss": 0.19487457275390624,
+      "step": 620
+    },
+    {
+      "epoch": 0.11371906841339156,
+      "grad_norm": 0.12103168666362762,
+      "learning_rate": 4.909234525306645e-05,
+      "loss": 0.1907251238822937,
+      "step": 625
+    },
+    {
+      "epoch": 0.1146288209606987,
+      "grad_norm": 0.12660574913024902,
+      "learning_rate": 4.907258677437802e-05,
+      "loss": 0.19327253103256226,
+      "step": 630
+    },
+    {
+      "epoch": 0.11553857350800582,
+      "grad_norm": 0.1347813606262207,
+      "learning_rate": 4.90526196096709e-05,
+      "loss": 0.19637736082077026,
+      "step": 635
+    },
+    {
+      "epoch": 0.11644832605531295,
+      "grad_norm": 0.14953652024269104,
+      "learning_rate": 4.903244393204107e-05,
+      "loss": 0.20325069427490233,
+      "step": 640
+    },
+    {
+      "epoch": 0.11735807860262008,
+      "grad_norm": 0.13936272263526917,
+      "learning_rate": 4.901205991639213e-05,
+      "loss": 0.1930275321006775,
+      "step": 645
+    },
+    {
+      "epoch": 0.11826783114992721,
+      "grad_norm": 0.1448420137166977,
+      "learning_rate": 4.899146773943374e-05,
+      "loss": 0.20026936531066894,
+      "step": 650
+    },
+    {
+      "epoch": 0.11917758369723436,
+      "grad_norm": 0.1312534064054489,
+      "learning_rate": 4.897066757968014e-05,
+      "loss": 0.19062033891677857,
+      "step": 655
+    },
+    {
+      "epoch": 0.12008733624454149,
+      "grad_norm": 0.13644742965698242,
+      "learning_rate": 4.894965961744859e-05,
+      "loss": 0.18719595670700073,
+      "step": 660
+    },
+    {
+      "epoch": 0.12099708879184862,
+      "grad_norm": 0.14276087284088135,
+      "learning_rate": 4.892844403485777e-05,
+      "loss": 0.19784307479858398,
+      "step": 665
+    },
+    {
+      "epoch": 0.12190684133915575,
+      "grad_norm": 0.14735399186611176,
+      "learning_rate": 4.890702101582623e-05,
+      "loss": 0.19163782596588136,
+      "step": 670
+    },
+    {
+      "epoch": 0.12281659388646288,
+      "grad_norm": 0.15742065012454987,
+      "learning_rate": 4.888539074607082e-05,
+      "loss": 0.19312986135482788,
+      "step": 675
+    },
+    {
+      "epoch": 0.12372634643377002,
+      "grad_norm": 0.12917031347751617,
+      "learning_rate": 4.8863553413105025e-05,
+      "loss": 0.20066320896148682,
+      "step": 680
+    },
+    {
+      "epoch": 0.12463609898107715,
+      "grad_norm": 0.1484801322221756,
+      "learning_rate": 4.884150920623737e-05,
+      "loss": 0.20096096992492676,
+      "step": 685
+    },
+    {
+      "epoch": 0.12554585152838427,
+      "grad_norm": 0.1455296128988266,
+      "learning_rate": 4.88192583165698e-05,
+      "loss": 0.20518505573272705,
+      "step": 690
+    },
+    {
+      "epoch": 0.12645560407569142,
+      "grad_norm": 0.14517490565776825,
+      "learning_rate": 4.879680093699598e-05,
+      "loss": 0.18859238624572755,
+      "step": 695
+    },
+    {
+      "epoch": 0.12736535662299855,
+      "grad_norm": 0.18778090178966522,
+      "learning_rate": 4.877413726219964e-05,
+      "loss": 0.197074818611145,
+      "step": 700
+    },
+    {
+      "epoch": 0.12827510917030568,
+      "grad_norm": 0.13497677445411682,
+      "learning_rate": 4.87512674886529e-05,
+      "loss": 0.18713107109069824,
+      "step": 705
+    },
+    {
+      "epoch": 0.12918486171761281,
+      "grad_norm": 0.12657155096530914,
+      "learning_rate": 4.872819181461455e-05,
+      "loss": 0.1858484387397766,
+      "step": 710
+    },
+    {
+      "epoch": 0.13009461426491994,
+      "grad_norm": 0.11458148807287216,
+      "learning_rate": 4.870491044012834e-05,
+      "loss": 0.18732179403305055,
+      "step": 715
+    },
+    {
+      "epoch": 0.13100436681222707,
+      "grad_norm": 0.13000249862670898,
+      "learning_rate": 4.8681423567021244e-05,
+      "loss": 0.1872936010360718,
+      "step": 720
+    },
+    {
+      "epoch": 0.1319141193595342,
+      "grad_norm": 0.14580890536308289,
+      "learning_rate": 4.865773139890172e-05,
+      "loss": 0.19280019998550416,
+      "step": 725
+    },
+    {
+      "epoch": 0.13282387190684133,
+      "grad_norm": 0.1507277935743332,
+      "learning_rate": 4.8633834141157913e-05,
+      "loss": 0.1898929238319397,
+      "step": 730
+    },
+    {
+      "epoch": 0.13373362445414846,
+      "grad_norm": 0.1418737769126892,
+      "learning_rate": 4.860973200095592e-05,
+      "loss": 0.17926375865936278,
+      "step": 735
+    },
+    {
+      "epoch": 0.1346433770014556,
+      "grad_norm": 0.17151866853237152,
+      "learning_rate": 4.858542518723794e-05,
+      "loss": 0.18963592052459716,
+      "step": 740
+    },
+    {
+      "epoch": 0.13555312954876272,
+      "grad_norm": 0.11162743717432022,
+      "learning_rate": 4.8560913910720535e-05,
+      "loss": 0.19466646909713745,
+      "step": 745
+    },
+    {
+      "epoch": 0.13646288209606988,
+      "grad_norm": 0.15628376603126526,
+      "learning_rate": 4.8536198383892725e-05,
+      "loss": 0.19494034051895143,
+      "step": 750
+    },
+    {
+      "epoch": 0.137372634643377,
+      "grad_norm": 0.18209289014339447,
+      "learning_rate": 4.851127882101421e-05,
+      "loss": 0.18747550249099731,
+      "step": 755
+    },
+    {
+      "epoch": 0.13828238719068414,
+      "grad_norm": 0.14559614658355713,
+      "learning_rate": 4.8486155438113454e-05,
+      "loss": 0.1897158980369568,
+      "step": 760
+    },
+    {
+      "epoch": 0.13919213973799127,
+      "grad_norm": 0.3198587894439697,
+      "learning_rate": 4.846082845298586e-05,
+      "loss": 0.18571001291275024,
+      "step": 765
+    },
+    {
+      "epoch": 0.1401018922852984,
+      "grad_norm": 0.1486678421497345,
+      "learning_rate": 4.843529808519189e-05,
+      "loss": 0.19561930894851684,
+      "step": 770
+    },
+    {
+      "epoch": 0.14101164483260553,
+      "grad_norm": 0.15318170189857483,
+      "learning_rate": 4.840956455605509e-05,
+      "loss": 0.187040114402771,
+      "step": 775
+    },
+    {
+      "epoch": 0.14192139737991266,
+      "grad_norm": 0.13754244148731232,
+      "learning_rate": 4.838362808866025e-05,
+      "loss": 0.18345539569854735,
+      "step": 780
+    },
+    {
+      "epoch": 0.1428311499272198,
+      "grad_norm": 0.12943248450756073,
+      "learning_rate": 4.835748890785143e-05,
+      "loss": 0.1921079397201538,
+      "step": 785
+    },
+    {
+      "epoch": 0.14374090247452692,
+      "grad_norm": 0.110458143055439,
+      "learning_rate": 4.833114724023001e-05,
+      "loss": 0.17927205562591553,
+      "step": 790
+    },
+    {
+      "epoch": 0.14465065502183405,
+      "grad_norm": 0.2421770840883255,
+      "learning_rate": 4.830460331415275e-05,
+      "loss": 0.18317567110061644,
+      "step": 795
+    },
+    {
+      "epoch": 0.14556040756914118,
+      "grad_norm": 0.14752762019634247,
+      "learning_rate": 4.8277857359729787e-05,
+      "loss": 0.1843916058540344,
+      "step": 800
+    },
+    {
+      "epoch": 0.14647016011644834,
+      "grad_norm": 0.15043556690216064,
+      "learning_rate": 4.8250909608822644e-05,
+      "loss": 0.18354393243789674,
+      "step": 805
+    },
+    {
+      "epoch": 0.14737991266375547,
+      "grad_norm": 0.1381794661283493,
+      "learning_rate": 4.822376029504223e-05,
+      "loss": 0.1789781332015991,
+      "step": 810
+    },
+    {
+      "epoch": 0.1482896652110626,
+      "grad_norm": 0.18386174738407135,
+      "learning_rate": 4.819640965374681e-05,
+      "loss": 0.19494292736053467,
+      "step": 815
+    },
+    {
+      "epoch": 0.14919941775836973,
+      "grad_norm": 0.13829593360424042,
+      "learning_rate": 4.816885792203996e-05,
+      "loss": 0.18486063480377196,
+      "step": 820
+    },
+    {
+      "epoch": 0.15010917030567686,
+      "grad_norm": 0.15033291280269623,
+      "learning_rate": 4.814110533876852e-05,
+      "loss": 0.18061509132385253,
+      "step": 825
+    },
+    {
+      "epoch": 0.151018922852984,
+      "grad_norm": 0.17150473594665527,
+      "learning_rate": 4.811315214452051e-05,
+      "loss": 0.18464866876602173,
+      "step": 830
+    },
+    {
+      "epoch": 0.15192867540029112,
+      "grad_norm": 0.15317125618457794,
+      "learning_rate": 4.808499858162307e-05,
+      "loss": 0.1837708592414856,
+      "step": 835
+    },
+    {
+      "epoch": 0.15283842794759825,
+      "grad_norm": 0.2671392560005188,
+      "learning_rate": 4.805664489414031e-05,
+      "loss": 0.19338636398315429,
+      "step": 840
+    },
+    {
+      "epoch": 0.15374818049490538,
+      "grad_norm": 0.14047028124332428,
+      "learning_rate": 4.802809132787125e-05,
+      "loss": 0.17069108486175538,
+      "step": 845
+    },
+    {
+      "epoch": 0.1546579330422125,
+      "grad_norm": 0.1520431935787201,
+      "learning_rate": 4.799933813034768e-05,
+      "loss": 0.18607735633850098,
+      "step": 850
+    },
+    {
+      "epoch": 0.15556768558951964,
+      "grad_norm": 0.17239463329315186,
+      "learning_rate": 4.797038555083197e-05,
+      "loss": 0.18069062232971192,
+      "step": 855
+    },
+    {
+      "epoch": 0.1564774381368268,
+      "grad_norm": 0.1377955675125122,
+      "learning_rate": 4.794123384031495e-05,
+      "loss": 0.18870222568511963,
+      "step": 860
+    },
+    {
+      "epoch": 0.15738719068413393,
+      "grad_norm": 0.15901461243629456,
+      "learning_rate": 4.791188325151373e-05,
+      "loss": 0.18128334283828734,
+      "step": 865
+    },
+    {
+      "epoch": 0.15829694323144106,
+      "grad_norm": 0.14634132385253906,
+      "learning_rate": 4.7882334038869495e-05,
+      "loss": 0.1866163969039917,
+      "step": 870
+    },
+    {
+      "epoch": 0.1592066957787482,
+      "grad_norm": 0.15361061692237854,
+      "learning_rate": 4.785258645854529e-05,
+      "loss": 0.17850807905197144,
+      "step": 875
+    },
+    {
+      "epoch": 0.16011644832605532,
+      "grad_norm": 0.13751649856567383,
+      "learning_rate": 4.782264076842385e-05,
+      "loss": 0.17731113433837892,
+      "step": 880
+    },
+    {
+      "epoch": 0.16102620087336245,
+      "grad_norm": 0.17909638583660126,
+      "learning_rate": 4.7792497228105314e-05,
+      "loss": 0.18344542980194092,
+      "step": 885
+    },
+    {
+      "epoch": 0.16193595342066958,
+      "grad_norm": 0.16038304567337036,
+      "learning_rate": 4.776215609890498e-05,
+      "loss": 0.18868647813796996,
+      "step": 890
+    },
+    {
+      "epoch": 0.1628457059679767,
+      "grad_norm": 0.1653951108455658,
+      "learning_rate": 4.773161764385107e-05,
+      "loss": 0.18614152669906617,
+      "step": 895
+    },
+    {
+      "epoch": 0.16375545851528384,
+      "grad_norm": 0.16193026304244995,
+      "learning_rate": 4.770088212768241e-05,
+      "loss": 0.18564575910568237,
+      "step": 900
+    },
+    {
+      "epoch": 0.16466521106259097,
+      "grad_norm": 0.16048531234264374,
+      "learning_rate": 4.7669949816846173e-05,
+      "loss": 0.18330031633377075,
+      "step": 905
+    },
+    {
+      "epoch": 0.1655749636098981,
+      "grad_norm": 0.1440177708864212,
+      "learning_rate": 4.7638820979495534e-05,
+      "loss": 0.17712442874908446,
+      "step": 910
+    },
+    {
+      "epoch": 0.16648471615720525,
+      "grad_norm": 0.19635969400405884,
+      "learning_rate": 4.760749588548738e-05,
+      "loss": 0.18679027557373046,
+      "step": 915
+    },
+    {
+      "epoch": 0.16739446870451238,
+      "grad_norm": 0.15576541423797607,
+      "learning_rate": 4.757597480637995e-05,
+      "loss": 0.19283764362335204,
+      "step": 920
+    },
+    {
+      "epoch": 0.1683042212518195,
+      "grad_norm": 0.1550331562757492,
+      "learning_rate": 4.7544258015430463e-05,
+      "loss": 0.18269542455673218,
+      "step": 925
+    },
+    {
+      "epoch": 0.16921397379912664,
+      "grad_norm": 0.18369626998901367,
+      "learning_rate": 4.75123457875928e-05,
+      "loss": 0.1697891116142273,
+      "step": 930
+    },
+    {
+      "epoch": 0.17012372634643377,
+      "grad_norm": 0.15266314148902893,
+      "learning_rate": 4.7480238399515074e-05,
+      "loss": 0.18523451089859008,
+      "step": 935
+    },
+    {
+      "epoch": 0.1710334788937409,
+      "grad_norm": 0.16709664463996887,
+      "learning_rate": 4.744793612953724e-05,
+      "loss": 0.1803238034248352,
+      "step": 940
+    },
+    {
+      "epoch": 0.17194323144104803,
+      "grad_norm": 0.14929179847240448,
+      "learning_rate": 4.741543925768872e-05,
+      "loss": 0.1861217737197876,
+      "step": 945
+    },
+    {
+      "epoch": 0.17285298398835516,
+      "grad_norm": 0.1362280696630478,
+      "learning_rate": 4.7382748065685915e-05,
+      "loss": 0.17896100282669067,
+      "step": 950
+    },
+    {
+      "epoch": 0.1737627365356623,
+      "grad_norm": 0.15290239453315735,
+      "learning_rate": 4.734986283692982e-05,
+      "loss": 0.18432788848876952,
+      "step": 955
+    },
+    {
+      "epoch": 0.17467248908296942,
+      "grad_norm": 0.1287035197019577,
+      "learning_rate": 4.73167838565035e-05,
+      "loss": 0.18485682010650634,
+      "step": 960
+    },
+    {
+      "epoch": 0.17558224163027655,
+      "grad_norm": 0.17969627678394318,
+      "learning_rate": 4.728351141116971e-05,
+      "loss": 0.17361557483673096,
+      "step": 965
+    },
+    {
+      "epoch": 0.1764919941775837,
+      "grad_norm": 0.13751201331615448,
+      "learning_rate": 4.7250045789368326e-05,
+      "loss": 0.1731679320335388,
+      "step": 970
+    },
+    {
+      "epoch": 0.17740174672489084,
+      "grad_norm": 0.1603265255689621,
+      "learning_rate": 4.721638728121388e-05,
+      "loss": 0.17308170795440675,
+      "step": 975
+    },
+    {
+      "epoch": 0.17831149927219797,
+      "grad_norm": 0.1592789888381958,
+      "learning_rate": 4.718253617849306e-05,
+      "loss": 0.17534757852554322,
+      "step": 980
+    },
+    {
+      "epoch": 0.1792212518195051,
+      "grad_norm": 0.12727224826812744,
+      "learning_rate": 4.714849277466214e-05,
+      "loss": 0.17817609310150145,
+      "step": 985
+    },
+    {
+      "epoch": 0.18013100436681223,
+      "grad_norm": 0.15401554107666016,
+      "learning_rate": 4.711425736484447e-05,
+      "loss": 0.1733405351638794,
+      "step": 990
+    },
+    {
+      "epoch": 0.18104075691411936,
+      "grad_norm": 0.13253968954086304,
+      "learning_rate": 4.7079830245827906e-05,
+      "loss": 0.17846795320510864,
+      "step": 995
+    },
+    {
+      "epoch": 0.1819505094614265,
+      "grad_norm": 0.21846213936805725,
+      "learning_rate": 4.7045211716062245e-05,
+      "loss": 0.18021599054336548,
+      "step": 1000
+    },
+    {
+      "epoch": 0.18286026200873362,
+      "grad_norm": 0.16867990791797638,
+      "learning_rate": 4.7010402075656595e-05,
+      "loss": 0.18232386112213134,
+      "step": 1005
+    },
+    {
+      "epoch": 0.18377001455604075,
+      "grad_norm": 0.17180582880973816,
+      "learning_rate": 4.697540162637686e-05,
+      "loss": 0.1816317319869995,
+      "step": 1010
+    },
+    {
+      "epoch": 0.18467976710334788,
+      "grad_norm": 0.16480213403701782,
+      "learning_rate": 4.694021067164303e-05,
+      "loss": 0.17718446254730225,
+      "step": 1015
+    },
+    {
+      "epoch": 0.185589519650655,
+      "grad_norm": 0.15015918016433716,
+      "learning_rate": 4.6904829516526605e-05,
+      "loss": 0.17412011623382567,
+      "step": 1020
+    },
+    {
+      "epoch": 0.18649927219796217,
+      "grad_norm": 0.14445139467716217,
+      "learning_rate": 4.686925846774795e-05,
+      "loss": 0.1778018832206726,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1874090247452693,
+      "grad_norm": 0.1701960265636444,
+      "learning_rate": 4.683349783367362e-05,
+      "loss": 0.16901081800460815,
+      "step": 1030
+    },
+    {
+      "epoch": 0.18831877729257643,
+      "grad_norm": 0.15894867479801178,
+      "learning_rate": 4.679754792431368e-05,
+      "loss": 0.17055928707122803,
+      "step": 1035
+    },
+    {
+      "epoch": 0.18922852983988356,
+      "grad_norm": 0.1511942446231842,
+      "learning_rate": 4.676140905131903e-05,
+      "loss": 0.17339680194854737,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1901382823871907,
+      "grad_norm": 0.14735209941864014,
+      "learning_rate": 4.672508152797872e-05,
+      "loss": 0.17802717685699462,
+      "step": 1045
+    },
+    {
+      "epoch": 0.19104803493449782,
+      "grad_norm": 0.17367291450500488,
+      "learning_rate": 4.66885656692172e-05,
+      "loss": 0.1732744097709656,
+      "step": 1050
+    },
+    {
+      "epoch": 0.19195778748180495,
+      "grad_norm": 0.147227481007576,
+      "learning_rate": 4.665186179159159e-05,
+      "loss": 0.17040517330169677,
+      "step": 1055
+    },
+    {
+      "epoch": 0.19286754002911208,
+      "grad_norm": 0.1709655076265335,
+      "learning_rate": 4.6614970213289e-05,
+      "loss": 0.17794088125228882,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1937772925764192,
+      "grad_norm": 0.1588088721036911,
+      "learning_rate": 4.657789125412366e-05,
+      "loss": 0.17180380821228028,
+      "step": 1065
+    },
+    {
+      "epoch": 0.19468704512372634,
+      "grad_norm": 0.14827021956443787,
+      "learning_rate": 4.654062523553428e-05,
+      "loss": 0.182997989654541,
+      "step": 1070
+    },
+    {
+      "epoch": 0.19559679767103347,
+      "grad_norm": 0.16230466961860657,
+      "learning_rate": 4.6503172480581126e-05,
+      "loss": 0.17346880435943604,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1965065502183406,
+      "grad_norm": 0.1637624353170395,
+      "learning_rate": 4.646553331394333e-05,
+      "loss": 0.17263576984405518,
+      "step": 1080
+    },
+    {
+      "epoch": 0.19741630276564776,
+      "grad_norm": 0.15977843105793,
+      "learning_rate": 4.642770806191603e-05,
+      "loss": 0.17284308671951293,
+      "step": 1085
+    },
+    {
+      "epoch": 0.19832605531295489,
+      "grad_norm": 0.15394869446754456,
+      "learning_rate": 4.6389697052407534e-05,
+      "loss": 0.17797101736068727,
+      "step": 1090
+    },
+    {
+      "epoch": 0.19923580786026202,
+      "grad_norm": 0.15995225310325623,
+      "learning_rate": 4.6351500614936485e-05,
+      "loss": 0.18137198686599731,
+      "step": 1095
+    },
+    {
+      "epoch": 0.20014556040756915,
+      "grad_norm": 0.1779479682445526,
+      "learning_rate": 4.6313119080629006e-05,
+      "loss": 0.17998344898223878,
+      "step": 1100
+    },
+    {
+      "epoch": 0.20105531295487628,
+      "grad_norm": 0.14362832903862,
+      "learning_rate": 4.627455278221584e-05,
+      "loss": 0.18196423053741456,
+      "step": 1105
+    },
+    {
+      "epoch": 0.2019650655021834,
+      "grad_norm": 0.15951639413833618,
+      "learning_rate": 4.623580205402947e-05,
+      "loss": 0.17423888444900512,
+      "step": 1110
+    },
+    {
+      "epoch": 0.20287481804949054,
+      "grad_norm": 0.17273563146591187,
+      "learning_rate": 4.619686723200115e-05,
+      "loss": 0.17392473220825194,
+      "step": 1115
+    },
+    {
+      "epoch": 0.20378457059679767,
+      "grad_norm": 0.1655360758304596,
+      "learning_rate": 4.615774865365813e-05,
+      "loss": 0.17528389692306517,
+      "step": 1120
+    },
+    {
+      "epoch": 0.2046943231441048,
+      "grad_norm": 0.15920691192150116,
+      "learning_rate": 4.611844665812058e-05,
+      "loss": 0.1806849241256714,
+      "step": 1125
+    },
+    {
+      "epoch": 0.20560407569141192,
+      "grad_norm": 0.16114577651023865,
+      "learning_rate": 4.607896158609875e-05,
+      "loss": 0.17217352390289306,
+      "step": 1130
+    },
+    {
+      "epoch": 0.20651382823871905,
+      "grad_norm": 0.1499422937631607,
+      "learning_rate": 4.603929377988999e-05,
+      "loss": 0.17806737422943114,
+      "step": 1135
+    },
+    {
+      "epoch": 0.2074235807860262,
+      "grad_norm": 0.17605191469192505,
+      "learning_rate": 4.5999443583375765e-05,
+      "loss": 0.17842113971710205,
+      "step": 1140
+    },
+    {
+      "epoch": 0.20833333333333334,
+      "grad_norm": 0.16117210686206818,
+      "learning_rate": 4.595941134201871e-05,
+      "loss": 0.18379683494567872,
+      "step": 1145
+    },
+    {
+      "epoch": 0.20924308588064047,
+      "grad_norm": 0.21199050545692444,
+      "learning_rate": 4.591919740285957e-05,
+      "loss": 0.16286123991012574,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2101528384279476,
+      "grad_norm": 0.15100529789924622,
+      "learning_rate": 4.587880211451427e-05,
+      "loss": 0.17995200157165528,
+      "step": 1155
+    },
+    {
+      "epoch": 0.21106259097525473,
+      "grad_norm": 0.16618172824382782,
+      "learning_rate": 4.583822582717085e-05,
+      "loss": 0.16960303783416747,
+      "step": 1160
+    },
+    {
+      "epoch": 0.21197234352256186,
+      "grad_norm": 0.14743569493293762,
+      "learning_rate": 4.579746889258643e-05,
+      "loss": 0.17762668132781984,
+      "step": 1165
+    },
+    {
+      "epoch": 0.212882096069869,
+      "grad_norm": 0.1697179079055786,
+      "learning_rate": 4.575653166408417e-05,
+      "loss": 0.16656005382537842,
+      "step": 1170
+    },
+    {
+      "epoch": 0.21379184861717612,
+      "grad_norm": 0.14886513352394104,
+      "learning_rate": 4.57154144965502e-05,
+      "loss": 0.17091882228851318,
+      "step": 1175
+    },
+    {
+      "epoch": 0.21470160116448325,
+      "grad_norm": 0.18197473883628845,
+      "learning_rate": 4.5674117746430556e-05,
+      "loss": 0.1770920753479004,
+      "step": 1180
+    },
+    {
+      "epoch": 0.21561135371179038,
+      "grad_norm": 0.17323088645935059,
+      "learning_rate": 4.563264177172807e-05,
+      "loss": 0.1734643578529358,
+      "step": 1185
+    },
+    {
+      "epoch": 0.2165211062590975,
+      "grad_norm": 0.1521984338760376,
+      "learning_rate": 4.559098693199929e-05,
+      "loss": 0.17515116930007935,
+      "step": 1190
+    },
+    {
+      "epoch": 0.21743085880640467,
+      "grad_norm": 0.1842304915189743,
+      "learning_rate": 4.554915358835134e-05,
+      "loss": 0.16798022985458375,
+      "step": 1195
+    },
+    {
+      "epoch": 0.2183406113537118,
+      "grad_norm": 0.14753451943397522,
+      "learning_rate": 4.5507142103438794e-05,
+      "loss": 0.1755476713180542,
+      "step": 1200
+    },
+    {
+      "epoch": 0.21925036390101893,
+      "grad_norm": 0.17096194624900818,
+      "learning_rate": 4.546495284146057e-05,
+      "loss": 0.1792473554611206,
+      "step": 1205
+    },
+    {
+      "epoch": 0.22016011644832606,
+      "grad_norm": 0.1579233556985855,
+      "learning_rate": 4.542258616815669e-05,
+      "loss": 0.17230144739151002,
+      "step": 1210
+    },
+    {
+      "epoch": 0.2210698689956332,
+      "grad_norm": 0.177297905087471,
+      "learning_rate": 4.5380042450805216e-05,
+      "loss": 0.1807127833366394,
+      "step": 1215
+    },
+    {
+      "epoch": 0.22197962154294032,
+      "grad_norm": 0.14331696927547455,
+      "learning_rate": 4.533732205821897e-05,
+      "loss": 0.17201389074325563,
+      "step": 1220
+    },
+    {
+      "epoch": 0.22288937409024745,
+      "grad_norm": 0.14473360776901245,
+      "learning_rate": 4.529442536074239e-05,
+      "loss": 0.17036900520324708,
+      "step": 1225
+    },
+    {
+      "epoch": 0.22379912663755458,
+      "grad_norm": 0.1820901483297348,
+      "learning_rate": 4.5251352730248314e-05,
+      "loss": 0.17704882621765136,
+      "step": 1230
+    },
+    {
+      "epoch": 0.2247088791848617,
+      "grad_norm": 0.1948976367712021,
+      "learning_rate": 4.5208104540134746e-05,
+      "loss": 0.1706973433494568,
+      "step": 1235
+    },
+    {
+      "epoch": 0.22561863173216884,
+      "grad_norm": 0.16660070419311523,
+      "learning_rate": 4.51646811653216e-05,
+      "loss": 0.17636821269989014,
+      "step": 1240
+    },
+    {
+      "epoch": 0.22652838427947597,
+      "grad_norm": 0.1699984073638916,
+      "learning_rate": 4.512108298224751e-05,
+      "loss": 0.16986632347106934,
+      "step": 1245
+    },
+    {
+      "epoch": 0.22743813682678313,
+      "grad_norm": 0.17601042985916138,
+      "learning_rate": 4.50773103688665e-05,
+      "loss": 0.17507898807525635,
+      "step": 1250
+    },
+    {
+      "epoch": 0.22834788937409026,
+      "grad_norm": 0.17557238042354584,
+      "learning_rate": 4.503336370464476e-05,
+      "loss": 0.17702863216400147,
+      "step": 1255
+    },
+    {
+      "epoch": 0.2292576419213974,
+      "grad_norm": 0.1800651252269745,
+      "learning_rate": 4.498924337055729e-05,
+      "loss": 0.16419180631637573,
+      "step": 1260
+    },
+    {
+      "epoch": 0.23016739446870452,
+      "grad_norm": 0.2022479772567749,
+      "learning_rate": 4.494494974908468e-05,
+      "loss": 0.17482060194015503,
+      "step": 1265
+    },
+    {
+      "epoch": 0.23107714701601165,
+      "grad_norm": 0.14180205762386322,
+      "learning_rate": 4.490048322420973e-05,
+      "loss": 0.1723136067390442,
+      "step": 1270
+    },
+    {
+      "epoch": 0.23198689956331878,
+      "grad_norm": 0.18607310950756073,
+      "learning_rate": 4.485584418141419e-05,
+      "loss": 0.17096419334411622,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2328966521106259,
+      "grad_norm": 0.15958310663700104,
+      "learning_rate": 4.481103300767529e-05,
+      "loss": 0.1656244158744812,
+      "step": 1280
+    },
+    {
+      "epoch": 0.23380640465793304,
+      "grad_norm": 0.17552383244037628,
+      "learning_rate": 4.476605009146255e-05,
+      "loss": 0.17677626609802247,
+      "step": 1285
+    },
+    {
+      "epoch": 0.23471615720524017,
+      "grad_norm": 0.15299823880195618,
+      "learning_rate": 4.472089582273429e-05,
+      "loss": 0.1778991103172302,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2356259097525473,
+      "grad_norm": 0.14613987505435944,
+      "learning_rate": 4.46755705929343e-05,
+      "loss": 0.17071452140808105,
+      "step": 1295
+    },
+    {
+      "epoch": 0.23653566229985443,
+      "grad_norm": 0.17781122028827667,
+      "learning_rate": 4.463007479498843e-05,
+      "loss": 0.16955430507659913,
+      "step": 1300
+    },
+    {
+      "epoch": 0.23744541484716158,
+      "grad_norm": 0.16326487064361572,
+      "learning_rate": 4.458440882330119e-05,
+      "loss": 0.1777693510055542,
+      "step": 1305
+    },
+    {
+      "epoch": 0.23835516739446871,
+      "grad_norm": 0.17701926827430725,
+      "learning_rate": 4.4538573073752365e-05,
+      "loss": 0.16323351860046387,
+      "step": 1310
+    },
+    {
+      "epoch": 0.23926491994177584,
+      "grad_norm": 0.13104717433452606,
+      "learning_rate": 4.449256794369349e-05,
+      "loss": 0.17653456926345826,
+      "step": 1315
+    },
+    {
+      "epoch": 0.24017467248908297,
+      "grad_norm": 0.1796836256980896,
+      "learning_rate": 4.444639383194452e-05,
+      "loss": 0.17189600467681884,
+      "step": 1320
+    },
+    {
+      "epoch": 0.2410844250363901,
+      "grad_norm": 0.14919696748256683,
+      "learning_rate": 4.440005113879029e-05,
+      "loss": 0.17003334760665895,
+      "step": 1325
+    },
+    {
+      "epoch": 0.24199417758369723,
+      "grad_norm": 0.1728784441947937,
+      "learning_rate": 4.4353540265977064e-05,
+      "loss": 0.17397408485412597,
+      "step": 1330
+    },
+    {
+      "epoch": 0.24290393013100436,
+      "grad_norm": 0.14591015875339508,
+      "learning_rate": 4.43068616167091e-05,
+      "loss": 0.16498478651046752,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2438136826783115,
+      "grad_norm": 0.18417201936244965,
+      "learning_rate": 4.4260015595645055e-05,
+      "loss": 0.16841750144958495,
+      "step": 1340
+    },
+    {
+      "epoch": 0.24472343522561862,
+      "grad_norm": 0.16264279186725616,
+      "learning_rate": 4.4213002608894605e-05,
+      "loss": 0.16907373666763306,
+      "step": 1345
+    },
+    {
+      "epoch": 0.24563318777292575,
+      "grad_norm": 0.15248481929302216,
+      "learning_rate": 4.416582306401481e-05,
+      "loss": 0.15931472778320313,
+      "step": 1350
+    },
+    {
+      "epoch": 0.24654294032023288,
+      "grad_norm": 0.1488373875617981,
+      "learning_rate": 4.4118477370006636e-05,
+      "loss": 0.1701716423034668,
+      "step": 1355
+    },
+    {
+      "epoch": 0.24745269286754004,
+      "grad_norm": 0.14679782092571259,
+      "learning_rate": 4.407096593731142e-05,
+      "loss": 0.157412326335907,
+      "step": 1360
+    },
+    {
+      "epoch": 0.24836244541484717,
+      "grad_norm": 0.17139530181884766,
+      "learning_rate": 4.402328917780728e-05,
+      "loss": 0.17303754091262818,
+      "step": 1365
+    },
+    {
+      "epoch": 0.2492721979621543,
+      "grad_norm": 0.1534871757030487,
+      "learning_rate": 4.397544750480554e-05,
+      "loss": 0.1786255121231079,
+      "step": 1370
+    },
+    {
+      "epoch": 0.2501819505094614,
+      "grad_norm": 0.1876252293586731,
+      "learning_rate": 4.39274413330472e-05,
+      "loss": 0.16442898511886597,
+      "step": 1375
+    },
+    {
+      "epoch": 0.25109170305676853,
+      "grad_norm": 0.16165752708911896,
+      "learning_rate": 4.387927107869928e-05,
+      "loss": 0.1780426025390625,
+      "step": 1380
+    },
+    {
+      "epoch": 0.25200145560407566,
+      "grad_norm": 0.17242255806922913,
+      "learning_rate": 4.383093715935124e-05,
+      "loss": 0.15959256887435913,
+      "step": 1385
+    },
+    {
+      "epoch": 0.25291120815138285,
+      "grad_norm": 0.1627114862203598,
+      "learning_rate": 4.378243999401137e-05,
+      "loss": 0.17606115341186523,
+      "step": 1390
+    },
+    {
+      "epoch": 0.25382096069869,
+      "grad_norm": 0.15911224484443665,
+      "learning_rate": 4.373378000310312e-05,
+      "loss": 0.16798585653305054,
+      "step": 1395
+    },
+    {
+      "epoch": 0.2547307132459971,
+      "grad_norm": 0.15542249381542206,
+      "learning_rate": 4.3684957608461505e-05,
+      "loss": 0.1695417881011963,
+      "step": 1400
+    },
+    {
+      "epoch": 0.25564046579330424,
+      "grad_norm": 0.1475304812192917,
+      "learning_rate": 4.363597323332941e-05,
+      "loss": 0.16340878009796142,
+      "step": 1405
+    },
+    {
+      "epoch": 0.25655021834061137,
+      "grad_norm": 0.16943927109241486,
+      "learning_rate": 4.358682730235395e-05,
+      "loss": 0.17240238189697266,
+      "step": 1410
+    },
+    {
+      "epoch": 0.2574599708879185,
+      "grad_norm": 0.1816391944885254,
+      "learning_rate": 4.3537520241582744e-05,
+      "loss": 0.16558437347412108,
+      "step": 1415
+    },
+    {
+      "epoch": 0.25836972343522563,
+      "grad_norm": 0.23851341009140015,
+      "learning_rate": 4.348805247846027e-05,
+      "loss": 0.16796000003814698,
+      "step": 1420
+    },
+    {
+      "epoch": 0.25927947598253276,
+      "grad_norm": 0.15415243804454803,
+      "learning_rate": 4.343842444182414e-05,
+      "loss": 0.1746017098426819,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2601892285298399,
+      "grad_norm": 0.15651032328605652,
+      "learning_rate": 4.338863656190139e-05,
+      "loss": 0.1649057984352112,
+      "step": 1430
+    },
+    {
+      "epoch": 0.261098981077147,
+      "grad_norm": 0.16601966321468353,
+      "learning_rate": 4.333868927030471e-05,
+      "loss": 0.15888988971710205,
+      "step": 1435
+    },
+    {
+      "epoch": 0.26200873362445415,
+      "grad_norm": 0.1549467295408249,
+      "learning_rate": 4.328858300002876e-05,
+      "loss": 0.16357985734939576,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2629184861717613,
+      "grad_norm": 0.16332370042800903,
+      "learning_rate": 4.32383181854464e-05,
+      "loss": 0.16749982833862304,
+      "step": 1445
+    },
+    {
+      "epoch": 0.2638282387190684,
+      "grad_norm": 0.14827077090740204,
+      "learning_rate": 4.3187895262304894e-05,
+      "loss": 0.16886214017868043,
+      "step": 1450
+    },
+    {
+      "epoch": 0.26473799126637554,
+      "grad_norm": 0.1557198166847229,
+      "learning_rate": 4.313731466772216e-05,
+      "loss": 0.17512214183807373,
+      "step": 1455
+    },
+    {
+      "epoch": 0.26564774381368267,
+      "grad_norm": 0.17263570427894592,
+      "learning_rate": 4.308657684018299e-05,
+      "loss": 0.16248074769973755,
+      "step": 1460
+    },
+    {
+      "epoch": 0.2665574963609898,
+      "grad_norm": 0.17135761678218842,
+      "learning_rate": 4.303568221953521e-05,
+      "loss": 0.16605921983718872,
+      "step": 1465
+    },
+    {
+      "epoch": 0.26746724890829693,
+      "grad_norm": 0.14322632551193237,
+      "learning_rate": 4.2984631246985897e-05,
+      "loss": 0.1610772728919983,
+      "step": 1470
+    },
+    {
+      "epoch": 0.26837700145560406,
+      "grad_norm": 0.18852312862873077,
+      "learning_rate": 4.2933424365097564e-05,
+      "loss": 0.1686462163925171,
+      "step": 1475
+    },
+    {
+      "epoch": 0.2692867540029112,
+      "grad_norm": 0.1780245155096054,
+      "learning_rate": 4.2882062017784294e-05,
+      "loss": 0.16953932046890258,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2701965065502183,
+      "grad_norm": 0.180568665266037,
+      "learning_rate": 4.2830544650307895e-05,
+      "loss": 0.16442664861679077,
+      "step": 1485
+    },
+    {
+      "epoch": 0.27110625909752545,
+      "grad_norm": 0.16876435279846191,
+      "learning_rate": 4.277887270927407e-05,
+      "loss": 0.17128173112869263,
+      "step": 1490
+    },
+    {
+      "epoch": 0.2720160116448326,
+      "grad_norm": 0.164053276181221,
+      "learning_rate": 4.2727046642628513e-05,
+      "loss": 0.16331382989883422,
+      "step": 1495
+    },
+    {
+      "epoch": 0.27292576419213976,
+      "grad_norm": 0.14577528834342957,
+      "learning_rate": 4.267506689965305e-05,
+      "loss": 0.1638316035270691,
+      "step": 1500
+    },
+    {
+      "epoch": 0.2738355167394469,
+      "grad_norm": 0.1648740917444229,
+      "learning_rate": 4.262293393096171e-05,
+      "loss": 0.15332664251327516,
+      "step": 1505
+    },
+    {
+      "epoch": 0.274745269286754,
+      "grad_norm": 0.16445094347000122,
+      "learning_rate": 4.257064818849685e-05,
+      "loss": 0.1706634521484375,
+      "step": 1510
+    },
+    {
+      "epoch": 0.27565502183406115,
+      "grad_norm": 0.1584935486316681,
+      "learning_rate": 4.251821012552524e-05,
+      "loss": 0.1684114694595337,
+      "step": 1515
+    },
+    {
+      "epoch": 0.2765647743813683,
+      "grad_norm": 0.17215611040592194,
+      "learning_rate": 4.24656201966341e-05,
+      "loss": 0.15594131946563722,
+      "step": 1520
+    },
+    {
+      "epoch": 0.2774745269286754,
+      "grad_norm": 0.15945589542388916,
+      "learning_rate": 4.2412878857727214e-05,
+      "loss": 0.1686659574508667,
+      "step": 1525
+    },
+    {
+      "epoch": 0.27838427947598254,
+      "grad_norm": 0.16103951632976532,
+      "learning_rate": 4.2359986566020906e-05,
+      "loss": 0.17779340744018554,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2792940320232897,
+      "grad_norm": 0.1770307570695877,
+      "learning_rate": 4.230694378004014e-05,
+      "loss": 0.16786882877349854,
+      "step": 1535
+    },
+    {
+      "epoch": 0.2802037845705968,
+      "grad_norm": 0.16225053369998932,
+      "learning_rate": 4.2253750959614504e-05,
+      "loss": 0.16558897495269775,
+      "step": 1540
+    },
+    {
+      "epoch": 0.28111353711790393,
+      "grad_norm": 0.27213969826698303,
+      "learning_rate": 4.220040856587425e-05,
+      "loss": 0.1641119599342346,
+      "step": 1545
+    },
+    {
+      "epoch": 0.28202328966521106,
+      "grad_norm": 0.1773071587085724,
+      "learning_rate": 4.2146917061246284e-05,
+      "loss": 0.16919140815734862,
+      "step": 1550
+    },
+    {
+      "epoch": 0.2829330422125182,
+      "grad_norm": 0.15519705414772034,
+      "learning_rate": 4.209327690945014e-05,
+      "loss": 0.15501506328582765,
+      "step": 1555
+    },
+    {
+      "epoch": 0.2838427947598253,
+      "grad_norm": 0.19921597838401794,
+      "learning_rate": 4.203948857549402e-05,
+      "loss": 0.1690821886062622,
+      "step": 1560
+    },
+    {
+      "epoch": 0.28475254730713245,
+      "grad_norm": 0.15417630970478058,
+      "learning_rate": 4.1985552525670696e-05,
+      "loss": 0.1675640344619751,
+      "step": 1565
+    },
+    {
+      "epoch": 0.2856622998544396,
+      "grad_norm": 0.1739572137594223,
+      "learning_rate": 4.193146922755348e-05,
+      "loss": 0.16738017797470092,
+      "step": 1570
+    },
+    {
+      "epoch": 0.2865720524017467,
+      "grad_norm": 0.1384361982345581,
+      "learning_rate": 4.187723914999221e-05,
+      "loss": 0.16802358627319336,
+      "step": 1575
+    },
+    {
+      "epoch": 0.28748180494905384,
+      "grad_norm": 0.1491454839706421,
+      "learning_rate": 4.182286276310915e-05,
+      "loss": 0.1619583249092102,
+      "step": 1580
+    },
+    {
+      "epoch": 0.288391557496361,
+      "grad_norm": 0.15831919014453888,
+      "learning_rate": 4.176834053829492e-05,
+      "loss": 0.1625199794769287,
+      "step": 1585
+    },
+    {
+      "epoch": 0.2893013100436681,
+      "grad_norm": 0.16265396773815155,
+      "learning_rate": 4.1713672948204416e-05,
+      "loss": 0.16718552112579346,
+      "step": 1590
+    },
+    {
+      "epoch": 0.29021106259097523,
+      "grad_norm": 0.15153461694717407,
+      "learning_rate": 4.1658860466752714e-05,
+      "loss": 0.15979087352752686,
+      "step": 1595
+    },
+    {
+      "epoch": 0.29112081513828236,
+      "grad_norm": 0.1620412915945053,
+      "learning_rate": 4.160390356911096e-05,
+      "loss": 0.16103557348251343,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2920305676855895,
+      "grad_norm": 0.16673807799816132,
+      "learning_rate": 4.154880273170223e-05,
+      "loss": 0.16394708156585694,
+      "step": 1605
+    },
+    {
+      "epoch": 0.2929403202328967,
+      "grad_norm": 0.14834867417812347,
+      "learning_rate": 4.149355843219744e-05,
+      "loss": 0.15916435718536376,
+      "step": 1610
+    },
+    {
+      "epoch": 0.2938500727802038,
+      "grad_norm": 0.16977964341640472,
+      "learning_rate": 4.143817114951119e-05,
+      "loss": 0.16538127660751342,
+      "step": 1615
+    },
+    {
+      "epoch": 0.29475982532751094,
+      "grad_norm": 0.17986875772476196,
+      "learning_rate": 4.138264136379756e-05,
+      "loss": 0.15514618158340454,
+      "step": 1620
+    },
+    {
+      "epoch": 0.29566957787481807,
+      "grad_norm": 0.15794920921325684,
+      "learning_rate": 4.132696955644605e-05,
+      "loss": 0.15992183685302735,
+      "step": 1625
+    },
+    {
+      "epoch": 0.2965793304221252,
+      "grad_norm": 0.19955399632453918,
+      "learning_rate": 4.127115621007731e-05,
+      "loss": 0.16362056732177735,
+      "step": 1630
+    },
+    {
+      "epoch": 0.29748908296943233,
+      "grad_norm": 0.1352023035287857,
+      "learning_rate": 4.121520180853903e-05,
+      "loss": 0.15631601810455323,
+      "step": 1635
+    },
+    {
+      "epoch": 0.29839883551673946,
+      "grad_norm": 0.15340781211853027,
+      "learning_rate": 4.1159106836901674e-05,
+      "loss": 0.1571858048439026,
+      "step": 1640
+    },
+    {
+      "epoch": 0.2993085880640466,
+      "grad_norm": 0.15311770141124725,
+      "learning_rate": 4.110287178145433e-05,
+      "loss": 0.16082344055175782,
+      "step": 1645
+    },
+    {
+      "epoch": 0.3002183406113537,
+      "grad_norm": 0.17811627686023712,
+      "learning_rate": 4.10464971297005e-05,
+      "loss": 0.16117215156555176,
+      "step": 1650
+    },
+    {
+      "epoch": 0.30112809315866085,
+      "grad_norm": 0.21060039103031158,
+      "learning_rate": 4.0989983370353805e-05,
+      "loss": 0.15838587284088135,
+      "step": 1655
+    },
+    {
+      "epoch": 0.302037845705968,
+      "grad_norm": 0.155836820602417,
+      "learning_rate": 4.093333099333383e-05,
+      "loss": 0.16648870706558228,
+      "step": 1660
+    },
+    {
+      "epoch": 0.3029475982532751,
+      "grad_norm": 0.13711698353290558,
+      "learning_rate": 4.0876540489761826e-05,
+      "loss": 0.16899349689483642,
+      "step": 1665
+    },
+    {
+      "epoch": 0.30385735080058224,
+      "grad_norm": 0.15162716805934906,
+      "learning_rate": 4.0819612351956485e-05,
+      "loss": 0.16574090719223022,
+      "step": 1670
+    },
+    {
+      "epoch": 0.30476710334788937,
+      "grad_norm": 0.15016348659992218,
+      "learning_rate": 4.0762547073429615e-05,
+      "loss": 0.1689780354499817,
+      "step": 1675
+    },
+    {
+      "epoch": 0.3056768558951965,
+      "grad_norm": 0.15182986855506897,
+      "learning_rate": 4.070534514888194e-05,
+      "loss": 0.1593686819076538,
+      "step": 1680
+    },
+    {
+      "epoch": 0.3065866084425036,
+      "grad_norm": 0.15648750960826874,
+      "learning_rate": 4.0648007074198765e-05,
+      "loss": 0.16436235904693602,
+      "step": 1685
+    },
+    {
+      "epoch": 0.30749636098981076,
+      "grad_norm": 0.18339484930038452,
+      "learning_rate": 4.0590533346445665e-05,
+      "loss": 0.1678077220916748,
+      "step": 1690
+    },
+    {
+      "epoch": 0.3084061135371179,
+      "grad_norm": 0.16426527500152588,
+      "learning_rate": 4.053292446386422e-05,
+      "loss": 0.1689227342605591,
+      "step": 1695
+    },
+    {
+      "epoch": 0.309315866084425,
+      "grad_norm": 0.16129335761070251,
+      "learning_rate": 4.047518092586766e-05,
+      "loss": 0.16592445373535156,
+      "step": 1700
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 9.409120064297925e+17,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-1700/training_args.bin b/checkpoint-1700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-1700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/checkpoint-1800/README.md b/checkpoint-1800/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-1800/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-1800/adapter_config.json b/checkpoint-1800/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-1800/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-1800/adapter_model.safetensors b/checkpoint-1800/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..0fa05a14273bda432c5109b78eebe99a26ebb547
--- /dev/null
+++ b/checkpoint-1800/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:37be58b37107e3e073fc6a7989de8b112a42eb3fe45ff7a948df543599fbb4d5
+size 169741912
diff --git a/checkpoint-1800/chat_template.jinja b/checkpoint-1800/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-1800/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-1800/optimizer.pt b/checkpoint-1800/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3b5376b2aff9b0e5fedb49cb8d9ec251350c341c
--- /dev/null
+++ b/checkpoint-1800/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8f2e48dd5170d692b732527d1d8d71c793dc18b8a4688a937ac30ddb8a190278
+size 72807355
diff --git a/checkpoint-1800/processor_config.json b/checkpoint-1800/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-1800/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-1800/rng_state.pth b/checkpoint-1800/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-1800/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-1800/scheduler.pt b/checkpoint-1800/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1731e382022e615ee7d7c054b92df874a8e611dc
--- /dev/null
+++ b/checkpoint-1800/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5929f0ded68cadf5a903af70646ae01b592f23d0cf488dad2297141251892d69
+size 1465
diff --git a/checkpoint-1800/tokenizer.json b/checkpoint-1800/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-1800/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-1800/tokenizer_config.json b/checkpoint-1800/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-1800/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-1800/trainer_state.json b/checkpoint-1800/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..1936a2517897358c32b61d00cbb13aaef1541746
--- /dev/null
+++ b/checkpoint-1800/trainer_state.json
@@ -0,0 +1,2562 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.32751091703056767,
+  "eval_steps": 100,
+  "global_step": 1800,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    },
+    {
+      "epoch": 0.03729985443959243,
+      "grad_norm": 0.061678580939769745,
+      "learning_rate": 4.999340748636462e-05,
+      "loss": 0.24956226348876953,
+      "step": 205
+    },
+    {
+      "epoch": 0.03820960698689956,
+      "grad_norm": 0.07328873127698898,
+      "learning_rate": 4.999160884067051e-05,
+      "loss": 0.26169676780700685,
+      "step": 210
+    },
+    {
+      "epoch": 0.0391193595342067,
+      "grad_norm": 0.08287990838289261,
+      "learning_rate": 4.9989593541926246e-05,
+      "loss": 0.2574604034423828,
+      "step": 215
+    },
+    {
+      "epoch": 0.04002911208151383,
+      "grad_norm": 0.06787359714508057,
+      "learning_rate": 4.9987361607602525e-05,
+      "loss": 0.25351409912109374,
+      "step": 220
+    },
+    {
+      "epoch": 0.04093886462882096,
+      "grad_norm": 0.06695502996444702,
+      "learning_rate": 4.998491305704805e-05,
+      "loss": 0.24522039890289307,
+      "step": 225
+    },
+    {
+      "epoch": 0.041848617176128096,
+      "grad_norm": 0.08872214704751968,
+      "learning_rate": 4.9982247911489375e-05,
+      "loss": 0.2581867933273315,
+      "step": 230
+    },
+    {
+      "epoch": 0.042758369723435226,
+      "grad_norm": 0.07637131959199905,
+      "learning_rate": 4.9979366194030743e-05,
+      "loss": 0.25569658279418944,
+      "step": 235
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 0.08158119022846222,
+      "learning_rate": 4.997626792965385e-05,
+      "loss": 0.2529409646987915,
+      "step": 240
+    },
+    {
+      "epoch": 0.04457787481804949,
+      "grad_norm": 0.07529161125421524,
+      "learning_rate": 4.997295314521766e-05,
+      "loss": 0.24049024581909179,
+      "step": 245
+    },
+    {
+      "epoch": 0.04548762736535662,
+      "grad_norm": 0.08860139548778534,
+      "learning_rate": 4.996942186945813e-05,
+      "loss": 0.2490522861480713,
+      "step": 250
+    },
+    {
+      "epoch": 0.04639737991266375,
+      "grad_norm": 0.0850321501493454,
+      "learning_rate": 4.9965674132988005e-05,
+      "loss": 0.24180831909179687,
+      "step": 255
+    },
+    {
+      "epoch": 0.04730713245997089,
+      "grad_norm": 0.07556115090847015,
+      "learning_rate": 4.996170996829653e-05,
+      "loss": 0.2509631872177124,
+      "step": 260
+    },
+    {
+      "epoch": 0.04821688500727802,
+      "grad_norm": 0.07971206307411194,
+      "learning_rate": 4.995752940974918e-05,
+      "loss": 0.24398891925811766,
+      "step": 265
+    },
+    {
+      "epoch": 0.04912663755458515,
+      "grad_norm": 0.09149336814880371,
+      "learning_rate": 4.9953132493587344e-05,
+      "loss": 0.2300492286682129,
+      "step": 270
+    },
+    {
+      "epoch": 0.050036390101892286,
+      "grad_norm": 0.08265820890665054,
+      "learning_rate": 4.9948519257928034e-05,
+      "loss": 0.24246792793273925,
+      "step": 275
+    },
+    {
+      "epoch": 0.050946142649199416,
+      "grad_norm": 0.10328587144613266,
+      "learning_rate": 4.9943689742763534e-05,
+      "loss": 0.2367171049118042,
+      "step": 280
+    },
+    {
+      "epoch": 0.05185589519650655,
+      "grad_norm": 0.0836917981505394,
+      "learning_rate": 4.993864398996105e-05,
+      "loss": 0.23215813636779786,
+      "step": 285
+    },
+    {
+      "epoch": 0.05276564774381368,
+      "grad_norm": 0.09475161135196686,
+      "learning_rate": 4.99333820432624e-05,
+      "loss": 0.2350748062133789,
+      "step": 290
+    },
+    {
+      "epoch": 0.05367540029112081,
+      "grad_norm": 0.08040128648281097,
+      "learning_rate": 4.992790394828355e-05,
+      "loss": 0.23253886699676513,
+      "step": 295
+    },
+    {
+      "epoch": 0.05458515283842795,
+      "grad_norm": 0.08852150291204453,
+      "learning_rate": 4.992220975251428e-05,
+      "loss": 0.23856515884399415,
+      "step": 300
+    },
+    {
+      "epoch": 0.05549490538573508,
+      "grad_norm": 0.09565229713916779,
+      "learning_rate": 4.991629950531775e-05,
+      "loss": 0.23311660289764405,
+      "step": 305
+    },
+    {
+      "epoch": 0.05640465793304221,
+      "grad_norm": 0.08158160001039505,
+      "learning_rate": 4.991017325793009e-05,
+      "loss": 0.22467944622039795,
+      "step": 310
+    },
+    {
+      "epoch": 0.05731441048034935,
+      "grad_norm": 0.07746429741382599,
+      "learning_rate": 4.990383106345994e-05,
+      "loss": 0.229844069480896,
+      "step": 315
+    },
+    {
+      "epoch": 0.05822416302765648,
+      "grad_norm": 0.08564355969429016,
+      "learning_rate": 4.989727297688797e-05,
+      "loss": 0.22414517402648926,
+      "step": 320
+    },
+    {
+      "epoch": 0.05913391557496361,
+      "grad_norm": 0.07517435401678085,
+      "learning_rate": 4.9890499055066435e-05,
+      "loss": 0.2236532211303711,
+      "step": 325
+    },
+    {
+      "epoch": 0.060043668122270744,
+      "grad_norm": 0.111734539270401,
+      "learning_rate": 4.988350935671869e-05,
+      "loss": 0.21474847793579102,
+      "step": 330
+    },
+    {
+      "epoch": 0.060953420669577874,
+      "grad_norm": 0.09906989336013794,
+      "learning_rate": 4.987630394243866e-05,
+      "loss": 0.23321933746337892,
+      "step": 335
+    },
+    {
+      "epoch": 0.06186317321688501,
+      "grad_norm": 0.10131457448005676,
+      "learning_rate": 4.98688828746903e-05,
+      "loss": 0.2310662031173706,
+      "step": 340
+    },
+    {
+      "epoch": 0.06277292576419213,
+      "grad_norm": 0.09203507006168365,
+      "learning_rate": 4.986124621780708e-05,
+      "loss": 0.22021169662475587,
+      "step": 345
+    },
+    {
+      "epoch": 0.06368267831149928,
+      "grad_norm": 0.09505912661552429,
+      "learning_rate": 4.9853394037991416e-05,
+      "loss": 0.2197155237197876,
+      "step": 350
+    },
+    {
+      "epoch": 0.06459243085880641,
+      "grad_norm": 0.09038657695055008,
+      "learning_rate": 4.984532640331412e-05,
+      "loss": 0.22066287994384765,
+      "step": 355
+    },
+    {
+      "epoch": 0.06550218340611354,
+      "grad_norm": 0.09707064181566238,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 0.22455451488494874,
+      "step": 360
+    },
+    {
+      "epoch": 0.06641193595342067,
+      "grad_norm": 0.10367228090763092,
+      "learning_rate": 4.98285450509961e-05,
+      "loss": 0.21993820667266845,
+      "step": 365
+    },
+    {
+      "epoch": 0.0673216885007278,
+      "grad_norm": 0.12229471653699875,
+      "learning_rate": 4.9819831478833456e-05,
+      "loss": 0.2168867588043213,
+      "step": 370
+    },
+    {
+      "epoch": 0.06823144104803494,
+      "grad_norm": 0.0964592918753624,
+      "learning_rate": 4.981090274276406e-05,
+      "loss": 0.21579203605651856,
+      "step": 375
+    },
+    {
+      "epoch": 0.06914119359534207,
+      "grad_norm": 0.09400496631860733,
+      "learning_rate": 4.980175892019141e-05,
+      "loss": 0.20972180366516113,
+      "step": 380
+    },
+    {
+      "epoch": 0.0700509461426492,
+      "grad_norm": 0.08158645778894424,
+      "learning_rate": 4.9792400090383594e-05,
+      "loss": 0.22148358821868896,
+      "step": 385
+    },
+    {
+      "epoch": 0.07096069868995633,
+      "grad_norm": 0.10916394740343094,
+      "learning_rate": 4.978282633447261e-05,
+      "loss": 0.2214418649673462,
+      "step": 390
+    },
+    {
+      "epoch": 0.07187045123726346,
+      "grad_norm": 0.11138810962438583,
+      "learning_rate": 4.9773037735453636e-05,
+      "loss": 0.21814754009246826,
+      "step": 395
+    },
+    {
+      "epoch": 0.07278020378457059,
+      "grad_norm": 0.10914396494626999,
+      "learning_rate": 4.9763034378184365e-05,
+      "loss": 0.21310818195343018,
+      "step": 400
+    },
+    {
+      "epoch": 0.07368995633187773,
+      "grad_norm": 0.1043366864323616,
+      "learning_rate": 4.975281634938421e-05,
+      "loss": 0.21266789436340333,
+      "step": 405
+    },
+    {
+      "epoch": 0.07459970887918486,
+      "grad_norm": 0.1036868542432785,
+      "learning_rate": 4.9742383737633594e-05,
+      "loss": 0.21606721878051757,
+      "step": 410
+    },
+    {
+      "epoch": 0.075509461426492,
+      "grad_norm": 0.11640442907810211,
+      "learning_rate": 4.9731736633373144e-05,
+      "loss": 0.21532948017120362,
+      "step": 415
+    },
+    {
+      "epoch": 0.07641921397379912,
+      "grad_norm": 0.11219926178455353,
+      "learning_rate": 4.9720875128902956e-05,
+      "loss": 0.2191627025604248,
+      "step": 420
+    },
+    {
+      "epoch": 0.07732896652110625,
+      "grad_norm": 0.12103637307882309,
+      "learning_rate": 4.970979931838176e-05,
+      "loss": 0.20938868522644044,
+      "step": 425
+    },
+    {
+      "epoch": 0.0782387190684134,
+      "grad_norm": 0.13274189829826355,
+      "learning_rate": 4.96985092978261e-05,
+      "loss": 0.21792960166931152,
+      "step": 430
+    },
+    {
+      "epoch": 0.07914847161572053,
+      "grad_norm": 0.11164513230323792,
+      "learning_rate": 4.968700516510954e-05,
+      "loss": 0.2022618055343628,
+      "step": 435
+    },
+    {
+      "epoch": 0.08005822416302766,
+      "grad_norm": 0.09532847255468369,
+      "learning_rate": 4.967528701996174e-05,
+      "loss": 0.21255812644958497,
+      "step": 440
+    },
+    {
+      "epoch": 0.08096797671033479,
+      "grad_norm": 0.10279258340597153,
+      "learning_rate": 4.96633549639677e-05,
+      "loss": 0.20683050155639648,
+      "step": 445
+    },
+    {
+      "epoch": 0.08187772925764192,
+      "grad_norm": 0.1257462352514267,
+      "learning_rate": 4.965120910056677e-05,
+      "loss": 0.21419920921325683,
+      "step": 450
+    },
+    {
+      "epoch": 0.08278748180494905,
+      "grad_norm": 0.11663137376308441,
+      "learning_rate": 4.963884953505186e-05,
+      "loss": 0.2072287082672119,
+      "step": 455
+    },
+    {
+      "epoch": 0.08369723435225619,
+      "grad_norm": 0.10488224029541016,
+      "learning_rate": 4.96262763745684e-05,
+      "loss": 0.1982678532600403,
+      "step": 460
+    },
+    {
+      "epoch": 0.08460698689956332,
+      "grad_norm": 0.11801692098379135,
+      "learning_rate": 4.961348972811354e-05,
+      "loss": 0.20662031173706055,
+      "step": 465
+    },
+    {
+      "epoch": 0.08551673944687045,
+      "grad_norm": 0.11318827420473099,
+      "learning_rate": 4.96004897065351e-05,
+      "loss": 0.20947303771972656,
+      "step": 470
+    },
+    {
+      "epoch": 0.08642649199417758,
+      "grad_norm": 0.13409486413002014,
+      "learning_rate": 4.95872764225307e-05,
+      "loss": 0.19670876264572143,
+      "step": 475
+    },
+    {
+      "epoch": 0.08733624454148471,
+      "grad_norm": 0.14440792798995972,
+      "learning_rate": 4.957384999064672e-05,
+      "loss": 0.19842848777770997,
+      "step": 480
+    },
+    {
+      "epoch": 0.08824599708879186,
+      "grad_norm": 0.12246996909379959,
+      "learning_rate": 4.956021052727731e-05,
+      "loss": 0.20318071842193602,
+      "step": 485
+    },
+    {
+      "epoch": 0.08915574963609899,
+      "grad_norm": 0.13437233865261078,
+      "learning_rate": 4.954635815066342e-05,
+      "loss": 0.21675212383270265,
+      "step": 490
+    },
+    {
+      "epoch": 0.09006550218340612,
+      "grad_norm": 0.11109672486782074,
+      "learning_rate": 4.9532292980891744e-05,
+      "loss": 0.2100757837295532,
+      "step": 495
+    },
+    {
+      "epoch": 0.09097525473071325,
+      "grad_norm": 0.1388893872499466,
+      "learning_rate": 4.9518015139893675e-05,
+      "loss": 0.20303285121917725,
+      "step": 500
+    },
+    {
+      "epoch": 0.09188500727802038,
+      "grad_norm": 0.13239721953868866,
+      "learning_rate": 4.950352475144427e-05,
+      "loss": 0.2152268409729004,
+      "step": 505
+    },
+    {
+      "epoch": 0.0927947598253275,
+      "grad_norm": 0.12834979593753815,
+      "learning_rate": 4.948882194116119e-05,
+      "loss": 0.20799248218536376,
+      "step": 510
+    },
+    {
+      "epoch": 0.09370451237263465,
+      "grad_norm": 0.11886704713106155,
+      "learning_rate": 4.947390683650354e-05,
+      "loss": 0.20394976139068605,
+      "step": 515
+    },
+    {
+      "epoch": 0.09461426491994178,
+      "grad_norm": 0.11398876458406448,
+      "learning_rate": 4.945877956677083e-05,
+      "loss": 0.2091092586517334,
+      "step": 520
+    },
+    {
+      "epoch": 0.09552401746724891,
+      "grad_norm": 0.1422540694475174,
+      "learning_rate": 4.944344026310186e-05,
+      "loss": 0.19564238786697388,
+      "step": 525
+    },
+    {
+      "epoch": 0.09643377001455604,
+      "grad_norm": 0.11359584331512451,
+      "learning_rate": 4.9427889058473535e-05,
+      "loss": 0.20493624210357667,
+      "step": 530
+    },
+    {
+      "epoch": 0.09734352256186317,
+      "grad_norm": 0.11703553050756454,
+      "learning_rate": 4.941212608769974e-05,
+      "loss": 0.2098615884780884,
+      "step": 535
+    },
+    {
+      "epoch": 0.0982532751091703,
+      "grad_norm": 0.14552047848701477,
+      "learning_rate": 4.939615148743017e-05,
+      "loss": 0.20382182598114013,
+      "step": 540
+    },
+    {
+      "epoch": 0.09916302765647744,
+      "grad_norm": 0.13178016245365143,
+      "learning_rate": 4.937996539614914e-05,
+      "loss": 0.19901862144470214,
+      "step": 545
+    },
+    {
+      "epoch": 0.10007278020378457,
+      "grad_norm": 0.635392427444458,
+      "learning_rate": 4.936356795417439e-05,
+      "loss": 0.20694944858551026,
+      "step": 550
+    },
+    {
+      "epoch": 0.1009825327510917,
+      "grad_norm": 0.15019077062606812,
+      "learning_rate": 4.934695930365586e-05,
+      "loss": 0.19313746690750122,
+      "step": 555
+    },
+    {
+      "epoch": 0.10189228529839883,
+      "grad_norm": 0.12941956520080566,
+      "learning_rate": 4.9330139588574474e-05,
+      "loss": 0.19671722650527954,
+      "step": 560
+    },
+    {
+      "epoch": 0.10280203784570596,
+      "grad_norm": 0.13818831741809845,
+      "learning_rate": 4.931310895474088e-05,
+      "loss": 0.20026786327362062,
+      "step": 565
+    },
+    {
+      "epoch": 0.1037117903930131,
+      "grad_norm": 0.12011194974184036,
+      "learning_rate": 4.929586754979417e-05,
+      "loss": 0.1932437539100647,
+      "step": 570
+    },
+    {
+      "epoch": 0.10462154294032024,
+      "grad_norm": 0.1345364898443222,
+      "learning_rate": 4.9278415523200644e-05,
+      "loss": 0.20245940685272218,
+      "step": 575
+    },
+    {
+      "epoch": 0.10553129548762737,
+      "grad_norm": 0.13281017541885376,
+      "learning_rate": 4.926075302625247e-05,
+      "loss": 0.19864981174468993,
+      "step": 580
+    },
+    {
+      "epoch": 0.1064410480349345,
+      "grad_norm": 0.13465586304664612,
+      "learning_rate": 4.924288021206639e-05,
+      "loss": 0.19573183059692384,
+      "step": 585
+    },
+    {
+      "epoch": 0.10735080058224163,
+      "grad_norm": 0.15225961804389954,
+      "learning_rate": 4.9224797235582396e-05,
+      "loss": 0.19946801662445068,
+      "step": 590
+    },
+    {
+      "epoch": 0.10826055312954876,
+      "grad_norm": 0.12816746532917023,
+      "learning_rate": 4.92065042535624e-05,
+      "loss": 0.19851526021957397,
+      "step": 595
+    },
+    {
+      "epoch": 0.1091703056768559,
+      "grad_norm": 0.13802853226661682,
+      "learning_rate": 4.9188001424588824e-05,
+      "loss": 0.19321763515472412,
+      "step": 600
+    },
+    {
+      "epoch": 0.11008005822416303,
+      "grad_norm": 0.17504797875881195,
+      "learning_rate": 4.9169288909063295e-05,
+      "loss": 0.2032616138458252,
+      "step": 605
+    },
+    {
+      "epoch": 0.11098981077147016,
+      "grad_norm": 0.13544194400310516,
+      "learning_rate": 4.91503668692052e-05,
+      "loss": 0.2011256456375122,
+      "step": 610
+    },
+    {
+      "epoch": 0.11189956331877729,
+      "grad_norm": 1.3976134061813354,
+      "learning_rate": 4.91312354690503e-05,
+      "loss": 0.19916868209838867,
+      "step": 615
+    },
+    {
+      "epoch": 0.11280931586608442,
+      "grad_norm": 0.1465059071779251,
+      "learning_rate": 4.91118948744493e-05,
+      "loss": 0.19487457275390624,
+      "step": 620
+    },
+    {
+      "epoch": 0.11371906841339156,
+      "grad_norm": 0.12103168666362762,
+      "learning_rate": 4.909234525306645e-05,
+      "loss": 0.1907251238822937,
+      "step": 625
+    },
+    {
+      "epoch": 0.1146288209606987,
+      "grad_norm": 0.12660574913024902,
+      "learning_rate": 4.907258677437802e-05,
+      "loss": 0.19327253103256226,
+      "step": 630
+    },
+    {
+      "epoch": 0.11553857350800582,
+      "grad_norm": 0.1347813606262207,
+      "learning_rate": 4.90526196096709e-05,
+      "loss": 0.19637736082077026,
+      "step": 635
+    },
+    {
+      "epoch": 0.11644832605531295,
+      "grad_norm": 0.14953652024269104,
+      "learning_rate": 4.903244393204107e-05,
+      "loss": 0.20325069427490233,
+      "step": 640
+    },
+    {
+      "epoch": 0.11735807860262008,
+      "grad_norm": 0.13936272263526917,
+      "learning_rate": 4.901205991639213e-05,
+      "loss": 0.1930275321006775,
+      "step": 645
+    },
+    {
+      "epoch": 0.11826783114992721,
+      "grad_norm": 0.1448420137166977,
+      "learning_rate": 4.899146773943374e-05,
+      "loss": 0.20026936531066894,
+      "step": 650
+    },
+    {
+      "epoch": 0.11917758369723436,
+      "grad_norm": 0.1312534064054489,
+      "learning_rate": 4.897066757968014e-05,
+      "loss": 0.19062033891677857,
+      "step": 655
+    },
+    {
+      "epoch": 0.12008733624454149,
+      "grad_norm": 0.13644742965698242,
+      "learning_rate": 4.894965961744859e-05,
+      "loss": 0.18719595670700073,
+      "step": 660
+    },
+    {
+      "epoch": 0.12099708879184862,
+      "grad_norm": 0.14276087284088135,
+      "learning_rate": 4.892844403485777e-05,
+      "loss": 0.19784307479858398,
+      "step": 665
+    },
+    {
+      "epoch": 0.12190684133915575,
+      "grad_norm": 0.14735399186611176,
+      "learning_rate": 4.890702101582623e-05,
+      "loss": 0.19163782596588136,
+      "step": 670
+    },
+    {
+      "epoch": 0.12281659388646288,
+      "grad_norm": 0.15742065012454987,
+      "learning_rate": 4.888539074607082e-05,
+      "loss": 0.19312986135482788,
+      "step": 675
+    },
+    {
+      "epoch": 0.12372634643377002,
+      "grad_norm": 0.12917031347751617,
+      "learning_rate": 4.8863553413105025e-05,
+      "loss": 0.20066320896148682,
+      "step": 680
+    },
+    {
+      "epoch": 0.12463609898107715,
+      "grad_norm": 0.1484801322221756,
+      "learning_rate": 4.884150920623737e-05,
+      "loss": 0.20096096992492676,
+      "step": 685
+    },
+    {
+      "epoch": 0.12554585152838427,
+      "grad_norm": 0.1455296128988266,
+      "learning_rate": 4.88192583165698e-05,
+      "loss": 0.20518505573272705,
+      "step": 690
+    },
+    {
+      "epoch": 0.12645560407569142,
+      "grad_norm": 0.14517490565776825,
+      "learning_rate": 4.879680093699598e-05,
+      "loss": 0.18859238624572755,
+      "step": 695
+    },
+    {
+      "epoch": 0.12736535662299855,
+      "grad_norm": 0.18778090178966522,
+      "learning_rate": 4.877413726219964e-05,
+      "loss": 0.197074818611145,
+      "step": 700
+    },
+    {
+      "epoch": 0.12827510917030568,
+      "grad_norm": 0.13497677445411682,
+      "learning_rate": 4.87512674886529e-05,
+      "loss": 0.18713107109069824,
+      "step": 705
+    },
+    {
+      "epoch": 0.12918486171761281,
+      "grad_norm": 0.12657155096530914,
+      "learning_rate": 4.872819181461455e-05,
+      "loss": 0.1858484387397766,
+      "step": 710
+    },
+    {
+      "epoch": 0.13009461426491994,
+      "grad_norm": 0.11458148807287216,
+      "learning_rate": 4.870491044012834e-05,
+      "loss": 0.18732179403305055,
+      "step": 715
+    },
+    {
+      "epoch": 0.13100436681222707,
+      "grad_norm": 0.13000249862670898,
+      "learning_rate": 4.8681423567021244e-05,
+      "loss": 0.1872936010360718,
+      "step": 720
+    },
+    {
+      "epoch": 0.1319141193595342,
+      "grad_norm": 0.14580890536308289,
+      "learning_rate": 4.865773139890172e-05,
+      "loss": 0.19280019998550416,
+      "step": 725
+    },
+    {
+      "epoch": 0.13282387190684133,
+      "grad_norm": 0.1507277935743332,
+      "learning_rate": 4.8633834141157913e-05,
+      "loss": 0.1898929238319397,
+      "step": 730
+    },
+    {
+      "epoch": 0.13373362445414846,
+      "grad_norm": 0.1418737769126892,
+      "learning_rate": 4.860973200095592e-05,
+      "loss": 0.17926375865936278,
+      "step": 735
+    },
+    {
+      "epoch": 0.1346433770014556,
+      "grad_norm": 0.17151866853237152,
+      "learning_rate": 4.858542518723794e-05,
+      "loss": 0.18963592052459716,
+      "step": 740
+    },
+    {
+      "epoch": 0.13555312954876272,
+      "grad_norm": 0.11162743717432022,
+      "learning_rate": 4.8560913910720535e-05,
+      "loss": 0.19466646909713745,
+      "step": 745
+    },
+    {
+      "epoch": 0.13646288209606988,
+      "grad_norm": 0.15628376603126526,
+      "learning_rate": 4.8536198383892725e-05,
+      "loss": 0.19494034051895143,
+      "step": 750
+    },
+    {
+      "epoch": 0.137372634643377,
+      "grad_norm": 0.18209289014339447,
+      "learning_rate": 4.851127882101421e-05,
+      "loss": 0.18747550249099731,
+      "step": 755
+    },
+    {
+      "epoch": 0.13828238719068414,
+      "grad_norm": 0.14559614658355713,
+      "learning_rate": 4.8486155438113454e-05,
+      "loss": 0.1897158980369568,
+      "step": 760
+    },
+    {
+      "epoch": 0.13919213973799127,
+      "grad_norm": 0.3198587894439697,
+      "learning_rate": 4.846082845298586e-05,
+      "loss": 0.18571001291275024,
+      "step": 765
+    },
+    {
+      "epoch": 0.1401018922852984,
+      "grad_norm": 0.1486678421497345,
+      "learning_rate": 4.843529808519189e-05,
+      "loss": 0.19561930894851684,
+      "step": 770
+    },
+    {
+      "epoch": 0.14101164483260553,
+      "grad_norm": 0.15318170189857483,
+      "learning_rate": 4.840956455605509e-05,
+      "loss": 0.187040114402771,
+      "step": 775
+    },
+    {
+      "epoch": 0.14192139737991266,
+      "grad_norm": 0.13754244148731232,
+      "learning_rate": 4.838362808866025e-05,
+      "loss": 0.18345539569854735,
+      "step": 780
+    },
+    {
+      "epoch": 0.1428311499272198,
+      "grad_norm": 0.12943248450756073,
+      "learning_rate": 4.835748890785143e-05,
+      "loss": 0.1921079397201538,
+      "step": 785
+    },
+    {
+      "epoch": 0.14374090247452692,
+      "grad_norm": 0.110458143055439,
+      "learning_rate": 4.833114724023001e-05,
+      "loss": 0.17927205562591553,
+      "step": 790
+    },
+    {
+      "epoch": 0.14465065502183405,
+      "grad_norm": 0.2421770840883255,
+      "learning_rate": 4.830460331415275e-05,
+      "loss": 0.18317567110061644,
+      "step": 795
+    },
+    {
+      "epoch": 0.14556040756914118,
+      "grad_norm": 0.14752762019634247,
+      "learning_rate": 4.8277857359729787e-05,
+      "loss": 0.1843916058540344,
+      "step": 800
+    },
+    {
+      "epoch": 0.14647016011644834,
+      "grad_norm": 0.15043556690216064,
+      "learning_rate": 4.8250909608822644e-05,
+      "loss": 0.18354393243789674,
+      "step": 805
+    },
+    {
+      "epoch": 0.14737991266375547,
+      "grad_norm": 0.1381794661283493,
+      "learning_rate": 4.822376029504223e-05,
+      "loss": 0.1789781332015991,
+      "step": 810
+    },
+    {
+      "epoch": 0.1482896652110626,
+      "grad_norm": 0.18386174738407135,
+      "learning_rate": 4.819640965374681e-05,
+      "loss": 0.19494292736053467,
+      "step": 815
+    },
+    {
+      "epoch": 0.14919941775836973,
+      "grad_norm": 0.13829593360424042,
+      "learning_rate": 4.816885792203996e-05,
+      "loss": 0.18486063480377196,
+      "step": 820
+    },
+    {
+      "epoch": 0.15010917030567686,
+      "grad_norm": 0.15033291280269623,
+      "learning_rate": 4.814110533876852e-05,
+      "loss": 0.18061509132385253,
+      "step": 825
+    },
+    {
+      "epoch": 0.151018922852984,
+      "grad_norm": 0.17150473594665527,
+      "learning_rate": 4.811315214452051e-05,
+      "loss": 0.18464866876602173,
+      "step": 830
+    },
+    {
+      "epoch": 0.15192867540029112,
+      "grad_norm": 0.15317125618457794,
+      "learning_rate": 4.808499858162307e-05,
+      "loss": 0.1837708592414856,
+      "step": 835
+    },
+    {
+      "epoch": 0.15283842794759825,
+      "grad_norm": 0.2671392560005188,
+      "learning_rate": 4.805664489414031e-05,
+      "loss": 0.19338636398315429,
+      "step": 840
+    },
+    {
+      "epoch": 0.15374818049490538,
+      "grad_norm": 0.14047028124332428,
+      "learning_rate": 4.802809132787125e-05,
+      "loss": 0.17069108486175538,
+      "step": 845
+    },
+    {
+      "epoch": 0.1546579330422125,
+      "grad_norm": 0.1520431935787201,
+      "learning_rate": 4.799933813034768e-05,
+      "loss": 0.18607735633850098,
+      "step": 850
+    },
+    {
+      "epoch": 0.15556768558951964,
+      "grad_norm": 0.17239463329315186,
+      "learning_rate": 4.797038555083197e-05,
+      "loss": 0.18069062232971192,
+      "step": 855
+    },
+    {
+      "epoch": 0.1564774381368268,
+      "grad_norm": 0.1377955675125122,
+      "learning_rate": 4.794123384031495e-05,
+      "loss": 0.18870222568511963,
+      "step": 860
+    },
+    {
+      "epoch": 0.15738719068413393,
+      "grad_norm": 0.15901461243629456,
+      "learning_rate": 4.791188325151373e-05,
+      "loss": 0.18128334283828734,
+      "step": 865
+    },
+    {
+      "epoch": 0.15829694323144106,
+      "grad_norm": 0.14634132385253906,
+      "learning_rate": 4.7882334038869495e-05,
+      "loss": 0.1866163969039917,
+      "step": 870
+    },
+    {
+      "epoch": 0.1592066957787482,
+      "grad_norm": 0.15361061692237854,
+      "learning_rate": 4.785258645854529e-05,
+      "loss": 0.17850807905197144,
+      "step": 875
+    },
+    {
+      "epoch": 0.16011644832605532,
+      "grad_norm": 0.13751649856567383,
+      "learning_rate": 4.782264076842385e-05,
+      "loss": 0.17731113433837892,
+      "step": 880
+    },
+    {
+      "epoch": 0.16102620087336245,
+      "grad_norm": 0.17909638583660126,
+      "learning_rate": 4.7792497228105314e-05,
+      "loss": 0.18344542980194092,
+      "step": 885
+    },
+    {
+      "epoch": 0.16193595342066958,
+      "grad_norm": 0.16038304567337036,
+      "learning_rate": 4.776215609890498e-05,
+      "loss": 0.18868647813796996,
+      "step": 890
+    },
+    {
+      "epoch": 0.1628457059679767,
+      "grad_norm": 0.1653951108455658,
+      "learning_rate": 4.773161764385107e-05,
+      "loss": 0.18614152669906617,
+      "step": 895
+    },
+    {
+      "epoch": 0.16375545851528384,
+      "grad_norm": 0.16193026304244995,
+      "learning_rate": 4.770088212768241e-05,
+      "loss": 0.18564575910568237,
+      "step": 900
+    },
+    {
+      "epoch": 0.16466521106259097,
+      "grad_norm": 0.16048531234264374,
+      "learning_rate": 4.7669949816846173e-05,
+      "loss": 0.18330031633377075,
+      "step": 905
+    },
+    {
+      "epoch": 0.1655749636098981,
+      "grad_norm": 0.1440177708864212,
+      "learning_rate": 4.7638820979495534e-05,
+      "loss": 0.17712442874908446,
+      "step": 910
+    },
+    {
+      "epoch": 0.16648471615720525,
+      "grad_norm": 0.19635969400405884,
+      "learning_rate": 4.760749588548738e-05,
+      "loss": 0.18679027557373046,
+      "step": 915
+    },
+    {
+      "epoch": 0.16739446870451238,
+      "grad_norm": 0.15576541423797607,
+      "learning_rate": 4.757597480637995e-05,
+      "loss": 0.19283764362335204,
+      "step": 920
+    },
+    {
+      "epoch": 0.1683042212518195,
+      "grad_norm": 0.1550331562757492,
+      "learning_rate": 4.7544258015430463e-05,
+      "loss": 0.18269542455673218,
+      "step": 925
+    },
+    {
+      "epoch": 0.16921397379912664,
+      "grad_norm": 0.18369626998901367,
+      "learning_rate": 4.75123457875928e-05,
+      "loss": 0.1697891116142273,
+      "step": 930
+    },
+    {
+      "epoch": 0.17012372634643377,
+      "grad_norm": 0.15266314148902893,
+      "learning_rate": 4.7480238399515074e-05,
+      "loss": 0.18523451089859008,
+      "step": 935
+    },
+    {
+      "epoch": 0.1710334788937409,
+      "grad_norm": 0.16709664463996887,
+      "learning_rate": 4.744793612953724e-05,
+      "loss": 0.1803238034248352,
+      "step": 940
+    },
+    {
+      "epoch": 0.17194323144104803,
+      "grad_norm": 0.14929179847240448,
+      "learning_rate": 4.741543925768872e-05,
+      "loss": 0.1861217737197876,
+      "step": 945
+    },
+    {
+      "epoch": 0.17285298398835516,
+      "grad_norm": 0.1362280696630478,
+      "learning_rate": 4.7382748065685915e-05,
+      "loss": 0.17896100282669067,
+      "step": 950
+    },
+    {
+      "epoch": 0.1737627365356623,
+      "grad_norm": 0.15290239453315735,
+      "learning_rate": 4.734986283692982e-05,
+      "loss": 0.18432788848876952,
+      "step": 955
+    },
+    {
+      "epoch": 0.17467248908296942,
+      "grad_norm": 0.1287035197019577,
+      "learning_rate": 4.73167838565035e-05,
+      "loss": 0.18485682010650634,
+      "step": 960
+    },
+    {
+      "epoch": 0.17558224163027655,
+      "grad_norm": 0.17969627678394318,
+      "learning_rate": 4.728351141116971e-05,
+      "loss": 0.17361557483673096,
+      "step": 965
+    },
+    {
+      "epoch": 0.1764919941775837,
+      "grad_norm": 0.13751201331615448,
+      "learning_rate": 4.7250045789368326e-05,
+      "loss": 0.1731679320335388,
+      "step": 970
+    },
+    {
+      "epoch": 0.17740174672489084,
+      "grad_norm": 0.1603265255689621,
+      "learning_rate": 4.721638728121388e-05,
+      "loss": 0.17308170795440675,
+      "step": 975
+    },
+    {
+      "epoch": 0.17831149927219797,
+      "grad_norm": 0.1592789888381958,
+      "learning_rate": 4.718253617849306e-05,
+      "loss": 0.17534757852554322,
+      "step": 980
+    },
+    {
+      "epoch": 0.1792212518195051,
+      "grad_norm": 0.12727224826812744,
+      "learning_rate": 4.714849277466214e-05,
+      "loss": 0.17817609310150145,
+      "step": 985
+    },
+    {
+      "epoch": 0.18013100436681223,
+      "grad_norm": 0.15401554107666016,
+      "learning_rate": 4.711425736484447e-05,
+      "loss": 0.1733405351638794,
+      "step": 990
+    },
+    {
+      "epoch": 0.18104075691411936,
+      "grad_norm": 0.13253968954086304,
+      "learning_rate": 4.7079830245827906e-05,
+      "loss": 0.17846795320510864,
+      "step": 995
+    },
+    {
+      "epoch": 0.1819505094614265,
+      "grad_norm": 0.21846213936805725,
+      "learning_rate": 4.7045211716062245e-05,
+      "loss": 0.18021599054336548,
+      "step": 1000
+    },
+    {
+      "epoch": 0.18286026200873362,
+      "grad_norm": 0.16867990791797638,
+      "learning_rate": 4.7010402075656595e-05,
+      "loss": 0.18232386112213134,
+      "step": 1005
+    },
+    {
+      "epoch": 0.18377001455604075,
+      "grad_norm": 0.17180582880973816,
+      "learning_rate": 4.697540162637686e-05,
+      "loss": 0.1816317319869995,
+      "step": 1010
+    },
+    {
+      "epoch": 0.18467976710334788,
+      "grad_norm": 0.16480213403701782,
+      "learning_rate": 4.694021067164303e-05,
+      "loss": 0.17718446254730225,
+      "step": 1015
+    },
+    {
+      "epoch": 0.185589519650655,
+      "grad_norm": 0.15015918016433716,
+      "learning_rate": 4.6904829516526605e-05,
+      "loss": 0.17412011623382567,
+      "step": 1020
+    },
+    {
+      "epoch": 0.18649927219796217,
+      "grad_norm": 0.14445139467716217,
+      "learning_rate": 4.686925846774795e-05,
+      "loss": 0.1778018832206726,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1874090247452693,
+      "grad_norm": 0.1701960265636444,
+      "learning_rate": 4.683349783367362e-05,
+      "loss": 0.16901081800460815,
+      "step": 1030
+    },
+    {
+      "epoch": 0.18831877729257643,
+      "grad_norm": 0.15894867479801178,
+      "learning_rate": 4.679754792431368e-05,
+      "loss": 0.17055928707122803,
+      "step": 1035
+    },
+    {
+      "epoch": 0.18922852983988356,
+      "grad_norm": 0.1511942446231842,
+      "learning_rate": 4.676140905131903e-05,
+      "loss": 0.17339680194854737,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1901382823871907,
+      "grad_norm": 0.14735209941864014,
+      "learning_rate": 4.672508152797872e-05,
+      "loss": 0.17802717685699462,
+      "step": 1045
+    },
+    {
+      "epoch": 0.19104803493449782,
+      "grad_norm": 0.17367291450500488,
+      "learning_rate": 4.66885656692172e-05,
+      "loss": 0.1732744097709656,
+      "step": 1050
+    },
+    {
+      "epoch": 0.19195778748180495,
+      "grad_norm": 0.147227481007576,
+      "learning_rate": 4.665186179159159e-05,
+      "loss": 0.17040517330169677,
+      "step": 1055
+    },
+    {
+      "epoch": 0.19286754002911208,
+      "grad_norm": 0.1709655076265335,
+      "learning_rate": 4.6614970213289e-05,
+      "loss": 0.17794088125228882,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1937772925764192,
+      "grad_norm": 0.1588088721036911,
+      "learning_rate": 4.657789125412366e-05,
+      "loss": 0.17180380821228028,
+      "step": 1065
+    },
+    {
+      "epoch": 0.19468704512372634,
+      "grad_norm": 0.14827021956443787,
+      "learning_rate": 4.654062523553428e-05,
+      "loss": 0.182997989654541,
+      "step": 1070
+    },
+    {
+      "epoch": 0.19559679767103347,
+      "grad_norm": 0.16230466961860657,
+      "learning_rate": 4.6503172480581126e-05,
+      "loss": 0.17346880435943604,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1965065502183406,
+      "grad_norm": 0.1637624353170395,
+      "learning_rate": 4.646553331394333e-05,
+      "loss": 0.17263576984405518,
+      "step": 1080
+    },
+    {
+      "epoch": 0.19741630276564776,
+      "grad_norm": 0.15977843105793,
+      "learning_rate": 4.642770806191603e-05,
+      "loss": 0.17284308671951293,
+      "step": 1085
+    },
+    {
+      "epoch": 0.19832605531295489,
+      "grad_norm": 0.15394869446754456,
+      "learning_rate": 4.6389697052407534e-05,
+      "loss": 0.17797101736068727,
+      "step": 1090
+    },
+    {
+      "epoch": 0.19923580786026202,
+      "grad_norm": 0.15995225310325623,
+      "learning_rate": 4.6351500614936485e-05,
+      "loss": 0.18137198686599731,
+      "step": 1095
+    },
+    {
+      "epoch": 0.20014556040756915,
+      "grad_norm": 0.1779479682445526,
+      "learning_rate": 4.6313119080629006e-05,
+      "loss": 0.17998344898223878,
+      "step": 1100
+    },
+    {
+      "epoch": 0.20105531295487628,
+      "grad_norm": 0.14362832903862,
+      "learning_rate": 4.627455278221584e-05,
+      "loss": 0.18196423053741456,
+      "step": 1105
+    },
+    {
+      "epoch": 0.2019650655021834,
+      "grad_norm": 0.15951639413833618,
+      "learning_rate": 4.623580205402947e-05,
+      "loss": 0.17423888444900512,
+      "step": 1110
+    },
+    {
+      "epoch": 0.20287481804949054,
+      "grad_norm": 0.17273563146591187,
+      "learning_rate": 4.619686723200115e-05,
+      "loss": 0.17392473220825194,
+      "step": 1115
+    },
+    {
+      "epoch": 0.20378457059679767,
+      "grad_norm": 0.1655360758304596,
+      "learning_rate": 4.615774865365813e-05,
+      "loss": 0.17528389692306517,
+      "step": 1120
+    },
+    {
+      "epoch": 0.2046943231441048,
+      "grad_norm": 0.15920691192150116,
+      "learning_rate": 4.611844665812058e-05,
+      "loss": 0.1806849241256714,
+      "step": 1125
+    },
+    {
+      "epoch": 0.20560407569141192,
+      "grad_norm": 0.16114577651023865,
+      "learning_rate": 4.607896158609875e-05,
+      "loss": 0.17217352390289306,
+      "step": 1130
+    },
+    {
+      "epoch": 0.20651382823871905,
+      "grad_norm": 0.1499422937631607,
+      "learning_rate": 4.603929377988999e-05,
+      "loss": 0.17806737422943114,
+      "step": 1135
+    },
+    {
+      "epoch": 0.2074235807860262,
+      "grad_norm": 0.17605191469192505,
+      "learning_rate": 4.5999443583375765e-05,
+      "loss": 0.17842113971710205,
+      "step": 1140
+    },
+    {
+      "epoch": 0.20833333333333334,
+      "grad_norm": 0.16117210686206818,
+      "learning_rate": 4.595941134201871e-05,
+      "loss": 0.18379683494567872,
+      "step": 1145
+    },
+    {
+      "epoch": 0.20924308588064047,
+      "grad_norm": 0.21199050545692444,
+      "learning_rate": 4.591919740285957e-05,
+      "loss": 0.16286123991012574,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2101528384279476,
+      "grad_norm": 0.15100529789924622,
+      "learning_rate": 4.587880211451427e-05,
+      "loss": 0.17995200157165528,
+      "step": 1155
+    },
+    {
+      "epoch": 0.21106259097525473,
+      "grad_norm": 0.16618172824382782,
+      "learning_rate": 4.583822582717085e-05,
+      "loss": 0.16960303783416747,
+      "step": 1160
+    },
+    {
+      "epoch": 0.21197234352256186,
+      "grad_norm": 0.14743569493293762,
+      "learning_rate": 4.579746889258643e-05,
+      "loss": 0.17762668132781984,
+      "step": 1165
+    },
+    {
+      "epoch": 0.212882096069869,
+      "grad_norm": 0.1697179079055786,
+      "learning_rate": 4.575653166408417e-05,
+      "loss": 0.16656005382537842,
+      "step": 1170
+    },
+    {
+      "epoch": 0.21379184861717612,
+      "grad_norm": 0.14886513352394104,
+      "learning_rate": 4.57154144965502e-05,
+      "loss": 0.17091882228851318,
+      "step": 1175
+    },
+    {
+      "epoch": 0.21470160116448325,
+      "grad_norm": 0.18197473883628845,
+      "learning_rate": 4.5674117746430556e-05,
+      "loss": 0.1770920753479004,
+      "step": 1180
+    },
+    {
+      "epoch": 0.21561135371179038,
+      "grad_norm": 0.17323088645935059,
+      "learning_rate": 4.563264177172807e-05,
+      "loss": 0.1734643578529358,
+      "step": 1185
+    },
+    {
+      "epoch": 0.2165211062590975,
+      "grad_norm": 0.1521984338760376,
+      "learning_rate": 4.559098693199929e-05,
+      "loss": 0.17515116930007935,
+      "step": 1190
+    },
+    {
+      "epoch": 0.21743085880640467,
+      "grad_norm": 0.1842304915189743,
+      "learning_rate": 4.554915358835134e-05,
+      "loss": 0.16798022985458375,
+      "step": 1195
+    },
+    {
+      "epoch": 0.2183406113537118,
+      "grad_norm": 0.14753451943397522,
+      "learning_rate": 4.5507142103438794e-05,
+      "loss": 0.1755476713180542,
+      "step": 1200
+    },
+    {
+      "epoch": 0.21925036390101893,
+      "grad_norm": 0.17096194624900818,
+      "learning_rate": 4.546495284146057e-05,
+      "loss": 0.1792473554611206,
+      "step": 1205
+    },
+    {
+      "epoch": 0.22016011644832606,
+      "grad_norm": 0.1579233556985855,
+      "learning_rate": 4.542258616815669e-05,
+      "loss": 0.17230144739151002,
+      "step": 1210
+    },
+    {
+      "epoch": 0.2210698689956332,
+      "grad_norm": 0.177297905087471,
+      "learning_rate": 4.5380042450805216e-05,
+      "loss": 0.1807127833366394,
+      "step": 1215
+    },
+    {
+      "epoch": 0.22197962154294032,
+      "grad_norm": 0.14331696927547455,
+      "learning_rate": 4.533732205821897e-05,
+      "loss": 0.17201389074325563,
+      "step": 1220
+    },
+    {
+      "epoch": 0.22288937409024745,
+      "grad_norm": 0.14473360776901245,
+      "learning_rate": 4.529442536074239e-05,
+      "loss": 0.17036900520324708,
+      "step": 1225
+    },
+    {
+      "epoch": 0.22379912663755458,
+      "grad_norm": 0.1820901483297348,
+      "learning_rate": 4.5251352730248314e-05,
+      "loss": 0.17704882621765136,
+      "step": 1230
+    },
+    {
+      "epoch": 0.2247088791848617,
+      "grad_norm": 0.1948976367712021,
+      "learning_rate": 4.5208104540134746e-05,
+      "loss": 0.1706973433494568,
+      "step": 1235
+    },
+    {
+      "epoch": 0.22561863173216884,
+      "grad_norm": 0.16660070419311523,
+      "learning_rate": 4.51646811653216e-05,
+      "loss": 0.17636821269989014,
+      "step": 1240
+    },
+    {
+      "epoch": 0.22652838427947597,
+      "grad_norm": 0.1699984073638916,
+      "learning_rate": 4.512108298224751e-05,
+      "loss": 0.16986632347106934,
+      "step": 1245
+    },
+    {
+      "epoch": 0.22743813682678313,
+      "grad_norm": 0.17601042985916138,
+      "learning_rate": 4.50773103688665e-05,
+      "loss": 0.17507898807525635,
+      "step": 1250
+    },
+    {
+      "epoch": 0.22834788937409026,
+      "grad_norm": 0.17557238042354584,
+      "learning_rate": 4.503336370464476e-05,
+      "loss": 0.17702863216400147,
+      "step": 1255
+    },
+    {
+      "epoch": 0.2292576419213974,
+      "grad_norm": 0.1800651252269745,
+      "learning_rate": 4.498924337055729e-05,
+      "loss": 0.16419180631637573,
+      "step": 1260
+    },
+    {
+      "epoch": 0.23016739446870452,
+      "grad_norm": 0.2022479772567749,
+      "learning_rate": 4.494494974908468e-05,
+      "loss": 0.17482060194015503,
+      "step": 1265
+    },
+    {
+      "epoch": 0.23107714701601165,
+      "grad_norm": 0.14180205762386322,
+      "learning_rate": 4.490048322420973e-05,
+      "loss": 0.1723136067390442,
+      "step": 1270
+    },
+    {
+      "epoch": 0.23198689956331878,
+      "grad_norm": 0.18607310950756073,
+      "learning_rate": 4.485584418141419e-05,
+      "loss": 0.17096419334411622,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2328966521106259,
+      "grad_norm": 0.15958310663700104,
+      "learning_rate": 4.481103300767529e-05,
+      "loss": 0.1656244158744812,
+      "step": 1280
+    },
+    {
+      "epoch": 0.23380640465793304,
+      "grad_norm": 0.17552383244037628,
+      "learning_rate": 4.476605009146255e-05,
+      "loss": 0.17677626609802247,
+      "step": 1285
+    },
+    {
+      "epoch": 0.23471615720524017,
+      "grad_norm": 0.15299823880195618,
+      "learning_rate": 4.472089582273429e-05,
+      "loss": 0.1778991103172302,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2356259097525473,
+      "grad_norm": 0.14613987505435944,
+      "learning_rate": 4.46755705929343e-05,
+      "loss": 0.17071452140808105,
+      "step": 1295
+    },
+    {
+      "epoch": 0.23653566229985443,
+      "grad_norm": 0.17781122028827667,
+      "learning_rate": 4.463007479498843e-05,
+      "loss": 0.16955430507659913,
+      "step": 1300
+    },
+    {
+      "epoch": 0.23744541484716158,
+      "grad_norm": 0.16326487064361572,
+      "learning_rate": 4.458440882330119e-05,
+      "loss": 0.1777693510055542,
+      "step": 1305
+    },
+    {
+      "epoch": 0.23835516739446871,
+      "grad_norm": 0.17701926827430725,
+      "learning_rate": 4.4538573073752365e-05,
+      "loss": 0.16323351860046387,
+      "step": 1310
+    },
+    {
+      "epoch": 0.23926491994177584,
+      "grad_norm": 0.13104717433452606,
+      "learning_rate": 4.449256794369349e-05,
+      "loss": 0.17653456926345826,
+      "step": 1315
+    },
+    {
+      "epoch": 0.24017467248908297,
+      "grad_norm": 0.1796836256980896,
+      "learning_rate": 4.444639383194452e-05,
+      "loss": 0.17189600467681884,
+      "step": 1320
+    },
+    {
+      "epoch": 0.2410844250363901,
+      "grad_norm": 0.14919696748256683,
+      "learning_rate": 4.440005113879029e-05,
+      "loss": 0.17003334760665895,
+      "step": 1325
+    },
+    {
+      "epoch": 0.24199417758369723,
+      "grad_norm": 0.1728784441947937,
+      "learning_rate": 4.4353540265977064e-05,
+      "loss": 0.17397408485412597,
+      "step": 1330
+    },
+    {
+      "epoch": 0.24290393013100436,
+      "grad_norm": 0.14591015875339508,
+      "learning_rate": 4.43068616167091e-05,
+      "loss": 0.16498478651046752,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2438136826783115,
+      "grad_norm": 0.18417201936244965,
+      "learning_rate": 4.4260015595645055e-05,
+      "loss": 0.16841750144958495,
+      "step": 1340
+    },
+    {
+      "epoch": 0.24472343522561862,
+      "grad_norm": 0.16264279186725616,
+      "learning_rate": 4.4213002608894605e-05,
+      "loss": 0.16907373666763306,
+      "step": 1345
+    },
+    {
+      "epoch": 0.24563318777292575,
+      "grad_norm": 0.15248481929302216,
+      "learning_rate": 4.416582306401481e-05,
+      "loss": 0.15931472778320313,
+      "step": 1350
+    },
+    {
+      "epoch": 0.24654294032023288,
+      "grad_norm": 0.1488373875617981,
+      "learning_rate": 4.4118477370006636e-05,
+      "loss": 0.1701716423034668,
+      "step": 1355
+    },
+    {
+      "epoch": 0.24745269286754004,
+      "grad_norm": 0.14679782092571259,
+      "learning_rate": 4.407096593731142e-05,
+      "loss": 0.157412326335907,
+      "step": 1360
+    },
+    {
+      "epoch": 0.24836244541484717,
+      "grad_norm": 0.17139530181884766,
+      "learning_rate": 4.402328917780728e-05,
+      "loss": 0.17303754091262818,
+      "step": 1365
+    },
+    {
+      "epoch": 0.2492721979621543,
+      "grad_norm": 0.1534871757030487,
+      "learning_rate": 4.397544750480554e-05,
+      "loss": 0.1786255121231079,
+      "step": 1370
+    },
+    {
+      "epoch": 0.2501819505094614,
+      "grad_norm": 0.1876252293586731,
+      "learning_rate": 4.39274413330472e-05,
+      "loss": 0.16442898511886597,
+      "step": 1375
+    },
+    {
+      "epoch": 0.25109170305676853,
+      "grad_norm": 0.16165752708911896,
+      "learning_rate": 4.387927107869928e-05,
+      "loss": 0.1780426025390625,
+      "step": 1380
+    },
+    {
+      "epoch": 0.25200145560407566,
+      "grad_norm": 0.17242255806922913,
+      "learning_rate": 4.383093715935124e-05,
+      "loss": 0.15959256887435913,
+      "step": 1385
+    },
+    {
+      "epoch": 0.25291120815138285,
+      "grad_norm": 0.1627114862203598,
+      "learning_rate": 4.378243999401137e-05,
+      "loss": 0.17606115341186523,
+      "step": 1390
+    },
+    {
+      "epoch": 0.25382096069869,
+      "grad_norm": 0.15911224484443665,
+      "learning_rate": 4.373378000310312e-05,
+      "loss": 0.16798585653305054,
+      "step": 1395
+    },
+    {
+      "epoch": 0.2547307132459971,
+      "grad_norm": 0.15542249381542206,
+      "learning_rate": 4.3684957608461505e-05,
+      "loss": 0.1695417881011963,
+      "step": 1400
+    },
+    {
+      "epoch": 0.25564046579330424,
+      "grad_norm": 0.1475304812192917,
+      "learning_rate": 4.363597323332941e-05,
+      "loss": 0.16340878009796142,
+      "step": 1405
+    },
+    {
+      "epoch": 0.25655021834061137,
+      "grad_norm": 0.16943927109241486,
+      "learning_rate": 4.358682730235395e-05,
+      "loss": 0.17240238189697266,
+      "step": 1410
+    },
+    {
+      "epoch": 0.2574599708879185,
+      "grad_norm": 0.1816391944885254,
+      "learning_rate": 4.3537520241582744e-05,
+      "loss": 0.16558437347412108,
+      "step": 1415
+    },
+    {
+      "epoch": 0.25836972343522563,
+      "grad_norm": 0.23851341009140015,
+      "learning_rate": 4.348805247846027e-05,
+      "loss": 0.16796000003814698,
+      "step": 1420
+    },
+    {
+      "epoch": 0.25927947598253276,
+      "grad_norm": 0.15415243804454803,
+      "learning_rate": 4.343842444182414e-05,
+      "loss": 0.1746017098426819,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2601892285298399,
+      "grad_norm": 0.15651032328605652,
+      "learning_rate": 4.338863656190139e-05,
+      "loss": 0.1649057984352112,
+      "step": 1430
+    },
+    {
+      "epoch": 0.261098981077147,
+      "grad_norm": 0.16601966321468353,
+      "learning_rate": 4.333868927030471e-05,
+      "loss": 0.15888988971710205,
+      "step": 1435
+    },
+    {
+      "epoch": 0.26200873362445415,
+      "grad_norm": 0.1549467295408249,
+      "learning_rate": 4.328858300002876e-05,
+      "loss": 0.16357985734939576,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2629184861717613,
+      "grad_norm": 0.16332370042800903,
+      "learning_rate": 4.32383181854464e-05,
+      "loss": 0.16749982833862304,
+      "step": 1445
+    },
+    {
+      "epoch": 0.2638282387190684,
+      "grad_norm": 0.14827077090740204,
+      "learning_rate": 4.3187895262304894e-05,
+      "loss": 0.16886214017868043,
+      "step": 1450
+    },
+    {
+      "epoch": 0.26473799126637554,
+      "grad_norm": 0.1557198166847229,
+      "learning_rate": 4.313731466772216e-05,
+      "loss": 0.17512214183807373,
+      "step": 1455
+    },
+    {
+      "epoch": 0.26564774381368267,
+      "grad_norm": 0.17263570427894592,
+      "learning_rate": 4.308657684018299e-05,
+      "loss": 0.16248074769973755,
+      "step": 1460
+    },
+    {
+      "epoch": 0.2665574963609898,
+      "grad_norm": 0.17135761678218842,
+      "learning_rate": 4.303568221953521e-05,
+      "loss": 0.16605921983718872,
+      "step": 1465
+    },
+    {
+      "epoch": 0.26746724890829693,
+      "grad_norm": 0.14322632551193237,
+      "learning_rate": 4.2984631246985897e-05,
+      "loss": 0.1610772728919983,
+      "step": 1470
+    },
+    {
+      "epoch": 0.26837700145560406,
+      "grad_norm": 0.18852312862873077,
+      "learning_rate": 4.2933424365097564e-05,
+      "loss": 0.1686462163925171,
+      "step": 1475
+    },
+    {
+      "epoch": 0.2692867540029112,
+      "grad_norm": 0.1780245155096054,
+      "learning_rate": 4.2882062017784294e-05,
+      "loss": 0.16953932046890258,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2701965065502183,
+      "grad_norm": 0.180568665266037,
+      "learning_rate": 4.2830544650307895e-05,
+      "loss": 0.16442664861679077,
+      "step": 1485
+    },
+    {
+      "epoch": 0.27110625909752545,
+      "grad_norm": 0.16876435279846191,
+      "learning_rate": 4.277887270927407e-05,
+      "loss": 0.17128173112869263,
+      "step": 1490
+    },
+    {
+      "epoch": 0.2720160116448326,
+      "grad_norm": 0.164053276181221,
+      "learning_rate": 4.2727046642628513e-05,
+      "loss": 0.16331382989883422,
+      "step": 1495
+    },
+    {
+      "epoch": 0.27292576419213976,
+      "grad_norm": 0.14577528834342957,
+      "learning_rate": 4.267506689965305e-05,
+      "loss": 0.1638316035270691,
+      "step": 1500
+    },
+    {
+      "epoch": 0.2738355167394469,
+      "grad_norm": 0.1648740917444229,
+      "learning_rate": 4.262293393096171e-05,
+      "loss": 0.15332664251327516,
+      "step": 1505
+    },
+    {
+      "epoch": 0.274745269286754,
+      "grad_norm": 0.16445094347000122,
+      "learning_rate": 4.257064818849685e-05,
+      "loss": 0.1706634521484375,
+      "step": 1510
+    },
+    {
+      "epoch": 0.27565502183406115,
+      "grad_norm": 0.1584935486316681,
+      "learning_rate": 4.251821012552524e-05,
+      "loss": 0.1684114694595337,
+      "step": 1515
+    },
+    {
+      "epoch": 0.2765647743813683,
+      "grad_norm": 0.17215611040592194,
+      "learning_rate": 4.24656201966341e-05,
+      "loss": 0.15594131946563722,
+      "step": 1520
+    },
+    {
+      "epoch": 0.2774745269286754,
+      "grad_norm": 0.15945589542388916,
+      "learning_rate": 4.2412878857727214e-05,
+      "loss": 0.1686659574508667,
+      "step": 1525
+    },
+    {
+      "epoch": 0.27838427947598254,
+      "grad_norm": 0.16103951632976532,
+      "learning_rate": 4.2359986566020906e-05,
+      "loss": 0.17779340744018554,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2792940320232897,
+      "grad_norm": 0.1770307570695877,
+      "learning_rate": 4.230694378004014e-05,
+      "loss": 0.16786882877349854,
+      "step": 1535
+    },
+    {
+      "epoch": 0.2802037845705968,
+      "grad_norm": 0.16225053369998932,
+      "learning_rate": 4.2253750959614504e-05,
+      "loss": 0.16558897495269775,
+      "step": 1540
+    },
+    {
+      "epoch": 0.28111353711790393,
+      "grad_norm": 0.27213969826698303,
+      "learning_rate": 4.220040856587425e-05,
+      "loss": 0.1641119599342346,
+      "step": 1545
+    },
+    {
+      "epoch": 0.28202328966521106,
+      "grad_norm": 0.1773071587085724,
+      "learning_rate": 4.2146917061246284e-05,
+      "loss": 0.16919140815734862,
+      "step": 1550
+    },
+    {
+      "epoch": 0.2829330422125182,
+      "grad_norm": 0.15519705414772034,
+      "learning_rate": 4.209327690945014e-05,
+      "loss": 0.15501506328582765,
+      "step": 1555
+    },
+    {
+      "epoch": 0.2838427947598253,
+      "grad_norm": 0.19921597838401794,
+      "learning_rate": 4.203948857549402e-05,
+      "loss": 0.1690821886062622,
+      "step": 1560
+    },
+    {
+      "epoch": 0.28475254730713245,
+      "grad_norm": 0.15417630970478058,
+      "learning_rate": 4.1985552525670696e-05,
+      "loss": 0.1675640344619751,
+      "step": 1565
+    },
+    {
+      "epoch": 0.2856622998544396,
+      "grad_norm": 0.1739572137594223,
+      "learning_rate": 4.193146922755348e-05,
+      "loss": 0.16738017797470092,
+      "step": 1570
+    },
+    {
+      "epoch": 0.2865720524017467,
+      "grad_norm": 0.1384361982345581,
+      "learning_rate": 4.187723914999221e-05,
+      "loss": 0.16802358627319336,
+      "step": 1575
+    },
+    {
+      "epoch": 0.28748180494905384,
+      "grad_norm": 0.1491454839706421,
+      "learning_rate": 4.182286276310915e-05,
+      "loss": 0.1619583249092102,
+      "step": 1580
+    },
+    {
+      "epoch": 0.288391557496361,
+      "grad_norm": 0.15831919014453888,
+      "learning_rate": 4.176834053829492e-05,
+      "loss": 0.1625199794769287,
+      "step": 1585
+    },
+    {
+      "epoch": 0.2893013100436681,
+      "grad_norm": 0.16265396773815155,
+      "learning_rate": 4.1713672948204416e-05,
+      "loss": 0.16718552112579346,
+      "step": 1590
+    },
+    {
+      "epoch": 0.29021106259097523,
+      "grad_norm": 0.15153461694717407,
+      "learning_rate": 4.1658860466752714e-05,
+      "loss": 0.15979087352752686,
+      "step": 1595
+    },
+    {
+      "epoch": 0.29112081513828236,
+      "grad_norm": 0.1620412915945053,
+      "learning_rate": 4.160390356911096e-05,
+      "loss": 0.16103557348251343,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2920305676855895,
+      "grad_norm": 0.16673807799816132,
+      "learning_rate": 4.154880273170223e-05,
+      "loss": 0.16394708156585694,
+      "step": 1605
+    },
+    {
+      "epoch": 0.2929403202328967,
+      "grad_norm": 0.14834867417812347,
+      "learning_rate": 4.149355843219744e-05,
+      "loss": 0.15916435718536376,
+      "step": 1610
+    },
+    {
+      "epoch": 0.2938500727802038,
+      "grad_norm": 0.16977964341640472,
+      "learning_rate": 4.143817114951119e-05,
+      "loss": 0.16538127660751342,
+      "step": 1615
+    },
+    {
+      "epoch": 0.29475982532751094,
+      "grad_norm": 0.17986875772476196,
+      "learning_rate": 4.138264136379756e-05,
+      "loss": 0.15514618158340454,
+      "step": 1620
+    },
+    {
+      "epoch": 0.29566957787481807,
+      "grad_norm": 0.15794920921325684,
+      "learning_rate": 4.132696955644605e-05,
+      "loss": 0.15992183685302735,
+      "step": 1625
+    },
+    {
+      "epoch": 0.2965793304221252,
+      "grad_norm": 0.19955399632453918,
+      "learning_rate": 4.127115621007731e-05,
+      "loss": 0.16362056732177735,
+      "step": 1630
+    },
+    {
+      "epoch": 0.29748908296943233,
+      "grad_norm": 0.1352023035287857,
+      "learning_rate": 4.121520180853903e-05,
+      "loss": 0.15631601810455323,
+      "step": 1635
+    },
+    {
+      "epoch": 0.29839883551673946,
+      "grad_norm": 0.15340781211853027,
+      "learning_rate": 4.1159106836901674e-05,
+      "loss": 0.1571858048439026,
+      "step": 1640
+    },
+    {
+      "epoch": 0.2993085880640466,
+      "grad_norm": 0.15311770141124725,
+      "learning_rate": 4.110287178145433e-05,
+      "loss": 0.16082344055175782,
+      "step": 1645
+    },
+    {
+      "epoch": 0.3002183406113537,
+      "grad_norm": 0.17811627686023712,
+      "learning_rate": 4.10464971297005e-05,
+      "loss": 0.16117215156555176,
+      "step": 1650
+    },
+    {
+      "epoch": 0.30112809315866085,
+      "grad_norm": 0.21060039103031158,
+      "learning_rate": 4.0989983370353805e-05,
+      "loss": 0.15838587284088135,
+      "step": 1655
+    },
+    {
+      "epoch": 0.302037845705968,
+      "grad_norm": 0.155836820602417,
+      "learning_rate": 4.093333099333383e-05,
+      "loss": 0.16648870706558228,
+      "step": 1660
+    },
+    {
+      "epoch": 0.3029475982532751,
+      "grad_norm": 0.13711698353290558,
+      "learning_rate": 4.0876540489761826e-05,
+      "loss": 0.16899349689483642,
+      "step": 1665
+    },
+    {
+      "epoch": 0.30385735080058224,
+      "grad_norm": 0.15162716805934906,
+      "learning_rate": 4.0819612351956485e-05,
+      "loss": 0.16574090719223022,
+      "step": 1670
+    },
+    {
+      "epoch": 0.30476710334788937,
+      "grad_norm": 0.15016348659992218,
+      "learning_rate": 4.0762547073429615e-05,
+      "loss": 0.1689780354499817,
+      "step": 1675
+    },
+    {
+      "epoch": 0.3056768558951965,
+      "grad_norm": 0.15182986855506897,
+      "learning_rate": 4.070534514888194e-05,
+      "loss": 0.1593686819076538,
+      "step": 1680
+    },
+    {
+      "epoch": 0.3065866084425036,
+      "grad_norm": 0.15648750960826874,
+      "learning_rate": 4.0648007074198765e-05,
+      "loss": 0.16436235904693602,
+      "step": 1685
+    },
+    {
+      "epoch": 0.30749636098981076,
+      "grad_norm": 0.18339484930038452,
+      "learning_rate": 4.0590533346445665e-05,
+      "loss": 0.1678077220916748,
+      "step": 1690
+    },
+    {
+      "epoch": 0.3084061135371179,
+      "grad_norm": 0.16426527500152588,
+      "learning_rate": 4.053292446386422e-05,
+      "loss": 0.1689227342605591,
+      "step": 1695
+    },
+    {
+      "epoch": 0.309315866084425,
+      "grad_norm": 0.16129335761070251,
+      "learning_rate": 4.047518092586766e-05,
+      "loss": 0.16592445373535156,
+      "step": 1700
+    },
+    {
+      "epoch": 0.31022561863173215,
+      "grad_norm": 0.15512363612651825,
+      "learning_rate": 4.041730323303654e-05,
+      "loss": 0.16142364740371704,
+      "step": 1705
+    },
+    {
+      "epoch": 0.3111353711790393,
+      "grad_norm": 0.159842386841774,
+      "learning_rate": 4.0359291887114425e-05,
+      "loss": 0.1702875852584839,
+      "step": 1710
+    },
+    {
+      "epoch": 0.3120451237263464,
+      "grad_norm": 0.19558854401111603,
+      "learning_rate": 4.030114739100352e-05,
+      "loss": 0.15966148376464845,
+      "step": 1715
+    },
+    {
+      "epoch": 0.3129548762736536,
+      "grad_norm": 0.1577496975660324,
+      "learning_rate": 4.024287024876029e-05,
+      "loss": 0.1620358943939209,
+      "step": 1720
+    },
+    {
+      "epoch": 0.3138646288209607,
+      "grad_norm": 0.1629355251789093,
+      "learning_rate": 4.0184460965591144e-05,
+      "loss": 0.16511552333831786,
+      "step": 1725
+    },
+    {
+      "epoch": 0.31477438136826785,
+      "grad_norm": 0.17060767114162445,
+      "learning_rate": 4.0125920047848e-05,
+      "loss": 0.15672838687896729,
+      "step": 1730
+    },
+    {
+      "epoch": 0.315684133915575,
+      "grad_norm": 0.22447620332241058,
+      "learning_rate": 4.006724800302394e-05,
+      "loss": 0.15339784622192382,
+      "step": 1735
+    },
+    {
+      "epoch": 0.3165938864628821,
+      "grad_norm": 0.14572037756443024,
+      "learning_rate": 4.000844533974878e-05,
+      "loss": 0.16566959619522095,
+      "step": 1740
+    },
+    {
+      "epoch": 0.31750363901018924,
+      "grad_norm": 0.15915483236312866,
+      "learning_rate": 3.9949512567784684e-05,
+      "loss": 0.16153957843780517,
+      "step": 1745
+    },
+    {
+      "epoch": 0.3184133915574964,
+      "grad_norm": 0.1668540984392166,
+      "learning_rate": 3.9890450198021704e-05,
+      "loss": 0.1659809947013855,
+      "step": 1750
+    },
+    {
+      "epoch": 0.3193231441048035,
+      "grad_norm": 0.16612035036087036,
+      "learning_rate": 3.983125874247341e-05,
+      "loss": 0.16941241025924683,
+      "step": 1755
+    },
+    {
+      "epoch": 0.32023289665211063,
+      "grad_norm": 0.15163679420948029,
+      "learning_rate": 3.9771938714272407e-05,
+      "loss": 0.16053590774536133,
+      "step": 1760
+    },
+    {
+      "epoch": 0.32114264919941776,
+      "grad_norm": 0.1797824203968048,
+      "learning_rate": 3.97124906276659e-05,
+      "loss": 0.1667110800743103,
+      "step": 1765
+    },
+    {
+      "epoch": 0.3220524017467249,
+      "grad_norm": 0.15076608955860138,
+      "learning_rate": 3.9652914998011237e-05,
+      "loss": 0.1607860803604126,
+      "step": 1770
+    },
+    {
+      "epoch": 0.322962154294032,
+      "grad_norm": 0.16523587703704834,
+      "learning_rate": 3.959321234177144e-05,
+      "loss": 0.16515827178955078,
+      "step": 1775
+    },
+    {
+      "epoch": 0.32387190684133915,
+      "grad_norm": 0.22065149247646332,
+      "learning_rate": 3.9533383176510746e-05,
+      "loss": 0.1618957757949829,
+      "step": 1780
+    },
+    {
+      "epoch": 0.3247816593886463,
+      "grad_norm": 0.16426463425159454,
+      "learning_rate": 3.9473428020890066e-05,
+      "loss": 0.15763382911682128,
+      "step": 1785
+    },
+    {
+      "epoch": 0.3256914119359534,
+      "grad_norm": 0.16474904119968414,
+      "learning_rate": 3.941334739466257e-05,
+      "loss": 0.15135571956634522,
+      "step": 1790
+    },
+    {
+      "epoch": 0.32660116448326054,
+      "grad_norm": 0.16746412217617035,
+      "learning_rate": 3.935314181866909e-05,
+      "loss": 0.15925389528274536,
+      "step": 1795
+    },
+    {
+      "epoch": 0.32751091703056767,
+      "grad_norm": 0.17819371819496155,
+      "learning_rate": 3.929281181483369e-05,
+      "loss": 0.1598669171333313,
+      "step": 1800
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 9.961482475364106e+17,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-1800/training_args.bin b/checkpoint-1800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-1800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/checkpoint-1900/README.md b/checkpoint-1900/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-1900/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-1900/adapter_config.json b/checkpoint-1900/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-1900/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-1900/adapter_model.safetensors b/checkpoint-1900/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b87aa3fe7fa719d6a7eec1f610f38bce67e0598b
--- /dev/null
+++ b/checkpoint-1900/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9b14ba9786290b8547d2316a4207455e4bf529521963087b4c7b5834a1d5c685
+size 169741912
diff --git a/checkpoint-1900/chat_template.jinja b/checkpoint-1900/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-1900/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-1900/optimizer.pt b/checkpoint-1900/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..edbf499a2d140d165acb1b01d5b97b20c8464d13
--- /dev/null
+++ b/checkpoint-1900/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2292d63b15aac6e7462fa0c28e7be8610fd06b587c5bd1ade3e75c7ee94893fe
+size 72807355
diff --git a/checkpoint-1900/processor_config.json b/checkpoint-1900/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-1900/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-1900/rng_state.pth b/checkpoint-1900/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-1900/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-1900/scheduler.pt b/checkpoint-1900/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3e10b8078fa966f09bfa866fbaf6f2f4ef4f5b64
--- /dev/null
+++ b/checkpoint-1900/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1eec1b8c04b4aafb9d7ce16df1301b79becce4e5c3d0741b708efa6172330081
+size 1465
diff --git a/checkpoint-1900/tokenizer.json b/checkpoint-1900/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-1900/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-1900/tokenizer_config.json b/checkpoint-1900/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-1900/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-1900/trainer_state.json b/checkpoint-1900/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..d0f35e0eabd94d7afe9f044962fde323088a9d14
--- /dev/null
+++ b/checkpoint-1900/trainer_state.json
@@ -0,0 +1,2702 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.3457059679767103,
+  "eval_steps": 100,
+  "global_step": 1900,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    },
+    {
+      "epoch": 0.03729985443959243,
+      "grad_norm": 0.061678580939769745,
+      "learning_rate": 4.999340748636462e-05,
+      "loss": 0.24956226348876953,
+      "step": 205
+    },
+    {
+      "epoch": 0.03820960698689956,
+      "grad_norm": 0.07328873127698898,
+      "learning_rate": 4.999160884067051e-05,
+      "loss": 0.26169676780700685,
+      "step": 210
+    },
+    {
+      "epoch": 0.0391193595342067,
+      "grad_norm": 0.08287990838289261,
+      "learning_rate": 4.9989593541926246e-05,
+      "loss": 0.2574604034423828,
+      "step": 215
+    },
+    {
+      "epoch": 0.04002911208151383,
+      "grad_norm": 0.06787359714508057,
+      "learning_rate": 4.9987361607602525e-05,
+      "loss": 0.25351409912109374,
+      "step": 220
+    },
+    {
+      "epoch": 0.04093886462882096,
+      "grad_norm": 0.06695502996444702,
+      "learning_rate": 4.998491305704805e-05,
+      "loss": 0.24522039890289307,
+      "step": 225
+    },
+    {
+      "epoch": 0.041848617176128096,
+      "grad_norm": 0.08872214704751968,
+      "learning_rate": 4.9982247911489375e-05,
+      "loss": 0.2581867933273315,
+      "step": 230
+    },
+    {
+      "epoch": 0.042758369723435226,
+      "grad_norm": 0.07637131959199905,
+      "learning_rate": 4.9979366194030743e-05,
+      "loss": 0.25569658279418944,
+      "step": 235
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 0.08158119022846222,
+      "learning_rate": 4.997626792965385e-05,
+      "loss": 0.2529409646987915,
+      "step": 240
+    },
+    {
+      "epoch": 0.04457787481804949,
+      "grad_norm": 0.07529161125421524,
+      "learning_rate": 4.997295314521766e-05,
+      "loss": 0.24049024581909179,
+      "step": 245
+    },
+    {
+      "epoch": 0.04548762736535662,
+      "grad_norm": 0.08860139548778534,
+      "learning_rate": 4.996942186945813e-05,
+      "loss": 0.2490522861480713,
+      "step": 250
+    },
+    {
+      "epoch": 0.04639737991266375,
+      "grad_norm": 0.0850321501493454,
+      "learning_rate": 4.9965674132988005e-05,
+      "loss": 0.24180831909179687,
+      "step": 255
+    },
+    {
+      "epoch": 0.04730713245997089,
+      "grad_norm": 0.07556115090847015,
+      "learning_rate": 4.996170996829653e-05,
+      "loss": 0.2509631872177124,
+      "step": 260
+    },
+    {
+      "epoch": 0.04821688500727802,
+      "grad_norm": 0.07971206307411194,
+      "learning_rate": 4.995752940974918e-05,
+      "loss": 0.24398891925811766,
+      "step": 265
+    },
+    {
+      "epoch": 0.04912663755458515,
+      "grad_norm": 0.09149336814880371,
+      "learning_rate": 4.9953132493587344e-05,
+      "loss": 0.2300492286682129,
+      "step": 270
+    },
+    {
+      "epoch": 0.050036390101892286,
+      "grad_norm": 0.08265820890665054,
+      "learning_rate": 4.9948519257928034e-05,
+      "loss": 0.24246792793273925,
+      "step": 275
+    },
+    {
+      "epoch": 0.050946142649199416,
+      "grad_norm": 0.10328587144613266,
+      "learning_rate": 4.9943689742763534e-05,
+      "loss": 0.2367171049118042,
+      "step": 280
+    },
+    {
+      "epoch": 0.05185589519650655,
+      "grad_norm": 0.0836917981505394,
+      "learning_rate": 4.993864398996105e-05,
+      "loss": 0.23215813636779786,
+      "step": 285
+    },
+    {
+      "epoch": 0.05276564774381368,
+      "grad_norm": 0.09475161135196686,
+      "learning_rate": 4.99333820432624e-05,
+      "loss": 0.2350748062133789,
+      "step": 290
+    },
+    {
+      "epoch": 0.05367540029112081,
+      "grad_norm": 0.08040128648281097,
+      "learning_rate": 4.992790394828355e-05,
+      "loss": 0.23253886699676513,
+      "step": 295
+    },
+    {
+      "epoch": 0.05458515283842795,
+      "grad_norm": 0.08852150291204453,
+      "learning_rate": 4.992220975251428e-05,
+      "loss": 0.23856515884399415,
+      "step": 300
+    },
+    {
+      "epoch": 0.05549490538573508,
+      "grad_norm": 0.09565229713916779,
+      "learning_rate": 4.991629950531775e-05,
+      "loss": 0.23311660289764405,
+      "step": 305
+    },
+    {
+      "epoch": 0.05640465793304221,
+      "grad_norm": 0.08158160001039505,
+      "learning_rate": 4.991017325793009e-05,
+      "loss": 0.22467944622039795,
+      "step": 310
+    },
+    {
+      "epoch": 0.05731441048034935,
+      "grad_norm": 0.07746429741382599,
+      "learning_rate": 4.990383106345994e-05,
+      "loss": 0.229844069480896,
+      "step": 315
+    },
+    {
+      "epoch": 0.05822416302765648,
+      "grad_norm": 0.08564355969429016,
+      "learning_rate": 4.989727297688797e-05,
+      "loss": 0.22414517402648926,
+      "step": 320
+    },
+    {
+      "epoch": 0.05913391557496361,
+      "grad_norm": 0.07517435401678085,
+      "learning_rate": 4.9890499055066435e-05,
+      "loss": 0.2236532211303711,
+      "step": 325
+    },
+    {
+      "epoch": 0.060043668122270744,
+      "grad_norm": 0.111734539270401,
+      "learning_rate": 4.988350935671869e-05,
+      "loss": 0.21474847793579102,
+      "step": 330
+    },
+    {
+      "epoch": 0.060953420669577874,
+      "grad_norm": 0.09906989336013794,
+      "learning_rate": 4.987630394243866e-05,
+      "loss": 0.23321933746337892,
+      "step": 335
+    },
+    {
+      "epoch": 0.06186317321688501,
+      "grad_norm": 0.10131457448005676,
+      "learning_rate": 4.98688828746903e-05,
+      "loss": 0.2310662031173706,
+      "step": 340
+    },
+    {
+      "epoch": 0.06277292576419213,
+      "grad_norm": 0.09203507006168365,
+      "learning_rate": 4.986124621780708e-05,
+      "loss": 0.22021169662475587,
+      "step": 345
+    },
+    {
+      "epoch": 0.06368267831149928,
+      "grad_norm": 0.09505912661552429,
+      "learning_rate": 4.9853394037991416e-05,
+      "loss": 0.2197155237197876,
+      "step": 350
+    },
+    {
+      "epoch": 0.06459243085880641,
+      "grad_norm": 0.09038657695055008,
+      "learning_rate": 4.984532640331412e-05,
+      "loss": 0.22066287994384765,
+      "step": 355
+    },
+    {
+      "epoch": 0.06550218340611354,
+      "grad_norm": 0.09707064181566238,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 0.22455451488494874,
+      "step": 360
+    },
+    {
+      "epoch": 0.06641193595342067,
+      "grad_norm": 0.10367228090763092,
+      "learning_rate": 4.98285450509961e-05,
+      "loss": 0.21993820667266845,
+      "step": 365
+    },
+    {
+      "epoch": 0.0673216885007278,
+      "grad_norm": 0.12229471653699875,
+      "learning_rate": 4.9819831478833456e-05,
+      "loss": 0.2168867588043213,
+      "step": 370
+    },
+    {
+      "epoch": 0.06823144104803494,
+      "grad_norm": 0.0964592918753624,
+      "learning_rate": 4.981090274276406e-05,
+      "loss": 0.21579203605651856,
+      "step": 375
+    },
+    {
+      "epoch": 0.06914119359534207,
+      "grad_norm": 0.09400496631860733,
+      "learning_rate": 4.980175892019141e-05,
+      "loss": 0.20972180366516113,
+      "step": 380
+    },
+    {
+      "epoch": 0.0700509461426492,
+      "grad_norm": 0.08158645778894424,
+      "learning_rate": 4.9792400090383594e-05,
+      "loss": 0.22148358821868896,
+      "step": 385
+    },
+    {
+      "epoch": 0.07096069868995633,
+      "grad_norm": 0.10916394740343094,
+      "learning_rate": 4.978282633447261e-05,
+      "loss": 0.2214418649673462,
+      "step": 390
+    },
+    {
+      "epoch": 0.07187045123726346,
+      "grad_norm": 0.11138810962438583,
+      "learning_rate": 4.9773037735453636e-05,
+      "loss": 0.21814754009246826,
+      "step": 395
+    },
+    {
+      "epoch": 0.07278020378457059,
+      "grad_norm": 0.10914396494626999,
+      "learning_rate": 4.9763034378184365e-05,
+      "loss": 0.21310818195343018,
+      "step": 400
+    },
+    {
+      "epoch": 0.07368995633187773,
+      "grad_norm": 0.1043366864323616,
+      "learning_rate": 4.975281634938421e-05,
+      "loss": 0.21266789436340333,
+      "step": 405
+    },
+    {
+      "epoch": 0.07459970887918486,
+      "grad_norm": 0.1036868542432785,
+      "learning_rate": 4.9742383737633594e-05,
+      "loss": 0.21606721878051757,
+      "step": 410
+    },
+    {
+      "epoch": 0.075509461426492,
+      "grad_norm": 0.11640442907810211,
+      "learning_rate": 4.9731736633373144e-05,
+      "loss": 0.21532948017120362,
+      "step": 415
+    },
+    {
+      "epoch": 0.07641921397379912,
+      "grad_norm": 0.11219926178455353,
+      "learning_rate": 4.9720875128902956e-05,
+      "loss": 0.2191627025604248,
+      "step": 420
+    },
+    {
+      "epoch": 0.07732896652110625,
+      "grad_norm": 0.12103637307882309,
+      "learning_rate": 4.970979931838176e-05,
+      "loss": 0.20938868522644044,
+      "step": 425
+    },
+    {
+      "epoch": 0.0782387190684134,
+      "grad_norm": 0.13274189829826355,
+      "learning_rate": 4.96985092978261e-05,
+      "loss": 0.21792960166931152,
+      "step": 430
+    },
+    {
+      "epoch": 0.07914847161572053,
+      "grad_norm": 0.11164513230323792,
+      "learning_rate": 4.968700516510954e-05,
+      "loss": 0.2022618055343628,
+      "step": 435
+    },
+    {
+      "epoch": 0.08005822416302766,
+      "grad_norm": 0.09532847255468369,
+      "learning_rate": 4.967528701996174e-05,
+      "loss": 0.21255812644958497,
+      "step": 440
+    },
+    {
+      "epoch": 0.08096797671033479,
+      "grad_norm": 0.10279258340597153,
+      "learning_rate": 4.96633549639677e-05,
+      "loss": 0.20683050155639648,
+      "step": 445
+    },
+    {
+      "epoch": 0.08187772925764192,
+      "grad_norm": 0.1257462352514267,
+      "learning_rate": 4.965120910056677e-05,
+      "loss": 0.21419920921325683,
+      "step": 450
+    },
+    {
+      "epoch": 0.08278748180494905,
+      "grad_norm": 0.11663137376308441,
+      "learning_rate": 4.963884953505186e-05,
+      "loss": 0.2072287082672119,
+      "step": 455
+    },
+    {
+      "epoch": 0.08369723435225619,
+      "grad_norm": 0.10488224029541016,
+      "learning_rate": 4.96262763745684e-05,
+      "loss": 0.1982678532600403,
+      "step": 460
+    },
+    {
+      "epoch": 0.08460698689956332,
+      "grad_norm": 0.11801692098379135,
+      "learning_rate": 4.961348972811354e-05,
+      "loss": 0.20662031173706055,
+      "step": 465
+    },
+    {
+      "epoch": 0.08551673944687045,
+      "grad_norm": 0.11318827420473099,
+      "learning_rate": 4.96004897065351e-05,
+      "loss": 0.20947303771972656,
+      "step": 470
+    },
+    {
+      "epoch": 0.08642649199417758,
+      "grad_norm": 0.13409486413002014,
+      "learning_rate": 4.95872764225307e-05,
+      "loss": 0.19670876264572143,
+      "step": 475
+    },
+    {
+      "epoch": 0.08733624454148471,
+      "grad_norm": 0.14440792798995972,
+      "learning_rate": 4.957384999064672e-05,
+      "loss": 0.19842848777770997,
+      "step": 480
+    },
+    {
+      "epoch": 0.08824599708879186,
+      "grad_norm": 0.12246996909379959,
+      "learning_rate": 4.956021052727731e-05,
+      "loss": 0.20318071842193602,
+      "step": 485
+    },
+    {
+      "epoch": 0.08915574963609899,
+      "grad_norm": 0.13437233865261078,
+      "learning_rate": 4.954635815066342e-05,
+      "loss": 0.21675212383270265,
+      "step": 490
+    },
+    {
+      "epoch": 0.09006550218340612,
+      "grad_norm": 0.11109672486782074,
+      "learning_rate": 4.9532292980891744e-05,
+      "loss": 0.2100757837295532,
+      "step": 495
+    },
+    {
+      "epoch": 0.09097525473071325,
+      "grad_norm": 0.1388893872499466,
+      "learning_rate": 4.9518015139893675e-05,
+      "loss": 0.20303285121917725,
+      "step": 500
+    },
+    {
+      "epoch": 0.09188500727802038,
+      "grad_norm": 0.13239721953868866,
+      "learning_rate": 4.950352475144427e-05,
+      "loss": 0.2152268409729004,
+      "step": 505
+    },
+    {
+      "epoch": 0.0927947598253275,
+      "grad_norm": 0.12834979593753815,
+      "learning_rate": 4.948882194116119e-05,
+      "loss": 0.20799248218536376,
+      "step": 510
+    },
+    {
+      "epoch": 0.09370451237263465,
+      "grad_norm": 0.11886704713106155,
+      "learning_rate": 4.947390683650354e-05,
+      "loss": 0.20394976139068605,
+      "step": 515
+    },
+    {
+      "epoch": 0.09461426491994178,
+      "grad_norm": 0.11398876458406448,
+      "learning_rate": 4.945877956677083e-05,
+      "loss": 0.2091092586517334,
+      "step": 520
+    },
+    {
+      "epoch": 0.09552401746724891,
+      "grad_norm": 0.1422540694475174,
+      "learning_rate": 4.944344026310186e-05,
+      "loss": 0.19564238786697388,
+      "step": 525
+    },
+    {
+      "epoch": 0.09643377001455604,
+      "grad_norm": 0.11359584331512451,
+      "learning_rate": 4.9427889058473535e-05,
+      "loss": 0.20493624210357667,
+      "step": 530
+    },
+    {
+      "epoch": 0.09734352256186317,
+      "grad_norm": 0.11703553050756454,
+      "learning_rate": 4.941212608769974e-05,
+      "loss": 0.2098615884780884,
+      "step": 535
+    },
+    {
+      "epoch": 0.0982532751091703,
+      "grad_norm": 0.14552047848701477,
+      "learning_rate": 4.939615148743017e-05,
+      "loss": 0.20382182598114013,
+      "step": 540
+    },
+    {
+      "epoch": 0.09916302765647744,
+      "grad_norm": 0.13178016245365143,
+      "learning_rate": 4.937996539614914e-05,
+      "loss": 0.19901862144470214,
+      "step": 545
+    },
+    {
+      "epoch": 0.10007278020378457,
+      "grad_norm": 0.635392427444458,
+      "learning_rate": 4.936356795417439e-05,
+      "loss": 0.20694944858551026,
+      "step": 550
+    },
+    {
+      "epoch": 0.1009825327510917,
+      "grad_norm": 0.15019077062606812,
+      "learning_rate": 4.934695930365586e-05,
+      "loss": 0.19313746690750122,
+      "step": 555
+    },
+    {
+      "epoch": 0.10189228529839883,
+      "grad_norm": 0.12941956520080566,
+      "learning_rate": 4.9330139588574474e-05,
+      "loss": 0.19671722650527954,
+      "step": 560
+    },
+    {
+      "epoch": 0.10280203784570596,
+      "grad_norm": 0.13818831741809845,
+      "learning_rate": 4.931310895474088e-05,
+      "loss": 0.20026786327362062,
+      "step": 565
+    },
+    {
+      "epoch": 0.1037117903930131,
+      "grad_norm": 0.12011194974184036,
+      "learning_rate": 4.929586754979417e-05,
+      "loss": 0.1932437539100647,
+      "step": 570
+    },
+    {
+      "epoch": 0.10462154294032024,
+      "grad_norm": 0.1345364898443222,
+      "learning_rate": 4.9278415523200644e-05,
+      "loss": 0.20245940685272218,
+      "step": 575
+    },
+    {
+      "epoch": 0.10553129548762737,
+      "grad_norm": 0.13281017541885376,
+      "learning_rate": 4.926075302625247e-05,
+      "loss": 0.19864981174468993,
+      "step": 580
+    },
+    {
+      "epoch": 0.1064410480349345,
+      "grad_norm": 0.13465586304664612,
+      "learning_rate": 4.924288021206639e-05,
+      "loss": 0.19573183059692384,
+      "step": 585
+    },
+    {
+      "epoch": 0.10735080058224163,
+      "grad_norm": 0.15225961804389954,
+      "learning_rate": 4.9224797235582396e-05,
+      "loss": 0.19946801662445068,
+      "step": 590
+    },
+    {
+      "epoch": 0.10826055312954876,
+      "grad_norm": 0.12816746532917023,
+      "learning_rate": 4.92065042535624e-05,
+      "loss": 0.19851526021957397,
+      "step": 595
+    },
+    {
+      "epoch": 0.1091703056768559,
+      "grad_norm": 0.13802853226661682,
+      "learning_rate": 4.9188001424588824e-05,
+      "loss": 0.19321763515472412,
+      "step": 600
+    },
+    {
+      "epoch": 0.11008005822416303,
+      "grad_norm": 0.17504797875881195,
+      "learning_rate": 4.9169288909063295e-05,
+      "loss": 0.2032616138458252,
+      "step": 605
+    },
+    {
+      "epoch": 0.11098981077147016,
+      "grad_norm": 0.13544194400310516,
+      "learning_rate": 4.91503668692052e-05,
+      "loss": 0.2011256456375122,
+      "step": 610
+    },
+    {
+      "epoch": 0.11189956331877729,
+      "grad_norm": 1.3976134061813354,
+      "learning_rate": 4.91312354690503e-05,
+      "loss": 0.19916868209838867,
+      "step": 615
+    },
+    {
+      "epoch": 0.11280931586608442,
+      "grad_norm": 0.1465059071779251,
+      "learning_rate": 4.91118948744493e-05,
+      "loss": 0.19487457275390624,
+      "step": 620
+    },
+    {
+      "epoch": 0.11371906841339156,
+      "grad_norm": 0.12103168666362762,
+      "learning_rate": 4.909234525306645e-05,
+      "loss": 0.1907251238822937,
+      "step": 625
+    },
+    {
+      "epoch": 0.1146288209606987,
+      "grad_norm": 0.12660574913024902,
+      "learning_rate": 4.907258677437802e-05,
+      "loss": 0.19327253103256226,
+      "step": 630
+    },
+    {
+      "epoch": 0.11553857350800582,
+      "grad_norm": 0.1347813606262207,
+      "learning_rate": 4.90526196096709e-05,
+      "loss": 0.19637736082077026,
+      "step": 635
+    },
+    {
+      "epoch": 0.11644832605531295,
+      "grad_norm": 0.14953652024269104,
+      "learning_rate": 4.903244393204107e-05,
+      "loss": 0.20325069427490233,
+      "step": 640
+    },
+    {
+      "epoch": 0.11735807860262008,
+      "grad_norm": 0.13936272263526917,
+      "learning_rate": 4.901205991639213e-05,
+      "loss": 0.1930275321006775,
+      "step": 645
+    },
+    {
+      "epoch": 0.11826783114992721,
+      "grad_norm": 0.1448420137166977,
+      "learning_rate": 4.899146773943374e-05,
+      "loss": 0.20026936531066894,
+      "step": 650
+    },
+    {
+      "epoch": 0.11917758369723436,
+      "grad_norm": 0.1312534064054489,
+      "learning_rate": 4.897066757968014e-05,
+      "loss": 0.19062033891677857,
+      "step": 655
+    },
+    {
+      "epoch": 0.12008733624454149,
+      "grad_norm": 0.13644742965698242,
+      "learning_rate": 4.894965961744859e-05,
+      "loss": 0.18719595670700073,
+      "step": 660
+    },
+    {
+      "epoch": 0.12099708879184862,
+      "grad_norm": 0.14276087284088135,
+      "learning_rate": 4.892844403485777e-05,
+      "loss": 0.19784307479858398,
+      "step": 665
+    },
+    {
+      "epoch": 0.12190684133915575,
+      "grad_norm": 0.14735399186611176,
+      "learning_rate": 4.890702101582623e-05,
+      "loss": 0.19163782596588136,
+      "step": 670
+    },
+    {
+      "epoch": 0.12281659388646288,
+      "grad_norm": 0.15742065012454987,
+      "learning_rate": 4.888539074607082e-05,
+      "loss": 0.19312986135482788,
+      "step": 675
+    },
+    {
+      "epoch": 0.12372634643377002,
+      "grad_norm": 0.12917031347751617,
+      "learning_rate": 4.8863553413105025e-05,
+      "loss": 0.20066320896148682,
+      "step": 680
+    },
+    {
+      "epoch": 0.12463609898107715,
+      "grad_norm": 0.1484801322221756,
+      "learning_rate": 4.884150920623737e-05,
+      "loss": 0.20096096992492676,
+      "step": 685
+    },
+    {
+      "epoch": 0.12554585152838427,
+      "grad_norm": 0.1455296128988266,
+      "learning_rate": 4.88192583165698e-05,
+      "loss": 0.20518505573272705,
+      "step": 690
+    },
+    {
+      "epoch": 0.12645560407569142,
+      "grad_norm": 0.14517490565776825,
+      "learning_rate": 4.879680093699598e-05,
+      "loss": 0.18859238624572755,
+      "step": 695
+    },
+    {
+      "epoch": 0.12736535662299855,
+      "grad_norm": 0.18778090178966522,
+      "learning_rate": 4.877413726219964e-05,
+      "loss": 0.197074818611145,
+      "step": 700
+    },
+    {
+      "epoch": 0.12827510917030568,
+      "grad_norm": 0.13497677445411682,
+      "learning_rate": 4.87512674886529e-05,
+      "loss": 0.18713107109069824,
+      "step": 705
+    },
+    {
+      "epoch": 0.12918486171761281,
+      "grad_norm": 0.12657155096530914,
+      "learning_rate": 4.872819181461455e-05,
+      "loss": 0.1858484387397766,
+      "step": 710
+    },
+    {
+      "epoch": 0.13009461426491994,
+      "grad_norm": 0.11458148807287216,
+      "learning_rate": 4.870491044012834e-05,
+      "loss": 0.18732179403305055,
+      "step": 715
+    },
+    {
+      "epoch": 0.13100436681222707,
+      "grad_norm": 0.13000249862670898,
+      "learning_rate": 4.8681423567021244e-05,
+      "loss": 0.1872936010360718,
+      "step": 720
+    },
+    {
+      "epoch": 0.1319141193595342,
+      "grad_norm": 0.14580890536308289,
+      "learning_rate": 4.865773139890172e-05,
+      "loss": 0.19280019998550416,
+      "step": 725
+    },
+    {
+      "epoch": 0.13282387190684133,
+      "grad_norm": 0.1507277935743332,
+      "learning_rate": 4.8633834141157913e-05,
+      "loss": 0.1898929238319397,
+      "step": 730
+    },
+    {
+      "epoch": 0.13373362445414846,
+      "grad_norm": 0.1418737769126892,
+      "learning_rate": 4.860973200095592e-05,
+      "loss": 0.17926375865936278,
+      "step": 735
+    },
+    {
+      "epoch": 0.1346433770014556,
+      "grad_norm": 0.17151866853237152,
+      "learning_rate": 4.858542518723794e-05,
+      "loss": 0.18963592052459716,
+      "step": 740
+    },
+    {
+      "epoch": 0.13555312954876272,
+      "grad_norm": 0.11162743717432022,
+      "learning_rate": 4.8560913910720535e-05,
+      "loss": 0.19466646909713745,
+      "step": 745
+    },
+    {
+      "epoch": 0.13646288209606988,
+      "grad_norm": 0.15628376603126526,
+      "learning_rate": 4.8536198383892725e-05,
+      "loss": 0.19494034051895143,
+      "step": 750
+    },
+    {
+      "epoch": 0.137372634643377,
+      "grad_norm": 0.18209289014339447,
+      "learning_rate": 4.851127882101421e-05,
+      "loss": 0.18747550249099731,
+      "step": 755
+    },
+    {
+      "epoch": 0.13828238719068414,
+      "grad_norm": 0.14559614658355713,
+      "learning_rate": 4.8486155438113454e-05,
+      "loss": 0.1897158980369568,
+      "step": 760
+    },
+    {
+      "epoch": 0.13919213973799127,
+      "grad_norm": 0.3198587894439697,
+      "learning_rate": 4.846082845298586e-05,
+      "loss": 0.18571001291275024,
+      "step": 765
+    },
+    {
+      "epoch": 0.1401018922852984,
+      "grad_norm": 0.1486678421497345,
+      "learning_rate": 4.843529808519189e-05,
+      "loss": 0.19561930894851684,
+      "step": 770
+    },
+    {
+      "epoch": 0.14101164483260553,
+      "grad_norm": 0.15318170189857483,
+      "learning_rate": 4.840956455605509e-05,
+      "loss": 0.187040114402771,
+      "step": 775
+    },
+    {
+      "epoch": 0.14192139737991266,
+      "grad_norm": 0.13754244148731232,
+      "learning_rate": 4.838362808866025e-05,
+      "loss": 0.18345539569854735,
+      "step": 780
+    },
+    {
+      "epoch": 0.1428311499272198,
+      "grad_norm": 0.12943248450756073,
+      "learning_rate": 4.835748890785143e-05,
+      "loss": 0.1921079397201538,
+      "step": 785
+    },
+    {
+      "epoch": 0.14374090247452692,
+      "grad_norm": 0.110458143055439,
+      "learning_rate": 4.833114724023001e-05,
+      "loss": 0.17927205562591553,
+      "step": 790
+    },
+    {
+      "epoch": 0.14465065502183405,
+      "grad_norm": 0.2421770840883255,
+      "learning_rate": 4.830460331415275e-05,
+      "loss": 0.18317567110061644,
+      "step": 795
+    },
+    {
+      "epoch": 0.14556040756914118,
+      "grad_norm": 0.14752762019634247,
+      "learning_rate": 4.8277857359729787e-05,
+      "loss": 0.1843916058540344,
+      "step": 800
+    },
+    {
+      "epoch": 0.14647016011644834,
+      "grad_norm": 0.15043556690216064,
+      "learning_rate": 4.8250909608822644e-05,
+      "loss": 0.18354393243789674,
+      "step": 805
+    },
+    {
+      "epoch": 0.14737991266375547,
+      "grad_norm": 0.1381794661283493,
+      "learning_rate": 4.822376029504223e-05,
+      "loss": 0.1789781332015991,
+      "step": 810
+    },
+    {
+      "epoch": 0.1482896652110626,
+      "grad_norm": 0.18386174738407135,
+      "learning_rate": 4.819640965374681e-05,
+      "loss": 0.19494292736053467,
+      "step": 815
+    },
+    {
+      "epoch": 0.14919941775836973,
+      "grad_norm": 0.13829593360424042,
+      "learning_rate": 4.816885792203996e-05,
+      "loss": 0.18486063480377196,
+      "step": 820
+    },
+    {
+      "epoch": 0.15010917030567686,
+      "grad_norm": 0.15033291280269623,
+      "learning_rate": 4.814110533876852e-05,
+      "loss": 0.18061509132385253,
+      "step": 825
+    },
+    {
+      "epoch": 0.151018922852984,
+      "grad_norm": 0.17150473594665527,
+      "learning_rate": 4.811315214452051e-05,
+      "loss": 0.18464866876602173,
+      "step": 830
+    },
+    {
+      "epoch": 0.15192867540029112,
+      "grad_norm": 0.15317125618457794,
+      "learning_rate": 4.808499858162307e-05,
+      "loss": 0.1837708592414856,
+      "step": 835
+    },
+    {
+      "epoch": 0.15283842794759825,
+      "grad_norm": 0.2671392560005188,
+      "learning_rate": 4.805664489414031e-05,
+      "loss": 0.19338636398315429,
+      "step": 840
+    },
+    {
+      "epoch": 0.15374818049490538,
+      "grad_norm": 0.14047028124332428,
+      "learning_rate": 4.802809132787125e-05,
+      "loss": 0.17069108486175538,
+      "step": 845
+    },
+    {
+      "epoch": 0.1546579330422125,
+      "grad_norm": 0.1520431935787201,
+      "learning_rate": 4.799933813034768e-05,
+      "loss": 0.18607735633850098,
+      "step": 850
+    },
+    {
+      "epoch": 0.15556768558951964,
+      "grad_norm": 0.17239463329315186,
+      "learning_rate": 4.797038555083197e-05,
+      "loss": 0.18069062232971192,
+      "step": 855
+    },
+    {
+      "epoch": 0.1564774381368268,
+      "grad_norm": 0.1377955675125122,
+      "learning_rate": 4.794123384031495e-05,
+      "loss": 0.18870222568511963,
+      "step": 860
+    },
+    {
+      "epoch": 0.15738719068413393,
+      "grad_norm": 0.15901461243629456,
+      "learning_rate": 4.791188325151373e-05,
+      "loss": 0.18128334283828734,
+      "step": 865
+    },
+    {
+      "epoch": 0.15829694323144106,
+      "grad_norm": 0.14634132385253906,
+      "learning_rate": 4.7882334038869495e-05,
+      "loss": 0.1866163969039917,
+      "step": 870
+    },
+    {
+      "epoch": 0.1592066957787482,
+      "grad_norm": 0.15361061692237854,
+      "learning_rate": 4.785258645854529e-05,
+      "loss": 0.17850807905197144,
+      "step": 875
+    },
+    {
+      "epoch": 0.16011644832605532,
+      "grad_norm": 0.13751649856567383,
+      "learning_rate": 4.782264076842385e-05,
+      "loss": 0.17731113433837892,
+      "step": 880
+    },
+    {
+      "epoch": 0.16102620087336245,
+      "grad_norm": 0.17909638583660126,
+      "learning_rate": 4.7792497228105314e-05,
+      "loss": 0.18344542980194092,
+      "step": 885
+    },
+    {
+      "epoch": 0.16193595342066958,
+      "grad_norm": 0.16038304567337036,
+      "learning_rate": 4.776215609890498e-05,
+      "loss": 0.18868647813796996,
+      "step": 890
+    },
+    {
+      "epoch": 0.1628457059679767,
+      "grad_norm": 0.1653951108455658,
+      "learning_rate": 4.773161764385107e-05,
+      "loss": 0.18614152669906617,
+      "step": 895
+    },
+    {
+      "epoch": 0.16375545851528384,
+      "grad_norm": 0.16193026304244995,
+      "learning_rate": 4.770088212768241e-05,
+      "loss": 0.18564575910568237,
+      "step": 900
+    },
+    {
+      "epoch": 0.16466521106259097,
+      "grad_norm": 0.16048531234264374,
+      "learning_rate": 4.7669949816846173e-05,
+      "loss": 0.18330031633377075,
+      "step": 905
+    },
+    {
+      "epoch": 0.1655749636098981,
+      "grad_norm": 0.1440177708864212,
+      "learning_rate": 4.7638820979495534e-05,
+      "loss": 0.17712442874908446,
+      "step": 910
+    },
+    {
+      "epoch": 0.16648471615720525,
+      "grad_norm": 0.19635969400405884,
+      "learning_rate": 4.760749588548738e-05,
+      "loss": 0.18679027557373046,
+      "step": 915
+    },
+    {
+      "epoch": 0.16739446870451238,
+      "grad_norm": 0.15576541423797607,
+      "learning_rate": 4.757597480637995e-05,
+      "loss": 0.19283764362335204,
+      "step": 920
+    },
+    {
+      "epoch": 0.1683042212518195,
+      "grad_norm": 0.1550331562757492,
+      "learning_rate": 4.7544258015430463e-05,
+      "loss": 0.18269542455673218,
+      "step": 925
+    },
+    {
+      "epoch": 0.16921397379912664,
+      "grad_norm": 0.18369626998901367,
+      "learning_rate": 4.75123457875928e-05,
+      "loss": 0.1697891116142273,
+      "step": 930
+    },
+    {
+      "epoch": 0.17012372634643377,
+      "grad_norm": 0.15266314148902893,
+      "learning_rate": 4.7480238399515074e-05,
+      "loss": 0.18523451089859008,
+      "step": 935
+    },
+    {
+      "epoch": 0.1710334788937409,
+      "grad_norm": 0.16709664463996887,
+      "learning_rate": 4.744793612953724e-05,
+      "loss": 0.1803238034248352,
+      "step": 940
+    },
+    {
+      "epoch": 0.17194323144104803,
+      "grad_norm": 0.14929179847240448,
+      "learning_rate": 4.741543925768872e-05,
+      "loss": 0.1861217737197876,
+      "step": 945
+    },
+    {
+      "epoch": 0.17285298398835516,
+      "grad_norm": 0.1362280696630478,
+      "learning_rate": 4.7382748065685915e-05,
+      "loss": 0.17896100282669067,
+      "step": 950
+    },
+    {
+      "epoch": 0.1737627365356623,
+      "grad_norm": 0.15290239453315735,
+      "learning_rate": 4.734986283692982e-05,
+      "loss": 0.18432788848876952,
+      "step": 955
+    },
+    {
+      "epoch": 0.17467248908296942,
+      "grad_norm": 0.1287035197019577,
+      "learning_rate": 4.73167838565035e-05,
+      "loss": 0.18485682010650634,
+      "step": 960
+    },
+    {
+      "epoch": 0.17558224163027655,
+      "grad_norm": 0.17969627678394318,
+      "learning_rate": 4.728351141116971e-05,
+      "loss": 0.17361557483673096,
+      "step": 965
+    },
+    {
+      "epoch": 0.1764919941775837,
+      "grad_norm": 0.13751201331615448,
+      "learning_rate": 4.7250045789368326e-05,
+      "loss": 0.1731679320335388,
+      "step": 970
+    },
+    {
+      "epoch": 0.17740174672489084,
+      "grad_norm": 0.1603265255689621,
+      "learning_rate": 4.721638728121388e-05,
+      "loss": 0.17308170795440675,
+      "step": 975
+    },
+    {
+      "epoch": 0.17831149927219797,
+      "grad_norm": 0.1592789888381958,
+      "learning_rate": 4.718253617849306e-05,
+      "loss": 0.17534757852554322,
+      "step": 980
+    },
+    {
+      "epoch": 0.1792212518195051,
+      "grad_norm": 0.12727224826812744,
+      "learning_rate": 4.714849277466214e-05,
+      "loss": 0.17817609310150145,
+      "step": 985
+    },
+    {
+      "epoch": 0.18013100436681223,
+      "grad_norm": 0.15401554107666016,
+      "learning_rate": 4.711425736484447e-05,
+      "loss": 0.1733405351638794,
+      "step": 990
+    },
+    {
+      "epoch": 0.18104075691411936,
+      "grad_norm": 0.13253968954086304,
+      "learning_rate": 4.7079830245827906e-05,
+      "loss": 0.17846795320510864,
+      "step": 995
+    },
+    {
+      "epoch": 0.1819505094614265,
+      "grad_norm": 0.21846213936805725,
+      "learning_rate": 4.7045211716062245e-05,
+      "loss": 0.18021599054336548,
+      "step": 1000
+    },
+    {
+      "epoch": 0.18286026200873362,
+      "grad_norm": 0.16867990791797638,
+      "learning_rate": 4.7010402075656595e-05,
+      "loss": 0.18232386112213134,
+      "step": 1005
+    },
+    {
+      "epoch": 0.18377001455604075,
+      "grad_norm": 0.17180582880973816,
+      "learning_rate": 4.697540162637686e-05,
+      "loss": 0.1816317319869995,
+      "step": 1010
+    },
+    {
+      "epoch": 0.18467976710334788,
+      "grad_norm": 0.16480213403701782,
+      "learning_rate": 4.694021067164303e-05,
+      "loss": 0.17718446254730225,
+      "step": 1015
+    },
+    {
+      "epoch": 0.185589519650655,
+      "grad_norm": 0.15015918016433716,
+      "learning_rate": 4.6904829516526605e-05,
+      "loss": 0.17412011623382567,
+      "step": 1020
+    },
+    {
+      "epoch": 0.18649927219796217,
+      "grad_norm": 0.14445139467716217,
+      "learning_rate": 4.686925846774795e-05,
+      "loss": 0.1778018832206726,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1874090247452693,
+      "grad_norm": 0.1701960265636444,
+      "learning_rate": 4.683349783367362e-05,
+      "loss": 0.16901081800460815,
+      "step": 1030
+    },
+    {
+      "epoch": 0.18831877729257643,
+      "grad_norm": 0.15894867479801178,
+      "learning_rate": 4.679754792431368e-05,
+      "loss": 0.17055928707122803,
+      "step": 1035
+    },
+    {
+      "epoch": 0.18922852983988356,
+      "grad_norm": 0.1511942446231842,
+      "learning_rate": 4.676140905131903e-05,
+      "loss": 0.17339680194854737,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1901382823871907,
+      "grad_norm": 0.14735209941864014,
+      "learning_rate": 4.672508152797872e-05,
+      "loss": 0.17802717685699462,
+      "step": 1045
+    },
+    {
+      "epoch": 0.19104803493449782,
+      "grad_norm": 0.17367291450500488,
+      "learning_rate": 4.66885656692172e-05,
+      "loss": 0.1732744097709656,
+      "step": 1050
+    },
+    {
+      "epoch": 0.19195778748180495,
+      "grad_norm": 0.147227481007576,
+      "learning_rate": 4.665186179159159e-05,
+      "loss": 0.17040517330169677,
+      "step": 1055
+    },
+    {
+      "epoch": 0.19286754002911208,
+      "grad_norm": 0.1709655076265335,
+      "learning_rate": 4.6614970213289e-05,
+      "loss": 0.17794088125228882,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1937772925764192,
+      "grad_norm": 0.1588088721036911,
+      "learning_rate": 4.657789125412366e-05,
+      "loss": 0.17180380821228028,
+      "step": 1065
+    },
+    {
+      "epoch": 0.19468704512372634,
+      "grad_norm": 0.14827021956443787,
+      "learning_rate": 4.654062523553428e-05,
+      "loss": 0.182997989654541,
+      "step": 1070
+    },
+    {
+      "epoch": 0.19559679767103347,
+      "grad_norm": 0.16230466961860657,
+      "learning_rate": 4.6503172480581126e-05,
+      "loss": 0.17346880435943604,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1965065502183406,
+      "grad_norm": 0.1637624353170395,
+      "learning_rate": 4.646553331394333e-05,
+      "loss": 0.17263576984405518,
+      "step": 1080
+    },
+    {
+      "epoch": 0.19741630276564776,
+      "grad_norm": 0.15977843105793,
+      "learning_rate": 4.642770806191603e-05,
+      "loss": 0.17284308671951293,
+      "step": 1085
+    },
+    {
+      "epoch": 0.19832605531295489,
+      "grad_norm": 0.15394869446754456,
+      "learning_rate": 4.6389697052407534e-05,
+      "loss": 0.17797101736068727,
+      "step": 1090
+    },
+    {
+      "epoch": 0.19923580786026202,
+      "grad_norm": 0.15995225310325623,
+      "learning_rate": 4.6351500614936485e-05,
+      "loss": 0.18137198686599731,
+      "step": 1095
+    },
+    {
+      "epoch": 0.20014556040756915,
+      "grad_norm": 0.1779479682445526,
+      "learning_rate": 4.6313119080629006e-05,
+      "loss": 0.17998344898223878,
+      "step": 1100
+    },
+    {
+      "epoch": 0.20105531295487628,
+      "grad_norm": 0.14362832903862,
+      "learning_rate": 4.627455278221584e-05,
+      "loss": 0.18196423053741456,
+      "step": 1105
+    },
+    {
+      "epoch": 0.2019650655021834,
+      "grad_norm": 0.15951639413833618,
+      "learning_rate": 4.623580205402947e-05,
+      "loss": 0.17423888444900512,
+      "step": 1110
+    },
+    {
+      "epoch": 0.20287481804949054,
+      "grad_norm": 0.17273563146591187,
+      "learning_rate": 4.619686723200115e-05,
+      "loss": 0.17392473220825194,
+      "step": 1115
+    },
+    {
+      "epoch": 0.20378457059679767,
+      "grad_norm": 0.1655360758304596,
+      "learning_rate": 4.615774865365813e-05,
+      "loss": 0.17528389692306517,
+      "step": 1120
+    },
+    {
+      "epoch": 0.2046943231441048,
+      "grad_norm": 0.15920691192150116,
+      "learning_rate": 4.611844665812058e-05,
+      "loss": 0.1806849241256714,
+      "step": 1125
+    },
+    {
+      "epoch": 0.20560407569141192,
+      "grad_norm": 0.16114577651023865,
+      "learning_rate": 4.607896158609875e-05,
+      "loss": 0.17217352390289306,
+      "step": 1130
+    },
+    {
+      "epoch": 0.20651382823871905,
+      "grad_norm": 0.1499422937631607,
+      "learning_rate": 4.603929377988999e-05,
+      "loss": 0.17806737422943114,
+      "step": 1135
+    },
+    {
+      "epoch": 0.2074235807860262,
+      "grad_norm": 0.17605191469192505,
+      "learning_rate": 4.5999443583375765e-05,
+      "loss": 0.17842113971710205,
+      "step": 1140
+    },
+    {
+      "epoch": 0.20833333333333334,
+      "grad_norm": 0.16117210686206818,
+      "learning_rate": 4.595941134201871e-05,
+      "loss": 0.18379683494567872,
+      "step": 1145
+    },
+    {
+      "epoch": 0.20924308588064047,
+      "grad_norm": 0.21199050545692444,
+      "learning_rate": 4.591919740285957e-05,
+      "loss": 0.16286123991012574,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2101528384279476,
+      "grad_norm": 0.15100529789924622,
+      "learning_rate": 4.587880211451427e-05,
+      "loss": 0.17995200157165528,
+      "step": 1155
+    },
+    {
+      "epoch": 0.21106259097525473,
+      "grad_norm": 0.16618172824382782,
+      "learning_rate": 4.583822582717085e-05,
+      "loss": 0.16960303783416747,
+      "step": 1160
+    },
+    {
+      "epoch": 0.21197234352256186,
+      "grad_norm": 0.14743569493293762,
+      "learning_rate": 4.579746889258643e-05,
+      "loss": 0.17762668132781984,
+      "step": 1165
+    },
+    {
+      "epoch": 0.212882096069869,
+      "grad_norm": 0.1697179079055786,
+      "learning_rate": 4.575653166408417e-05,
+      "loss": 0.16656005382537842,
+      "step": 1170
+    },
+    {
+      "epoch": 0.21379184861717612,
+      "grad_norm": 0.14886513352394104,
+      "learning_rate": 4.57154144965502e-05,
+      "loss": 0.17091882228851318,
+      "step": 1175
+    },
+    {
+      "epoch": 0.21470160116448325,
+      "grad_norm": 0.18197473883628845,
+      "learning_rate": 4.5674117746430556e-05,
+      "loss": 0.1770920753479004,
+      "step": 1180
+    },
+    {
+      "epoch": 0.21561135371179038,
+      "grad_norm": 0.17323088645935059,
+      "learning_rate": 4.563264177172807e-05,
+      "loss": 0.1734643578529358,
+      "step": 1185
+    },
+    {
+      "epoch": 0.2165211062590975,
+      "grad_norm": 0.1521984338760376,
+      "learning_rate": 4.559098693199929e-05,
+      "loss": 0.17515116930007935,
+      "step": 1190
+    },
+    {
+      "epoch": 0.21743085880640467,
+      "grad_norm": 0.1842304915189743,
+      "learning_rate": 4.554915358835134e-05,
+      "loss": 0.16798022985458375,
+      "step": 1195
+    },
+    {
+      "epoch": 0.2183406113537118,
+      "grad_norm": 0.14753451943397522,
+      "learning_rate": 4.5507142103438794e-05,
+      "loss": 0.1755476713180542,
+      "step": 1200
+    },
+    {
+      "epoch": 0.21925036390101893,
+      "grad_norm": 0.17096194624900818,
+      "learning_rate": 4.546495284146057e-05,
+      "loss": 0.1792473554611206,
+      "step": 1205
+    },
+    {
+      "epoch": 0.22016011644832606,
+      "grad_norm": 0.1579233556985855,
+      "learning_rate": 4.542258616815669e-05,
+      "loss": 0.17230144739151002,
+      "step": 1210
+    },
+    {
+      "epoch": 0.2210698689956332,
+      "grad_norm": 0.177297905087471,
+      "learning_rate": 4.5380042450805216e-05,
+      "loss": 0.1807127833366394,
+      "step": 1215
+    },
+    {
+      "epoch": 0.22197962154294032,
+      "grad_norm": 0.14331696927547455,
+      "learning_rate": 4.533732205821897e-05,
+      "loss": 0.17201389074325563,
+      "step": 1220
+    },
+    {
+      "epoch": 0.22288937409024745,
+      "grad_norm": 0.14473360776901245,
+      "learning_rate": 4.529442536074239e-05,
+      "loss": 0.17036900520324708,
+      "step": 1225
+    },
+    {
+      "epoch": 0.22379912663755458,
+      "grad_norm": 0.1820901483297348,
+      "learning_rate": 4.5251352730248314e-05,
+      "loss": 0.17704882621765136,
+      "step": 1230
+    },
+    {
+      "epoch": 0.2247088791848617,
+      "grad_norm": 0.1948976367712021,
+      "learning_rate": 4.5208104540134746e-05,
+      "loss": 0.1706973433494568,
+      "step": 1235
+    },
+    {
+      "epoch": 0.22561863173216884,
+      "grad_norm": 0.16660070419311523,
+      "learning_rate": 4.51646811653216e-05,
+      "loss": 0.17636821269989014,
+      "step": 1240
+    },
+    {
+      "epoch": 0.22652838427947597,
+      "grad_norm": 0.1699984073638916,
+      "learning_rate": 4.512108298224751e-05,
+      "loss": 0.16986632347106934,
+      "step": 1245
+    },
+    {
+      "epoch": 0.22743813682678313,
+      "grad_norm": 0.17601042985916138,
+      "learning_rate": 4.50773103688665e-05,
+      "loss": 0.17507898807525635,
+      "step": 1250
+    },
+    {
+      "epoch": 0.22834788937409026,
+      "grad_norm": 0.17557238042354584,
+      "learning_rate": 4.503336370464476e-05,
+      "loss": 0.17702863216400147,
+      "step": 1255
+    },
+    {
+      "epoch": 0.2292576419213974,
+      "grad_norm": 0.1800651252269745,
+      "learning_rate": 4.498924337055729e-05,
+      "loss": 0.16419180631637573,
+      "step": 1260
+    },
+    {
+      "epoch": 0.23016739446870452,
+      "grad_norm": 0.2022479772567749,
+      "learning_rate": 4.494494974908468e-05,
+      "loss": 0.17482060194015503,
+      "step": 1265
+    },
+    {
+      "epoch": 0.23107714701601165,
+      "grad_norm": 0.14180205762386322,
+      "learning_rate": 4.490048322420973e-05,
+      "loss": 0.1723136067390442,
+      "step": 1270
+    },
+    {
+      "epoch": 0.23198689956331878,
+      "grad_norm": 0.18607310950756073,
+      "learning_rate": 4.485584418141419e-05,
+      "loss": 0.17096419334411622,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2328966521106259,
+      "grad_norm": 0.15958310663700104,
+      "learning_rate": 4.481103300767529e-05,
+      "loss": 0.1656244158744812,
+      "step": 1280
+    },
+    {
+      "epoch": 0.23380640465793304,
+      "grad_norm": 0.17552383244037628,
+      "learning_rate": 4.476605009146255e-05,
+      "loss": 0.17677626609802247,
+      "step": 1285
+    },
+    {
+      "epoch": 0.23471615720524017,
+      "grad_norm": 0.15299823880195618,
+      "learning_rate": 4.472089582273429e-05,
+      "loss": 0.1778991103172302,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2356259097525473,
+      "grad_norm": 0.14613987505435944,
+      "learning_rate": 4.46755705929343e-05,
+      "loss": 0.17071452140808105,
+      "step": 1295
+    },
+    {
+      "epoch": 0.23653566229985443,
+      "grad_norm": 0.17781122028827667,
+      "learning_rate": 4.463007479498843e-05,
+      "loss": 0.16955430507659913,
+      "step": 1300
+    },
+    {
+      "epoch": 0.23744541484716158,
+      "grad_norm": 0.16326487064361572,
+      "learning_rate": 4.458440882330119e-05,
+      "loss": 0.1777693510055542,
+      "step": 1305
+    },
+    {
+      "epoch": 0.23835516739446871,
+      "grad_norm": 0.17701926827430725,
+      "learning_rate": 4.4538573073752365e-05,
+      "loss": 0.16323351860046387,
+      "step": 1310
+    },
+    {
+      "epoch": 0.23926491994177584,
+      "grad_norm": 0.13104717433452606,
+      "learning_rate": 4.449256794369349e-05,
+      "loss": 0.17653456926345826,
+      "step": 1315
+    },
+    {
+      "epoch": 0.24017467248908297,
+      "grad_norm": 0.1796836256980896,
+      "learning_rate": 4.444639383194452e-05,
+      "loss": 0.17189600467681884,
+      "step": 1320
+    },
+    {
+      "epoch": 0.2410844250363901,
+      "grad_norm": 0.14919696748256683,
+      "learning_rate": 4.440005113879029e-05,
+      "loss": 0.17003334760665895,
+      "step": 1325
+    },
+    {
+      "epoch": 0.24199417758369723,
+      "grad_norm": 0.1728784441947937,
+      "learning_rate": 4.4353540265977064e-05,
+      "loss": 0.17397408485412597,
+      "step": 1330
+    },
+    {
+      "epoch": 0.24290393013100436,
+      "grad_norm": 0.14591015875339508,
+      "learning_rate": 4.43068616167091e-05,
+      "loss": 0.16498478651046752,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2438136826783115,
+      "grad_norm": 0.18417201936244965,
+      "learning_rate": 4.4260015595645055e-05,
+      "loss": 0.16841750144958495,
+      "step": 1340
+    },
+    {
+      "epoch": 0.24472343522561862,
+      "grad_norm": 0.16264279186725616,
+      "learning_rate": 4.4213002608894605e-05,
+      "loss": 0.16907373666763306,
+      "step": 1345
+    },
+    {
+      "epoch": 0.24563318777292575,
+      "grad_norm": 0.15248481929302216,
+      "learning_rate": 4.416582306401481e-05,
+      "loss": 0.15931472778320313,
+      "step": 1350
+    },
+    {
+      "epoch": 0.24654294032023288,
+      "grad_norm": 0.1488373875617981,
+      "learning_rate": 4.4118477370006636e-05,
+      "loss": 0.1701716423034668,
+      "step": 1355
+    },
+    {
+      "epoch": 0.24745269286754004,
+      "grad_norm": 0.14679782092571259,
+      "learning_rate": 4.407096593731142e-05,
+      "loss": 0.157412326335907,
+      "step": 1360
+    },
+    {
+      "epoch": 0.24836244541484717,
+      "grad_norm": 0.17139530181884766,
+      "learning_rate": 4.402328917780728e-05,
+      "loss": 0.17303754091262818,
+      "step": 1365
+    },
+    {
+      "epoch": 0.2492721979621543,
+      "grad_norm": 0.1534871757030487,
+      "learning_rate": 4.397544750480554e-05,
+      "loss": 0.1786255121231079,
+      "step": 1370
+    },
+    {
+      "epoch": 0.2501819505094614,
+      "grad_norm": 0.1876252293586731,
+      "learning_rate": 4.39274413330472e-05,
+      "loss": 0.16442898511886597,
+      "step": 1375
+    },
+    {
+      "epoch": 0.25109170305676853,
+      "grad_norm": 0.16165752708911896,
+      "learning_rate": 4.387927107869928e-05,
+      "loss": 0.1780426025390625,
+      "step": 1380
+    },
+    {
+      "epoch": 0.25200145560407566,
+      "grad_norm": 0.17242255806922913,
+      "learning_rate": 4.383093715935124e-05,
+      "loss": 0.15959256887435913,
+      "step": 1385
+    },
+    {
+      "epoch": 0.25291120815138285,
+      "grad_norm": 0.1627114862203598,
+      "learning_rate": 4.378243999401137e-05,
+      "loss": 0.17606115341186523,
+      "step": 1390
+    },
+    {
+      "epoch": 0.25382096069869,
+      "grad_norm": 0.15911224484443665,
+      "learning_rate": 4.373378000310312e-05,
+      "loss": 0.16798585653305054,
+      "step": 1395
+    },
+    {
+      "epoch": 0.2547307132459971,
+      "grad_norm": 0.15542249381542206,
+      "learning_rate": 4.3684957608461505e-05,
+      "loss": 0.1695417881011963,
+      "step": 1400
+    },
+    {
+      "epoch": 0.25564046579330424,
+      "grad_norm": 0.1475304812192917,
+      "learning_rate": 4.363597323332941e-05,
+      "loss": 0.16340878009796142,
+      "step": 1405
+    },
+    {
+      "epoch": 0.25655021834061137,
+      "grad_norm": 0.16943927109241486,
+      "learning_rate": 4.358682730235395e-05,
+      "loss": 0.17240238189697266,
+      "step": 1410
+    },
+    {
+      "epoch": 0.2574599708879185,
+      "grad_norm": 0.1816391944885254,
+      "learning_rate": 4.3537520241582744e-05,
+      "loss": 0.16558437347412108,
+      "step": 1415
+    },
+    {
+      "epoch": 0.25836972343522563,
+      "grad_norm": 0.23851341009140015,
+      "learning_rate": 4.348805247846027e-05,
+      "loss": 0.16796000003814698,
+      "step": 1420
+    },
+    {
+      "epoch": 0.25927947598253276,
+      "grad_norm": 0.15415243804454803,
+      "learning_rate": 4.343842444182414e-05,
+      "loss": 0.1746017098426819,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2601892285298399,
+      "grad_norm": 0.15651032328605652,
+      "learning_rate": 4.338863656190139e-05,
+      "loss": 0.1649057984352112,
+      "step": 1430
+    },
+    {
+      "epoch": 0.261098981077147,
+      "grad_norm": 0.16601966321468353,
+      "learning_rate": 4.333868927030471e-05,
+      "loss": 0.15888988971710205,
+      "step": 1435
+    },
+    {
+      "epoch": 0.26200873362445415,
+      "grad_norm": 0.1549467295408249,
+      "learning_rate": 4.328858300002876e-05,
+      "loss": 0.16357985734939576,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2629184861717613,
+      "grad_norm": 0.16332370042800903,
+      "learning_rate": 4.32383181854464e-05,
+      "loss": 0.16749982833862304,
+      "step": 1445
+    },
+    {
+      "epoch": 0.2638282387190684,
+      "grad_norm": 0.14827077090740204,
+      "learning_rate": 4.3187895262304894e-05,
+      "loss": 0.16886214017868043,
+      "step": 1450
+    },
+    {
+      "epoch": 0.26473799126637554,
+      "grad_norm": 0.1557198166847229,
+      "learning_rate": 4.313731466772216e-05,
+      "loss": 0.17512214183807373,
+      "step": 1455
+    },
+    {
+      "epoch": 0.26564774381368267,
+      "grad_norm": 0.17263570427894592,
+      "learning_rate": 4.308657684018299e-05,
+      "loss": 0.16248074769973755,
+      "step": 1460
+    },
+    {
+      "epoch": 0.2665574963609898,
+      "grad_norm": 0.17135761678218842,
+      "learning_rate": 4.303568221953521e-05,
+      "loss": 0.16605921983718872,
+      "step": 1465
+    },
+    {
+      "epoch": 0.26746724890829693,
+      "grad_norm": 0.14322632551193237,
+      "learning_rate": 4.2984631246985897e-05,
+      "loss": 0.1610772728919983,
+      "step": 1470
+    },
+    {
+      "epoch": 0.26837700145560406,
+      "grad_norm": 0.18852312862873077,
+      "learning_rate": 4.2933424365097564e-05,
+      "loss": 0.1686462163925171,
+      "step": 1475
+    },
+    {
+      "epoch": 0.2692867540029112,
+      "grad_norm": 0.1780245155096054,
+      "learning_rate": 4.2882062017784294e-05,
+      "loss": 0.16953932046890258,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2701965065502183,
+      "grad_norm": 0.180568665266037,
+      "learning_rate": 4.2830544650307895e-05,
+      "loss": 0.16442664861679077,
+      "step": 1485
+    },
+    {
+      "epoch": 0.27110625909752545,
+      "grad_norm": 0.16876435279846191,
+      "learning_rate": 4.277887270927407e-05,
+      "loss": 0.17128173112869263,
+      "step": 1490
+    },
+    {
+      "epoch": 0.2720160116448326,
+      "grad_norm": 0.164053276181221,
+      "learning_rate": 4.2727046642628513e-05,
+      "loss": 0.16331382989883422,
+      "step": 1495
+    },
+    {
+      "epoch": 0.27292576419213976,
+      "grad_norm": 0.14577528834342957,
+      "learning_rate": 4.267506689965305e-05,
+      "loss": 0.1638316035270691,
+      "step": 1500
+    },
+    {
+      "epoch": 0.2738355167394469,
+      "grad_norm": 0.1648740917444229,
+      "learning_rate": 4.262293393096171e-05,
+      "loss": 0.15332664251327516,
+      "step": 1505
+    },
+    {
+      "epoch": 0.274745269286754,
+      "grad_norm": 0.16445094347000122,
+      "learning_rate": 4.257064818849685e-05,
+      "loss": 0.1706634521484375,
+      "step": 1510
+    },
+    {
+      "epoch": 0.27565502183406115,
+      "grad_norm": 0.1584935486316681,
+      "learning_rate": 4.251821012552524e-05,
+      "loss": 0.1684114694595337,
+      "step": 1515
+    },
+    {
+      "epoch": 0.2765647743813683,
+      "grad_norm": 0.17215611040592194,
+      "learning_rate": 4.24656201966341e-05,
+      "loss": 0.15594131946563722,
+      "step": 1520
+    },
+    {
+      "epoch": 0.2774745269286754,
+      "grad_norm": 0.15945589542388916,
+      "learning_rate": 4.2412878857727214e-05,
+      "loss": 0.1686659574508667,
+      "step": 1525
+    },
+    {
+      "epoch": 0.27838427947598254,
+      "grad_norm": 0.16103951632976532,
+      "learning_rate": 4.2359986566020906e-05,
+      "loss": 0.17779340744018554,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2792940320232897,
+      "grad_norm": 0.1770307570695877,
+      "learning_rate": 4.230694378004014e-05,
+      "loss": 0.16786882877349854,
+      "step": 1535
+    },
+    {
+      "epoch": 0.2802037845705968,
+      "grad_norm": 0.16225053369998932,
+      "learning_rate": 4.2253750959614504e-05,
+      "loss": 0.16558897495269775,
+      "step": 1540
+    },
+    {
+      "epoch": 0.28111353711790393,
+      "grad_norm": 0.27213969826698303,
+      "learning_rate": 4.220040856587425e-05,
+      "loss": 0.1641119599342346,
+      "step": 1545
+    },
+    {
+      "epoch": 0.28202328966521106,
+      "grad_norm": 0.1773071587085724,
+      "learning_rate": 4.2146917061246284e-05,
+      "loss": 0.16919140815734862,
+      "step": 1550
+    },
+    {
+      "epoch": 0.2829330422125182,
+      "grad_norm": 0.15519705414772034,
+      "learning_rate": 4.209327690945014e-05,
+      "loss": 0.15501506328582765,
+      "step": 1555
+    },
+    {
+      "epoch": 0.2838427947598253,
+      "grad_norm": 0.19921597838401794,
+      "learning_rate": 4.203948857549402e-05,
+      "loss": 0.1690821886062622,
+      "step": 1560
+    },
+    {
+      "epoch": 0.28475254730713245,
+      "grad_norm": 0.15417630970478058,
+      "learning_rate": 4.1985552525670696e-05,
+      "loss": 0.1675640344619751,
+      "step": 1565
+    },
+    {
+      "epoch": 0.2856622998544396,
+      "grad_norm": 0.1739572137594223,
+      "learning_rate": 4.193146922755348e-05,
+      "loss": 0.16738017797470092,
+      "step": 1570
+    },
+    {
+      "epoch": 0.2865720524017467,
+      "grad_norm": 0.1384361982345581,
+      "learning_rate": 4.187723914999221e-05,
+      "loss": 0.16802358627319336,
+      "step": 1575
+    },
+    {
+      "epoch": 0.28748180494905384,
+      "grad_norm": 0.1491454839706421,
+      "learning_rate": 4.182286276310915e-05,
+      "loss": 0.1619583249092102,
+      "step": 1580
+    },
+    {
+      "epoch": 0.288391557496361,
+      "grad_norm": 0.15831919014453888,
+      "learning_rate": 4.176834053829492e-05,
+      "loss": 0.1625199794769287,
+      "step": 1585
+    },
+    {
+      "epoch": 0.2893013100436681,
+      "grad_norm": 0.16265396773815155,
+      "learning_rate": 4.1713672948204416e-05,
+      "loss": 0.16718552112579346,
+      "step": 1590
+    },
+    {
+      "epoch": 0.29021106259097523,
+      "grad_norm": 0.15153461694717407,
+      "learning_rate": 4.1658860466752714e-05,
+      "loss": 0.15979087352752686,
+      "step": 1595
+    },
+    {
+      "epoch": 0.29112081513828236,
+      "grad_norm": 0.1620412915945053,
+      "learning_rate": 4.160390356911096e-05,
+      "loss": 0.16103557348251343,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2920305676855895,
+      "grad_norm": 0.16673807799816132,
+      "learning_rate": 4.154880273170223e-05,
+      "loss": 0.16394708156585694,
+      "step": 1605
+    },
+    {
+      "epoch": 0.2929403202328967,
+      "grad_norm": 0.14834867417812347,
+      "learning_rate": 4.149355843219744e-05,
+      "loss": 0.15916435718536376,
+      "step": 1610
+    },
+    {
+      "epoch": 0.2938500727802038,
+      "grad_norm": 0.16977964341640472,
+      "learning_rate": 4.143817114951119e-05,
+      "loss": 0.16538127660751342,
+      "step": 1615
+    },
+    {
+      "epoch": 0.29475982532751094,
+      "grad_norm": 0.17986875772476196,
+      "learning_rate": 4.138264136379756e-05,
+      "loss": 0.15514618158340454,
+      "step": 1620
+    },
+    {
+      "epoch": 0.29566957787481807,
+      "grad_norm": 0.15794920921325684,
+      "learning_rate": 4.132696955644605e-05,
+      "loss": 0.15992183685302735,
+      "step": 1625
+    },
+    {
+      "epoch": 0.2965793304221252,
+      "grad_norm": 0.19955399632453918,
+      "learning_rate": 4.127115621007731e-05,
+      "loss": 0.16362056732177735,
+      "step": 1630
+    },
+    {
+      "epoch": 0.29748908296943233,
+      "grad_norm": 0.1352023035287857,
+      "learning_rate": 4.121520180853903e-05,
+      "loss": 0.15631601810455323,
+      "step": 1635
+    },
+    {
+      "epoch": 0.29839883551673946,
+      "grad_norm": 0.15340781211853027,
+      "learning_rate": 4.1159106836901674e-05,
+      "loss": 0.1571858048439026,
+      "step": 1640
+    },
+    {
+      "epoch": 0.2993085880640466,
+      "grad_norm": 0.15311770141124725,
+      "learning_rate": 4.110287178145433e-05,
+      "loss": 0.16082344055175782,
+      "step": 1645
+    },
+    {
+      "epoch": 0.3002183406113537,
+      "grad_norm": 0.17811627686023712,
+      "learning_rate": 4.10464971297005e-05,
+      "loss": 0.16117215156555176,
+      "step": 1650
+    },
+    {
+      "epoch": 0.30112809315866085,
+      "grad_norm": 0.21060039103031158,
+      "learning_rate": 4.0989983370353805e-05,
+      "loss": 0.15838587284088135,
+      "step": 1655
+    },
+    {
+      "epoch": 0.302037845705968,
+      "grad_norm": 0.155836820602417,
+      "learning_rate": 4.093333099333383e-05,
+      "loss": 0.16648870706558228,
+      "step": 1660
+    },
+    {
+      "epoch": 0.3029475982532751,
+      "grad_norm": 0.13711698353290558,
+      "learning_rate": 4.0876540489761826e-05,
+      "loss": 0.16899349689483642,
+      "step": 1665
+    },
+    {
+      "epoch": 0.30385735080058224,
+      "grad_norm": 0.15162716805934906,
+      "learning_rate": 4.0819612351956485e-05,
+      "loss": 0.16574090719223022,
+      "step": 1670
+    },
+    {
+      "epoch": 0.30476710334788937,
+      "grad_norm": 0.15016348659992218,
+      "learning_rate": 4.0762547073429615e-05,
+      "loss": 0.1689780354499817,
+      "step": 1675
+    },
+    {
+      "epoch": 0.3056768558951965,
+      "grad_norm": 0.15182986855506897,
+      "learning_rate": 4.070534514888194e-05,
+      "loss": 0.1593686819076538,
+      "step": 1680
+    },
+    {
+      "epoch": 0.3065866084425036,
+      "grad_norm": 0.15648750960826874,
+      "learning_rate": 4.0648007074198765e-05,
+      "loss": 0.16436235904693602,
+      "step": 1685
+    },
+    {
+      "epoch": 0.30749636098981076,
+      "grad_norm": 0.18339484930038452,
+      "learning_rate": 4.0590533346445665e-05,
+      "loss": 0.1678077220916748,
+      "step": 1690
+    },
+    {
+      "epoch": 0.3084061135371179,
+      "grad_norm": 0.16426527500152588,
+      "learning_rate": 4.053292446386422e-05,
+      "loss": 0.1689227342605591,
+      "step": 1695
+    },
+    {
+      "epoch": 0.309315866084425,
+      "grad_norm": 0.16129335761070251,
+      "learning_rate": 4.047518092586766e-05,
+      "loss": 0.16592445373535156,
+      "step": 1700
+    },
+    {
+      "epoch": 0.31022561863173215,
+      "grad_norm": 0.15512363612651825,
+      "learning_rate": 4.041730323303654e-05,
+      "loss": 0.16142364740371704,
+      "step": 1705
+    },
+    {
+      "epoch": 0.3111353711790393,
+      "grad_norm": 0.159842386841774,
+      "learning_rate": 4.0359291887114425e-05,
+      "loss": 0.1702875852584839,
+      "step": 1710
+    },
+    {
+      "epoch": 0.3120451237263464,
+      "grad_norm": 0.19558854401111603,
+      "learning_rate": 4.030114739100352e-05,
+      "loss": 0.15966148376464845,
+      "step": 1715
+    },
+    {
+      "epoch": 0.3129548762736536,
+      "grad_norm": 0.1577496975660324,
+      "learning_rate": 4.024287024876029e-05,
+      "loss": 0.1620358943939209,
+      "step": 1720
+    },
+    {
+      "epoch": 0.3138646288209607,
+      "grad_norm": 0.1629355251789093,
+      "learning_rate": 4.0184460965591144e-05,
+      "loss": 0.16511552333831786,
+      "step": 1725
+    },
+    {
+      "epoch": 0.31477438136826785,
+      "grad_norm": 0.17060767114162445,
+      "learning_rate": 4.0125920047848e-05,
+      "loss": 0.15672838687896729,
+      "step": 1730
+    },
+    {
+      "epoch": 0.315684133915575,
+      "grad_norm": 0.22447620332241058,
+      "learning_rate": 4.006724800302394e-05,
+      "loss": 0.15339784622192382,
+      "step": 1735
+    },
+    {
+      "epoch": 0.3165938864628821,
+      "grad_norm": 0.14572037756443024,
+      "learning_rate": 4.000844533974878e-05,
+      "loss": 0.16566959619522095,
+      "step": 1740
+    },
+    {
+      "epoch": 0.31750363901018924,
+      "grad_norm": 0.15915483236312866,
+      "learning_rate": 3.9949512567784684e-05,
+      "loss": 0.16153957843780517,
+      "step": 1745
+    },
+    {
+      "epoch": 0.3184133915574964,
+      "grad_norm": 0.1668540984392166,
+      "learning_rate": 3.9890450198021704e-05,
+      "loss": 0.1659809947013855,
+      "step": 1750
+    },
+    {
+      "epoch": 0.3193231441048035,
+      "grad_norm": 0.16612035036087036,
+      "learning_rate": 3.983125874247341e-05,
+      "loss": 0.16941241025924683,
+      "step": 1755
+    },
+    {
+      "epoch": 0.32023289665211063,
+      "grad_norm": 0.15163679420948029,
+      "learning_rate": 3.9771938714272407e-05,
+      "loss": 0.16053590774536133,
+      "step": 1760
+    },
+    {
+      "epoch": 0.32114264919941776,
+      "grad_norm": 0.1797824203968048,
+      "learning_rate": 3.97124906276659e-05,
+      "loss": 0.1667110800743103,
+      "step": 1765
+    },
+    {
+      "epoch": 0.3220524017467249,
+      "grad_norm": 0.15076608955860138,
+      "learning_rate": 3.9652914998011237e-05,
+      "loss": 0.1607860803604126,
+      "step": 1770
+    },
+    {
+      "epoch": 0.322962154294032,
+      "grad_norm": 0.16523587703704834,
+      "learning_rate": 3.959321234177144e-05,
+      "loss": 0.16515827178955078,
+      "step": 1775
+    },
+    {
+      "epoch": 0.32387190684133915,
+      "grad_norm": 0.22065149247646332,
+      "learning_rate": 3.9533383176510746e-05,
+      "loss": 0.1618957757949829,
+      "step": 1780
+    },
+    {
+      "epoch": 0.3247816593886463,
+      "grad_norm": 0.16426463425159454,
+      "learning_rate": 3.9473428020890066e-05,
+      "loss": 0.15763382911682128,
+      "step": 1785
+    },
+    {
+      "epoch": 0.3256914119359534,
+      "grad_norm": 0.16474904119968414,
+      "learning_rate": 3.941334739466257e-05,
+      "loss": 0.15135571956634522,
+      "step": 1790
+    },
+    {
+      "epoch": 0.32660116448326054,
+      "grad_norm": 0.16746412217617035,
+      "learning_rate": 3.935314181866909e-05,
+      "loss": 0.15925389528274536,
+      "step": 1795
+    },
+    {
+      "epoch": 0.32751091703056767,
+      "grad_norm": 0.17819371819496155,
+      "learning_rate": 3.929281181483369e-05,
+      "loss": 0.1598669171333313,
+      "step": 1800
+    },
+    {
+      "epoch": 0.3284206695778748,
+      "grad_norm": 0.1816040277481079,
+      "learning_rate": 3.923235790615907e-05,
+      "loss": 0.1652522087097168,
+      "step": 1805
+    },
+    {
+      "epoch": 0.32933042212518193,
+      "grad_norm": 0.14846695959568024,
+      "learning_rate": 3.917178061672211e-05,
+      "loss": 0.16665585041046144,
+      "step": 1810
+    },
+    {
+      "epoch": 0.33024017467248906,
+      "grad_norm": 0.1734926551580429,
+      "learning_rate": 3.911108047166924e-05,
+      "loss": 0.16069791316986085,
+      "step": 1815
+    },
+    {
+      "epoch": 0.3311499272197962,
+      "grad_norm": 0.16154922544956207,
+      "learning_rate": 3.905025799721194e-05,
+      "loss": 0.16114097833633423,
+      "step": 1820
+    },
+    {
+      "epoch": 0.3320596797671033,
+      "grad_norm": 0.1538771390914917,
+      "learning_rate": 3.898931372062217e-05,
+      "loss": 0.1602831244468689,
+      "step": 1825
+    },
+    {
+      "epoch": 0.3329694323144105,
+      "grad_norm": 0.14036566019058228,
+      "learning_rate": 3.892824817022781e-05,
+      "loss": 0.1502395749092102,
+      "step": 1830
+    },
+    {
+      "epoch": 0.33387918486171764,
+      "grad_norm": 0.19212059676647186,
+      "learning_rate": 3.886706187540804e-05,
+      "loss": 0.16265250444412233,
+      "step": 1835
+    },
+    {
+      "epoch": 0.33478893740902477,
+      "grad_norm": 0.17410333454608917,
+      "learning_rate": 3.880575536658881e-05,
+      "loss": 0.15689224004745483,
+      "step": 1840
+    },
+    {
+      "epoch": 0.3356986899563319,
+      "grad_norm": 0.15165294706821442,
+      "learning_rate": 3.874432917523817e-05,
+      "loss": 0.15033140182495117,
+      "step": 1845
+    },
+    {
+      "epoch": 0.336608442503639,
+      "grad_norm": 0.16166730225086212,
+      "learning_rate": 3.8682783833861736e-05,
+      "loss": 0.16896235942840576,
+      "step": 1850
+    },
+    {
+      "epoch": 0.33751819505094616,
+      "grad_norm": 0.16497021913528442,
+      "learning_rate": 3.8621119875998026e-05,
+      "loss": 0.1600774645805359,
+      "step": 1855
+    },
+    {
+      "epoch": 0.3384279475982533,
+      "grad_norm": 0.17264948785305023,
+      "learning_rate": 3.855933783621384e-05,
+      "loss": 0.16947593688964843,
+      "step": 1860
+    },
+    {
+      "epoch": 0.3393377001455604,
+      "grad_norm": 0.16870704293251038,
+      "learning_rate": 3.8497438250099636e-05,
+      "loss": 0.16062095165252685,
+      "step": 1865
+    },
+    {
+      "epoch": 0.34024745269286755,
+      "grad_norm": 0.16644036769866943,
+      "learning_rate": 3.843542165426492e-05,
+      "loss": 0.16015599966049193,
+      "step": 1870
+    },
+    {
+      "epoch": 0.3411572052401747,
+      "grad_norm": 0.1626352220773697,
+      "learning_rate": 3.837328858633349e-05,
+      "loss": 0.17444703578948975,
+      "step": 1875
+    },
+    {
+      "epoch": 0.3420669577874818,
+      "grad_norm": 0.1427375227212906,
+      "learning_rate": 3.83110395849389e-05,
+      "loss": 0.1589805006980896,
+      "step": 1880
+    },
+    {
+      "epoch": 0.34297671033478894,
+      "grad_norm": 0.17840255796909332,
+      "learning_rate": 3.824867518971973e-05,
+      "loss": 0.15953952074050903,
+      "step": 1885
+    },
+    {
+      "epoch": 0.34388646288209607,
+      "grad_norm": 0.16998249292373657,
+      "learning_rate": 3.818619594131489e-05,
+      "loss": 0.16027032136917113,
+      "step": 1890
+    },
+    {
+      "epoch": 0.3447962154294032,
+      "grad_norm": 0.14950257539749146,
+      "learning_rate": 3.812360238135897e-05,
+      "loss": 0.15335670709609986,
+      "step": 1895
+    },
+    {
+      "epoch": 0.3457059679767103,
+      "grad_norm": 0.1678011417388916,
+      "learning_rate": 3.806089505247752e-05,
+      "loss": 0.1560648798942566,
+      "step": 1900
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.0510934319974177e+18,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-1900/training_args.bin b/checkpoint-1900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-1900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/checkpoint-200/README.md b/checkpoint-200/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-200/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-200/adapter_config.json b/checkpoint-200/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-200/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-200/adapter_model.safetensors b/checkpoint-200/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..0dc859761af4bdca3b87b9a6f46266d791b01f4b
--- /dev/null
+++ b/checkpoint-200/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:89f1fb5bfacc31fdb717a86263aba4bc317f49b0e0a622c411c81f51589ab46f
+size 169741912
diff --git a/checkpoint-200/chat_template.jinja b/checkpoint-200/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-200/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-200/optimizer.pt b/checkpoint-200/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b389c40b520cb3c7eb73d38a7cc1a3480f842e8e
--- /dev/null
+++ b/checkpoint-200/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a4b91829c7d08e4d7ec6bef21d1366610edf9b4f5e9f4bf99e1af4fc24452e78
+size 72806843
diff --git a/checkpoint-200/processor_config.json b/checkpoint-200/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-200/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-200/rng_state.pth b/checkpoint-200/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-200/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-200/scheduler.pt b/checkpoint-200/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..66e6004136ff395821fc96a23576cb2c57bb9aa8
--- /dev/null
+++ b/checkpoint-200/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:29729914fcd289b6cd7b8d01f022641caf784664fb47ca2f2f1100dd2c24307d
+size 1465
diff --git a/checkpoint-200/tokenizer.json b/checkpoint-200/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-200/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-200/tokenizer_config.json b/checkpoint-200/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-200/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-200/trainer_state.json b/checkpoint-200/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..a042aff091ff3ba7fcfe0dbd36c3e04b12b96aae
--- /dev/null
+++ b/checkpoint-200/trainer_state.json
@@ -0,0 +1,322 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.036390101892285295,
+  "eval_steps": 100,
+  "global_step": 200,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.1939621783048986e+17,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-200/training_args.bin b/checkpoint-200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/checkpoint-2000/README.md b/checkpoint-2000/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-2000/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-2000/adapter_config.json b/checkpoint-2000/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-2000/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-2000/adapter_model.safetensors b/checkpoint-2000/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c97597f7537f1c06b48b6d5221ec8a500bec04fb
--- /dev/null
+++ b/checkpoint-2000/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5e5237ec64de13e87d4dce23b236cab89d80e71d67d7620797822a866a7babc9
+size 169741912
diff --git a/checkpoint-2000/chat_template.jinja b/checkpoint-2000/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-2000/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-2000/optimizer.pt b/checkpoint-2000/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ca8c73b498ae9dc0746f2a44ff42293c83ac5830
--- /dev/null
+++ b/checkpoint-2000/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1fa24a3e158860666ac3f2fbedcfa62984113fb8045909bf0a62c4d972f2137b
+size 72807355
diff --git a/checkpoint-2000/processor_config.json b/checkpoint-2000/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-2000/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-2000/rng_state.pth b/checkpoint-2000/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-2000/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-2000/scheduler.pt b/checkpoint-2000/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..731d6355192a1d6d2f4f05e83f27d99381e7ff35
--- /dev/null
+++ b/checkpoint-2000/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5f5bdc1a9515599586f406cb9dd374c8e2782d4f3f12557a0e6fc81a835534f4
+size 1465
diff --git a/checkpoint-2000/tokenizer.json b/checkpoint-2000/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-2000/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-2000/tokenizer_config.json b/checkpoint-2000/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-2000/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-2000/trainer_state.json b/checkpoint-2000/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..5f823793b437ad32b488441ca957816107dee2ed
--- /dev/null
+++ b/checkpoint-2000/trainer_state.json
@@ -0,0 +1,2842 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.363901018922853,
+  "eval_steps": 100,
+  "global_step": 2000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    },
+    {
+      "epoch": 0.03729985443959243,
+      "grad_norm": 0.061678580939769745,
+      "learning_rate": 4.999340748636462e-05,
+      "loss": 0.24956226348876953,
+      "step": 205
+    },
+    {
+      "epoch": 0.03820960698689956,
+      "grad_norm": 0.07328873127698898,
+      "learning_rate": 4.999160884067051e-05,
+      "loss": 0.26169676780700685,
+      "step": 210
+    },
+    {
+      "epoch": 0.0391193595342067,
+      "grad_norm": 0.08287990838289261,
+      "learning_rate": 4.9989593541926246e-05,
+      "loss": 0.2574604034423828,
+      "step": 215
+    },
+    {
+      "epoch": 0.04002911208151383,
+      "grad_norm": 0.06787359714508057,
+      "learning_rate": 4.9987361607602525e-05,
+      "loss": 0.25351409912109374,
+      "step": 220
+    },
+    {
+      "epoch": 0.04093886462882096,
+      "grad_norm": 0.06695502996444702,
+      "learning_rate": 4.998491305704805e-05,
+      "loss": 0.24522039890289307,
+      "step": 225
+    },
+    {
+      "epoch": 0.041848617176128096,
+      "grad_norm": 0.08872214704751968,
+      "learning_rate": 4.9982247911489375e-05,
+      "loss": 0.2581867933273315,
+      "step": 230
+    },
+    {
+      "epoch": 0.042758369723435226,
+      "grad_norm": 0.07637131959199905,
+      "learning_rate": 4.9979366194030743e-05,
+      "loss": 0.25569658279418944,
+      "step": 235
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 0.08158119022846222,
+      "learning_rate": 4.997626792965385e-05,
+      "loss": 0.2529409646987915,
+      "step": 240
+    },
+    {
+      "epoch": 0.04457787481804949,
+      "grad_norm": 0.07529161125421524,
+      "learning_rate": 4.997295314521766e-05,
+      "loss": 0.24049024581909179,
+      "step": 245
+    },
+    {
+      "epoch": 0.04548762736535662,
+      "grad_norm": 0.08860139548778534,
+      "learning_rate": 4.996942186945813e-05,
+      "loss": 0.2490522861480713,
+      "step": 250
+    },
+    {
+      "epoch": 0.04639737991266375,
+      "grad_norm": 0.0850321501493454,
+      "learning_rate": 4.9965674132988005e-05,
+      "loss": 0.24180831909179687,
+      "step": 255
+    },
+    {
+      "epoch": 0.04730713245997089,
+      "grad_norm": 0.07556115090847015,
+      "learning_rate": 4.996170996829653e-05,
+      "loss": 0.2509631872177124,
+      "step": 260
+    },
+    {
+      "epoch": 0.04821688500727802,
+      "grad_norm": 0.07971206307411194,
+      "learning_rate": 4.995752940974918e-05,
+      "loss": 0.24398891925811766,
+      "step": 265
+    },
+    {
+      "epoch": 0.04912663755458515,
+      "grad_norm": 0.09149336814880371,
+      "learning_rate": 4.9953132493587344e-05,
+      "loss": 0.2300492286682129,
+      "step": 270
+    },
+    {
+      "epoch": 0.050036390101892286,
+      "grad_norm": 0.08265820890665054,
+      "learning_rate": 4.9948519257928034e-05,
+      "loss": 0.24246792793273925,
+      "step": 275
+    },
+    {
+      "epoch": 0.050946142649199416,
+      "grad_norm": 0.10328587144613266,
+      "learning_rate": 4.9943689742763534e-05,
+      "loss": 0.2367171049118042,
+      "step": 280
+    },
+    {
+      "epoch": 0.05185589519650655,
+      "grad_norm": 0.0836917981505394,
+      "learning_rate": 4.993864398996105e-05,
+      "loss": 0.23215813636779786,
+      "step": 285
+    },
+    {
+      "epoch": 0.05276564774381368,
+      "grad_norm": 0.09475161135196686,
+      "learning_rate": 4.99333820432624e-05,
+      "loss": 0.2350748062133789,
+      "step": 290
+    },
+    {
+      "epoch": 0.05367540029112081,
+      "grad_norm": 0.08040128648281097,
+      "learning_rate": 4.992790394828355e-05,
+      "loss": 0.23253886699676513,
+      "step": 295
+    },
+    {
+      "epoch": 0.05458515283842795,
+      "grad_norm": 0.08852150291204453,
+      "learning_rate": 4.992220975251428e-05,
+      "loss": 0.23856515884399415,
+      "step": 300
+    },
+    {
+      "epoch": 0.05549490538573508,
+      "grad_norm": 0.09565229713916779,
+      "learning_rate": 4.991629950531775e-05,
+      "loss": 0.23311660289764405,
+      "step": 305
+    },
+    {
+      "epoch": 0.05640465793304221,
+      "grad_norm": 0.08158160001039505,
+      "learning_rate": 4.991017325793009e-05,
+      "loss": 0.22467944622039795,
+      "step": 310
+    },
+    {
+      "epoch": 0.05731441048034935,
+      "grad_norm": 0.07746429741382599,
+      "learning_rate": 4.990383106345994e-05,
+      "loss": 0.229844069480896,
+      "step": 315
+    },
+    {
+      "epoch": 0.05822416302765648,
+      "grad_norm": 0.08564355969429016,
+      "learning_rate": 4.989727297688797e-05,
+      "loss": 0.22414517402648926,
+      "step": 320
+    },
+    {
+      "epoch": 0.05913391557496361,
+      "grad_norm": 0.07517435401678085,
+      "learning_rate": 4.9890499055066435e-05,
+      "loss": 0.2236532211303711,
+      "step": 325
+    },
+    {
+      "epoch": 0.060043668122270744,
+      "grad_norm": 0.111734539270401,
+      "learning_rate": 4.988350935671869e-05,
+      "loss": 0.21474847793579102,
+      "step": 330
+    },
+    {
+      "epoch": 0.060953420669577874,
+      "grad_norm": 0.09906989336013794,
+      "learning_rate": 4.987630394243866e-05,
+      "loss": 0.23321933746337892,
+      "step": 335
+    },
+    {
+      "epoch": 0.06186317321688501,
+      "grad_norm": 0.10131457448005676,
+      "learning_rate": 4.98688828746903e-05,
+      "loss": 0.2310662031173706,
+      "step": 340
+    },
+    {
+      "epoch": 0.06277292576419213,
+      "grad_norm": 0.09203507006168365,
+      "learning_rate": 4.986124621780708e-05,
+      "loss": 0.22021169662475587,
+      "step": 345
+    },
+    {
+      "epoch": 0.06368267831149928,
+      "grad_norm": 0.09505912661552429,
+      "learning_rate": 4.9853394037991416e-05,
+      "loss": 0.2197155237197876,
+      "step": 350
+    },
+    {
+      "epoch": 0.06459243085880641,
+      "grad_norm": 0.09038657695055008,
+      "learning_rate": 4.984532640331412e-05,
+      "loss": 0.22066287994384765,
+      "step": 355
+    },
+    {
+      "epoch": 0.06550218340611354,
+      "grad_norm": 0.09707064181566238,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 0.22455451488494874,
+      "step": 360
+    },
+    {
+      "epoch": 0.06641193595342067,
+      "grad_norm": 0.10367228090763092,
+      "learning_rate": 4.98285450509961e-05,
+      "loss": 0.21993820667266845,
+      "step": 365
+    },
+    {
+      "epoch": 0.0673216885007278,
+      "grad_norm": 0.12229471653699875,
+      "learning_rate": 4.9819831478833456e-05,
+      "loss": 0.2168867588043213,
+      "step": 370
+    },
+    {
+      "epoch": 0.06823144104803494,
+      "grad_norm": 0.0964592918753624,
+      "learning_rate": 4.981090274276406e-05,
+      "loss": 0.21579203605651856,
+      "step": 375
+    },
+    {
+      "epoch": 0.06914119359534207,
+      "grad_norm": 0.09400496631860733,
+      "learning_rate": 4.980175892019141e-05,
+      "loss": 0.20972180366516113,
+      "step": 380
+    },
+    {
+      "epoch": 0.0700509461426492,
+      "grad_norm": 0.08158645778894424,
+      "learning_rate": 4.9792400090383594e-05,
+      "loss": 0.22148358821868896,
+      "step": 385
+    },
+    {
+      "epoch": 0.07096069868995633,
+      "grad_norm": 0.10916394740343094,
+      "learning_rate": 4.978282633447261e-05,
+      "loss": 0.2214418649673462,
+      "step": 390
+    },
+    {
+      "epoch": 0.07187045123726346,
+      "grad_norm": 0.11138810962438583,
+      "learning_rate": 4.9773037735453636e-05,
+      "loss": 0.21814754009246826,
+      "step": 395
+    },
+    {
+      "epoch": 0.07278020378457059,
+      "grad_norm": 0.10914396494626999,
+      "learning_rate": 4.9763034378184365e-05,
+      "loss": 0.21310818195343018,
+      "step": 400
+    },
+    {
+      "epoch": 0.07368995633187773,
+      "grad_norm": 0.1043366864323616,
+      "learning_rate": 4.975281634938421e-05,
+      "loss": 0.21266789436340333,
+      "step": 405
+    },
+    {
+      "epoch": 0.07459970887918486,
+      "grad_norm": 0.1036868542432785,
+      "learning_rate": 4.9742383737633594e-05,
+      "loss": 0.21606721878051757,
+      "step": 410
+    },
+    {
+      "epoch": 0.075509461426492,
+      "grad_norm": 0.11640442907810211,
+      "learning_rate": 4.9731736633373144e-05,
+      "loss": 0.21532948017120362,
+      "step": 415
+    },
+    {
+      "epoch": 0.07641921397379912,
+      "grad_norm": 0.11219926178455353,
+      "learning_rate": 4.9720875128902956e-05,
+      "loss": 0.2191627025604248,
+      "step": 420
+    },
+    {
+      "epoch": 0.07732896652110625,
+      "grad_norm": 0.12103637307882309,
+      "learning_rate": 4.970979931838176e-05,
+      "loss": 0.20938868522644044,
+      "step": 425
+    },
+    {
+      "epoch": 0.0782387190684134,
+      "grad_norm": 0.13274189829826355,
+      "learning_rate": 4.96985092978261e-05,
+      "loss": 0.21792960166931152,
+      "step": 430
+    },
+    {
+      "epoch": 0.07914847161572053,
+      "grad_norm": 0.11164513230323792,
+      "learning_rate": 4.968700516510954e-05,
+      "loss": 0.2022618055343628,
+      "step": 435
+    },
+    {
+      "epoch": 0.08005822416302766,
+      "grad_norm": 0.09532847255468369,
+      "learning_rate": 4.967528701996174e-05,
+      "loss": 0.21255812644958497,
+      "step": 440
+    },
+    {
+      "epoch": 0.08096797671033479,
+      "grad_norm": 0.10279258340597153,
+      "learning_rate": 4.96633549639677e-05,
+      "loss": 0.20683050155639648,
+      "step": 445
+    },
+    {
+      "epoch": 0.08187772925764192,
+      "grad_norm": 0.1257462352514267,
+      "learning_rate": 4.965120910056677e-05,
+      "loss": 0.21419920921325683,
+      "step": 450
+    },
+    {
+      "epoch": 0.08278748180494905,
+      "grad_norm": 0.11663137376308441,
+      "learning_rate": 4.963884953505186e-05,
+      "loss": 0.2072287082672119,
+      "step": 455
+    },
+    {
+      "epoch": 0.08369723435225619,
+      "grad_norm": 0.10488224029541016,
+      "learning_rate": 4.96262763745684e-05,
+      "loss": 0.1982678532600403,
+      "step": 460
+    },
+    {
+      "epoch": 0.08460698689956332,
+      "grad_norm": 0.11801692098379135,
+      "learning_rate": 4.961348972811354e-05,
+      "loss": 0.20662031173706055,
+      "step": 465
+    },
+    {
+      "epoch": 0.08551673944687045,
+      "grad_norm": 0.11318827420473099,
+      "learning_rate": 4.96004897065351e-05,
+      "loss": 0.20947303771972656,
+      "step": 470
+    },
+    {
+      "epoch": 0.08642649199417758,
+      "grad_norm": 0.13409486413002014,
+      "learning_rate": 4.95872764225307e-05,
+      "loss": 0.19670876264572143,
+      "step": 475
+    },
+    {
+      "epoch": 0.08733624454148471,
+      "grad_norm": 0.14440792798995972,
+      "learning_rate": 4.957384999064672e-05,
+      "loss": 0.19842848777770997,
+      "step": 480
+    },
+    {
+      "epoch": 0.08824599708879186,
+      "grad_norm": 0.12246996909379959,
+      "learning_rate": 4.956021052727731e-05,
+      "loss": 0.20318071842193602,
+      "step": 485
+    },
+    {
+      "epoch": 0.08915574963609899,
+      "grad_norm": 0.13437233865261078,
+      "learning_rate": 4.954635815066342e-05,
+      "loss": 0.21675212383270265,
+      "step": 490
+    },
+    {
+      "epoch": 0.09006550218340612,
+      "grad_norm": 0.11109672486782074,
+      "learning_rate": 4.9532292980891744e-05,
+      "loss": 0.2100757837295532,
+      "step": 495
+    },
+    {
+      "epoch": 0.09097525473071325,
+      "grad_norm": 0.1388893872499466,
+      "learning_rate": 4.9518015139893675e-05,
+      "loss": 0.20303285121917725,
+      "step": 500
+    },
+    {
+      "epoch": 0.09188500727802038,
+      "grad_norm": 0.13239721953868866,
+      "learning_rate": 4.950352475144427e-05,
+      "loss": 0.2152268409729004,
+      "step": 505
+    },
+    {
+      "epoch": 0.0927947598253275,
+      "grad_norm": 0.12834979593753815,
+      "learning_rate": 4.948882194116119e-05,
+      "loss": 0.20799248218536376,
+      "step": 510
+    },
+    {
+      "epoch": 0.09370451237263465,
+      "grad_norm": 0.11886704713106155,
+      "learning_rate": 4.947390683650354e-05,
+      "loss": 0.20394976139068605,
+      "step": 515
+    },
+    {
+      "epoch": 0.09461426491994178,
+      "grad_norm": 0.11398876458406448,
+      "learning_rate": 4.945877956677083e-05,
+      "loss": 0.2091092586517334,
+      "step": 520
+    },
+    {
+      "epoch": 0.09552401746724891,
+      "grad_norm": 0.1422540694475174,
+      "learning_rate": 4.944344026310186e-05,
+      "loss": 0.19564238786697388,
+      "step": 525
+    },
+    {
+      "epoch": 0.09643377001455604,
+      "grad_norm": 0.11359584331512451,
+      "learning_rate": 4.9427889058473535e-05,
+      "loss": 0.20493624210357667,
+      "step": 530
+    },
+    {
+      "epoch": 0.09734352256186317,
+      "grad_norm": 0.11703553050756454,
+      "learning_rate": 4.941212608769974e-05,
+      "loss": 0.2098615884780884,
+      "step": 535
+    },
+    {
+      "epoch": 0.0982532751091703,
+      "grad_norm": 0.14552047848701477,
+      "learning_rate": 4.939615148743017e-05,
+      "loss": 0.20382182598114013,
+      "step": 540
+    },
+    {
+      "epoch": 0.09916302765647744,
+      "grad_norm": 0.13178016245365143,
+      "learning_rate": 4.937996539614914e-05,
+      "loss": 0.19901862144470214,
+      "step": 545
+    },
+    {
+      "epoch": 0.10007278020378457,
+      "grad_norm": 0.635392427444458,
+      "learning_rate": 4.936356795417439e-05,
+      "loss": 0.20694944858551026,
+      "step": 550
+    },
+    {
+      "epoch": 0.1009825327510917,
+      "grad_norm": 0.15019077062606812,
+      "learning_rate": 4.934695930365586e-05,
+      "loss": 0.19313746690750122,
+      "step": 555
+    },
+    {
+      "epoch": 0.10189228529839883,
+      "grad_norm": 0.12941956520080566,
+      "learning_rate": 4.9330139588574474e-05,
+      "loss": 0.19671722650527954,
+      "step": 560
+    },
+    {
+      "epoch": 0.10280203784570596,
+      "grad_norm": 0.13818831741809845,
+      "learning_rate": 4.931310895474088e-05,
+      "loss": 0.20026786327362062,
+      "step": 565
+    },
+    {
+      "epoch": 0.1037117903930131,
+      "grad_norm": 0.12011194974184036,
+      "learning_rate": 4.929586754979417e-05,
+      "loss": 0.1932437539100647,
+      "step": 570
+    },
+    {
+      "epoch": 0.10462154294032024,
+      "grad_norm": 0.1345364898443222,
+      "learning_rate": 4.9278415523200644e-05,
+      "loss": 0.20245940685272218,
+      "step": 575
+    },
+    {
+      "epoch": 0.10553129548762737,
+      "grad_norm": 0.13281017541885376,
+      "learning_rate": 4.926075302625247e-05,
+      "loss": 0.19864981174468993,
+      "step": 580
+    },
+    {
+      "epoch": 0.1064410480349345,
+      "grad_norm": 0.13465586304664612,
+      "learning_rate": 4.924288021206639e-05,
+      "loss": 0.19573183059692384,
+      "step": 585
+    },
+    {
+      "epoch": 0.10735080058224163,
+      "grad_norm": 0.15225961804389954,
+      "learning_rate": 4.9224797235582396e-05,
+      "loss": 0.19946801662445068,
+      "step": 590
+    },
+    {
+      "epoch": 0.10826055312954876,
+      "grad_norm": 0.12816746532917023,
+      "learning_rate": 4.92065042535624e-05,
+      "loss": 0.19851526021957397,
+      "step": 595
+    },
+    {
+      "epoch": 0.1091703056768559,
+      "grad_norm": 0.13802853226661682,
+      "learning_rate": 4.9188001424588824e-05,
+      "loss": 0.19321763515472412,
+      "step": 600
+    },
+    {
+      "epoch": 0.11008005822416303,
+      "grad_norm": 0.17504797875881195,
+      "learning_rate": 4.9169288909063295e-05,
+      "loss": 0.2032616138458252,
+      "step": 605
+    },
+    {
+      "epoch": 0.11098981077147016,
+      "grad_norm": 0.13544194400310516,
+      "learning_rate": 4.91503668692052e-05,
+      "loss": 0.2011256456375122,
+      "step": 610
+    },
+    {
+      "epoch": 0.11189956331877729,
+      "grad_norm": 1.3976134061813354,
+      "learning_rate": 4.91312354690503e-05,
+      "loss": 0.19916868209838867,
+      "step": 615
+    },
+    {
+      "epoch": 0.11280931586608442,
+      "grad_norm": 0.1465059071779251,
+      "learning_rate": 4.91118948744493e-05,
+      "loss": 0.19487457275390624,
+      "step": 620
+    },
+    {
+      "epoch": 0.11371906841339156,
+      "grad_norm": 0.12103168666362762,
+      "learning_rate": 4.909234525306645e-05,
+      "loss": 0.1907251238822937,
+      "step": 625
+    },
+    {
+      "epoch": 0.1146288209606987,
+      "grad_norm": 0.12660574913024902,
+      "learning_rate": 4.907258677437802e-05,
+      "loss": 0.19327253103256226,
+      "step": 630
+    },
+    {
+      "epoch": 0.11553857350800582,
+      "grad_norm": 0.1347813606262207,
+      "learning_rate": 4.90526196096709e-05,
+      "loss": 0.19637736082077026,
+      "step": 635
+    },
+    {
+      "epoch": 0.11644832605531295,
+      "grad_norm": 0.14953652024269104,
+      "learning_rate": 4.903244393204107e-05,
+      "loss": 0.20325069427490233,
+      "step": 640
+    },
+    {
+      "epoch": 0.11735807860262008,
+      "grad_norm": 0.13936272263526917,
+      "learning_rate": 4.901205991639213e-05,
+      "loss": 0.1930275321006775,
+      "step": 645
+    },
+    {
+      "epoch": 0.11826783114992721,
+      "grad_norm": 0.1448420137166977,
+      "learning_rate": 4.899146773943374e-05,
+      "loss": 0.20026936531066894,
+      "step": 650
+    },
+    {
+      "epoch": 0.11917758369723436,
+      "grad_norm": 0.1312534064054489,
+      "learning_rate": 4.897066757968014e-05,
+      "loss": 0.19062033891677857,
+      "step": 655
+    },
+    {
+      "epoch": 0.12008733624454149,
+      "grad_norm": 0.13644742965698242,
+      "learning_rate": 4.894965961744859e-05,
+      "loss": 0.18719595670700073,
+      "step": 660
+    },
+    {
+      "epoch": 0.12099708879184862,
+      "grad_norm": 0.14276087284088135,
+      "learning_rate": 4.892844403485777e-05,
+      "loss": 0.19784307479858398,
+      "step": 665
+    },
+    {
+      "epoch": 0.12190684133915575,
+      "grad_norm": 0.14735399186611176,
+      "learning_rate": 4.890702101582623e-05,
+      "loss": 0.19163782596588136,
+      "step": 670
+    },
+    {
+      "epoch": 0.12281659388646288,
+      "grad_norm": 0.15742065012454987,
+      "learning_rate": 4.888539074607082e-05,
+      "loss": 0.19312986135482788,
+      "step": 675
+    },
+    {
+      "epoch": 0.12372634643377002,
+      "grad_norm": 0.12917031347751617,
+      "learning_rate": 4.8863553413105025e-05,
+      "loss": 0.20066320896148682,
+      "step": 680
+    },
+    {
+      "epoch": 0.12463609898107715,
+      "grad_norm": 0.1484801322221756,
+      "learning_rate": 4.884150920623737e-05,
+      "loss": 0.20096096992492676,
+      "step": 685
+    },
+    {
+      "epoch": 0.12554585152838427,
+      "grad_norm": 0.1455296128988266,
+      "learning_rate": 4.88192583165698e-05,
+      "loss": 0.20518505573272705,
+      "step": 690
+    },
+    {
+      "epoch": 0.12645560407569142,
+      "grad_norm": 0.14517490565776825,
+      "learning_rate": 4.879680093699598e-05,
+      "loss": 0.18859238624572755,
+      "step": 695
+    },
+    {
+      "epoch": 0.12736535662299855,
+      "grad_norm": 0.18778090178966522,
+      "learning_rate": 4.877413726219964e-05,
+      "loss": 0.197074818611145,
+      "step": 700
+    },
+    {
+      "epoch": 0.12827510917030568,
+      "grad_norm": 0.13497677445411682,
+      "learning_rate": 4.87512674886529e-05,
+      "loss": 0.18713107109069824,
+      "step": 705
+    },
+    {
+      "epoch": 0.12918486171761281,
+      "grad_norm": 0.12657155096530914,
+      "learning_rate": 4.872819181461455e-05,
+      "loss": 0.1858484387397766,
+      "step": 710
+    },
+    {
+      "epoch": 0.13009461426491994,
+      "grad_norm": 0.11458148807287216,
+      "learning_rate": 4.870491044012834e-05,
+      "loss": 0.18732179403305055,
+      "step": 715
+    },
+    {
+      "epoch": 0.13100436681222707,
+      "grad_norm": 0.13000249862670898,
+      "learning_rate": 4.8681423567021244e-05,
+      "loss": 0.1872936010360718,
+      "step": 720
+    },
+    {
+      "epoch": 0.1319141193595342,
+      "grad_norm": 0.14580890536308289,
+      "learning_rate": 4.865773139890172e-05,
+      "loss": 0.19280019998550416,
+      "step": 725
+    },
+    {
+      "epoch": 0.13282387190684133,
+      "grad_norm": 0.1507277935743332,
+      "learning_rate": 4.8633834141157913e-05,
+      "loss": 0.1898929238319397,
+      "step": 730
+    },
+    {
+      "epoch": 0.13373362445414846,
+      "grad_norm": 0.1418737769126892,
+      "learning_rate": 4.860973200095592e-05,
+      "loss": 0.17926375865936278,
+      "step": 735
+    },
+    {
+      "epoch": 0.1346433770014556,
+      "grad_norm": 0.17151866853237152,
+      "learning_rate": 4.858542518723794e-05,
+      "loss": 0.18963592052459716,
+      "step": 740
+    },
+    {
+      "epoch": 0.13555312954876272,
+      "grad_norm": 0.11162743717432022,
+      "learning_rate": 4.8560913910720535e-05,
+      "loss": 0.19466646909713745,
+      "step": 745
+    },
+    {
+      "epoch": 0.13646288209606988,
+      "grad_norm": 0.15628376603126526,
+      "learning_rate": 4.8536198383892725e-05,
+      "loss": 0.19494034051895143,
+      "step": 750
+    },
+    {
+      "epoch": 0.137372634643377,
+      "grad_norm": 0.18209289014339447,
+      "learning_rate": 4.851127882101421e-05,
+      "loss": 0.18747550249099731,
+      "step": 755
+    },
+    {
+      "epoch": 0.13828238719068414,
+      "grad_norm": 0.14559614658355713,
+      "learning_rate": 4.8486155438113454e-05,
+      "loss": 0.1897158980369568,
+      "step": 760
+    },
+    {
+      "epoch": 0.13919213973799127,
+      "grad_norm": 0.3198587894439697,
+      "learning_rate": 4.846082845298586e-05,
+      "loss": 0.18571001291275024,
+      "step": 765
+    },
+    {
+      "epoch": 0.1401018922852984,
+      "grad_norm": 0.1486678421497345,
+      "learning_rate": 4.843529808519189e-05,
+      "loss": 0.19561930894851684,
+      "step": 770
+    },
+    {
+      "epoch": 0.14101164483260553,
+      "grad_norm": 0.15318170189857483,
+      "learning_rate": 4.840956455605509e-05,
+      "loss": 0.187040114402771,
+      "step": 775
+    },
+    {
+      "epoch": 0.14192139737991266,
+      "grad_norm": 0.13754244148731232,
+      "learning_rate": 4.838362808866025e-05,
+      "loss": 0.18345539569854735,
+      "step": 780
+    },
+    {
+      "epoch": 0.1428311499272198,
+      "grad_norm": 0.12943248450756073,
+      "learning_rate": 4.835748890785143e-05,
+      "loss": 0.1921079397201538,
+      "step": 785
+    },
+    {
+      "epoch": 0.14374090247452692,
+      "grad_norm": 0.110458143055439,
+      "learning_rate": 4.833114724023001e-05,
+      "loss": 0.17927205562591553,
+      "step": 790
+    },
+    {
+      "epoch": 0.14465065502183405,
+      "grad_norm": 0.2421770840883255,
+      "learning_rate": 4.830460331415275e-05,
+      "loss": 0.18317567110061644,
+      "step": 795
+    },
+    {
+      "epoch": 0.14556040756914118,
+      "grad_norm": 0.14752762019634247,
+      "learning_rate": 4.8277857359729787e-05,
+      "loss": 0.1843916058540344,
+      "step": 800
+    },
+    {
+      "epoch": 0.14647016011644834,
+      "grad_norm": 0.15043556690216064,
+      "learning_rate": 4.8250909608822644e-05,
+      "loss": 0.18354393243789674,
+      "step": 805
+    },
+    {
+      "epoch": 0.14737991266375547,
+      "grad_norm": 0.1381794661283493,
+      "learning_rate": 4.822376029504223e-05,
+      "loss": 0.1789781332015991,
+      "step": 810
+    },
+    {
+      "epoch": 0.1482896652110626,
+      "grad_norm": 0.18386174738407135,
+      "learning_rate": 4.819640965374681e-05,
+      "loss": 0.19494292736053467,
+      "step": 815
+    },
+    {
+      "epoch": 0.14919941775836973,
+      "grad_norm": 0.13829593360424042,
+      "learning_rate": 4.816885792203996e-05,
+      "loss": 0.18486063480377196,
+      "step": 820
+    },
+    {
+      "epoch": 0.15010917030567686,
+      "grad_norm": 0.15033291280269623,
+      "learning_rate": 4.814110533876852e-05,
+      "loss": 0.18061509132385253,
+      "step": 825
+    },
+    {
+      "epoch": 0.151018922852984,
+      "grad_norm": 0.17150473594665527,
+      "learning_rate": 4.811315214452051e-05,
+      "loss": 0.18464866876602173,
+      "step": 830
+    },
+    {
+      "epoch": 0.15192867540029112,
+      "grad_norm": 0.15317125618457794,
+      "learning_rate": 4.808499858162307e-05,
+      "loss": 0.1837708592414856,
+      "step": 835
+    },
+    {
+      "epoch": 0.15283842794759825,
+      "grad_norm": 0.2671392560005188,
+      "learning_rate": 4.805664489414031e-05,
+      "loss": 0.19338636398315429,
+      "step": 840
+    },
+    {
+      "epoch": 0.15374818049490538,
+      "grad_norm": 0.14047028124332428,
+      "learning_rate": 4.802809132787125e-05,
+      "loss": 0.17069108486175538,
+      "step": 845
+    },
+    {
+      "epoch": 0.1546579330422125,
+      "grad_norm": 0.1520431935787201,
+      "learning_rate": 4.799933813034768e-05,
+      "loss": 0.18607735633850098,
+      "step": 850
+    },
+    {
+      "epoch": 0.15556768558951964,
+      "grad_norm": 0.17239463329315186,
+      "learning_rate": 4.797038555083197e-05,
+      "loss": 0.18069062232971192,
+      "step": 855
+    },
+    {
+      "epoch": 0.1564774381368268,
+      "grad_norm": 0.1377955675125122,
+      "learning_rate": 4.794123384031495e-05,
+      "loss": 0.18870222568511963,
+      "step": 860
+    },
+    {
+      "epoch": 0.15738719068413393,
+      "grad_norm": 0.15901461243629456,
+      "learning_rate": 4.791188325151373e-05,
+      "loss": 0.18128334283828734,
+      "step": 865
+    },
+    {
+      "epoch": 0.15829694323144106,
+      "grad_norm": 0.14634132385253906,
+      "learning_rate": 4.7882334038869495e-05,
+      "loss": 0.1866163969039917,
+      "step": 870
+    },
+    {
+      "epoch": 0.1592066957787482,
+      "grad_norm": 0.15361061692237854,
+      "learning_rate": 4.785258645854529e-05,
+      "loss": 0.17850807905197144,
+      "step": 875
+    },
+    {
+      "epoch": 0.16011644832605532,
+      "grad_norm": 0.13751649856567383,
+      "learning_rate": 4.782264076842385e-05,
+      "loss": 0.17731113433837892,
+      "step": 880
+    },
+    {
+      "epoch": 0.16102620087336245,
+      "grad_norm": 0.17909638583660126,
+      "learning_rate": 4.7792497228105314e-05,
+      "loss": 0.18344542980194092,
+      "step": 885
+    },
+    {
+      "epoch": 0.16193595342066958,
+      "grad_norm": 0.16038304567337036,
+      "learning_rate": 4.776215609890498e-05,
+      "loss": 0.18868647813796996,
+      "step": 890
+    },
+    {
+      "epoch": 0.1628457059679767,
+      "grad_norm": 0.1653951108455658,
+      "learning_rate": 4.773161764385107e-05,
+      "loss": 0.18614152669906617,
+      "step": 895
+    },
+    {
+      "epoch": 0.16375545851528384,
+      "grad_norm": 0.16193026304244995,
+      "learning_rate": 4.770088212768241e-05,
+      "loss": 0.18564575910568237,
+      "step": 900
+    },
+    {
+      "epoch": 0.16466521106259097,
+      "grad_norm": 0.16048531234264374,
+      "learning_rate": 4.7669949816846173e-05,
+      "loss": 0.18330031633377075,
+      "step": 905
+    },
+    {
+      "epoch": 0.1655749636098981,
+      "grad_norm": 0.1440177708864212,
+      "learning_rate": 4.7638820979495534e-05,
+      "loss": 0.17712442874908446,
+      "step": 910
+    },
+    {
+      "epoch": 0.16648471615720525,
+      "grad_norm": 0.19635969400405884,
+      "learning_rate": 4.760749588548738e-05,
+      "loss": 0.18679027557373046,
+      "step": 915
+    },
+    {
+      "epoch": 0.16739446870451238,
+      "grad_norm": 0.15576541423797607,
+      "learning_rate": 4.757597480637995e-05,
+      "loss": 0.19283764362335204,
+      "step": 920
+    },
+    {
+      "epoch": 0.1683042212518195,
+      "grad_norm": 0.1550331562757492,
+      "learning_rate": 4.7544258015430463e-05,
+      "loss": 0.18269542455673218,
+      "step": 925
+    },
+    {
+      "epoch": 0.16921397379912664,
+      "grad_norm": 0.18369626998901367,
+      "learning_rate": 4.75123457875928e-05,
+      "loss": 0.1697891116142273,
+      "step": 930
+    },
+    {
+      "epoch": 0.17012372634643377,
+      "grad_norm": 0.15266314148902893,
+      "learning_rate": 4.7480238399515074e-05,
+      "loss": 0.18523451089859008,
+      "step": 935
+    },
+    {
+      "epoch": 0.1710334788937409,
+      "grad_norm": 0.16709664463996887,
+      "learning_rate": 4.744793612953724e-05,
+      "loss": 0.1803238034248352,
+      "step": 940
+    },
+    {
+      "epoch": 0.17194323144104803,
+      "grad_norm": 0.14929179847240448,
+      "learning_rate": 4.741543925768872e-05,
+      "loss": 0.1861217737197876,
+      "step": 945
+    },
+    {
+      "epoch": 0.17285298398835516,
+      "grad_norm": 0.1362280696630478,
+      "learning_rate": 4.7382748065685915e-05,
+      "loss": 0.17896100282669067,
+      "step": 950
+    },
+    {
+      "epoch": 0.1737627365356623,
+      "grad_norm": 0.15290239453315735,
+      "learning_rate": 4.734986283692982e-05,
+      "loss": 0.18432788848876952,
+      "step": 955
+    },
+    {
+      "epoch": 0.17467248908296942,
+      "grad_norm": 0.1287035197019577,
+      "learning_rate": 4.73167838565035e-05,
+      "loss": 0.18485682010650634,
+      "step": 960
+    },
+    {
+      "epoch": 0.17558224163027655,
+      "grad_norm": 0.17969627678394318,
+      "learning_rate": 4.728351141116971e-05,
+      "loss": 0.17361557483673096,
+      "step": 965
+    },
+    {
+      "epoch": 0.1764919941775837,
+      "grad_norm": 0.13751201331615448,
+      "learning_rate": 4.7250045789368326e-05,
+      "loss": 0.1731679320335388,
+      "step": 970
+    },
+    {
+      "epoch": 0.17740174672489084,
+      "grad_norm": 0.1603265255689621,
+      "learning_rate": 4.721638728121388e-05,
+      "loss": 0.17308170795440675,
+      "step": 975
+    },
+    {
+      "epoch": 0.17831149927219797,
+      "grad_norm": 0.1592789888381958,
+      "learning_rate": 4.718253617849306e-05,
+      "loss": 0.17534757852554322,
+      "step": 980
+    },
+    {
+      "epoch": 0.1792212518195051,
+      "grad_norm": 0.12727224826812744,
+      "learning_rate": 4.714849277466214e-05,
+      "loss": 0.17817609310150145,
+      "step": 985
+    },
+    {
+      "epoch": 0.18013100436681223,
+      "grad_norm": 0.15401554107666016,
+      "learning_rate": 4.711425736484447e-05,
+      "loss": 0.1733405351638794,
+      "step": 990
+    },
+    {
+      "epoch": 0.18104075691411936,
+      "grad_norm": 0.13253968954086304,
+      "learning_rate": 4.7079830245827906e-05,
+      "loss": 0.17846795320510864,
+      "step": 995
+    },
+    {
+      "epoch": 0.1819505094614265,
+      "grad_norm": 0.21846213936805725,
+      "learning_rate": 4.7045211716062245e-05,
+      "loss": 0.18021599054336548,
+      "step": 1000
+    },
+    {
+      "epoch": 0.18286026200873362,
+      "grad_norm": 0.16867990791797638,
+      "learning_rate": 4.7010402075656595e-05,
+      "loss": 0.18232386112213134,
+      "step": 1005
+    },
+    {
+      "epoch": 0.18377001455604075,
+      "grad_norm": 0.17180582880973816,
+      "learning_rate": 4.697540162637686e-05,
+      "loss": 0.1816317319869995,
+      "step": 1010
+    },
+    {
+      "epoch": 0.18467976710334788,
+      "grad_norm": 0.16480213403701782,
+      "learning_rate": 4.694021067164303e-05,
+      "loss": 0.17718446254730225,
+      "step": 1015
+    },
+    {
+      "epoch": 0.185589519650655,
+      "grad_norm": 0.15015918016433716,
+      "learning_rate": 4.6904829516526605e-05,
+      "loss": 0.17412011623382567,
+      "step": 1020
+    },
+    {
+      "epoch": 0.18649927219796217,
+      "grad_norm": 0.14445139467716217,
+      "learning_rate": 4.686925846774795e-05,
+      "loss": 0.1778018832206726,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1874090247452693,
+      "grad_norm": 0.1701960265636444,
+      "learning_rate": 4.683349783367362e-05,
+      "loss": 0.16901081800460815,
+      "step": 1030
+    },
+    {
+      "epoch": 0.18831877729257643,
+      "grad_norm": 0.15894867479801178,
+      "learning_rate": 4.679754792431368e-05,
+      "loss": 0.17055928707122803,
+      "step": 1035
+    },
+    {
+      "epoch": 0.18922852983988356,
+      "grad_norm": 0.1511942446231842,
+      "learning_rate": 4.676140905131903e-05,
+      "loss": 0.17339680194854737,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1901382823871907,
+      "grad_norm": 0.14735209941864014,
+      "learning_rate": 4.672508152797872e-05,
+      "loss": 0.17802717685699462,
+      "step": 1045
+    },
+    {
+      "epoch": 0.19104803493449782,
+      "grad_norm": 0.17367291450500488,
+      "learning_rate": 4.66885656692172e-05,
+      "loss": 0.1732744097709656,
+      "step": 1050
+    },
+    {
+      "epoch": 0.19195778748180495,
+      "grad_norm": 0.147227481007576,
+      "learning_rate": 4.665186179159159e-05,
+      "loss": 0.17040517330169677,
+      "step": 1055
+    },
+    {
+      "epoch": 0.19286754002911208,
+      "grad_norm": 0.1709655076265335,
+      "learning_rate": 4.6614970213289e-05,
+      "loss": 0.17794088125228882,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1937772925764192,
+      "grad_norm": 0.1588088721036911,
+      "learning_rate": 4.657789125412366e-05,
+      "loss": 0.17180380821228028,
+      "step": 1065
+    },
+    {
+      "epoch": 0.19468704512372634,
+      "grad_norm": 0.14827021956443787,
+      "learning_rate": 4.654062523553428e-05,
+      "loss": 0.182997989654541,
+      "step": 1070
+    },
+    {
+      "epoch": 0.19559679767103347,
+      "grad_norm": 0.16230466961860657,
+      "learning_rate": 4.6503172480581126e-05,
+      "loss": 0.17346880435943604,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1965065502183406,
+      "grad_norm": 0.1637624353170395,
+      "learning_rate": 4.646553331394333e-05,
+      "loss": 0.17263576984405518,
+      "step": 1080
+    },
+    {
+      "epoch": 0.19741630276564776,
+      "grad_norm": 0.15977843105793,
+      "learning_rate": 4.642770806191603e-05,
+      "loss": 0.17284308671951293,
+      "step": 1085
+    },
+    {
+      "epoch": 0.19832605531295489,
+      "grad_norm": 0.15394869446754456,
+      "learning_rate": 4.6389697052407534e-05,
+      "loss": 0.17797101736068727,
+      "step": 1090
+    },
+    {
+      "epoch": 0.19923580786026202,
+      "grad_norm": 0.15995225310325623,
+      "learning_rate": 4.6351500614936485e-05,
+      "loss": 0.18137198686599731,
+      "step": 1095
+    },
+    {
+      "epoch": 0.20014556040756915,
+      "grad_norm": 0.1779479682445526,
+      "learning_rate": 4.6313119080629006e-05,
+      "loss": 0.17998344898223878,
+      "step": 1100
+    },
+    {
+      "epoch": 0.20105531295487628,
+      "grad_norm": 0.14362832903862,
+      "learning_rate": 4.627455278221584e-05,
+      "loss": 0.18196423053741456,
+      "step": 1105
+    },
+    {
+      "epoch": 0.2019650655021834,
+      "grad_norm": 0.15951639413833618,
+      "learning_rate": 4.623580205402947e-05,
+      "loss": 0.17423888444900512,
+      "step": 1110
+    },
+    {
+      "epoch": 0.20287481804949054,
+      "grad_norm": 0.17273563146591187,
+      "learning_rate": 4.619686723200115e-05,
+      "loss": 0.17392473220825194,
+      "step": 1115
+    },
+    {
+      "epoch": 0.20378457059679767,
+      "grad_norm": 0.1655360758304596,
+      "learning_rate": 4.615774865365813e-05,
+      "loss": 0.17528389692306517,
+      "step": 1120
+    },
+    {
+      "epoch": 0.2046943231441048,
+      "grad_norm": 0.15920691192150116,
+      "learning_rate": 4.611844665812058e-05,
+      "loss": 0.1806849241256714,
+      "step": 1125
+    },
+    {
+      "epoch": 0.20560407569141192,
+      "grad_norm": 0.16114577651023865,
+      "learning_rate": 4.607896158609875e-05,
+      "loss": 0.17217352390289306,
+      "step": 1130
+    },
+    {
+      "epoch": 0.20651382823871905,
+      "grad_norm": 0.1499422937631607,
+      "learning_rate": 4.603929377988999e-05,
+      "loss": 0.17806737422943114,
+      "step": 1135
+    },
+    {
+      "epoch": 0.2074235807860262,
+      "grad_norm": 0.17605191469192505,
+      "learning_rate": 4.5999443583375765e-05,
+      "loss": 0.17842113971710205,
+      "step": 1140
+    },
+    {
+      "epoch": 0.20833333333333334,
+      "grad_norm": 0.16117210686206818,
+      "learning_rate": 4.595941134201871e-05,
+      "loss": 0.18379683494567872,
+      "step": 1145
+    },
+    {
+      "epoch": 0.20924308588064047,
+      "grad_norm": 0.21199050545692444,
+      "learning_rate": 4.591919740285957e-05,
+      "loss": 0.16286123991012574,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2101528384279476,
+      "grad_norm": 0.15100529789924622,
+      "learning_rate": 4.587880211451427e-05,
+      "loss": 0.17995200157165528,
+      "step": 1155
+    },
+    {
+      "epoch": 0.21106259097525473,
+      "grad_norm": 0.16618172824382782,
+      "learning_rate": 4.583822582717085e-05,
+      "loss": 0.16960303783416747,
+      "step": 1160
+    },
+    {
+      "epoch": 0.21197234352256186,
+      "grad_norm": 0.14743569493293762,
+      "learning_rate": 4.579746889258643e-05,
+      "loss": 0.17762668132781984,
+      "step": 1165
+    },
+    {
+      "epoch": 0.212882096069869,
+      "grad_norm": 0.1697179079055786,
+      "learning_rate": 4.575653166408417e-05,
+      "loss": 0.16656005382537842,
+      "step": 1170
+    },
+    {
+      "epoch": 0.21379184861717612,
+      "grad_norm": 0.14886513352394104,
+      "learning_rate": 4.57154144965502e-05,
+      "loss": 0.17091882228851318,
+      "step": 1175
+    },
+    {
+      "epoch": 0.21470160116448325,
+      "grad_norm": 0.18197473883628845,
+      "learning_rate": 4.5674117746430556e-05,
+      "loss": 0.1770920753479004,
+      "step": 1180
+    },
+    {
+      "epoch": 0.21561135371179038,
+      "grad_norm": 0.17323088645935059,
+      "learning_rate": 4.563264177172807e-05,
+      "loss": 0.1734643578529358,
+      "step": 1185
+    },
+    {
+      "epoch": 0.2165211062590975,
+      "grad_norm": 0.1521984338760376,
+      "learning_rate": 4.559098693199929e-05,
+      "loss": 0.17515116930007935,
+      "step": 1190
+    },
+    {
+      "epoch": 0.21743085880640467,
+      "grad_norm": 0.1842304915189743,
+      "learning_rate": 4.554915358835134e-05,
+      "loss": 0.16798022985458375,
+      "step": 1195
+    },
+    {
+      "epoch": 0.2183406113537118,
+      "grad_norm": 0.14753451943397522,
+      "learning_rate": 4.5507142103438794e-05,
+      "loss": 0.1755476713180542,
+      "step": 1200
+    },
+    {
+      "epoch": 0.21925036390101893,
+      "grad_norm": 0.17096194624900818,
+      "learning_rate": 4.546495284146057e-05,
+      "loss": 0.1792473554611206,
+      "step": 1205
+    },
+    {
+      "epoch": 0.22016011644832606,
+      "grad_norm": 0.1579233556985855,
+      "learning_rate": 4.542258616815669e-05,
+      "loss": 0.17230144739151002,
+      "step": 1210
+    },
+    {
+      "epoch": 0.2210698689956332,
+      "grad_norm": 0.177297905087471,
+      "learning_rate": 4.5380042450805216e-05,
+      "loss": 0.1807127833366394,
+      "step": 1215
+    },
+    {
+      "epoch": 0.22197962154294032,
+      "grad_norm": 0.14331696927547455,
+      "learning_rate": 4.533732205821897e-05,
+      "loss": 0.17201389074325563,
+      "step": 1220
+    },
+    {
+      "epoch": 0.22288937409024745,
+      "grad_norm": 0.14473360776901245,
+      "learning_rate": 4.529442536074239e-05,
+      "loss": 0.17036900520324708,
+      "step": 1225
+    },
+    {
+      "epoch": 0.22379912663755458,
+      "grad_norm": 0.1820901483297348,
+      "learning_rate": 4.5251352730248314e-05,
+      "loss": 0.17704882621765136,
+      "step": 1230
+    },
+    {
+      "epoch": 0.2247088791848617,
+      "grad_norm": 0.1948976367712021,
+      "learning_rate": 4.5208104540134746e-05,
+      "loss": 0.1706973433494568,
+      "step": 1235
+    },
+    {
+      "epoch": 0.22561863173216884,
+      "grad_norm": 0.16660070419311523,
+      "learning_rate": 4.51646811653216e-05,
+      "loss": 0.17636821269989014,
+      "step": 1240
+    },
+    {
+      "epoch": 0.22652838427947597,
+      "grad_norm": 0.1699984073638916,
+      "learning_rate": 4.512108298224751e-05,
+      "loss": 0.16986632347106934,
+      "step": 1245
+    },
+    {
+      "epoch": 0.22743813682678313,
+      "grad_norm": 0.17601042985916138,
+      "learning_rate": 4.50773103688665e-05,
+      "loss": 0.17507898807525635,
+      "step": 1250
+    },
+    {
+      "epoch": 0.22834788937409026,
+      "grad_norm": 0.17557238042354584,
+      "learning_rate": 4.503336370464476e-05,
+      "loss": 0.17702863216400147,
+      "step": 1255
+    },
+    {
+      "epoch": 0.2292576419213974,
+      "grad_norm": 0.1800651252269745,
+      "learning_rate": 4.498924337055729e-05,
+      "loss": 0.16419180631637573,
+      "step": 1260
+    },
+    {
+      "epoch": 0.23016739446870452,
+      "grad_norm": 0.2022479772567749,
+      "learning_rate": 4.494494974908468e-05,
+      "loss": 0.17482060194015503,
+      "step": 1265
+    },
+    {
+      "epoch": 0.23107714701601165,
+      "grad_norm": 0.14180205762386322,
+      "learning_rate": 4.490048322420973e-05,
+      "loss": 0.1723136067390442,
+      "step": 1270
+    },
+    {
+      "epoch": 0.23198689956331878,
+      "grad_norm": 0.18607310950756073,
+      "learning_rate": 4.485584418141419e-05,
+      "loss": 0.17096419334411622,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2328966521106259,
+      "grad_norm": 0.15958310663700104,
+      "learning_rate": 4.481103300767529e-05,
+      "loss": 0.1656244158744812,
+      "step": 1280
+    },
+    {
+      "epoch": 0.23380640465793304,
+      "grad_norm": 0.17552383244037628,
+      "learning_rate": 4.476605009146255e-05,
+      "loss": 0.17677626609802247,
+      "step": 1285
+    },
+    {
+      "epoch": 0.23471615720524017,
+      "grad_norm": 0.15299823880195618,
+      "learning_rate": 4.472089582273429e-05,
+      "loss": 0.1778991103172302,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2356259097525473,
+      "grad_norm": 0.14613987505435944,
+      "learning_rate": 4.46755705929343e-05,
+      "loss": 0.17071452140808105,
+      "step": 1295
+    },
+    {
+      "epoch": 0.23653566229985443,
+      "grad_norm": 0.17781122028827667,
+      "learning_rate": 4.463007479498843e-05,
+      "loss": 0.16955430507659913,
+      "step": 1300
+    },
+    {
+      "epoch": 0.23744541484716158,
+      "grad_norm": 0.16326487064361572,
+      "learning_rate": 4.458440882330119e-05,
+      "loss": 0.1777693510055542,
+      "step": 1305
+    },
+    {
+      "epoch": 0.23835516739446871,
+      "grad_norm": 0.17701926827430725,
+      "learning_rate": 4.4538573073752365e-05,
+      "loss": 0.16323351860046387,
+      "step": 1310
+    },
+    {
+      "epoch": 0.23926491994177584,
+      "grad_norm": 0.13104717433452606,
+      "learning_rate": 4.449256794369349e-05,
+      "loss": 0.17653456926345826,
+      "step": 1315
+    },
+    {
+      "epoch": 0.24017467248908297,
+      "grad_norm": 0.1796836256980896,
+      "learning_rate": 4.444639383194452e-05,
+      "loss": 0.17189600467681884,
+      "step": 1320
+    },
+    {
+      "epoch": 0.2410844250363901,
+      "grad_norm": 0.14919696748256683,
+      "learning_rate": 4.440005113879029e-05,
+      "loss": 0.17003334760665895,
+      "step": 1325
+    },
+    {
+      "epoch": 0.24199417758369723,
+      "grad_norm": 0.1728784441947937,
+      "learning_rate": 4.4353540265977064e-05,
+      "loss": 0.17397408485412597,
+      "step": 1330
+    },
+    {
+      "epoch": 0.24290393013100436,
+      "grad_norm": 0.14591015875339508,
+      "learning_rate": 4.43068616167091e-05,
+      "loss": 0.16498478651046752,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2438136826783115,
+      "grad_norm": 0.18417201936244965,
+      "learning_rate": 4.4260015595645055e-05,
+      "loss": 0.16841750144958495,
+      "step": 1340
+    },
+    {
+      "epoch": 0.24472343522561862,
+      "grad_norm": 0.16264279186725616,
+      "learning_rate": 4.4213002608894605e-05,
+      "loss": 0.16907373666763306,
+      "step": 1345
+    },
+    {
+      "epoch": 0.24563318777292575,
+      "grad_norm": 0.15248481929302216,
+      "learning_rate": 4.416582306401481e-05,
+      "loss": 0.15931472778320313,
+      "step": 1350
+    },
+    {
+      "epoch": 0.24654294032023288,
+      "grad_norm": 0.1488373875617981,
+      "learning_rate": 4.4118477370006636e-05,
+      "loss": 0.1701716423034668,
+      "step": 1355
+    },
+    {
+      "epoch": 0.24745269286754004,
+      "grad_norm": 0.14679782092571259,
+      "learning_rate": 4.407096593731142e-05,
+      "loss": 0.157412326335907,
+      "step": 1360
+    },
+    {
+      "epoch": 0.24836244541484717,
+      "grad_norm": 0.17139530181884766,
+      "learning_rate": 4.402328917780728e-05,
+      "loss": 0.17303754091262818,
+      "step": 1365
+    },
+    {
+      "epoch": 0.2492721979621543,
+      "grad_norm": 0.1534871757030487,
+      "learning_rate": 4.397544750480554e-05,
+      "loss": 0.1786255121231079,
+      "step": 1370
+    },
+    {
+      "epoch": 0.2501819505094614,
+      "grad_norm": 0.1876252293586731,
+      "learning_rate": 4.39274413330472e-05,
+      "loss": 0.16442898511886597,
+      "step": 1375
+    },
+    {
+      "epoch": 0.25109170305676853,
+      "grad_norm": 0.16165752708911896,
+      "learning_rate": 4.387927107869928e-05,
+      "loss": 0.1780426025390625,
+      "step": 1380
+    },
+    {
+      "epoch": 0.25200145560407566,
+      "grad_norm": 0.17242255806922913,
+      "learning_rate": 4.383093715935124e-05,
+      "loss": 0.15959256887435913,
+      "step": 1385
+    },
+    {
+      "epoch": 0.25291120815138285,
+      "grad_norm": 0.1627114862203598,
+      "learning_rate": 4.378243999401137e-05,
+      "loss": 0.17606115341186523,
+      "step": 1390
+    },
+    {
+      "epoch": 0.25382096069869,
+      "grad_norm": 0.15911224484443665,
+      "learning_rate": 4.373378000310312e-05,
+      "loss": 0.16798585653305054,
+      "step": 1395
+    },
+    {
+      "epoch": 0.2547307132459971,
+      "grad_norm": 0.15542249381542206,
+      "learning_rate": 4.3684957608461505e-05,
+      "loss": 0.1695417881011963,
+      "step": 1400
+    },
+    {
+      "epoch": 0.25564046579330424,
+      "grad_norm": 0.1475304812192917,
+      "learning_rate": 4.363597323332941e-05,
+      "loss": 0.16340878009796142,
+      "step": 1405
+    },
+    {
+      "epoch": 0.25655021834061137,
+      "grad_norm": 0.16943927109241486,
+      "learning_rate": 4.358682730235395e-05,
+      "loss": 0.17240238189697266,
+      "step": 1410
+    },
+    {
+      "epoch": 0.2574599708879185,
+      "grad_norm": 0.1816391944885254,
+      "learning_rate": 4.3537520241582744e-05,
+      "loss": 0.16558437347412108,
+      "step": 1415
+    },
+    {
+      "epoch": 0.25836972343522563,
+      "grad_norm": 0.23851341009140015,
+      "learning_rate": 4.348805247846027e-05,
+      "loss": 0.16796000003814698,
+      "step": 1420
+    },
+    {
+      "epoch": 0.25927947598253276,
+      "grad_norm": 0.15415243804454803,
+      "learning_rate": 4.343842444182414e-05,
+      "loss": 0.1746017098426819,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2601892285298399,
+      "grad_norm": 0.15651032328605652,
+      "learning_rate": 4.338863656190139e-05,
+      "loss": 0.1649057984352112,
+      "step": 1430
+    },
+    {
+      "epoch": 0.261098981077147,
+      "grad_norm": 0.16601966321468353,
+      "learning_rate": 4.333868927030471e-05,
+      "loss": 0.15888988971710205,
+      "step": 1435
+    },
+    {
+      "epoch": 0.26200873362445415,
+      "grad_norm": 0.1549467295408249,
+      "learning_rate": 4.328858300002876e-05,
+      "loss": 0.16357985734939576,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2629184861717613,
+      "grad_norm": 0.16332370042800903,
+      "learning_rate": 4.32383181854464e-05,
+      "loss": 0.16749982833862304,
+      "step": 1445
+    },
+    {
+      "epoch": 0.2638282387190684,
+      "grad_norm": 0.14827077090740204,
+      "learning_rate": 4.3187895262304894e-05,
+      "loss": 0.16886214017868043,
+      "step": 1450
+    },
+    {
+      "epoch": 0.26473799126637554,
+      "grad_norm": 0.1557198166847229,
+      "learning_rate": 4.313731466772216e-05,
+      "loss": 0.17512214183807373,
+      "step": 1455
+    },
+    {
+      "epoch": 0.26564774381368267,
+      "grad_norm": 0.17263570427894592,
+      "learning_rate": 4.308657684018299e-05,
+      "loss": 0.16248074769973755,
+      "step": 1460
+    },
+    {
+      "epoch": 0.2665574963609898,
+      "grad_norm": 0.17135761678218842,
+      "learning_rate": 4.303568221953521e-05,
+      "loss": 0.16605921983718872,
+      "step": 1465
+    },
+    {
+      "epoch": 0.26746724890829693,
+      "grad_norm": 0.14322632551193237,
+      "learning_rate": 4.2984631246985897e-05,
+      "loss": 0.1610772728919983,
+      "step": 1470
+    },
+    {
+      "epoch": 0.26837700145560406,
+      "grad_norm": 0.18852312862873077,
+      "learning_rate": 4.2933424365097564e-05,
+      "loss": 0.1686462163925171,
+      "step": 1475
+    },
+    {
+      "epoch": 0.2692867540029112,
+      "grad_norm": 0.1780245155096054,
+      "learning_rate": 4.2882062017784294e-05,
+      "loss": 0.16953932046890258,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2701965065502183,
+      "grad_norm": 0.180568665266037,
+      "learning_rate": 4.2830544650307895e-05,
+      "loss": 0.16442664861679077,
+      "step": 1485
+    },
+    {
+      "epoch": 0.27110625909752545,
+      "grad_norm": 0.16876435279846191,
+      "learning_rate": 4.277887270927407e-05,
+      "loss": 0.17128173112869263,
+      "step": 1490
+    },
+    {
+      "epoch": 0.2720160116448326,
+      "grad_norm": 0.164053276181221,
+      "learning_rate": 4.2727046642628513e-05,
+      "loss": 0.16331382989883422,
+      "step": 1495
+    },
+    {
+      "epoch": 0.27292576419213976,
+      "grad_norm": 0.14577528834342957,
+      "learning_rate": 4.267506689965305e-05,
+      "loss": 0.1638316035270691,
+      "step": 1500
+    },
+    {
+      "epoch": 0.2738355167394469,
+      "grad_norm": 0.1648740917444229,
+      "learning_rate": 4.262293393096171e-05,
+      "loss": 0.15332664251327516,
+      "step": 1505
+    },
+    {
+      "epoch": 0.274745269286754,
+      "grad_norm": 0.16445094347000122,
+      "learning_rate": 4.257064818849685e-05,
+      "loss": 0.1706634521484375,
+      "step": 1510
+    },
+    {
+      "epoch": 0.27565502183406115,
+      "grad_norm": 0.1584935486316681,
+      "learning_rate": 4.251821012552524e-05,
+      "loss": 0.1684114694595337,
+      "step": 1515
+    },
+    {
+      "epoch": 0.2765647743813683,
+      "grad_norm": 0.17215611040592194,
+      "learning_rate": 4.24656201966341e-05,
+      "loss": 0.15594131946563722,
+      "step": 1520
+    },
+    {
+      "epoch": 0.2774745269286754,
+      "grad_norm": 0.15945589542388916,
+      "learning_rate": 4.2412878857727214e-05,
+      "loss": 0.1686659574508667,
+      "step": 1525
+    },
+    {
+      "epoch": 0.27838427947598254,
+      "grad_norm": 0.16103951632976532,
+      "learning_rate": 4.2359986566020906e-05,
+      "loss": 0.17779340744018554,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2792940320232897,
+      "grad_norm": 0.1770307570695877,
+      "learning_rate": 4.230694378004014e-05,
+      "loss": 0.16786882877349854,
+      "step": 1535
+    },
+    {
+      "epoch": 0.2802037845705968,
+      "grad_norm": 0.16225053369998932,
+      "learning_rate": 4.2253750959614504e-05,
+      "loss": 0.16558897495269775,
+      "step": 1540
+    },
+    {
+      "epoch": 0.28111353711790393,
+      "grad_norm": 0.27213969826698303,
+      "learning_rate": 4.220040856587425e-05,
+      "loss": 0.1641119599342346,
+      "step": 1545
+    },
+    {
+      "epoch": 0.28202328966521106,
+      "grad_norm": 0.1773071587085724,
+      "learning_rate": 4.2146917061246284e-05,
+      "loss": 0.16919140815734862,
+      "step": 1550
+    },
+    {
+      "epoch": 0.2829330422125182,
+      "grad_norm": 0.15519705414772034,
+      "learning_rate": 4.209327690945014e-05,
+      "loss": 0.15501506328582765,
+      "step": 1555
+    },
+    {
+      "epoch": 0.2838427947598253,
+      "grad_norm": 0.19921597838401794,
+      "learning_rate": 4.203948857549402e-05,
+      "loss": 0.1690821886062622,
+      "step": 1560
+    },
+    {
+      "epoch": 0.28475254730713245,
+      "grad_norm": 0.15417630970478058,
+      "learning_rate": 4.1985552525670696e-05,
+      "loss": 0.1675640344619751,
+      "step": 1565
+    },
+    {
+      "epoch": 0.2856622998544396,
+      "grad_norm": 0.1739572137594223,
+      "learning_rate": 4.193146922755348e-05,
+      "loss": 0.16738017797470092,
+      "step": 1570
+    },
+    {
+      "epoch": 0.2865720524017467,
+      "grad_norm": 0.1384361982345581,
+      "learning_rate": 4.187723914999221e-05,
+      "loss": 0.16802358627319336,
+      "step": 1575
+    },
+    {
+      "epoch": 0.28748180494905384,
+      "grad_norm": 0.1491454839706421,
+      "learning_rate": 4.182286276310915e-05,
+      "loss": 0.1619583249092102,
+      "step": 1580
+    },
+    {
+      "epoch": 0.288391557496361,
+      "grad_norm": 0.15831919014453888,
+      "learning_rate": 4.176834053829492e-05,
+      "loss": 0.1625199794769287,
+      "step": 1585
+    },
+    {
+      "epoch": 0.2893013100436681,
+      "grad_norm": 0.16265396773815155,
+      "learning_rate": 4.1713672948204416e-05,
+      "loss": 0.16718552112579346,
+      "step": 1590
+    },
+    {
+      "epoch": 0.29021106259097523,
+      "grad_norm": 0.15153461694717407,
+      "learning_rate": 4.1658860466752714e-05,
+      "loss": 0.15979087352752686,
+      "step": 1595
+    },
+    {
+      "epoch": 0.29112081513828236,
+      "grad_norm": 0.1620412915945053,
+      "learning_rate": 4.160390356911096e-05,
+      "loss": 0.16103557348251343,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2920305676855895,
+      "grad_norm": 0.16673807799816132,
+      "learning_rate": 4.154880273170223e-05,
+      "loss": 0.16394708156585694,
+      "step": 1605
+    },
+    {
+      "epoch": 0.2929403202328967,
+      "grad_norm": 0.14834867417812347,
+      "learning_rate": 4.149355843219744e-05,
+      "loss": 0.15916435718536376,
+      "step": 1610
+    },
+    {
+      "epoch": 0.2938500727802038,
+      "grad_norm": 0.16977964341640472,
+      "learning_rate": 4.143817114951119e-05,
+      "loss": 0.16538127660751342,
+      "step": 1615
+    },
+    {
+      "epoch": 0.29475982532751094,
+      "grad_norm": 0.17986875772476196,
+      "learning_rate": 4.138264136379756e-05,
+      "loss": 0.15514618158340454,
+      "step": 1620
+    },
+    {
+      "epoch": 0.29566957787481807,
+      "grad_norm": 0.15794920921325684,
+      "learning_rate": 4.132696955644605e-05,
+      "loss": 0.15992183685302735,
+      "step": 1625
+    },
+    {
+      "epoch": 0.2965793304221252,
+      "grad_norm": 0.19955399632453918,
+      "learning_rate": 4.127115621007731e-05,
+      "loss": 0.16362056732177735,
+      "step": 1630
+    },
+    {
+      "epoch": 0.29748908296943233,
+      "grad_norm": 0.1352023035287857,
+      "learning_rate": 4.121520180853903e-05,
+      "loss": 0.15631601810455323,
+      "step": 1635
+    },
+    {
+      "epoch": 0.29839883551673946,
+      "grad_norm": 0.15340781211853027,
+      "learning_rate": 4.1159106836901674e-05,
+      "loss": 0.1571858048439026,
+      "step": 1640
+    },
+    {
+      "epoch": 0.2993085880640466,
+      "grad_norm": 0.15311770141124725,
+      "learning_rate": 4.110287178145433e-05,
+      "loss": 0.16082344055175782,
+      "step": 1645
+    },
+    {
+      "epoch": 0.3002183406113537,
+      "grad_norm": 0.17811627686023712,
+      "learning_rate": 4.10464971297005e-05,
+      "loss": 0.16117215156555176,
+      "step": 1650
+    },
+    {
+      "epoch": 0.30112809315866085,
+      "grad_norm": 0.21060039103031158,
+      "learning_rate": 4.0989983370353805e-05,
+      "loss": 0.15838587284088135,
+      "step": 1655
+    },
+    {
+      "epoch": 0.302037845705968,
+      "grad_norm": 0.155836820602417,
+      "learning_rate": 4.093333099333383e-05,
+      "loss": 0.16648870706558228,
+      "step": 1660
+    },
+    {
+      "epoch": 0.3029475982532751,
+      "grad_norm": 0.13711698353290558,
+      "learning_rate": 4.0876540489761826e-05,
+      "loss": 0.16899349689483642,
+      "step": 1665
+    },
+    {
+      "epoch": 0.30385735080058224,
+      "grad_norm": 0.15162716805934906,
+      "learning_rate": 4.0819612351956485e-05,
+      "loss": 0.16574090719223022,
+      "step": 1670
+    },
+    {
+      "epoch": 0.30476710334788937,
+      "grad_norm": 0.15016348659992218,
+      "learning_rate": 4.0762547073429615e-05,
+      "loss": 0.1689780354499817,
+      "step": 1675
+    },
+    {
+      "epoch": 0.3056768558951965,
+      "grad_norm": 0.15182986855506897,
+      "learning_rate": 4.070534514888194e-05,
+      "loss": 0.1593686819076538,
+      "step": 1680
+    },
+    {
+      "epoch": 0.3065866084425036,
+      "grad_norm": 0.15648750960826874,
+      "learning_rate": 4.0648007074198765e-05,
+      "loss": 0.16436235904693602,
+      "step": 1685
+    },
+    {
+      "epoch": 0.30749636098981076,
+      "grad_norm": 0.18339484930038452,
+      "learning_rate": 4.0590533346445665e-05,
+      "loss": 0.1678077220916748,
+      "step": 1690
+    },
+    {
+      "epoch": 0.3084061135371179,
+      "grad_norm": 0.16426527500152588,
+      "learning_rate": 4.053292446386422e-05,
+      "loss": 0.1689227342605591,
+      "step": 1695
+    },
+    {
+      "epoch": 0.309315866084425,
+      "grad_norm": 0.16129335761070251,
+      "learning_rate": 4.047518092586766e-05,
+      "loss": 0.16592445373535156,
+      "step": 1700
+    },
+    {
+      "epoch": 0.31022561863173215,
+      "grad_norm": 0.15512363612651825,
+      "learning_rate": 4.041730323303654e-05,
+      "loss": 0.16142364740371704,
+      "step": 1705
+    },
+    {
+      "epoch": 0.3111353711790393,
+      "grad_norm": 0.159842386841774,
+      "learning_rate": 4.0359291887114425e-05,
+      "loss": 0.1702875852584839,
+      "step": 1710
+    },
+    {
+      "epoch": 0.3120451237263464,
+      "grad_norm": 0.19558854401111603,
+      "learning_rate": 4.030114739100352e-05,
+      "loss": 0.15966148376464845,
+      "step": 1715
+    },
+    {
+      "epoch": 0.3129548762736536,
+      "grad_norm": 0.1577496975660324,
+      "learning_rate": 4.024287024876029e-05,
+      "loss": 0.1620358943939209,
+      "step": 1720
+    },
+    {
+      "epoch": 0.3138646288209607,
+      "grad_norm": 0.1629355251789093,
+      "learning_rate": 4.0184460965591144e-05,
+      "loss": 0.16511552333831786,
+      "step": 1725
+    },
+    {
+      "epoch": 0.31477438136826785,
+      "grad_norm": 0.17060767114162445,
+      "learning_rate": 4.0125920047848e-05,
+      "loss": 0.15672838687896729,
+      "step": 1730
+    },
+    {
+      "epoch": 0.315684133915575,
+      "grad_norm": 0.22447620332241058,
+      "learning_rate": 4.006724800302394e-05,
+      "loss": 0.15339784622192382,
+      "step": 1735
+    },
+    {
+      "epoch": 0.3165938864628821,
+      "grad_norm": 0.14572037756443024,
+      "learning_rate": 4.000844533974878e-05,
+      "loss": 0.16566959619522095,
+      "step": 1740
+    },
+    {
+      "epoch": 0.31750363901018924,
+      "grad_norm": 0.15915483236312866,
+      "learning_rate": 3.9949512567784684e-05,
+      "loss": 0.16153957843780517,
+      "step": 1745
+    },
+    {
+      "epoch": 0.3184133915574964,
+      "grad_norm": 0.1668540984392166,
+      "learning_rate": 3.9890450198021704e-05,
+      "loss": 0.1659809947013855,
+      "step": 1750
+    },
+    {
+      "epoch": 0.3193231441048035,
+      "grad_norm": 0.16612035036087036,
+      "learning_rate": 3.983125874247341e-05,
+      "loss": 0.16941241025924683,
+      "step": 1755
+    },
+    {
+      "epoch": 0.32023289665211063,
+      "grad_norm": 0.15163679420948029,
+      "learning_rate": 3.9771938714272407e-05,
+      "loss": 0.16053590774536133,
+      "step": 1760
+    },
+    {
+      "epoch": 0.32114264919941776,
+      "grad_norm": 0.1797824203968048,
+      "learning_rate": 3.97124906276659e-05,
+      "loss": 0.1667110800743103,
+      "step": 1765
+    },
+    {
+      "epoch": 0.3220524017467249,
+      "grad_norm": 0.15076608955860138,
+      "learning_rate": 3.9652914998011237e-05,
+      "loss": 0.1607860803604126,
+      "step": 1770
+    },
+    {
+      "epoch": 0.322962154294032,
+      "grad_norm": 0.16523587703704834,
+      "learning_rate": 3.959321234177144e-05,
+      "loss": 0.16515827178955078,
+      "step": 1775
+    },
+    {
+      "epoch": 0.32387190684133915,
+      "grad_norm": 0.22065149247646332,
+      "learning_rate": 3.9533383176510746e-05,
+      "loss": 0.1618957757949829,
+      "step": 1780
+    },
+    {
+      "epoch": 0.3247816593886463,
+      "grad_norm": 0.16426463425159454,
+      "learning_rate": 3.9473428020890066e-05,
+      "loss": 0.15763382911682128,
+      "step": 1785
+    },
+    {
+      "epoch": 0.3256914119359534,
+      "grad_norm": 0.16474904119968414,
+      "learning_rate": 3.941334739466257e-05,
+      "loss": 0.15135571956634522,
+      "step": 1790
+    },
+    {
+      "epoch": 0.32660116448326054,
+      "grad_norm": 0.16746412217617035,
+      "learning_rate": 3.935314181866909e-05,
+      "loss": 0.15925389528274536,
+      "step": 1795
+    },
+    {
+      "epoch": 0.32751091703056767,
+      "grad_norm": 0.17819371819496155,
+      "learning_rate": 3.929281181483369e-05,
+      "loss": 0.1598669171333313,
+      "step": 1800
+    },
+    {
+      "epoch": 0.3284206695778748,
+      "grad_norm": 0.1816040277481079,
+      "learning_rate": 3.923235790615907e-05,
+      "loss": 0.1652522087097168,
+      "step": 1805
+    },
+    {
+      "epoch": 0.32933042212518193,
+      "grad_norm": 0.14846695959568024,
+      "learning_rate": 3.917178061672211e-05,
+      "loss": 0.16665585041046144,
+      "step": 1810
+    },
+    {
+      "epoch": 0.33024017467248906,
+      "grad_norm": 0.1734926551580429,
+      "learning_rate": 3.911108047166924e-05,
+      "loss": 0.16069791316986085,
+      "step": 1815
+    },
+    {
+      "epoch": 0.3311499272197962,
+      "grad_norm": 0.16154922544956207,
+      "learning_rate": 3.905025799721194e-05,
+      "loss": 0.16114097833633423,
+      "step": 1820
+    },
+    {
+      "epoch": 0.3320596797671033,
+      "grad_norm": 0.1538771390914917,
+      "learning_rate": 3.898931372062217e-05,
+      "loss": 0.1602831244468689,
+      "step": 1825
+    },
+    {
+      "epoch": 0.3329694323144105,
+      "grad_norm": 0.14036566019058228,
+      "learning_rate": 3.892824817022781e-05,
+      "loss": 0.1502395749092102,
+      "step": 1830
+    },
+    {
+      "epoch": 0.33387918486171764,
+      "grad_norm": 0.19212059676647186,
+      "learning_rate": 3.886706187540804e-05,
+      "loss": 0.16265250444412233,
+      "step": 1835
+    },
+    {
+      "epoch": 0.33478893740902477,
+      "grad_norm": 0.17410333454608917,
+      "learning_rate": 3.880575536658881e-05,
+      "loss": 0.15689224004745483,
+      "step": 1840
+    },
+    {
+      "epoch": 0.3356986899563319,
+      "grad_norm": 0.15165294706821442,
+      "learning_rate": 3.874432917523817e-05,
+      "loss": 0.15033140182495117,
+      "step": 1845
+    },
+    {
+      "epoch": 0.336608442503639,
+      "grad_norm": 0.16166730225086212,
+      "learning_rate": 3.8682783833861736e-05,
+      "loss": 0.16896235942840576,
+      "step": 1850
+    },
+    {
+      "epoch": 0.33751819505094616,
+      "grad_norm": 0.16497021913528442,
+      "learning_rate": 3.8621119875998026e-05,
+      "loss": 0.1600774645805359,
+      "step": 1855
+    },
+    {
+      "epoch": 0.3384279475982533,
+      "grad_norm": 0.17264948785305023,
+      "learning_rate": 3.855933783621384e-05,
+      "loss": 0.16947593688964843,
+      "step": 1860
+    },
+    {
+      "epoch": 0.3393377001455604,
+      "grad_norm": 0.16870704293251038,
+      "learning_rate": 3.8497438250099636e-05,
+      "loss": 0.16062095165252685,
+      "step": 1865
+    },
+    {
+      "epoch": 0.34024745269286755,
+      "grad_norm": 0.16644036769866943,
+      "learning_rate": 3.843542165426492e-05,
+      "loss": 0.16015599966049193,
+      "step": 1870
+    },
+    {
+      "epoch": 0.3411572052401747,
+      "grad_norm": 0.1626352220773697,
+      "learning_rate": 3.837328858633349e-05,
+      "loss": 0.17444703578948975,
+      "step": 1875
+    },
+    {
+      "epoch": 0.3420669577874818,
+      "grad_norm": 0.1427375227212906,
+      "learning_rate": 3.83110395849389e-05,
+      "loss": 0.1589805006980896,
+      "step": 1880
+    },
+    {
+      "epoch": 0.34297671033478894,
+      "grad_norm": 0.17840255796909332,
+      "learning_rate": 3.824867518971973e-05,
+      "loss": 0.15953952074050903,
+      "step": 1885
+    },
+    {
+      "epoch": 0.34388646288209607,
+      "grad_norm": 0.16998249292373657,
+      "learning_rate": 3.818619594131489e-05,
+      "loss": 0.16027032136917113,
+      "step": 1890
+    },
+    {
+      "epoch": 0.3447962154294032,
+      "grad_norm": 0.14950257539749146,
+      "learning_rate": 3.812360238135897e-05,
+      "loss": 0.15335670709609986,
+      "step": 1895
+    },
+    {
+      "epoch": 0.3457059679767103,
+      "grad_norm": 0.1678011417388916,
+      "learning_rate": 3.806089505247752e-05,
+      "loss": 0.1560648798942566,
+      "step": 1900
+    },
+    {
+      "epoch": 0.34661572052401746,
+      "grad_norm": 0.17944541573524475,
+      "learning_rate": 3.799807449828238e-05,
+      "loss": 0.16072254180908202,
+      "step": 1905
+    },
+    {
+      "epoch": 0.3475254730713246,
+      "grad_norm": 0.166817307472229,
+      "learning_rate": 3.793514126336691e-05,
+      "loss": 0.1542820692062378,
+      "step": 1910
+    },
+    {
+      "epoch": 0.3484352256186317,
+      "grad_norm": 0.16047626733779907,
+      "learning_rate": 3.787209589330134e-05,
+      "loss": 0.16092092990875245,
+      "step": 1915
+    },
+    {
+      "epoch": 0.34934497816593885,
+      "grad_norm": 0.16478900611400604,
+      "learning_rate": 3.7808938934627965e-05,
+      "loss": 0.16765867471694945,
+      "step": 1920
+    },
+    {
+      "epoch": 0.350254730713246,
+      "grad_norm": 0.15349514782428741,
+      "learning_rate": 3.774567093485648e-05,
+      "loss": 0.15890377759933472,
+      "step": 1925
+    },
+    {
+      "epoch": 0.3511644832605531,
+      "grad_norm": 0.1515921950340271,
+      "learning_rate": 3.768229244245917e-05,
+      "loss": 0.16668319702148438,
+      "step": 1930
+    },
+    {
+      "epoch": 0.35207423580786024,
+      "grad_norm": 0.16310466825962067,
+      "learning_rate": 3.7618804006866195e-05,
+      "loss": 0.15182652473449706,
+      "step": 1935
+    },
+    {
+      "epoch": 0.3529839883551674,
+      "grad_norm": 0.17294517159461975,
+      "learning_rate": 3.755520617846084e-05,
+      "loss": 0.16287628412246705,
+      "step": 1940
+    },
+    {
+      "epoch": 0.35389374090247455,
+      "grad_norm": 0.1482895463705063,
+      "learning_rate": 3.749149950857467e-05,
+      "loss": 0.15321952104568481,
+      "step": 1945
+    },
+    {
+      "epoch": 0.3548034934497817,
+      "grad_norm": 0.2236029952764511,
+      "learning_rate": 3.7427684549482847e-05,
+      "loss": 0.15403482913970948,
+      "step": 1950
+    },
+    {
+      "epoch": 0.3557132459970888,
+      "grad_norm": 0.20185327529907227,
+      "learning_rate": 3.736376185439927e-05,
+      "loss": 0.1633884072303772,
+      "step": 1955
+    },
+    {
+      "epoch": 0.35662299854439594,
+      "grad_norm": 0.13906247913837433,
+      "learning_rate": 3.7299731977471816e-05,
+      "loss": 0.15925350189208984,
+      "step": 1960
+    },
+    {
+      "epoch": 0.35753275109170307,
+      "grad_norm": 0.18665002286434174,
+      "learning_rate": 3.723559547377751e-05,
+      "loss": 0.1612026572227478,
+      "step": 1965
+    },
+    {
+      "epoch": 0.3584425036390102,
+      "grad_norm": 0.16913433372974396,
+      "learning_rate": 3.717135289931774e-05,
+      "loss": 0.15479494333267213,
+      "step": 1970
+    },
+    {
+      "epoch": 0.35935225618631733,
+      "grad_norm": 0.1620066910982132,
+      "learning_rate": 3.7107004811013434e-05,
+      "loss": 0.1604058027267456,
+      "step": 1975
+    },
+    {
+      "epoch": 0.36026200873362446,
+      "grad_norm": 0.16838301718235016,
+      "learning_rate": 3.704255176670021e-05,
+      "loss": 0.15335073471069335,
+      "step": 1980
+    },
+    {
+      "epoch": 0.3611717612809316,
+      "grad_norm": 0.3054695427417755,
+      "learning_rate": 3.6977994325123535e-05,
+      "loss": 0.16558053493499755,
+      "step": 1985
+    },
+    {
+      "epoch": 0.3620815138282387,
+      "grad_norm": 0.1526716649532318,
+      "learning_rate": 3.6913333045933934e-05,
+      "loss": 0.16148923635482787,
+      "step": 1990
+    },
+    {
+      "epoch": 0.36299126637554585,
+      "grad_norm": 0.15328513085842133,
+      "learning_rate": 3.684856848968209e-05,
+      "loss": 0.1553613781929016,
+      "step": 1995
+    },
+    {
+      "epoch": 0.363901018922853,
+      "grad_norm": 0.16129714250564575,
+      "learning_rate": 3.6783701217813995e-05,
+      "loss": 0.16724612712860107,
+      "step": 2000
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.1054573765554867e+18,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-2000/training_args.bin b/checkpoint-2000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-2000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/checkpoint-2100/README.md b/checkpoint-2100/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-2100/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-2100/adapter_config.json b/checkpoint-2100/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-2100/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-2100/adapter_model.safetensors b/checkpoint-2100/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..89824870012385a7445185a0b242b10503d88ec5
--- /dev/null
+++ b/checkpoint-2100/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dd22d1447908436667437e7214461acb5b3fecd1084ef74d79e006b5bb7bb6fa
+size 169741912
diff --git a/checkpoint-2100/chat_template.jinja b/checkpoint-2100/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-2100/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-2100/optimizer.pt b/checkpoint-2100/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4bfde60777d9c71277f32f6310cda5d2ea5900ed
--- /dev/null
+++ b/checkpoint-2100/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1092c81c46128e4e68ee858b1452a206901141ef5ee9e216e16ac1ba432eb3fa
+size 72807355
diff --git a/checkpoint-2100/processor_config.json b/checkpoint-2100/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-2100/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-2100/rng_state.pth b/checkpoint-2100/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-2100/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-2100/scheduler.pt b/checkpoint-2100/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..95f472f2e9300a031d1dd6de2fef1d5198b78098
--- /dev/null
+++ b/checkpoint-2100/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4d9bf6b09354aeeffdfb2ee007f3a492486af8a031630a2b864b38b951b063c1
+size 1465
diff --git a/checkpoint-2100/tokenizer.json b/checkpoint-2100/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-2100/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-2100/tokenizer_config.json b/checkpoint-2100/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-2100/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-2100/trainer_state.json b/checkpoint-2100/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..b146e2b8814610fd03f0683a40290ec83d81f0c6
--- /dev/null
+++ b/checkpoint-2100/trainer_state.json
@@ -0,0 +1,2982 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.38209606986899564,
+  "eval_steps": 100,
+  "global_step": 2100,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    },
+    {
+      "epoch": 0.03729985443959243,
+      "grad_norm": 0.061678580939769745,
+      "learning_rate": 4.999340748636462e-05,
+      "loss": 0.24956226348876953,
+      "step": 205
+    },
+    {
+      "epoch": 0.03820960698689956,
+      "grad_norm": 0.07328873127698898,
+      "learning_rate": 4.999160884067051e-05,
+      "loss": 0.26169676780700685,
+      "step": 210
+    },
+    {
+      "epoch": 0.0391193595342067,
+      "grad_norm": 0.08287990838289261,
+      "learning_rate": 4.9989593541926246e-05,
+      "loss": 0.2574604034423828,
+      "step": 215
+    },
+    {
+      "epoch": 0.04002911208151383,
+      "grad_norm": 0.06787359714508057,
+      "learning_rate": 4.9987361607602525e-05,
+      "loss": 0.25351409912109374,
+      "step": 220
+    },
+    {
+      "epoch": 0.04093886462882096,
+      "grad_norm": 0.06695502996444702,
+      "learning_rate": 4.998491305704805e-05,
+      "loss": 0.24522039890289307,
+      "step": 225
+    },
+    {
+      "epoch": 0.041848617176128096,
+      "grad_norm": 0.08872214704751968,
+      "learning_rate": 4.9982247911489375e-05,
+      "loss": 0.2581867933273315,
+      "step": 230
+    },
+    {
+      "epoch": 0.042758369723435226,
+      "grad_norm": 0.07637131959199905,
+      "learning_rate": 4.9979366194030743e-05,
+      "loss": 0.25569658279418944,
+      "step": 235
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 0.08158119022846222,
+      "learning_rate": 4.997626792965385e-05,
+      "loss": 0.2529409646987915,
+      "step": 240
+    },
+    {
+      "epoch": 0.04457787481804949,
+      "grad_norm": 0.07529161125421524,
+      "learning_rate": 4.997295314521766e-05,
+      "loss": 0.24049024581909179,
+      "step": 245
+    },
+    {
+      "epoch": 0.04548762736535662,
+      "grad_norm": 0.08860139548778534,
+      "learning_rate": 4.996942186945813e-05,
+      "loss": 0.2490522861480713,
+      "step": 250
+    },
+    {
+      "epoch": 0.04639737991266375,
+      "grad_norm": 0.0850321501493454,
+      "learning_rate": 4.9965674132988005e-05,
+      "loss": 0.24180831909179687,
+      "step": 255
+    },
+    {
+      "epoch": 0.04730713245997089,
+      "grad_norm": 0.07556115090847015,
+      "learning_rate": 4.996170996829653e-05,
+      "loss": 0.2509631872177124,
+      "step": 260
+    },
+    {
+      "epoch": 0.04821688500727802,
+      "grad_norm": 0.07971206307411194,
+      "learning_rate": 4.995752940974918e-05,
+      "loss": 0.24398891925811766,
+      "step": 265
+    },
+    {
+      "epoch": 0.04912663755458515,
+      "grad_norm": 0.09149336814880371,
+      "learning_rate": 4.9953132493587344e-05,
+      "loss": 0.2300492286682129,
+      "step": 270
+    },
+    {
+      "epoch": 0.050036390101892286,
+      "grad_norm": 0.08265820890665054,
+      "learning_rate": 4.9948519257928034e-05,
+      "loss": 0.24246792793273925,
+      "step": 275
+    },
+    {
+      "epoch": 0.050946142649199416,
+      "grad_norm": 0.10328587144613266,
+      "learning_rate": 4.9943689742763534e-05,
+      "loss": 0.2367171049118042,
+      "step": 280
+    },
+    {
+      "epoch": 0.05185589519650655,
+      "grad_norm": 0.0836917981505394,
+      "learning_rate": 4.993864398996105e-05,
+      "loss": 0.23215813636779786,
+      "step": 285
+    },
+    {
+      "epoch": 0.05276564774381368,
+      "grad_norm": 0.09475161135196686,
+      "learning_rate": 4.99333820432624e-05,
+      "loss": 0.2350748062133789,
+      "step": 290
+    },
+    {
+      "epoch": 0.05367540029112081,
+      "grad_norm": 0.08040128648281097,
+      "learning_rate": 4.992790394828355e-05,
+      "loss": 0.23253886699676513,
+      "step": 295
+    },
+    {
+      "epoch": 0.05458515283842795,
+      "grad_norm": 0.08852150291204453,
+      "learning_rate": 4.992220975251428e-05,
+      "loss": 0.23856515884399415,
+      "step": 300
+    },
+    {
+      "epoch": 0.05549490538573508,
+      "grad_norm": 0.09565229713916779,
+      "learning_rate": 4.991629950531775e-05,
+      "loss": 0.23311660289764405,
+      "step": 305
+    },
+    {
+      "epoch": 0.05640465793304221,
+      "grad_norm": 0.08158160001039505,
+      "learning_rate": 4.991017325793009e-05,
+      "loss": 0.22467944622039795,
+      "step": 310
+    },
+    {
+      "epoch": 0.05731441048034935,
+      "grad_norm": 0.07746429741382599,
+      "learning_rate": 4.990383106345994e-05,
+      "loss": 0.229844069480896,
+      "step": 315
+    },
+    {
+      "epoch": 0.05822416302765648,
+      "grad_norm": 0.08564355969429016,
+      "learning_rate": 4.989727297688797e-05,
+      "loss": 0.22414517402648926,
+      "step": 320
+    },
+    {
+      "epoch": 0.05913391557496361,
+      "grad_norm": 0.07517435401678085,
+      "learning_rate": 4.9890499055066435e-05,
+      "loss": 0.2236532211303711,
+      "step": 325
+    },
+    {
+      "epoch": 0.060043668122270744,
+      "grad_norm": 0.111734539270401,
+      "learning_rate": 4.988350935671869e-05,
+      "loss": 0.21474847793579102,
+      "step": 330
+    },
+    {
+      "epoch": 0.060953420669577874,
+      "grad_norm": 0.09906989336013794,
+      "learning_rate": 4.987630394243866e-05,
+      "loss": 0.23321933746337892,
+      "step": 335
+    },
+    {
+      "epoch": 0.06186317321688501,
+      "grad_norm": 0.10131457448005676,
+      "learning_rate": 4.98688828746903e-05,
+      "loss": 0.2310662031173706,
+      "step": 340
+    },
+    {
+      "epoch": 0.06277292576419213,
+      "grad_norm": 0.09203507006168365,
+      "learning_rate": 4.986124621780708e-05,
+      "loss": 0.22021169662475587,
+      "step": 345
+    },
+    {
+      "epoch": 0.06368267831149928,
+      "grad_norm": 0.09505912661552429,
+      "learning_rate": 4.9853394037991416e-05,
+      "loss": 0.2197155237197876,
+      "step": 350
+    },
+    {
+      "epoch": 0.06459243085880641,
+      "grad_norm": 0.09038657695055008,
+      "learning_rate": 4.984532640331412e-05,
+      "loss": 0.22066287994384765,
+      "step": 355
+    },
+    {
+      "epoch": 0.06550218340611354,
+      "grad_norm": 0.09707064181566238,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 0.22455451488494874,
+      "step": 360
+    },
+    {
+      "epoch": 0.06641193595342067,
+      "grad_norm": 0.10367228090763092,
+      "learning_rate": 4.98285450509961e-05,
+      "loss": 0.21993820667266845,
+      "step": 365
+    },
+    {
+      "epoch": 0.0673216885007278,
+      "grad_norm": 0.12229471653699875,
+      "learning_rate": 4.9819831478833456e-05,
+      "loss": 0.2168867588043213,
+      "step": 370
+    },
+    {
+      "epoch": 0.06823144104803494,
+      "grad_norm": 0.0964592918753624,
+      "learning_rate": 4.981090274276406e-05,
+      "loss": 0.21579203605651856,
+      "step": 375
+    },
+    {
+      "epoch": 0.06914119359534207,
+      "grad_norm": 0.09400496631860733,
+      "learning_rate": 4.980175892019141e-05,
+      "loss": 0.20972180366516113,
+      "step": 380
+    },
+    {
+      "epoch": 0.0700509461426492,
+      "grad_norm": 0.08158645778894424,
+      "learning_rate": 4.9792400090383594e-05,
+      "loss": 0.22148358821868896,
+      "step": 385
+    },
+    {
+      "epoch": 0.07096069868995633,
+      "grad_norm": 0.10916394740343094,
+      "learning_rate": 4.978282633447261e-05,
+      "loss": 0.2214418649673462,
+      "step": 390
+    },
+    {
+      "epoch": 0.07187045123726346,
+      "grad_norm": 0.11138810962438583,
+      "learning_rate": 4.9773037735453636e-05,
+      "loss": 0.21814754009246826,
+      "step": 395
+    },
+    {
+      "epoch": 0.07278020378457059,
+      "grad_norm": 0.10914396494626999,
+      "learning_rate": 4.9763034378184365e-05,
+      "loss": 0.21310818195343018,
+      "step": 400
+    },
+    {
+      "epoch": 0.07368995633187773,
+      "grad_norm": 0.1043366864323616,
+      "learning_rate": 4.975281634938421e-05,
+      "loss": 0.21266789436340333,
+      "step": 405
+    },
+    {
+      "epoch": 0.07459970887918486,
+      "grad_norm": 0.1036868542432785,
+      "learning_rate": 4.9742383737633594e-05,
+      "loss": 0.21606721878051757,
+      "step": 410
+    },
+    {
+      "epoch": 0.075509461426492,
+      "grad_norm": 0.11640442907810211,
+      "learning_rate": 4.9731736633373144e-05,
+      "loss": 0.21532948017120362,
+      "step": 415
+    },
+    {
+      "epoch": 0.07641921397379912,
+      "grad_norm": 0.11219926178455353,
+      "learning_rate": 4.9720875128902956e-05,
+      "loss": 0.2191627025604248,
+      "step": 420
+    },
+    {
+      "epoch": 0.07732896652110625,
+      "grad_norm": 0.12103637307882309,
+      "learning_rate": 4.970979931838176e-05,
+      "loss": 0.20938868522644044,
+      "step": 425
+    },
+    {
+      "epoch": 0.0782387190684134,
+      "grad_norm": 0.13274189829826355,
+      "learning_rate": 4.96985092978261e-05,
+      "loss": 0.21792960166931152,
+      "step": 430
+    },
+    {
+      "epoch": 0.07914847161572053,
+      "grad_norm": 0.11164513230323792,
+      "learning_rate": 4.968700516510954e-05,
+      "loss": 0.2022618055343628,
+      "step": 435
+    },
+    {
+      "epoch": 0.08005822416302766,
+      "grad_norm": 0.09532847255468369,
+      "learning_rate": 4.967528701996174e-05,
+      "loss": 0.21255812644958497,
+      "step": 440
+    },
+    {
+      "epoch": 0.08096797671033479,
+      "grad_norm": 0.10279258340597153,
+      "learning_rate": 4.96633549639677e-05,
+      "loss": 0.20683050155639648,
+      "step": 445
+    },
+    {
+      "epoch": 0.08187772925764192,
+      "grad_norm": 0.1257462352514267,
+      "learning_rate": 4.965120910056677e-05,
+      "loss": 0.21419920921325683,
+      "step": 450
+    },
+    {
+      "epoch": 0.08278748180494905,
+      "grad_norm": 0.11663137376308441,
+      "learning_rate": 4.963884953505186e-05,
+      "loss": 0.2072287082672119,
+      "step": 455
+    },
+    {
+      "epoch": 0.08369723435225619,
+      "grad_norm": 0.10488224029541016,
+      "learning_rate": 4.96262763745684e-05,
+      "loss": 0.1982678532600403,
+      "step": 460
+    },
+    {
+      "epoch": 0.08460698689956332,
+      "grad_norm": 0.11801692098379135,
+      "learning_rate": 4.961348972811354e-05,
+      "loss": 0.20662031173706055,
+      "step": 465
+    },
+    {
+      "epoch": 0.08551673944687045,
+      "grad_norm": 0.11318827420473099,
+      "learning_rate": 4.96004897065351e-05,
+      "loss": 0.20947303771972656,
+      "step": 470
+    },
+    {
+      "epoch": 0.08642649199417758,
+      "grad_norm": 0.13409486413002014,
+      "learning_rate": 4.95872764225307e-05,
+      "loss": 0.19670876264572143,
+      "step": 475
+    },
+    {
+      "epoch": 0.08733624454148471,
+      "grad_norm": 0.14440792798995972,
+      "learning_rate": 4.957384999064672e-05,
+      "loss": 0.19842848777770997,
+      "step": 480
+    },
+    {
+      "epoch": 0.08824599708879186,
+      "grad_norm": 0.12246996909379959,
+      "learning_rate": 4.956021052727731e-05,
+      "loss": 0.20318071842193602,
+      "step": 485
+    },
+    {
+      "epoch": 0.08915574963609899,
+      "grad_norm": 0.13437233865261078,
+      "learning_rate": 4.954635815066342e-05,
+      "loss": 0.21675212383270265,
+      "step": 490
+    },
+    {
+      "epoch": 0.09006550218340612,
+      "grad_norm": 0.11109672486782074,
+      "learning_rate": 4.9532292980891744e-05,
+      "loss": 0.2100757837295532,
+      "step": 495
+    },
+    {
+      "epoch": 0.09097525473071325,
+      "grad_norm": 0.1388893872499466,
+      "learning_rate": 4.9518015139893675e-05,
+      "loss": 0.20303285121917725,
+      "step": 500
+    },
+    {
+      "epoch": 0.09188500727802038,
+      "grad_norm": 0.13239721953868866,
+      "learning_rate": 4.950352475144427e-05,
+      "loss": 0.2152268409729004,
+      "step": 505
+    },
+    {
+      "epoch": 0.0927947598253275,
+      "grad_norm": 0.12834979593753815,
+      "learning_rate": 4.948882194116119e-05,
+      "loss": 0.20799248218536376,
+      "step": 510
+    },
+    {
+      "epoch": 0.09370451237263465,
+      "grad_norm": 0.11886704713106155,
+      "learning_rate": 4.947390683650354e-05,
+      "loss": 0.20394976139068605,
+      "step": 515
+    },
+    {
+      "epoch": 0.09461426491994178,
+      "grad_norm": 0.11398876458406448,
+      "learning_rate": 4.945877956677083e-05,
+      "loss": 0.2091092586517334,
+      "step": 520
+    },
+    {
+      "epoch": 0.09552401746724891,
+      "grad_norm": 0.1422540694475174,
+      "learning_rate": 4.944344026310186e-05,
+      "loss": 0.19564238786697388,
+      "step": 525
+    },
+    {
+      "epoch": 0.09643377001455604,
+      "grad_norm": 0.11359584331512451,
+      "learning_rate": 4.9427889058473535e-05,
+      "loss": 0.20493624210357667,
+      "step": 530
+    },
+    {
+      "epoch": 0.09734352256186317,
+      "grad_norm": 0.11703553050756454,
+      "learning_rate": 4.941212608769974e-05,
+      "loss": 0.2098615884780884,
+      "step": 535
+    },
+    {
+      "epoch": 0.0982532751091703,
+      "grad_norm": 0.14552047848701477,
+      "learning_rate": 4.939615148743017e-05,
+      "loss": 0.20382182598114013,
+      "step": 540
+    },
+    {
+      "epoch": 0.09916302765647744,
+      "grad_norm": 0.13178016245365143,
+      "learning_rate": 4.937996539614914e-05,
+      "loss": 0.19901862144470214,
+      "step": 545
+    },
+    {
+      "epoch": 0.10007278020378457,
+      "grad_norm": 0.635392427444458,
+      "learning_rate": 4.936356795417439e-05,
+      "loss": 0.20694944858551026,
+      "step": 550
+    },
+    {
+      "epoch": 0.1009825327510917,
+      "grad_norm": 0.15019077062606812,
+      "learning_rate": 4.934695930365586e-05,
+      "loss": 0.19313746690750122,
+      "step": 555
+    },
+    {
+      "epoch": 0.10189228529839883,
+      "grad_norm": 0.12941956520080566,
+      "learning_rate": 4.9330139588574474e-05,
+      "loss": 0.19671722650527954,
+      "step": 560
+    },
+    {
+      "epoch": 0.10280203784570596,
+      "grad_norm": 0.13818831741809845,
+      "learning_rate": 4.931310895474088e-05,
+      "loss": 0.20026786327362062,
+      "step": 565
+    },
+    {
+      "epoch": 0.1037117903930131,
+      "grad_norm": 0.12011194974184036,
+      "learning_rate": 4.929586754979417e-05,
+      "loss": 0.1932437539100647,
+      "step": 570
+    },
+    {
+      "epoch": 0.10462154294032024,
+      "grad_norm": 0.1345364898443222,
+      "learning_rate": 4.9278415523200644e-05,
+      "loss": 0.20245940685272218,
+      "step": 575
+    },
+    {
+      "epoch": 0.10553129548762737,
+      "grad_norm": 0.13281017541885376,
+      "learning_rate": 4.926075302625247e-05,
+      "loss": 0.19864981174468993,
+      "step": 580
+    },
+    {
+      "epoch": 0.1064410480349345,
+      "grad_norm": 0.13465586304664612,
+      "learning_rate": 4.924288021206639e-05,
+      "loss": 0.19573183059692384,
+      "step": 585
+    },
+    {
+      "epoch": 0.10735080058224163,
+      "grad_norm": 0.15225961804389954,
+      "learning_rate": 4.9224797235582396e-05,
+      "loss": 0.19946801662445068,
+      "step": 590
+    },
+    {
+      "epoch": 0.10826055312954876,
+      "grad_norm": 0.12816746532917023,
+      "learning_rate": 4.92065042535624e-05,
+      "loss": 0.19851526021957397,
+      "step": 595
+    },
+    {
+      "epoch": 0.1091703056768559,
+      "grad_norm": 0.13802853226661682,
+      "learning_rate": 4.9188001424588824e-05,
+      "loss": 0.19321763515472412,
+      "step": 600
+    },
+    {
+      "epoch": 0.11008005822416303,
+      "grad_norm": 0.17504797875881195,
+      "learning_rate": 4.9169288909063295e-05,
+      "loss": 0.2032616138458252,
+      "step": 605
+    },
+    {
+      "epoch": 0.11098981077147016,
+      "grad_norm": 0.13544194400310516,
+      "learning_rate": 4.91503668692052e-05,
+      "loss": 0.2011256456375122,
+      "step": 610
+    },
+    {
+      "epoch": 0.11189956331877729,
+      "grad_norm": 1.3976134061813354,
+      "learning_rate": 4.91312354690503e-05,
+      "loss": 0.19916868209838867,
+      "step": 615
+    },
+    {
+      "epoch": 0.11280931586608442,
+      "grad_norm": 0.1465059071779251,
+      "learning_rate": 4.91118948744493e-05,
+      "loss": 0.19487457275390624,
+      "step": 620
+    },
+    {
+      "epoch": 0.11371906841339156,
+      "grad_norm": 0.12103168666362762,
+      "learning_rate": 4.909234525306645e-05,
+      "loss": 0.1907251238822937,
+      "step": 625
+    },
+    {
+      "epoch": 0.1146288209606987,
+      "grad_norm": 0.12660574913024902,
+      "learning_rate": 4.907258677437802e-05,
+      "loss": 0.19327253103256226,
+      "step": 630
+    },
+    {
+      "epoch": 0.11553857350800582,
+      "grad_norm": 0.1347813606262207,
+      "learning_rate": 4.90526196096709e-05,
+      "loss": 0.19637736082077026,
+      "step": 635
+    },
+    {
+      "epoch": 0.11644832605531295,
+      "grad_norm": 0.14953652024269104,
+      "learning_rate": 4.903244393204107e-05,
+      "loss": 0.20325069427490233,
+      "step": 640
+    },
+    {
+      "epoch": 0.11735807860262008,
+      "grad_norm": 0.13936272263526917,
+      "learning_rate": 4.901205991639213e-05,
+      "loss": 0.1930275321006775,
+      "step": 645
+    },
+    {
+      "epoch": 0.11826783114992721,
+      "grad_norm": 0.1448420137166977,
+      "learning_rate": 4.899146773943374e-05,
+      "loss": 0.20026936531066894,
+      "step": 650
+    },
+    {
+      "epoch": 0.11917758369723436,
+      "grad_norm": 0.1312534064054489,
+      "learning_rate": 4.897066757968014e-05,
+      "loss": 0.19062033891677857,
+      "step": 655
+    },
+    {
+      "epoch": 0.12008733624454149,
+      "grad_norm": 0.13644742965698242,
+      "learning_rate": 4.894965961744859e-05,
+      "loss": 0.18719595670700073,
+      "step": 660
+    },
+    {
+      "epoch": 0.12099708879184862,
+      "grad_norm": 0.14276087284088135,
+      "learning_rate": 4.892844403485777e-05,
+      "loss": 0.19784307479858398,
+      "step": 665
+    },
+    {
+      "epoch": 0.12190684133915575,
+      "grad_norm": 0.14735399186611176,
+      "learning_rate": 4.890702101582623e-05,
+      "loss": 0.19163782596588136,
+      "step": 670
+    },
+    {
+      "epoch": 0.12281659388646288,
+      "grad_norm": 0.15742065012454987,
+      "learning_rate": 4.888539074607082e-05,
+      "loss": 0.19312986135482788,
+      "step": 675
+    },
+    {
+      "epoch": 0.12372634643377002,
+      "grad_norm": 0.12917031347751617,
+      "learning_rate": 4.8863553413105025e-05,
+      "loss": 0.20066320896148682,
+      "step": 680
+    },
+    {
+      "epoch": 0.12463609898107715,
+      "grad_norm": 0.1484801322221756,
+      "learning_rate": 4.884150920623737e-05,
+      "loss": 0.20096096992492676,
+      "step": 685
+    },
+    {
+      "epoch": 0.12554585152838427,
+      "grad_norm": 0.1455296128988266,
+      "learning_rate": 4.88192583165698e-05,
+      "loss": 0.20518505573272705,
+      "step": 690
+    },
+    {
+      "epoch": 0.12645560407569142,
+      "grad_norm": 0.14517490565776825,
+      "learning_rate": 4.879680093699598e-05,
+      "loss": 0.18859238624572755,
+      "step": 695
+    },
+    {
+      "epoch": 0.12736535662299855,
+      "grad_norm": 0.18778090178966522,
+      "learning_rate": 4.877413726219964e-05,
+      "loss": 0.197074818611145,
+      "step": 700
+    },
+    {
+      "epoch": 0.12827510917030568,
+      "grad_norm": 0.13497677445411682,
+      "learning_rate": 4.87512674886529e-05,
+      "loss": 0.18713107109069824,
+      "step": 705
+    },
+    {
+      "epoch": 0.12918486171761281,
+      "grad_norm": 0.12657155096530914,
+      "learning_rate": 4.872819181461455e-05,
+      "loss": 0.1858484387397766,
+      "step": 710
+    },
+    {
+      "epoch": 0.13009461426491994,
+      "grad_norm": 0.11458148807287216,
+      "learning_rate": 4.870491044012834e-05,
+      "loss": 0.18732179403305055,
+      "step": 715
+    },
+    {
+      "epoch": 0.13100436681222707,
+      "grad_norm": 0.13000249862670898,
+      "learning_rate": 4.8681423567021244e-05,
+      "loss": 0.1872936010360718,
+      "step": 720
+    },
+    {
+      "epoch": 0.1319141193595342,
+      "grad_norm": 0.14580890536308289,
+      "learning_rate": 4.865773139890172e-05,
+      "loss": 0.19280019998550416,
+      "step": 725
+    },
+    {
+      "epoch": 0.13282387190684133,
+      "grad_norm": 0.1507277935743332,
+      "learning_rate": 4.8633834141157913e-05,
+      "loss": 0.1898929238319397,
+      "step": 730
+    },
+    {
+      "epoch": 0.13373362445414846,
+      "grad_norm": 0.1418737769126892,
+      "learning_rate": 4.860973200095592e-05,
+      "loss": 0.17926375865936278,
+      "step": 735
+    },
+    {
+      "epoch": 0.1346433770014556,
+      "grad_norm": 0.17151866853237152,
+      "learning_rate": 4.858542518723794e-05,
+      "loss": 0.18963592052459716,
+      "step": 740
+    },
+    {
+      "epoch": 0.13555312954876272,
+      "grad_norm": 0.11162743717432022,
+      "learning_rate": 4.8560913910720535e-05,
+      "loss": 0.19466646909713745,
+      "step": 745
+    },
+    {
+      "epoch": 0.13646288209606988,
+      "grad_norm": 0.15628376603126526,
+      "learning_rate": 4.8536198383892725e-05,
+      "loss": 0.19494034051895143,
+      "step": 750
+    },
+    {
+      "epoch": 0.137372634643377,
+      "grad_norm": 0.18209289014339447,
+      "learning_rate": 4.851127882101421e-05,
+      "loss": 0.18747550249099731,
+      "step": 755
+    },
+    {
+      "epoch": 0.13828238719068414,
+      "grad_norm": 0.14559614658355713,
+      "learning_rate": 4.8486155438113454e-05,
+      "loss": 0.1897158980369568,
+      "step": 760
+    },
+    {
+      "epoch": 0.13919213973799127,
+      "grad_norm": 0.3198587894439697,
+      "learning_rate": 4.846082845298586e-05,
+      "loss": 0.18571001291275024,
+      "step": 765
+    },
+    {
+      "epoch": 0.1401018922852984,
+      "grad_norm": 0.1486678421497345,
+      "learning_rate": 4.843529808519189e-05,
+      "loss": 0.19561930894851684,
+      "step": 770
+    },
+    {
+      "epoch": 0.14101164483260553,
+      "grad_norm": 0.15318170189857483,
+      "learning_rate": 4.840956455605509e-05,
+      "loss": 0.187040114402771,
+      "step": 775
+    },
+    {
+      "epoch": 0.14192139737991266,
+      "grad_norm": 0.13754244148731232,
+      "learning_rate": 4.838362808866025e-05,
+      "loss": 0.18345539569854735,
+      "step": 780
+    },
+    {
+      "epoch": 0.1428311499272198,
+      "grad_norm": 0.12943248450756073,
+      "learning_rate": 4.835748890785143e-05,
+      "loss": 0.1921079397201538,
+      "step": 785
+    },
+    {
+      "epoch": 0.14374090247452692,
+      "grad_norm": 0.110458143055439,
+      "learning_rate": 4.833114724023001e-05,
+      "loss": 0.17927205562591553,
+      "step": 790
+    },
+    {
+      "epoch": 0.14465065502183405,
+      "grad_norm": 0.2421770840883255,
+      "learning_rate": 4.830460331415275e-05,
+      "loss": 0.18317567110061644,
+      "step": 795
+    },
+    {
+      "epoch": 0.14556040756914118,
+      "grad_norm": 0.14752762019634247,
+      "learning_rate": 4.8277857359729787e-05,
+      "loss": 0.1843916058540344,
+      "step": 800
+    },
+    {
+      "epoch": 0.14647016011644834,
+      "grad_norm": 0.15043556690216064,
+      "learning_rate": 4.8250909608822644e-05,
+      "loss": 0.18354393243789674,
+      "step": 805
+    },
+    {
+      "epoch": 0.14737991266375547,
+      "grad_norm": 0.1381794661283493,
+      "learning_rate": 4.822376029504223e-05,
+      "loss": 0.1789781332015991,
+      "step": 810
+    },
+    {
+      "epoch": 0.1482896652110626,
+      "grad_norm": 0.18386174738407135,
+      "learning_rate": 4.819640965374681e-05,
+      "loss": 0.19494292736053467,
+      "step": 815
+    },
+    {
+      "epoch": 0.14919941775836973,
+      "grad_norm": 0.13829593360424042,
+      "learning_rate": 4.816885792203996e-05,
+      "loss": 0.18486063480377196,
+      "step": 820
+    },
+    {
+      "epoch": 0.15010917030567686,
+      "grad_norm": 0.15033291280269623,
+      "learning_rate": 4.814110533876852e-05,
+      "loss": 0.18061509132385253,
+      "step": 825
+    },
+    {
+      "epoch": 0.151018922852984,
+      "grad_norm": 0.17150473594665527,
+      "learning_rate": 4.811315214452051e-05,
+      "loss": 0.18464866876602173,
+      "step": 830
+    },
+    {
+      "epoch": 0.15192867540029112,
+      "grad_norm": 0.15317125618457794,
+      "learning_rate": 4.808499858162307e-05,
+      "loss": 0.1837708592414856,
+      "step": 835
+    },
+    {
+      "epoch": 0.15283842794759825,
+      "grad_norm": 0.2671392560005188,
+      "learning_rate": 4.805664489414031e-05,
+      "loss": 0.19338636398315429,
+      "step": 840
+    },
+    {
+      "epoch": 0.15374818049490538,
+      "grad_norm": 0.14047028124332428,
+      "learning_rate": 4.802809132787125e-05,
+      "loss": 0.17069108486175538,
+      "step": 845
+    },
+    {
+      "epoch": 0.1546579330422125,
+      "grad_norm": 0.1520431935787201,
+      "learning_rate": 4.799933813034768e-05,
+      "loss": 0.18607735633850098,
+      "step": 850
+    },
+    {
+      "epoch": 0.15556768558951964,
+      "grad_norm": 0.17239463329315186,
+      "learning_rate": 4.797038555083197e-05,
+      "loss": 0.18069062232971192,
+      "step": 855
+    },
+    {
+      "epoch": 0.1564774381368268,
+      "grad_norm": 0.1377955675125122,
+      "learning_rate": 4.794123384031495e-05,
+      "loss": 0.18870222568511963,
+      "step": 860
+    },
+    {
+      "epoch": 0.15738719068413393,
+      "grad_norm": 0.15901461243629456,
+      "learning_rate": 4.791188325151373e-05,
+      "loss": 0.18128334283828734,
+      "step": 865
+    },
+    {
+      "epoch": 0.15829694323144106,
+      "grad_norm": 0.14634132385253906,
+      "learning_rate": 4.7882334038869495e-05,
+      "loss": 0.1866163969039917,
+      "step": 870
+    },
+    {
+      "epoch": 0.1592066957787482,
+      "grad_norm": 0.15361061692237854,
+      "learning_rate": 4.785258645854529e-05,
+      "loss": 0.17850807905197144,
+      "step": 875
+    },
+    {
+      "epoch": 0.16011644832605532,
+      "grad_norm": 0.13751649856567383,
+      "learning_rate": 4.782264076842385e-05,
+      "loss": 0.17731113433837892,
+      "step": 880
+    },
+    {
+      "epoch": 0.16102620087336245,
+      "grad_norm": 0.17909638583660126,
+      "learning_rate": 4.7792497228105314e-05,
+      "loss": 0.18344542980194092,
+      "step": 885
+    },
+    {
+      "epoch": 0.16193595342066958,
+      "grad_norm": 0.16038304567337036,
+      "learning_rate": 4.776215609890498e-05,
+      "loss": 0.18868647813796996,
+      "step": 890
+    },
+    {
+      "epoch": 0.1628457059679767,
+      "grad_norm": 0.1653951108455658,
+      "learning_rate": 4.773161764385107e-05,
+      "loss": 0.18614152669906617,
+      "step": 895
+    },
+    {
+      "epoch": 0.16375545851528384,
+      "grad_norm": 0.16193026304244995,
+      "learning_rate": 4.770088212768241e-05,
+      "loss": 0.18564575910568237,
+      "step": 900
+    },
+    {
+      "epoch": 0.16466521106259097,
+      "grad_norm": 0.16048531234264374,
+      "learning_rate": 4.7669949816846173e-05,
+      "loss": 0.18330031633377075,
+      "step": 905
+    },
+    {
+      "epoch": 0.1655749636098981,
+      "grad_norm": 0.1440177708864212,
+      "learning_rate": 4.7638820979495534e-05,
+      "loss": 0.17712442874908446,
+      "step": 910
+    },
+    {
+      "epoch": 0.16648471615720525,
+      "grad_norm": 0.19635969400405884,
+      "learning_rate": 4.760749588548738e-05,
+      "loss": 0.18679027557373046,
+      "step": 915
+    },
+    {
+      "epoch": 0.16739446870451238,
+      "grad_norm": 0.15576541423797607,
+      "learning_rate": 4.757597480637995e-05,
+      "loss": 0.19283764362335204,
+      "step": 920
+    },
+    {
+      "epoch": 0.1683042212518195,
+      "grad_norm": 0.1550331562757492,
+      "learning_rate": 4.7544258015430463e-05,
+      "loss": 0.18269542455673218,
+      "step": 925
+    },
+    {
+      "epoch": 0.16921397379912664,
+      "grad_norm": 0.18369626998901367,
+      "learning_rate": 4.75123457875928e-05,
+      "loss": 0.1697891116142273,
+      "step": 930
+    },
+    {
+      "epoch": 0.17012372634643377,
+      "grad_norm": 0.15266314148902893,
+      "learning_rate": 4.7480238399515074e-05,
+      "loss": 0.18523451089859008,
+      "step": 935
+    },
+    {
+      "epoch": 0.1710334788937409,
+      "grad_norm": 0.16709664463996887,
+      "learning_rate": 4.744793612953724e-05,
+      "loss": 0.1803238034248352,
+      "step": 940
+    },
+    {
+      "epoch": 0.17194323144104803,
+      "grad_norm": 0.14929179847240448,
+      "learning_rate": 4.741543925768872e-05,
+      "loss": 0.1861217737197876,
+      "step": 945
+    },
+    {
+      "epoch": 0.17285298398835516,
+      "grad_norm": 0.1362280696630478,
+      "learning_rate": 4.7382748065685915e-05,
+      "loss": 0.17896100282669067,
+      "step": 950
+    },
+    {
+      "epoch": 0.1737627365356623,
+      "grad_norm": 0.15290239453315735,
+      "learning_rate": 4.734986283692982e-05,
+      "loss": 0.18432788848876952,
+      "step": 955
+    },
+    {
+      "epoch": 0.17467248908296942,
+      "grad_norm": 0.1287035197019577,
+      "learning_rate": 4.73167838565035e-05,
+      "loss": 0.18485682010650634,
+      "step": 960
+    },
+    {
+      "epoch": 0.17558224163027655,
+      "grad_norm": 0.17969627678394318,
+      "learning_rate": 4.728351141116971e-05,
+      "loss": 0.17361557483673096,
+      "step": 965
+    },
+    {
+      "epoch": 0.1764919941775837,
+      "grad_norm": 0.13751201331615448,
+      "learning_rate": 4.7250045789368326e-05,
+      "loss": 0.1731679320335388,
+      "step": 970
+    },
+    {
+      "epoch": 0.17740174672489084,
+      "grad_norm": 0.1603265255689621,
+      "learning_rate": 4.721638728121388e-05,
+      "loss": 0.17308170795440675,
+      "step": 975
+    },
+    {
+      "epoch": 0.17831149927219797,
+      "grad_norm": 0.1592789888381958,
+      "learning_rate": 4.718253617849306e-05,
+      "loss": 0.17534757852554322,
+      "step": 980
+    },
+    {
+      "epoch": 0.1792212518195051,
+      "grad_norm": 0.12727224826812744,
+      "learning_rate": 4.714849277466214e-05,
+      "loss": 0.17817609310150145,
+      "step": 985
+    },
+    {
+      "epoch": 0.18013100436681223,
+      "grad_norm": 0.15401554107666016,
+      "learning_rate": 4.711425736484447e-05,
+      "loss": 0.1733405351638794,
+      "step": 990
+    },
+    {
+      "epoch": 0.18104075691411936,
+      "grad_norm": 0.13253968954086304,
+      "learning_rate": 4.7079830245827906e-05,
+      "loss": 0.17846795320510864,
+      "step": 995
+    },
+    {
+      "epoch": 0.1819505094614265,
+      "grad_norm": 0.21846213936805725,
+      "learning_rate": 4.7045211716062245e-05,
+      "loss": 0.18021599054336548,
+      "step": 1000
+    },
+    {
+      "epoch": 0.18286026200873362,
+      "grad_norm": 0.16867990791797638,
+      "learning_rate": 4.7010402075656595e-05,
+      "loss": 0.18232386112213134,
+      "step": 1005
+    },
+    {
+      "epoch": 0.18377001455604075,
+      "grad_norm": 0.17180582880973816,
+      "learning_rate": 4.697540162637686e-05,
+      "loss": 0.1816317319869995,
+      "step": 1010
+    },
+    {
+      "epoch": 0.18467976710334788,
+      "grad_norm": 0.16480213403701782,
+      "learning_rate": 4.694021067164303e-05,
+      "loss": 0.17718446254730225,
+      "step": 1015
+    },
+    {
+      "epoch": 0.185589519650655,
+      "grad_norm": 0.15015918016433716,
+      "learning_rate": 4.6904829516526605e-05,
+      "loss": 0.17412011623382567,
+      "step": 1020
+    },
+    {
+      "epoch": 0.18649927219796217,
+      "grad_norm": 0.14445139467716217,
+      "learning_rate": 4.686925846774795e-05,
+      "loss": 0.1778018832206726,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1874090247452693,
+      "grad_norm": 0.1701960265636444,
+      "learning_rate": 4.683349783367362e-05,
+      "loss": 0.16901081800460815,
+      "step": 1030
+    },
+    {
+      "epoch": 0.18831877729257643,
+      "grad_norm": 0.15894867479801178,
+      "learning_rate": 4.679754792431368e-05,
+      "loss": 0.17055928707122803,
+      "step": 1035
+    },
+    {
+      "epoch": 0.18922852983988356,
+      "grad_norm": 0.1511942446231842,
+      "learning_rate": 4.676140905131903e-05,
+      "loss": 0.17339680194854737,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1901382823871907,
+      "grad_norm": 0.14735209941864014,
+      "learning_rate": 4.672508152797872e-05,
+      "loss": 0.17802717685699462,
+      "step": 1045
+    },
+    {
+      "epoch": 0.19104803493449782,
+      "grad_norm": 0.17367291450500488,
+      "learning_rate": 4.66885656692172e-05,
+      "loss": 0.1732744097709656,
+      "step": 1050
+    },
+    {
+      "epoch": 0.19195778748180495,
+      "grad_norm": 0.147227481007576,
+      "learning_rate": 4.665186179159159e-05,
+      "loss": 0.17040517330169677,
+      "step": 1055
+    },
+    {
+      "epoch": 0.19286754002911208,
+      "grad_norm": 0.1709655076265335,
+      "learning_rate": 4.6614970213289e-05,
+      "loss": 0.17794088125228882,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1937772925764192,
+      "grad_norm": 0.1588088721036911,
+      "learning_rate": 4.657789125412366e-05,
+      "loss": 0.17180380821228028,
+      "step": 1065
+    },
+    {
+      "epoch": 0.19468704512372634,
+      "grad_norm": 0.14827021956443787,
+      "learning_rate": 4.654062523553428e-05,
+      "loss": 0.182997989654541,
+      "step": 1070
+    },
+    {
+      "epoch": 0.19559679767103347,
+      "grad_norm": 0.16230466961860657,
+      "learning_rate": 4.6503172480581126e-05,
+      "loss": 0.17346880435943604,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1965065502183406,
+      "grad_norm": 0.1637624353170395,
+      "learning_rate": 4.646553331394333e-05,
+      "loss": 0.17263576984405518,
+      "step": 1080
+    },
+    {
+      "epoch": 0.19741630276564776,
+      "grad_norm": 0.15977843105793,
+      "learning_rate": 4.642770806191603e-05,
+      "loss": 0.17284308671951293,
+      "step": 1085
+    },
+    {
+      "epoch": 0.19832605531295489,
+      "grad_norm": 0.15394869446754456,
+      "learning_rate": 4.6389697052407534e-05,
+      "loss": 0.17797101736068727,
+      "step": 1090
+    },
+    {
+      "epoch": 0.19923580786026202,
+      "grad_norm": 0.15995225310325623,
+      "learning_rate": 4.6351500614936485e-05,
+      "loss": 0.18137198686599731,
+      "step": 1095
+    },
+    {
+      "epoch": 0.20014556040756915,
+      "grad_norm": 0.1779479682445526,
+      "learning_rate": 4.6313119080629006e-05,
+      "loss": 0.17998344898223878,
+      "step": 1100
+    },
+    {
+      "epoch": 0.20105531295487628,
+      "grad_norm": 0.14362832903862,
+      "learning_rate": 4.627455278221584e-05,
+      "loss": 0.18196423053741456,
+      "step": 1105
+    },
+    {
+      "epoch": 0.2019650655021834,
+      "grad_norm": 0.15951639413833618,
+      "learning_rate": 4.623580205402947e-05,
+      "loss": 0.17423888444900512,
+      "step": 1110
+    },
+    {
+      "epoch": 0.20287481804949054,
+      "grad_norm": 0.17273563146591187,
+      "learning_rate": 4.619686723200115e-05,
+      "loss": 0.17392473220825194,
+      "step": 1115
+    },
+    {
+      "epoch": 0.20378457059679767,
+      "grad_norm": 0.1655360758304596,
+      "learning_rate": 4.615774865365813e-05,
+      "loss": 0.17528389692306517,
+      "step": 1120
+    },
+    {
+      "epoch": 0.2046943231441048,
+      "grad_norm": 0.15920691192150116,
+      "learning_rate": 4.611844665812058e-05,
+      "loss": 0.1806849241256714,
+      "step": 1125
+    },
+    {
+      "epoch": 0.20560407569141192,
+      "grad_norm": 0.16114577651023865,
+      "learning_rate": 4.607896158609875e-05,
+      "loss": 0.17217352390289306,
+      "step": 1130
+    },
+    {
+      "epoch": 0.20651382823871905,
+      "grad_norm": 0.1499422937631607,
+      "learning_rate": 4.603929377988999e-05,
+      "loss": 0.17806737422943114,
+      "step": 1135
+    },
+    {
+      "epoch": 0.2074235807860262,
+      "grad_norm": 0.17605191469192505,
+      "learning_rate": 4.5999443583375765e-05,
+      "loss": 0.17842113971710205,
+      "step": 1140
+    },
+    {
+      "epoch": 0.20833333333333334,
+      "grad_norm": 0.16117210686206818,
+      "learning_rate": 4.595941134201871e-05,
+      "loss": 0.18379683494567872,
+      "step": 1145
+    },
+    {
+      "epoch": 0.20924308588064047,
+      "grad_norm": 0.21199050545692444,
+      "learning_rate": 4.591919740285957e-05,
+      "loss": 0.16286123991012574,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2101528384279476,
+      "grad_norm": 0.15100529789924622,
+      "learning_rate": 4.587880211451427e-05,
+      "loss": 0.17995200157165528,
+      "step": 1155
+    },
+    {
+      "epoch": 0.21106259097525473,
+      "grad_norm": 0.16618172824382782,
+      "learning_rate": 4.583822582717085e-05,
+      "loss": 0.16960303783416747,
+      "step": 1160
+    },
+    {
+      "epoch": 0.21197234352256186,
+      "grad_norm": 0.14743569493293762,
+      "learning_rate": 4.579746889258643e-05,
+      "loss": 0.17762668132781984,
+      "step": 1165
+    },
+    {
+      "epoch": 0.212882096069869,
+      "grad_norm": 0.1697179079055786,
+      "learning_rate": 4.575653166408417e-05,
+      "loss": 0.16656005382537842,
+      "step": 1170
+    },
+    {
+      "epoch": 0.21379184861717612,
+      "grad_norm": 0.14886513352394104,
+      "learning_rate": 4.57154144965502e-05,
+      "loss": 0.17091882228851318,
+      "step": 1175
+    },
+    {
+      "epoch": 0.21470160116448325,
+      "grad_norm": 0.18197473883628845,
+      "learning_rate": 4.5674117746430556e-05,
+      "loss": 0.1770920753479004,
+      "step": 1180
+    },
+    {
+      "epoch": 0.21561135371179038,
+      "grad_norm": 0.17323088645935059,
+      "learning_rate": 4.563264177172807e-05,
+      "loss": 0.1734643578529358,
+      "step": 1185
+    },
+    {
+      "epoch": 0.2165211062590975,
+      "grad_norm": 0.1521984338760376,
+      "learning_rate": 4.559098693199929e-05,
+      "loss": 0.17515116930007935,
+      "step": 1190
+    },
+    {
+      "epoch": 0.21743085880640467,
+      "grad_norm": 0.1842304915189743,
+      "learning_rate": 4.554915358835134e-05,
+      "loss": 0.16798022985458375,
+      "step": 1195
+    },
+    {
+      "epoch": 0.2183406113537118,
+      "grad_norm": 0.14753451943397522,
+      "learning_rate": 4.5507142103438794e-05,
+      "loss": 0.1755476713180542,
+      "step": 1200
+    },
+    {
+      "epoch": 0.21925036390101893,
+      "grad_norm": 0.17096194624900818,
+      "learning_rate": 4.546495284146057e-05,
+      "loss": 0.1792473554611206,
+      "step": 1205
+    },
+    {
+      "epoch": 0.22016011644832606,
+      "grad_norm": 0.1579233556985855,
+      "learning_rate": 4.542258616815669e-05,
+      "loss": 0.17230144739151002,
+      "step": 1210
+    },
+    {
+      "epoch": 0.2210698689956332,
+      "grad_norm": 0.177297905087471,
+      "learning_rate": 4.5380042450805216e-05,
+      "loss": 0.1807127833366394,
+      "step": 1215
+    },
+    {
+      "epoch": 0.22197962154294032,
+      "grad_norm": 0.14331696927547455,
+      "learning_rate": 4.533732205821897e-05,
+      "loss": 0.17201389074325563,
+      "step": 1220
+    },
+    {
+      "epoch": 0.22288937409024745,
+      "grad_norm": 0.14473360776901245,
+      "learning_rate": 4.529442536074239e-05,
+      "loss": 0.17036900520324708,
+      "step": 1225
+    },
+    {
+      "epoch": 0.22379912663755458,
+      "grad_norm": 0.1820901483297348,
+      "learning_rate": 4.5251352730248314e-05,
+      "loss": 0.17704882621765136,
+      "step": 1230
+    },
+    {
+      "epoch": 0.2247088791848617,
+      "grad_norm": 0.1948976367712021,
+      "learning_rate": 4.5208104540134746e-05,
+      "loss": 0.1706973433494568,
+      "step": 1235
+    },
+    {
+      "epoch": 0.22561863173216884,
+      "grad_norm": 0.16660070419311523,
+      "learning_rate": 4.51646811653216e-05,
+      "loss": 0.17636821269989014,
+      "step": 1240
+    },
+    {
+      "epoch": 0.22652838427947597,
+      "grad_norm": 0.1699984073638916,
+      "learning_rate": 4.512108298224751e-05,
+      "loss": 0.16986632347106934,
+      "step": 1245
+    },
+    {
+      "epoch": 0.22743813682678313,
+      "grad_norm": 0.17601042985916138,
+      "learning_rate": 4.50773103688665e-05,
+      "loss": 0.17507898807525635,
+      "step": 1250
+    },
+    {
+      "epoch": 0.22834788937409026,
+      "grad_norm": 0.17557238042354584,
+      "learning_rate": 4.503336370464476e-05,
+      "loss": 0.17702863216400147,
+      "step": 1255
+    },
+    {
+      "epoch": 0.2292576419213974,
+      "grad_norm": 0.1800651252269745,
+      "learning_rate": 4.498924337055729e-05,
+      "loss": 0.16419180631637573,
+      "step": 1260
+    },
+    {
+      "epoch": 0.23016739446870452,
+      "grad_norm": 0.2022479772567749,
+      "learning_rate": 4.494494974908468e-05,
+      "loss": 0.17482060194015503,
+      "step": 1265
+    },
+    {
+      "epoch": 0.23107714701601165,
+      "grad_norm": 0.14180205762386322,
+      "learning_rate": 4.490048322420973e-05,
+      "loss": 0.1723136067390442,
+      "step": 1270
+    },
+    {
+      "epoch": 0.23198689956331878,
+      "grad_norm": 0.18607310950756073,
+      "learning_rate": 4.485584418141419e-05,
+      "loss": 0.17096419334411622,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2328966521106259,
+      "grad_norm": 0.15958310663700104,
+      "learning_rate": 4.481103300767529e-05,
+      "loss": 0.1656244158744812,
+      "step": 1280
+    },
+    {
+      "epoch": 0.23380640465793304,
+      "grad_norm": 0.17552383244037628,
+      "learning_rate": 4.476605009146255e-05,
+      "loss": 0.17677626609802247,
+      "step": 1285
+    },
+    {
+      "epoch": 0.23471615720524017,
+      "grad_norm": 0.15299823880195618,
+      "learning_rate": 4.472089582273429e-05,
+      "loss": 0.1778991103172302,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2356259097525473,
+      "grad_norm": 0.14613987505435944,
+      "learning_rate": 4.46755705929343e-05,
+      "loss": 0.17071452140808105,
+      "step": 1295
+    },
+    {
+      "epoch": 0.23653566229985443,
+      "grad_norm": 0.17781122028827667,
+      "learning_rate": 4.463007479498843e-05,
+      "loss": 0.16955430507659913,
+      "step": 1300
+    },
+    {
+      "epoch": 0.23744541484716158,
+      "grad_norm": 0.16326487064361572,
+      "learning_rate": 4.458440882330119e-05,
+      "loss": 0.1777693510055542,
+      "step": 1305
+    },
+    {
+      "epoch": 0.23835516739446871,
+      "grad_norm": 0.17701926827430725,
+      "learning_rate": 4.4538573073752365e-05,
+      "loss": 0.16323351860046387,
+      "step": 1310
+    },
+    {
+      "epoch": 0.23926491994177584,
+      "grad_norm": 0.13104717433452606,
+      "learning_rate": 4.449256794369349e-05,
+      "loss": 0.17653456926345826,
+      "step": 1315
+    },
+    {
+      "epoch": 0.24017467248908297,
+      "grad_norm": 0.1796836256980896,
+      "learning_rate": 4.444639383194452e-05,
+      "loss": 0.17189600467681884,
+      "step": 1320
+    },
+    {
+      "epoch": 0.2410844250363901,
+      "grad_norm": 0.14919696748256683,
+      "learning_rate": 4.440005113879029e-05,
+      "loss": 0.17003334760665895,
+      "step": 1325
+    },
+    {
+      "epoch": 0.24199417758369723,
+      "grad_norm": 0.1728784441947937,
+      "learning_rate": 4.4353540265977064e-05,
+      "loss": 0.17397408485412597,
+      "step": 1330
+    },
+    {
+      "epoch": 0.24290393013100436,
+      "grad_norm": 0.14591015875339508,
+      "learning_rate": 4.43068616167091e-05,
+      "loss": 0.16498478651046752,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2438136826783115,
+      "grad_norm": 0.18417201936244965,
+      "learning_rate": 4.4260015595645055e-05,
+      "loss": 0.16841750144958495,
+      "step": 1340
+    },
+    {
+      "epoch": 0.24472343522561862,
+      "grad_norm": 0.16264279186725616,
+      "learning_rate": 4.4213002608894605e-05,
+      "loss": 0.16907373666763306,
+      "step": 1345
+    },
+    {
+      "epoch": 0.24563318777292575,
+      "grad_norm": 0.15248481929302216,
+      "learning_rate": 4.416582306401481e-05,
+      "loss": 0.15931472778320313,
+      "step": 1350
+    },
+    {
+      "epoch": 0.24654294032023288,
+      "grad_norm": 0.1488373875617981,
+      "learning_rate": 4.4118477370006636e-05,
+      "loss": 0.1701716423034668,
+      "step": 1355
+    },
+    {
+      "epoch": 0.24745269286754004,
+      "grad_norm": 0.14679782092571259,
+      "learning_rate": 4.407096593731142e-05,
+      "loss": 0.157412326335907,
+      "step": 1360
+    },
+    {
+      "epoch": 0.24836244541484717,
+      "grad_norm": 0.17139530181884766,
+      "learning_rate": 4.402328917780728e-05,
+      "loss": 0.17303754091262818,
+      "step": 1365
+    },
+    {
+      "epoch": 0.2492721979621543,
+      "grad_norm": 0.1534871757030487,
+      "learning_rate": 4.397544750480554e-05,
+      "loss": 0.1786255121231079,
+      "step": 1370
+    },
+    {
+      "epoch": 0.2501819505094614,
+      "grad_norm": 0.1876252293586731,
+      "learning_rate": 4.39274413330472e-05,
+      "loss": 0.16442898511886597,
+      "step": 1375
+    },
+    {
+      "epoch": 0.25109170305676853,
+      "grad_norm": 0.16165752708911896,
+      "learning_rate": 4.387927107869928e-05,
+      "loss": 0.1780426025390625,
+      "step": 1380
+    },
+    {
+      "epoch": 0.25200145560407566,
+      "grad_norm": 0.17242255806922913,
+      "learning_rate": 4.383093715935124e-05,
+      "loss": 0.15959256887435913,
+      "step": 1385
+    },
+    {
+      "epoch": 0.25291120815138285,
+      "grad_norm": 0.1627114862203598,
+      "learning_rate": 4.378243999401137e-05,
+      "loss": 0.17606115341186523,
+      "step": 1390
+    },
+    {
+      "epoch": 0.25382096069869,
+      "grad_norm": 0.15911224484443665,
+      "learning_rate": 4.373378000310312e-05,
+      "loss": 0.16798585653305054,
+      "step": 1395
+    },
+    {
+      "epoch": 0.2547307132459971,
+      "grad_norm": 0.15542249381542206,
+      "learning_rate": 4.3684957608461505e-05,
+      "loss": 0.1695417881011963,
+      "step": 1400
+    },
+    {
+      "epoch": 0.25564046579330424,
+      "grad_norm": 0.1475304812192917,
+      "learning_rate": 4.363597323332941e-05,
+      "loss": 0.16340878009796142,
+      "step": 1405
+    },
+    {
+      "epoch": 0.25655021834061137,
+      "grad_norm": 0.16943927109241486,
+      "learning_rate": 4.358682730235395e-05,
+      "loss": 0.17240238189697266,
+      "step": 1410
+    },
+    {
+      "epoch": 0.2574599708879185,
+      "grad_norm": 0.1816391944885254,
+      "learning_rate": 4.3537520241582744e-05,
+      "loss": 0.16558437347412108,
+      "step": 1415
+    },
+    {
+      "epoch": 0.25836972343522563,
+      "grad_norm": 0.23851341009140015,
+      "learning_rate": 4.348805247846027e-05,
+      "loss": 0.16796000003814698,
+      "step": 1420
+    },
+    {
+      "epoch": 0.25927947598253276,
+      "grad_norm": 0.15415243804454803,
+      "learning_rate": 4.343842444182414e-05,
+      "loss": 0.1746017098426819,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2601892285298399,
+      "grad_norm": 0.15651032328605652,
+      "learning_rate": 4.338863656190139e-05,
+      "loss": 0.1649057984352112,
+      "step": 1430
+    },
+    {
+      "epoch": 0.261098981077147,
+      "grad_norm": 0.16601966321468353,
+      "learning_rate": 4.333868927030471e-05,
+      "loss": 0.15888988971710205,
+      "step": 1435
+    },
+    {
+      "epoch": 0.26200873362445415,
+      "grad_norm": 0.1549467295408249,
+      "learning_rate": 4.328858300002876e-05,
+      "loss": 0.16357985734939576,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2629184861717613,
+      "grad_norm": 0.16332370042800903,
+      "learning_rate": 4.32383181854464e-05,
+      "loss": 0.16749982833862304,
+      "step": 1445
+    },
+    {
+      "epoch": 0.2638282387190684,
+      "grad_norm": 0.14827077090740204,
+      "learning_rate": 4.3187895262304894e-05,
+      "loss": 0.16886214017868043,
+      "step": 1450
+    },
+    {
+      "epoch": 0.26473799126637554,
+      "grad_norm": 0.1557198166847229,
+      "learning_rate": 4.313731466772216e-05,
+      "loss": 0.17512214183807373,
+      "step": 1455
+    },
+    {
+      "epoch": 0.26564774381368267,
+      "grad_norm": 0.17263570427894592,
+      "learning_rate": 4.308657684018299e-05,
+      "loss": 0.16248074769973755,
+      "step": 1460
+    },
+    {
+      "epoch": 0.2665574963609898,
+      "grad_norm": 0.17135761678218842,
+      "learning_rate": 4.303568221953521e-05,
+      "loss": 0.16605921983718872,
+      "step": 1465
+    },
+    {
+      "epoch": 0.26746724890829693,
+      "grad_norm": 0.14322632551193237,
+      "learning_rate": 4.2984631246985897e-05,
+      "loss": 0.1610772728919983,
+      "step": 1470
+    },
+    {
+      "epoch": 0.26837700145560406,
+      "grad_norm": 0.18852312862873077,
+      "learning_rate": 4.2933424365097564e-05,
+      "loss": 0.1686462163925171,
+      "step": 1475
+    },
+    {
+      "epoch": 0.2692867540029112,
+      "grad_norm": 0.1780245155096054,
+      "learning_rate": 4.2882062017784294e-05,
+      "loss": 0.16953932046890258,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2701965065502183,
+      "grad_norm": 0.180568665266037,
+      "learning_rate": 4.2830544650307895e-05,
+      "loss": 0.16442664861679077,
+      "step": 1485
+    },
+    {
+      "epoch": 0.27110625909752545,
+      "grad_norm": 0.16876435279846191,
+      "learning_rate": 4.277887270927407e-05,
+      "loss": 0.17128173112869263,
+      "step": 1490
+    },
+    {
+      "epoch": 0.2720160116448326,
+      "grad_norm": 0.164053276181221,
+      "learning_rate": 4.2727046642628513e-05,
+      "loss": 0.16331382989883422,
+      "step": 1495
+    },
+    {
+      "epoch": 0.27292576419213976,
+      "grad_norm": 0.14577528834342957,
+      "learning_rate": 4.267506689965305e-05,
+      "loss": 0.1638316035270691,
+      "step": 1500
+    },
+    {
+      "epoch": 0.2738355167394469,
+      "grad_norm": 0.1648740917444229,
+      "learning_rate": 4.262293393096171e-05,
+      "loss": 0.15332664251327516,
+      "step": 1505
+    },
+    {
+      "epoch": 0.274745269286754,
+      "grad_norm": 0.16445094347000122,
+      "learning_rate": 4.257064818849685e-05,
+      "loss": 0.1706634521484375,
+      "step": 1510
+    },
+    {
+      "epoch": 0.27565502183406115,
+      "grad_norm": 0.1584935486316681,
+      "learning_rate": 4.251821012552524e-05,
+      "loss": 0.1684114694595337,
+      "step": 1515
+    },
+    {
+      "epoch": 0.2765647743813683,
+      "grad_norm": 0.17215611040592194,
+      "learning_rate": 4.24656201966341e-05,
+      "loss": 0.15594131946563722,
+      "step": 1520
+    },
+    {
+      "epoch": 0.2774745269286754,
+      "grad_norm": 0.15945589542388916,
+      "learning_rate": 4.2412878857727214e-05,
+      "loss": 0.1686659574508667,
+      "step": 1525
+    },
+    {
+      "epoch": 0.27838427947598254,
+      "grad_norm": 0.16103951632976532,
+      "learning_rate": 4.2359986566020906e-05,
+      "loss": 0.17779340744018554,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2792940320232897,
+      "grad_norm": 0.1770307570695877,
+      "learning_rate": 4.230694378004014e-05,
+      "loss": 0.16786882877349854,
+      "step": 1535
+    },
+    {
+      "epoch": 0.2802037845705968,
+      "grad_norm": 0.16225053369998932,
+      "learning_rate": 4.2253750959614504e-05,
+      "loss": 0.16558897495269775,
+      "step": 1540
+    },
+    {
+      "epoch": 0.28111353711790393,
+      "grad_norm": 0.27213969826698303,
+      "learning_rate": 4.220040856587425e-05,
+      "loss": 0.1641119599342346,
+      "step": 1545
+    },
+    {
+      "epoch": 0.28202328966521106,
+      "grad_norm": 0.1773071587085724,
+      "learning_rate": 4.2146917061246284e-05,
+      "loss": 0.16919140815734862,
+      "step": 1550
+    },
+    {
+      "epoch": 0.2829330422125182,
+      "grad_norm": 0.15519705414772034,
+      "learning_rate": 4.209327690945014e-05,
+      "loss": 0.15501506328582765,
+      "step": 1555
+    },
+    {
+      "epoch": 0.2838427947598253,
+      "grad_norm": 0.19921597838401794,
+      "learning_rate": 4.203948857549402e-05,
+      "loss": 0.1690821886062622,
+      "step": 1560
+    },
+    {
+      "epoch": 0.28475254730713245,
+      "grad_norm": 0.15417630970478058,
+      "learning_rate": 4.1985552525670696e-05,
+      "loss": 0.1675640344619751,
+      "step": 1565
+    },
+    {
+      "epoch": 0.2856622998544396,
+      "grad_norm": 0.1739572137594223,
+      "learning_rate": 4.193146922755348e-05,
+      "loss": 0.16738017797470092,
+      "step": 1570
+    },
+    {
+      "epoch": 0.2865720524017467,
+      "grad_norm": 0.1384361982345581,
+      "learning_rate": 4.187723914999221e-05,
+      "loss": 0.16802358627319336,
+      "step": 1575
+    },
+    {
+      "epoch": 0.28748180494905384,
+      "grad_norm": 0.1491454839706421,
+      "learning_rate": 4.182286276310915e-05,
+      "loss": 0.1619583249092102,
+      "step": 1580
+    },
+    {
+      "epoch": 0.288391557496361,
+      "grad_norm": 0.15831919014453888,
+      "learning_rate": 4.176834053829492e-05,
+      "loss": 0.1625199794769287,
+      "step": 1585
+    },
+    {
+      "epoch": 0.2893013100436681,
+      "grad_norm": 0.16265396773815155,
+      "learning_rate": 4.1713672948204416e-05,
+      "loss": 0.16718552112579346,
+      "step": 1590
+    },
+    {
+      "epoch": 0.29021106259097523,
+      "grad_norm": 0.15153461694717407,
+      "learning_rate": 4.1658860466752714e-05,
+      "loss": 0.15979087352752686,
+      "step": 1595
+    },
+    {
+      "epoch": 0.29112081513828236,
+      "grad_norm": 0.1620412915945053,
+      "learning_rate": 4.160390356911096e-05,
+      "loss": 0.16103557348251343,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2920305676855895,
+      "grad_norm": 0.16673807799816132,
+      "learning_rate": 4.154880273170223e-05,
+      "loss": 0.16394708156585694,
+      "step": 1605
+    },
+    {
+      "epoch": 0.2929403202328967,
+      "grad_norm": 0.14834867417812347,
+      "learning_rate": 4.149355843219744e-05,
+      "loss": 0.15916435718536376,
+      "step": 1610
+    },
+    {
+      "epoch": 0.2938500727802038,
+      "grad_norm": 0.16977964341640472,
+      "learning_rate": 4.143817114951119e-05,
+      "loss": 0.16538127660751342,
+      "step": 1615
+    },
+    {
+      "epoch": 0.29475982532751094,
+      "grad_norm": 0.17986875772476196,
+      "learning_rate": 4.138264136379756e-05,
+      "loss": 0.15514618158340454,
+      "step": 1620
+    },
+    {
+      "epoch": 0.29566957787481807,
+      "grad_norm": 0.15794920921325684,
+      "learning_rate": 4.132696955644605e-05,
+      "loss": 0.15992183685302735,
+      "step": 1625
+    },
+    {
+      "epoch": 0.2965793304221252,
+      "grad_norm": 0.19955399632453918,
+      "learning_rate": 4.127115621007731e-05,
+      "loss": 0.16362056732177735,
+      "step": 1630
+    },
+    {
+      "epoch": 0.29748908296943233,
+      "grad_norm": 0.1352023035287857,
+      "learning_rate": 4.121520180853903e-05,
+      "loss": 0.15631601810455323,
+      "step": 1635
+    },
+    {
+      "epoch": 0.29839883551673946,
+      "grad_norm": 0.15340781211853027,
+      "learning_rate": 4.1159106836901674e-05,
+      "loss": 0.1571858048439026,
+      "step": 1640
+    },
+    {
+      "epoch": 0.2993085880640466,
+      "grad_norm": 0.15311770141124725,
+      "learning_rate": 4.110287178145433e-05,
+      "loss": 0.16082344055175782,
+      "step": 1645
+    },
+    {
+      "epoch": 0.3002183406113537,
+      "grad_norm": 0.17811627686023712,
+      "learning_rate": 4.10464971297005e-05,
+      "loss": 0.16117215156555176,
+      "step": 1650
+    },
+    {
+      "epoch": 0.30112809315866085,
+      "grad_norm": 0.21060039103031158,
+      "learning_rate": 4.0989983370353805e-05,
+      "loss": 0.15838587284088135,
+      "step": 1655
+    },
+    {
+      "epoch": 0.302037845705968,
+      "grad_norm": 0.155836820602417,
+      "learning_rate": 4.093333099333383e-05,
+      "loss": 0.16648870706558228,
+      "step": 1660
+    },
+    {
+      "epoch": 0.3029475982532751,
+      "grad_norm": 0.13711698353290558,
+      "learning_rate": 4.0876540489761826e-05,
+      "loss": 0.16899349689483642,
+      "step": 1665
+    },
+    {
+      "epoch": 0.30385735080058224,
+      "grad_norm": 0.15162716805934906,
+      "learning_rate": 4.0819612351956485e-05,
+      "loss": 0.16574090719223022,
+      "step": 1670
+    },
+    {
+      "epoch": 0.30476710334788937,
+      "grad_norm": 0.15016348659992218,
+      "learning_rate": 4.0762547073429615e-05,
+      "loss": 0.1689780354499817,
+      "step": 1675
+    },
+    {
+      "epoch": 0.3056768558951965,
+      "grad_norm": 0.15182986855506897,
+      "learning_rate": 4.070534514888194e-05,
+      "loss": 0.1593686819076538,
+      "step": 1680
+    },
+    {
+      "epoch": 0.3065866084425036,
+      "grad_norm": 0.15648750960826874,
+      "learning_rate": 4.0648007074198765e-05,
+      "loss": 0.16436235904693602,
+      "step": 1685
+    },
+    {
+      "epoch": 0.30749636098981076,
+      "grad_norm": 0.18339484930038452,
+      "learning_rate": 4.0590533346445665e-05,
+      "loss": 0.1678077220916748,
+      "step": 1690
+    },
+    {
+      "epoch": 0.3084061135371179,
+      "grad_norm": 0.16426527500152588,
+      "learning_rate": 4.053292446386422e-05,
+      "loss": 0.1689227342605591,
+      "step": 1695
+    },
+    {
+      "epoch": 0.309315866084425,
+      "grad_norm": 0.16129335761070251,
+      "learning_rate": 4.047518092586766e-05,
+      "loss": 0.16592445373535156,
+      "step": 1700
+    },
+    {
+      "epoch": 0.31022561863173215,
+      "grad_norm": 0.15512363612651825,
+      "learning_rate": 4.041730323303654e-05,
+      "loss": 0.16142364740371704,
+      "step": 1705
+    },
+    {
+      "epoch": 0.3111353711790393,
+      "grad_norm": 0.159842386841774,
+      "learning_rate": 4.0359291887114425e-05,
+      "loss": 0.1702875852584839,
+      "step": 1710
+    },
+    {
+      "epoch": 0.3120451237263464,
+      "grad_norm": 0.19558854401111603,
+      "learning_rate": 4.030114739100352e-05,
+      "loss": 0.15966148376464845,
+      "step": 1715
+    },
+    {
+      "epoch": 0.3129548762736536,
+      "grad_norm": 0.1577496975660324,
+      "learning_rate": 4.024287024876029e-05,
+      "loss": 0.1620358943939209,
+      "step": 1720
+    },
+    {
+      "epoch": 0.3138646288209607,
+      "grad_norm": 0.1629355251789093,
+      "learning_rate": 4.0184460965591144e-05,
+      "loss": 0.16511552333831786,
+      "step": 1725
+    },
+    {
+      "epoch": 0.31477438136826785,
+      "grad_norm": 0.17060767114162445,
+      "learning_rate": 4.0125920047848e-05,
+      "loss": 0.15672838687896729,
+      "step": 1730
+    },
+    {
+      "epoch": 0.315684133915575,
+      "grad_norm": 0.22447620332241058,
+      "learning_rate": 4.006724800302394e-05,
+      "loss": 0.15339784622192382,
+      "step": 1735
+    },
+    {
+      "epoch": 0.3165938864628821,
+      "grad_norm": 0.14572037756443024,
+      "learning_rate": 4.000844533974878e-05,
+      "loss": 0.16566959619522095,
+      "step": 1740
+    },
+    {
+      "epoch": 0.31750363901018924,
+      "grad_norm": 0.15915483236312866,
+      "learning_rate": 3.9949512567784684e-05,
+      "loss": 0.16153957843780517,
+      "step": 1745
+    },
+    {
+      "epoch": 0.3184133915574964,
+      "grad_norm": 0.1668540984392166,
+      "learning_rate": 3.9890450198021704e-05,
+      "loss": 0.1659809947013855,
+      "step": 1750
+    },
+    {
+      "epoch": 0.3193231441048035,
+      "grad_norm": 0.16612035036087036,
+      "learning_rate": 3.983125874247341e-05,
+      "loss": 0.16941241025924683,
+      "step": 1755
+    },
+    {
+      "epoch": 0.32023289665211063,
+      "grad_norm": 0.15163679420948029,
+      "learning_rate": 3.9771938714272407e-05,
+      "loss": 0.16053590774536133,
+      "step": 1760
+    },
+    {
+      "epoch": 0.32114264919941776,
+      "grad_norm": 0.1797824203968048,
+      "learning_rate": 3.97124906276659e-05,
+      "loss": 0.1667110800743103,
+      "step": 1765
+    },
+    {
+      "epoch": 0.3220524017467249,
+      "grad_norm": 0.15076608955860138,
+      "learning_rate": 3.9652914998011237e-05,
+      "loss": 0.1607860803604126,
+      "step": 1770
+    },
+    {
+      "epoch": 0.322962154294032,
+      "grad_norm": 0.16523587703704834,
+      "learning_rate": 3.959321234177144e-05,
+      "loss": 0.16515827178955078,
+      "step": 1775
+    },
+    {
+      "epoch": 0.32387190684133915,
+      "grad_norm": 0.22065149247646332,
+      "learning_rate": 3.9533383176510746e-05,
+      "loss": 0.1618957757949829,
+      "step": 1780
+    },
+    {
+      "epoch": 0.3247816593886463,
+      "grad_norm": 0.16426463425159454,
+      "learning_rate": 3.9473428020890066e-05,
+      "loss": 0.15763382911682128,
+      "step": 1785
+    },
+    {
+      "epoch": 0.3256914119359534,
+      "grad_norm": 0.16474904119968414,
+      "learning_rate": 3.941334739466257e-05,
+      "loss": 0.15135571956634522,
+      "step": 1790
+    },
+    {
+      "epoch": 0.32660116448326054,
+      "grad_norm": 0.16746412217617035,
+      "learning_rate": 3.935314181866909e-05,
+      "loss": 0.15925389528274536,
+      "step": 1795
+    },
+    {
+      "epoch": 0.32751091703056767,
+      "grad_norm": 0.17819371819496155,
+      "learning_rate": 3.929281181483369e-05,
+      "loss": 0.1598669171333313,
+      "step": 1800
+    },
+    {
+      "epoch": 0.3284206695778748,
+      "grad_norm": 0.1816040277481079,
+      "learning_rate": 3.923235790615907e-05,
+      "loss": 0.1652522087097168,
+      "step": 1805
+    },
+    {
+      "epoch": 0.32933042212518193,
+      "grad_norm": 0.14846695959568024,
+      "learning_rate": 3.917178061672211e-05,
+      "loss": 0.16665585041046144,
+      "step": 1810
+    },
+    {
+      "epoch": 0.33024017467248906,
+      "grad_norm": 0.1734926551580429,
+      "learning_rate": 3.911108047166924e-05,
+      "loss": 0.16069791316986085,
+      "step": 1815
+    },
+    {
+      "epoch": 0.3311499272197962,
+      "grad_norm": 0.16154922544956207,
+      "learning_rate": 3.905025799721194e-05,
+      "loss": 0.16114097833633423,
+      "step": 1820
+    },
+    {
+      "epoch": 0.3320596797671033,
+      "grad_norm": 0.1538771390914917,
+      "learning_rate": 3.898931372062217e-05,
+      "loss": 0.1602831244468689,
+      "step": 1825
+    },
+    {
+      "epoch": 0.3329694323144105,
+      "grad_norm": 0.14036566019058228,
+      "learning_rate": 3.892824817022781e-05,
+      "loss": 0.1502395749092102,
+      "step": 1830
+    },
+    {
+      "epoch": 0.33387918486171764,
+      "grad_norm": 0.19212059676647186,
+      "learning_rate": 3.886706187540804e-05,
+      "loss": 0.16265250444412233,
+      "step": 1835
+    },
+    {
+      "epoch": 0.33478893740902477,
+      "grad_norm": 0.17410333454608917,
+      "learning_rate": 3.880575536658881e-05,
+      "loss": 0.15689224004745483,
+      "step": 1840
+    },
+    {
+      "epoch": 0.3356986899563319,
+      "grad_norm": 0.15165294706821442,
+      "learning_rate": 3.874432917523817e-05,
+      "loss": 0.15033140182495117,
+      "step": 1845
+    },
+    {
+      "epoch": 0.336608442503639,
+      "grad_norm": 0.16166730225086212,
+      "learning_rate": 3.8682783833861736e-05,
+      "loss": 0.16896235942840576,
+      "step": 1850
+    },
+    {
+      "epoch": 0.33751819505094616,
+      "grad_norm": 0.16497021913528442,
+      "learning_rate": 3.8621119875998026e-05,
+      "loss": 0.1600774645805359,
+      "step": 1855
+    },
+    {
+      "epoch": 0.3384279475982533,
+      "grad_norm": 0.17264948785305023,
+      "learning_rate": 3.855933783621384e-05,
+      "loss": 0.16947593688964843,
+      "step": 1860
+    },
+    {
+      "epoch": 0.3393377001455604,
+      "grad_norm": 0.16870704293251038,
+      "learning_rate": 3.8497438250099636e-05,
+      "loss": 0.16062095165252685,
+      "step": 1865
+    },
+    {
+      "epoch": 0.34024745269286755,
+      "grad_norm": 0.16644036769866943,
+      "learning_rate": 3.843542165426492e-05,
+      "loss": 0.16015599966049193,
+      "step": 1870
+    },
+    {
+      "epoch": 0.3411572052401747,
+      "grad_norm": 0.1626352220773697,
+      "learning_rate": 3.837328858633349e-05,
+      "loss": 0.17444703578948975,
+      "step": 1875
+    },
+    {
+      "epoch": 0.3420669577874818,
+      "grad_norm": 0.1427375227212906,
+      "learning_rate": 3.83110395849389e-05,
+      "loss": 0.1589805006980896,
+      "step": 1880
+    },
+    {
+      "epoch": 0.34297671033478894,
+      "grad_norm": 0.17840255796909332,
+      "learning_rate": 3.824867518971973e-05,
+      "loss": 0.15953952074050903,
+      "step": 1885
+    },
+    {
+      "epoch": 0.34388646288209607,
+      "grad_norm": 0.16998249292373657,
+      "learning_rate": 3.818619594131489e-05,
+      "loss": 0.16027032136917113,
+      "step": 1890
+    },
+    {
+      "epoch": 0.3447962154294032,
+      "grad_norm": 0.14950257539749146,
+      "learning_rate": 3.812360238135897e-05,
+      "loss": 0.15335670709609986,
+      "step": 1895
+    },
+    {
+      "epoch": 0.3457059679767103,
+      "grad_norm": 0.1678011417388916,
+      "learning_rate": 3.806089505247752e-05,
+      "loss": 0.1560648798942566,
+      "step": 1900
+    },
+    {
+      "epoch": 0.34661572052401746,
+      "grad_norm": 0.17944541573524475,
+      "learning_rate": 3.799807449828238e-05,
+      "loss": 0.16072254180908202,
+      "step": 1905
+    },
+    {
+      "epoch": 0.3475254730713246,
+      "grad_norm": 0.166817307472229,
+      "learning_rate": 3.793514126336691e-05,
+      "loss": 0.1542820692062378,
+      "step": 1910
+    },
+    {
+      "epoch": 0.3484352256186317,
+      "grad_norm": 0.16047626733779907,
+      "learning_rate": 3.787209589330134e-05,
+      "loss": 0.16092092990875245,
+      "step": 1915
+    },
+    {
+      "epoch": 0.34934497816593885,
+      "grad_norm": 0.16478900611400604,
+      "learning_rate": 3.7808938934627965e-05,
+      "loss": 0.16765867471694945,
+      "step": 1920
+    },
+    {
+      "epoch": 0.350254730713246,
+      "grad_norm": 0.15349514782428741,
+      "learning_rate": 3.774567093485648e-05,
+      "loss": 0.15890377759933472,
+      "step": 1925
+    },
+    {
+      "epoch": 0.3511644832605531,
+      "grad_norm": 0.1515921950340271,
+      "learning_rate": 3.768229244245917e-05,
+      "loss": 0.16668319702148438,
+      "step": 1930
+    },
+    {
+      "epoch": 0.35207423580786024,
+      "grad_norm": 0.16310466825962067,
+      "learning_rate": 3.7618804006866195e-05,
+      "loss": 0.15182652473449706,
+      "step": 1935
+    },
+    {
+      "epoch": 0.3529839883551674,
+      "grad_norm": 0.17294517159461975,
+      "learning_rate": 3.755520617846084e-05,
+      "loss": 0.16287628412246705,
+      "step": 1940
+    },
+    {
+      "epoch": 0.35389374090247455,
+      "grad_norm": 0.1482895463705063,
+      "learning_rate": 3.749149950857467e-05,
+      "loss": 0.15321952104568481,
+      "step": 1945
+    },
+    {
+      "epoch": 0.3548034934497817,
+      "grad_norm": 0.2236029952764511,
+      "learning_rate": 3.7427684549482847e-05,
+      "loss": 0.15403482913970948,
+      "step": 1950
+    },
+    {
+      "epoch": 0.3557132459970888,
+      "grad_norm": 0.20185327529907227,
+      "learning_rate": 3.736376185439927e-05,
+      "loss": 0.1633884072303772,
+      "step": 1955
+    },
+    {
+      "epoch": 0.35662299854439594,
+      "grad_norm": 0.13906247913837433,
+      "learning_rate": 3.7299731977471816e-05,
+      "loss": 0.15925350189208984,
+      "step": 1960
+    },
+    {
+      "epoch": 0.35753275109170307,
+      "grad_norm": 0.18665002286434174,
+      "learning_rate": 3.723559547377751e-05,
+      "loss": 0.1612026572227478,
+      "step": 1965
+    },
+    {
+      "epoch": 0.3584425036390102,
+      "grad_norm": 0.16913433372974396,
+      "learning_rate": 3.717135289931774e-05,
+      "loss": 0.15479494333267213,
+      "step": 1970
+    },
+    {
+      "epoch": 0.35935225618631733,
+      "grad_norm": 0.1620066910982132,
+      "learning_rate": 3.7107004811013434e-05,
+      "loss": 0.1604058027267456,
+      "step": 1975
+    },
+    {
+      "epoch": 0.36026200873362446,
+      "grad_norm": 0.16838301718235016,
+      "learning_rate": 3.704255176670021e-05,
+      "loss": 0.15335073471069335,
+      "step": 1980
+    },
+    {
+      "epoch": 0.3611717612809316,
+      "grad_norm": 0.3054695427417755,
+      "learning_rate": 3.6977994325123535e-05,
+      "loss": 0.16558053493499755,
+      "step": 1985
+    },
+    {
+      "epoch": 0.3620815138282387,
+      "grad_norm": 0.1526716649532318,
+      "learning_rate": 3.6913333045933934e-05,
+      "loss": 0.16148923635482787,
+      "step": 1990
+    },
+    {
+      "epoch": 0.36299126637554585,
+      "grad_norm": 0.15328513085842133,
+      "learning_rate": 3.684856848968209e-05,
+      "loss": 0.1553613781929016,
+      "step": 1995
+    },
+    {
+      "epoch": 0.363901018922853,
+      "grad_norm": 0.16129714250564575,
+      "learning_rate": 3.6783701217813995e-05,
+      "loss": 0.16724612712860107,
+      "step": 2000
+    },
+    {
+      "epoch": 0.3648107714701601,
+      "grad_norm": 0.15715539455413818,
+      "learning_rate": 3.6718731792666086e-05,
+      "loss": 0.15867922306060792,
+      "step": 2005
+    },
+    {
+      "epoch": 0.36572052401746724,
+      "grad_norm": 0.15569166839122772,
+      "learning_rate": 3.6653660777460366e-05,
+      "loss": 0.1552058696746826,
+      "step": 2010
+    },
+    {
+      "epoch": 0.36663027656477437,
+      "grad_norm": 0.16223010420799255,
+      "learning_rate": 3.6588488736299535e-05,
+      "loss": 0.1583200454711914,
+      "step": 2015
+    },
+    {
+      "epoch": 0.3675400291120815,
+      "grad_norm": 0.18441995978355408,
+      "learning_rate": 3.652321623416209e-05,
+      "loss": 0.15050662755966188,
+      "step": 2020
+    },
+    {
+      "epoch": 0.36844978165938863,
+      "grad_norm": 0.13792674243450165,
+      "learning_rate": 3.645784383689742e-05,
+      "loss": 0.15458759069442748,
+      "step": 2025
+    },
+    {
+      "epoch": 0.36935953420669576,
+      "grad_norm": 0.14993111789226532,
+      "learning_rate": 3.639237211122091e-05,
+      "loss": 0.15926222801208495,
+      "step": 2030
+    },
+    {
+      "epoch": 0.3702692867540029,
+      "grad_norm": 0.16815930604934692,
+      "learning_rate": 3.632680162470904e-05,
+      "loss": 0.15524441003799438,
+      "step": 2035
+    },
+    {
+      "epoch": 0.37117903930131,
+      "grad_norm": 0.13312821090221405,
+      "learning_rate": 3.626113294579441e-05,
+      "loss": 0.15883516073226928,
+      "step": 2040
+    },
+    {
+      "epoch": 0.37208879184861715,
+      "grad_norm": 0.16838273406028748,
+      "learning_rate": 3.619536664376091e-05,
+      "loss": 0.15829603672027587,
+      "step": 2045
+    },
+    {
+      "epoch": 0.37299854439592434,
+      "grad_norm": 0.14706873893737793,
+      "learning_rate": 3.612950328873869e-05,
+      "loss": 0.15644397735595703,
+      "step": 2050
+    },
+    {
+      "epoch": 0.37390829694323147,
+      "grad_norm": 0.1644199639558792,
+      "learning_rate": 3.606354345169926e-05,
+      "loss": 0.15858219861984252,
+      "step": 2055
+    },
+    {
+      "epoch": 0.3748180494905386,
+      "grad_norm": 0.18077051639556885,
+      "learning_rate": 3.599748770445055e-05,
+      "loss": 0.1641286849975586,
+      "step": 2060
+    },
+    {
+      "epoch": 0.3757278020378457,
+      "grad_norm": 0.16329127550125122,
+      "learning_rate": 3.5931336619631914e-05,
+      "loss": 0.15027186870574952,
+      "step": 2065
+    },
+    {
+      "epoch": 0.37663755458515286,
+      "grad_norm": 0.16346783936023712,
+      "learning_rate": 3.586509077070922e-05,
+      "loss": 0.1558641314506531,
+      "step": 2070
+    },
+    {
+      "epoch": 0.37754730713246,
+      "grad_norm": 0.1727602630853653,
+      "learning_rate": 3.5798750731969834e-05,
+      "loss": 0.15390506982803345,
+      "step": 2075
+    },
+    {
+      "epoch": 0.3784570596797671,
+      "grad_norm": 0.7598192691802979,
+      "learning_rate": 3.5732317078517654e-05,
+      "loss": 0.1533232808113098,
+      "step": 2080
+    },
+    {
+      "epoch": 0.37936681222707425,
+      "grad_norm": 0.1433355212211609,
+      "learning_rate": 3.5665790386268124e-05,
+      "loss": 0.15560413599014283,
+      "step": 2085
+    },
+    {
+      "epoch": 0.3802765647743814,
+      "grad_norm": 0.18439625203609467,
+      "learning_rate": 3.559917123194325e-05,
+      "loss": 0.16695556640625,
+      "step": 2090
+    },
+    {
+      "epoch": 0.3811863173216885,
+      "grad_norm": 0.1693502813577652,
+      "learning_rate": 3.55324601930666e-05,
+      "loss": 0.15957870483398437,
+      "step": 2095
+    },
+    {
+      "epoch": 0.38209606986899564,
+      "grad_norm": 0.17776088416576385,
+      "learning_rate": 3.54656578479583e-05,
+      "loss": 0.1527492880821228,
+      "step": 2100
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.1604275617561206e+18,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-2100/training_args.bin b/checkpoint-2100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-2100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/checkpoint-2200/README.md b/checkpoint-2200/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-2200/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-2200/adapter_config.json b/checkpoint-2200/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-2200/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-2200/adapter_model.safetensors b/checkpoint-2200/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..2bc9977e54d6c2cffd17bbb6cb4b29a89bc532ed
--- /dev/null
+++ b/checkpoint-2200/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e09661f96fafc76014b129e07d70d157fea9859b9d63a2fa6e24ff566fb9fead
+size 169741912
diff --git a/checkpoint-2200/chat_template.jinja b/checkpoint-2200/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-2200/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-2200/optimizer.pt b/checkpoint-2200/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1f333f1be8fdf2569d73e2cf85875c11a8cff165
--- /dev/null
+++ b/checkpoint-2200/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5008d914373b4eedcadd9daa42d0af31703fd4ddb9c587dbfe773f8990b0a7ec
+size 72807355
diff --git a/checkpoint-2200/processor_config.json b/checkpoint-2200/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-2200/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-2200/rng_state.pth b/checkpoint-2200/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-2200/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-2200/scheduler.pt b/checkpoint-2200/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..cc875bf98e117b4166aa25022db47f1003196ac7
--- /dev/null
+++ b/checkpoint-2200/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3d7606597a533127bd5f869473a397899878377d8d2e89fd35dcd20b36560b18
+size 1465
diff --git a/checkpoint-2200/tokenizer.json b/checkpoint-2200/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-2200/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-2200/tokenizer_config.json b/checkpoint-2200/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-2200/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-2200/trainer_state.json b/checkpoint-2200/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..9acce07e33434b7356e60af62f0035bb79c99c49
--- /dev/null
+++ b/checkpoint-2200/trainer_state.json
@@ -0,0 +1,3122 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.4002911208151383,
+  "eval_steps": 100,
+  "global_step": 2200,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    },
+    {
+      "epoch": 0.03729985443959243,
+      "grad_norm": 0.061678580939769745,
+      "learning_rate": 4.999340748636462e-05,
+      "loss": 0.24956226348876953,
+      "step": 205
+    },
+    {
+      "epoch": 0.03820960698689956,
+      "grad_norm": 0.07328873127698898,
+      "learning_rate": 4.999160884067051e-05,
+      "loss": 0.26169676780700685,
+      "step": 210
+    },
+    {
+      "epoch": 0.0391193595342067,
+      "grad_norm": 0.08287990838289261,
+      "learning_rate": 4.9989593541926246e-05,
+      "loss": 0.2574604034423828,
+      "step": 215
+    },
+    {
+      "epoch": 0.04002911208151383,
+      "grad_norm": 0.06787359714508057,
+      "learning_rate": 4.9987361607602525e-05,
+      "loss": 0.25351409912109374,
+      "step": 220
+    },
+    {
+      "epoch": 0.04093886462882096,
+      "grad_norm": 0.06695502996444702,
+      "learning_rate": 4.998491305704805e-05,
+      "loss": 0.24522039890289307,
+      "step": 225
+    },
+    {
+      "epoch": 0.041848617176128096,
+      "grad_norm": 0.08872214704751968,
+      "learning_rate": 4.9982247911489375e-05,
+      "loss": 0.2581867933273315,
+      "step": 230
+    },
+    {
+      "epoch": 0.042758369723435226,
+      "grad_norm": 0.07637131959199905,
+      "learning_rate": 4.9979366194030743e-05,
+      "loss": 0.25569658279418944,
+      "step": 235
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 0.08158119022846222,
+      "learning_rate": 4.997626792965385e-05,
+      "loss": 0.2529409646987915,
+      "step": 240
+    },
+    {
+      "epoch": 0.04457787481804949,
+      "grad_norm": 0.07529161125421524,
+      "learning_rate": 4.997295314521766e-05,
+      "loss": 0.24049024581909179,
+      "step": 245
+    },
+    {
+      "epoch": 0.04548762736535662,
+      "grad_norm": 0.08860139548778534,
+      "learning_rate": 4.996942186945813e-05,
+      "loss": 0.2490522861480713,
+      "step": 250
+    },
+    {
+      "epoch": 0.04639737991266375,
+      "grad_norm": 0.0850321501493454,
+      "learning_rate": 4.9965674132988005e-05,
+      "loss": 0.24180831909179687,
+      "step": 255
+    },
+    {
+      "epoch": 0.04730713245997089,
+      "grad_norm": 0.07556115090847015,
+      "learning_rate": 4.996170996829653e-05,
+      "loss": 0.2509631872177124,
+      "step": 260
+    },
+    {
+      "epoch": 0.04821688500727802,
+      "grad_norm": 0.07971206307411194,
+      "learning_rate": 4.995752940974918e-05,
+      "loss": 0.24398891925811766,
+      "step": 265
+    },
+    {
+      "epoch": 0.04912663755458515,
+      "grad_norm": 0.09149336814880371,
+      "learning_rate": 4.9953132493587344e-05,
+      "loss": 0.2300492286682129,
+      "step": 270
+    },
+    {
+      "epoch": 0.050036390101892286,
+      "grad_norm": 0.08265820890665054,
+      "learning_rate": 4.9948519257928034e-05,
+      "loss": 0.24246792793273925,
+      "step": 275
+    },
+    {
+      "epoch": 0.050946142649199416,
+      "grad_norm": 0.10328587144613266,
+      "learning_rate": 4.9943689742763534e-05,
+      "loss": 0.2367171049118042,
+      "step": 280
+    },
+    {
+      "epoch": 0.05185589519650655,
+      "grad_norm": 0.0836917981505394,
+      "learning_rate": 4.993864398996105e-05,
+      "loss": 0.23215813636779786,
+      "step": 285
+    },
+    {
+      "epoch": 0.05276564774381368,
+      "grad_norm": 0.09475161135196686,
+      "learning_rate": 4.99333820432624e-05,
+      "loss": 0.2350748062133789,
+      "step": 290
+    },
+    {
+      "epoch": 0.05367540029112081,
+      "grad_norm": 0.08040128648281097,
+      "learning_rate": 4.992790394828355e-05,
+      "loss": 0.23253886699676513,
+      "step": 295
+    },
+    {
+      "epoch": 0.05458515283842795,
+      "grad_norm": 0.08852150291204453,
+      "learning_rate": 4.992220975251428e-05,
+      "loss": 0.23856515884399415,
+      "step": 300
+    },
+    {
+      "epoch": 0.05549490538573508,
+      "grad_norm": 0.09565229713916779,
+      "learning_rate": 4.991629950531775e-05,
+      "loss": 0.23311660289764405,
+      "step": 305
+    },
+    {
+      "epoch": 0.05640465793304221,
+      "grad_norm": 0.08158160001039505,
+      "learning_rate": 4.991017325793009e-05,
+      "loss": 0.22467944622039795,
+      "step": 310
+    },
+    {
+      "epoch": 0.05731441048034935,
+      "grad_norm": 0.07746429741382599,
+      "learning_rate": 4.990383106345994e-05,
+      "loss": 0.229844069480896,
+      "step": 315
+    },
+    {
+      "epoch": 0.05822416302765648,
+      "grad_norm": 0.08564355969429016,
+      "learning_rate": 4.989727297688797e-05,
+      "loss": 0.22414517402648926,
+      "step": 320
+    },
+    {
+      "epoch": 0.05913391557496361,
+      "grad_norm": 0.07517435401678085,
+      "learning_rate": 4.9890499055066435e-05,
+      "loss": 0.2236532211303711,
+      "step": 325
+    },
+    {
+      "epoch": 0.060043668122270744,
+      "grad_norm": 0.111734539270401,
+      "learning_rate": 4.988350935671869e-05,
+      "loss": 0.21474847793579102,
+      "step": 330
+    },
+    {
+      "epoch": 0.060953420669577874,
+      "grad_norm": 0.09906989336013794,
+      "learning_rate": 4.987630394243866e-05,
+      "loss": 0.23321933746337892,
+      "step": 335
+    },
+    {
+      "epoch": 0.06186317321688501,
+      "grad_norm": 0.10131457448005676,
+      "learning_rate": 4.98688828746903e-05,
+      "loss": 0.2310662031173706,
+      "step": 340
+    },
+    {
+      "epoch": 0.06277292576419213,
+      "grad_norm": 0.09203507006168365,
+      "learning_rate": 4.986124621780708e-05,
+      "loss": 0.22021169662475587,
+      "step": 345
+    },
+    {
+      "epoch": 0.06368267831149928,
+      "grad_norm": 0.09505912661552429,
+      "learning_rate": 4.9853394037991416e-05,
+      "loss": 0.2197155237197876,
+      "step": 350
+    },
+    {
+      "epoch": 0.06459243085880641,
+      "grad_norm": 0.09038657695055008,
+      "learning_rate": 4.984532640331412e-05,
+      "loss": 0.22066287994384765,
+      "step": 355
+    },
+    {
+      "epoch": 0.06550218340611354,
+      "grad_norm": 0.09707064181566238,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 0.22455451488494874,
+      "step": 360
+    },
+    {
+      "epoch": 0.06641193595342067,
+      "grad_norm": 0.10367228090763092,
+      "learning_rate": 4.98285450509961e-05,
+      "loss": 0.21993820667266845,
+      "step": 365
+    },
+    {
+      "epoch": 0.0673216885007278,
+      "grad_norm": 0.12229471653699875,
+      "learning_rate": 4.9819831478833456e-05,
+      "loss": 0.2168867588043213,
+      "step": 370
+    },
+    {
+      "epoch": 0.06823144104803494,
+      "grad_norm": 0.0964592918753624,
+      "learning_rate": 4.981090274276406e-05,
+      "loss": 0.21579203605651856,
+      "step": 375
+    },
+    {
+      "epoch": 0.06914119359534207,
+      "grad_norm": 0.09400496631860733,
+      "learning_rate": 4.980175892019141e-05,
+      "loss": 0.20972180366516113,
+      "step": 380
+    },
+    {
+      "epoch": 0.0700509461426492,
+      "grad_norm": 0.08158645778894424,
+      "learning_rate": 4.9792400090383594e-05,
+      "loss": 0.22148358821868896,
+      "step": 385
+    },
+    {
+      "epoch": 0.07096069868995633,
+      "grad_norm": 0.10916394740343094,
+      "learning_rate": 4.978282633447261e-05,
+      "loss": 0.2214418649673462,
+      "step": 390
+    },
+    {
+      "epoch": 0.07187045123726346,
+      "grad_norm": 0.11138810962438583,
+      "learning_rate": 4.9773037735453636e-05,
+      "loss": 0.21814754009246826,
+      "step": 395
+    },
+    {
+      "epoch": 0.07278020378457059,
+      "grad_norm": 0.10914396494626999,
+      "learning_rate": 4.9763034378184365e-05,
+      "loss": 0.21310818195343018,
+      "step": 400
+    },
+    {
+      "epoch": 0.07368995633187773,
+      "grad_norm": 0.1043366864323616,
+      "learning_rate": 4.975281634938421e-05,
+      "loss": 0.21266789436340333,
+      "step": 405
+    },
+    {
+      "epoch": 0.07459970887918486,
+      "grad_norm": 0.1036868542432785,
+      "learning_rate": 4.9742383737633594e-05,
+      "loss": 0.21606721878051757,
+      "step": 410
+    },
+    {
+      "epoch": 0.075509461426492,
+      "grad_norm": 0.11640442907810211,
+      "learning_rate": 4.9731736633373144e-05,
+      "loss": 0.21532948017120362,
+      "step": 415
+    },
+    {
+      "epoch": 0.07641921397379912,
+      "grad_norm": 0.11219926178455353,
+      "learning_rate": 4.9720875128902956e-05,
+      "loss": 0.2191627025604248,
+      "step": 420
+    },
+    {
+      "epoch": 0.07732896652110625,
+      "grad_norm": 0.12103637307882309,
+      "learning_rate": 4.970979931838176e-05,
+      "loss": 0.20938868522644044,
+      "step": 425
+    },
+    {
+      "epoch": 0.0782387190684134,
+      "grad_norm": 0.13274189829826355,
+      "learning_rate": 4.96985092978261e-05,
+      "loss": 0.21792960166931152,
+      "step": 430
+    },
+    {
+      "epoch": 0.07914847161572053,
+      "grad_norm": 0.11164513230323792,
+      "learning_rate": 4.968700516510954e-05,
+      "loss": 0.2022618055343628,
+      "step": 435
+    },
+    {
+      "epoch": 0.08005822416302766,
+      "grad_norm": 0.09532847255468369,
+      "learning_rate": 4.967528701996174e-05,
+      "loss": 0.21255812644958497,
+      "step": 440
+    },
+    {
+      "epoch": 0.08096797671033479,
+      "grad_norm": 0.10279258340597153,
+      "learning_rate": 4.96633549639677e-05,
+      "loss": 0.20683050155639648,
+      "step": 445
+    },
+    {
+      "epoch": 0.08187772925764192,
+      "grad_norm": 0.1257462352514267,
+      "learning_rate": 4.965120910056677e-05,
+      "loss": 0.21419920921325683,
+      "step": 450
+    },
+    {
+      "epoch": 0.08278748180494905,
+      "grad_norm": 0.11663137376308441,
+      "learning_rate": 4.963884953505186e-05,
+      "loss": 0.2072287082672119,
+      "step": 455
+    },
+    {
+      "epoch": 0.08369723435225619,
+      "grad_norm": 0.10488224029541016,
+      "learning_rate": 4.96262763745684e-05,
+      "loss": 0.1982678532600403,
+      "step": 460
+    },
+    {
+      "epoch": 0.08460698689956332,
+      "grad_norm": 0.11801692098379135,
+      "learning_rate": 4.961348972811354e-05,
+      "loss": 0.20662031173706055,
+      "step": 465
+    },
+    {
+      "epoch": 0.08551673944687045,
+      "grad_norm": 0.11318827420473099,
+      "learning_rate": 4.96004897065351e-05,
+      "loss": 0.20947303771972656,
+      "step": 470
+    },
+    {
+      "epoch": 0.08642649199417758,
+      "grad_norm": 0.13409486413002014,
+      "learning_rate": 4.95872764225307e-05,
+      "loss": 0.19670876264572143,
+      "step": 475
+    },
+    {
+      "epoch": 0.08733624454148471,
+      "grad_norm": 0.14440792798995972,
+      "learning_rate": 4.957384999064672e-05,
+      "loss": 0.19842848777770997,
+      "step": 480
+    },
+    {
+      "epoch": 0.08824599708879186,
+      "grad_norm": 0.12246996909379959,
+      "learning_rate": 4.956021052727731e-05,
+      "loss": 0.20318071842193602,
+      "step": 485
+    },
+    {
+      "epoch": 0.08915574963609899,
+      "grad_norm": 0.13437233865261078,
+      "learning_rate": 4.954635815066342e-05,
+      "loss": 0.21675212383270265,
+      "step": 490
+    },
+    {
+      "epoch": 0.09006550218340612,
+      "grad_norm": 0.11109672486782074,
+      "learning_rate": 4.9532292980891744e-05,
+      "loss": 0.2100757837295532,
+      "step": 495
+    },
+    {
+      "epoch": 0.09097525473071325,
+      "grad_norm": 0.1388893872499466,
+      "learning_rate": 4.9518015139893675e-05,
+      "loss": 0.20303285121917725,
+      "step": 500
+    },
+    {
+      "epoch": 0.09188500727802038,
+      "grad_norm": 0.13239721953868866,
+      "learning_rate": 4.950352475144427e-05,
+      "loss": 0.2152268409729004,
+      "step": 505
+    },
+    {
+      "epoch": 0.0927947598253275,
+      "grad_norm": 0.12834979593753815,
+      "learning_rate": 4.948882194116119e-05,
+      "loss": 0.20799248218536376,
+      "step": 510
+    },
+    {
+      "epoch": 0.09370451237263465,
+      "grad_norm": 0.11886704713106155,
+      "learning_rate": 4.947390683650354e-05,
+      "loss": 0.20394976139068605,
+      "step": 515
+    },
+    {
+      "epoch": 0.09461426491994178,
+      "grad_norm": 0.11398876458406448,
+      "learning_rate": 4.945877956677083e-05,
+      "loss": 0.2091092586517334,
+      "step": 520
+    },
+    {
+      "epoch": 0.09552401746724891,
+      "grad_norm": 0.1422540694475174,
+      "learning_rate": 4.944344026310186e-05,
+      "loss": 0.19564238786697388,
+      "step": 525
+    },
+    {
+      "epoch": 0.09643377001455604,
+      "grad_norm": 0.11359584331512451,
+      "learning_rate": 4.9427889058473535e-05,
+      "loss": 0.20493624210357667,
+      "step": 530
+    },
+    {
+      "epoch": 0.09734352256186317,
+      "grad_norm": 0.11703553050756454,
+      "learning_rate": 4.941212608769974e-05,
+      "loss": 0.2098615884780884,
+      "step": 535
+    },
+    {
+      "epoch": 0.0982532751091703,
+      "grad_norm": 0.14552047848701477,
+      "learning_rate": 4.939615148743017e-05,
+      "loss": 0.20382182598114013,
+      "step": 540
+    },
+    {
+      "epoch": 0.09916302765647744,
+      "grad_norm": 0.13178016245365143,
+      "learning_rate": 4.937996539614914e-05,
+      "loss": 0.19901862144470214,
+      "step": 545
+    },
+    {
+      "epoch": 0.10007278020378457,
+      "grad_norm": 0.635392427444458,
+      "learning_rate": 4.936356795417439e-05,
+      "loss": 0.20694944858551026,
+      "step": 550
+    },
+    {
+      "epoch": 0.1009825327510917,
+      "grad_norm": 0.15019077062606812,
+      "learning_rate": 4.934695930365586e-05,
+      "loss": 0.19313746690750122,
+      "step": 555
+    },
+    {
+      "epoch": 0.10189228529839883,
+      "grad_norm": 0.12941956520080566,
+      "learning_rate": 4.9330139588574474e-05,
+      "loss": 0.19671722650527954,
+      "step": 560
+    },
+    {
+      "epoch": 0.10280203784570596,
+      "grad_norm": 0.13818831741809845,
+      "learning_rate": 4.931310895474088e-05,
+      "loss": 0.20026786327362062,
+      "step": 565
+    },
+    {
+      "epoch": 0.1037117903930131,
+      "grad_norm": 0.12011194974184036,
+      "learning_rate": 4.929586754979417e-05,
+      "loss": 0.1932437539100647,
+      "step": 570
+    },
+    {
+      "epoch": 0.10462154294032024,
+      "grad_norm": 0.1345364898443222,
+      "learning_rate": 4.9278415523200644e-05,
+      "loss": 0.20245940685272218,
+      "step": 575
+    },
+    {
+      "epoch": 0.10553129548762737,
+      "grad_norm": 0.13281017541885376,
+      "learning_rate": 4.926075302625247e-05,
+      "loss": 0.19864981174468993,
+      "step": 580
+    },
+    {
+      "epoch": 0.1064410480349345,
+      "grad_norm": 0.13465586304664612,
+      "learning_rate": 4.924288021206639e-05,
+      "loss": 0.19573183059692384,
+      "step": 585
+    },
+    {
+      "epoch": 0.10735080058224163,
+      "grad_norm": 0.15225961804389954,
+      "learning_rate": 4.9224797235582396e-05,
+      "loss": 0.19946801662445068,
+      "step": 590
+    },
+    {
+      "epoch": 0.10826055312954876,
+      "grad_norm": 0.12816746532917023,
+      "learning_rate": 4.92065042535624e-05,
+      "loss": 0.19851526021957397,
+      "step": 595
+    },
+    {
+      "epoch": 0.1091703056768559,
+      "grad_norm": 0.13802853226661682,
+      "learning_rate": 4.9188001424588824e-05,
+      "loss": 0.19321763515472412,
+      "step": 600
+    },
+    {
+      "epoch": 0.11008005822416303,
+      "grad_norm": 0.17504797875881195,
+      "learning_rate": 4.9169288909063295e-05,
+      "loss": 0.2032616138458252,
+      "step": 605
+    },
+    {
+      "epoch": 0.11098981077147016,
+      "grad_norm": 0.13544194400310516,
+      "learning_rate": 4.91503668692052e-05,
+      "loss": 0.2011256456375122,
+      "step": 610
+    },
+    {
+      "epoch": 0.11189956331877729,
+      "grad_norm": 1.3976134061813354,
+      "learning_rate": 4.91312354690503e-05,
+      "loss": 0.19916868209838867,
+      "step": 615
+    },
+    {
+      "epoch": 0.11280931586608442,
+      "grad_norm": 0.1465059071779251,
+      "learning_rate": 4.91118948744493e-05,
+      "loss": 0.19487457275390624,
+      "step": 620
+    },
+    {
+      "epoch": 0.11371906841339156,
+      "grad_norm": 0.12103168666362762,
+      "learning_rate": 4.909234525306645e-05,
+      "loss": 0.1907251238822937,
+      "step": 625
+    },
+    {
+      "epoch": 0.1146288209606987,
+      "grad_norm": 0.12660574913024902,
+      "learning_rate": 4.907258677437802e-05,
+      "loss": 0.19327253103256226,
+      "step": 630
+    },
+    {
+      "epoch": 0.11553857350800582,
+      "grad_norm": 0.1347813606262207,
+      "learning_rate": 4.90526196096709e-05,
+      "loss": 0.19637736082077026,
+      "step": 635
+    },
+    {
+      "epoch": 0.11644832605531295,
+      "grad_norm": 0.14953652024269104,
+      "learning_rate": 4.903244393204107e-05,
+      "loss": 0.20325069427490233,
+      "step": 640
+    },
+    {
+      "epoch": 0.11735807860262008,
+      "grad_norm": 0.13936272263526917,
+      "learning_rate": 4.901205991639213e-05,
+      "loss": 0.1930275321006775,
+      "step": 645
+    },
+    {
+      "epoch": 0.11826783114992721,
+      "grad_norm": 0.1448420137166977,
+      "learning_rate": 4.899146773943374e-05,
+      "loss": 0.20026936531066894,
+      "step": 650
+    },
+    {
+      "epoch": 0.11917758369723436,
+      "grad_norm": 0.1312534064054489,
+      "learning_rate": 4.897066757968014e-05,
+      "loss": 0.19062033891677857,
+      "step": 655
+    },
+    {
+      "epoch": 0.12008733624454149,
+      "grad_norm": 0.13644742965698242,
+      "learning_rate": 4.894965961744859e-05,
+      "loss": 0.18719595670700073,
+      "step": 660
+    },
+    {
+      "epoch": 0.12099708879184862,
+      "grad_norm": 0.14276087284088135,
+      "learning_rate": 4.892844403485777e-05,
+      "loss": 0.19784307479858398,
+      "step": 665
+    },
+    {
+      "epoch": 0.12190684133915575,
+      "grad_norm": 0.14735399186611176,
+      "learning_rate": 4.890702101582623e-05,
+      "loss": 0.19163782596588136,
+      "step": 670
+    },
+    {
+      "epoch": 0.12281659388646288,
+      "grad_norm": 0.15742065012454987,
+      "learning_rate": 4.888539074607082e-05,
+      "loss": 0.19312986135482788,
+      "step": 675
+    },
+    {
+      "epoch": 0.12372634643377002,
+      "grad_norm": 0.12917031347751617,
+      "learning_rate": 4.8863553413105025e-05,
+      "loss": 0.20066320896148682,
+      "step": 680
+    },
+    {
+      "epoch": 0.12463609898107715,
+      "grad_norm": 0.1484801322221756,
+      "learning_rate": 4.884150920623737e-05,
+      "loss": 0.20096096992492676,
+      "step": 685
+    },
+    {
+      "epoch": 0.12554585152838427,
+      "grad_norm": 0.1455296128988266,
+      "learning_rate": 4.88192583165698e-05,
+      "loss": 0.20518505573272705,
+      "step": 690
+    },
+    {
+      "epoch": 0.12645560407569142,
+      "grad_norm": 0.14517490565776825,
+      "learning_rate": 4.879680093699598e-05,
+      "loss": 0.18859238624572755,
+      "step": 695
+    },
+    {
+      "epoch": 0.12736535662299855,
+      "grad_norm": 0.18778090178966522,
+      "learning_rate": 4.877413726219964e-05,
+      "loss": 0.197074818611145,
+      "step": 700
+    },
+    {
+      "epoch": 0.12827510917030568,
+      "grad_norm": 0.13497677445411682,
+      "learning_rate": 4.87512674886529e-05,
+      "loss": 0.18713107109069824,
+      "step": 705
+    },
+    {
+      "epoch": 0.12918486171761281,
+      "grad_norm": 0.12657155096530914,
+      "learning_rate": 4.872819181461455e-05,
+      "loss": 0.1858484387397766,
+      "step": 710
+    },
+    {
+      "epoch": 0.13009461426491994,
+      "grad_norm": 0.11458148807287216,
+      "learning_rate": 4.870491044012834e-05,
+      "loss": 0.18732179403305055,
+      "step": 715
+    },
+    {
+      "epoch": 0.13100436681222707,
+      "grad_norm": 0.13000249862670898,
+      "learning_rate": 4.8681423567021244e-05,
+      "loss": 0.1872936010360718,
+      "step": 720
+    },
+    {
+      "epoch": 0.1319141193595342,
+      "grad_norm": 0.14580890536308289,
+      "learning_rate": 4.865773139890172e-05,
+      "loss": 0.19280019998550416,
+      "step": 725
+    },
+    {
+      "epoch": 0.13282387190684133,
+      "grad_norm": 0.1507277935743332,
+      "learning_rate": 4.8633834141157913e-05,
+      "loss": 0.1898929238319397,
+      "step": 730
+    },
+    {
+      "epoch": 0.13373362445414846,
+      "grad_norm": 0.1418737769126892,
+      "learning_rate": 4.860973200095592e-05,
+      "loss": 0.17926375865936278,
+      "step": 735
+    },
+    {
+      "epoch": 0.1346433770014556,
+      "grad_norm": 0.17151866853237152,
+      "learning_rate": 4.858542518723794e-05,
+      "loss": 0.18963592052459716,
+      "step": 740
+    },
+    {
+      "epoch": 0.13555312954876272,
+      "grad_norm": 0.11162743717432022,
+      "learning_rate": 4.8560913910720535e-05,
+      "loss": 0.19466646909713745,
+      "step": 745
+    },
+    {
+      "epoch": 0.13646288209606988,
+      "grad_norm": 0.15628376603126526,
+      "learning_rate": 4.8536198383892725e-05,
+      "loss": 0.19494034051895143,
+      "step": 750
+    },
+    {
+      "epoch": 0.137372634643377,
+      "grad_norm": 0.18209289014339447,
+      "learning_rate": 4.851127882101421e-05,
+      "loss": 0.18747550249099731,
+      "step": 755
+    },
+    {
+      "epoch": 0.13828238719068414,
+      "grad_norm": 0.14559614658355713,
+      "learning_rate": 4.8486155438113454e-05,
+      "loss": 0.1897158980369568,
+      "step": 760
+    },
+    {
+      "epoch": 0.13919213973799127,
+      "grad_norm": 0.3198587894439697,
+      "learning_rate": 4.846082845298586e-05,
+      "loss": 0.18571001291275024,
+      "step": 765
+    },
+    {
+      "epoch": 0.1401018922852984,
+      "grad_norm": 0.1486678421497345,
+      "learning_rate": 4.843529808519189e-05,
+      "loss": 0.19561930894851684,
+      "step": 770
+    },
+    {
+      "epoch": 0.14101164483260553,
+      "grad_norm": 0.15318170189857483,
+      "learning_rate": 4.840956455605509e-05,
+      "loss": 0.187040114402771,
+      "step": 775
+    },
+    {
+      "epoch": 0.14192139737991266,
+      "grad_norm": 0.13754244148731232,
+      "learning_rate": 4.838362808866025e-05,
+      "loss": 0.18345539569854735,
+      "step": 780
+    },
+    {
+      "epoch": 0.1428311499272198,
+      "grad_norm": 0.12943248450756073,
+      "learning_rate": 4.835748890785143e-05,
+      "loss": 0.1921079397201538,
+      "step": 785
+    },
+    {
+      "epoch": 0.14374090247452692,
+      "grad_norm": 0.110458143055439,
+      "learning_rate": 4.833114724023001e-05,
+      "loss": 0.17927205562591553,
+      "step": 790
+    },
+    {
+      "epoch": 0.14465065502183405,
+      "grad_norm": 0.2421770840883255,
+      "learning_rate": 4.830460331415275e-05,
+      "loss": 0.18317567110061644,
+      "step": 795
+    },
+    {
+      "epoch": 0.14556040756914118,
+      "grad_norm": 0.14752762019634247,
+      "learning_rate": 4.8277857359729787e-05,
+      "loss": 0.1843916058540344,
+      "step": 800
+    },
+    {
+      "epoch": 0.14647016011644834,
+      "grad_norm": 0.15043556690216064,
+      "learning_rate": 4.8250909608822644e-05,
+      "loss": 0.18354393243789674,
+      "step": 805
+    },
+    {
+      "epoch": 0.14737991266375547,
+      "grad_norm": 0.1381794661283493,
+      "learning_rate": 4.822376029504223e-05,
+      "loss": 0.1789781332015991,
+      "step": 810
+    },
+    {
+      "epoch": 0.1482896652110626,
+      "grad_norm": 0.18386174738407135,
+      "learning_rate": 4.819640965374681e-05,
+      "loss": 0.19494292736053467,
+      "step": 815
+    },
+    {
+      "epoch": 0.14919941775836973,
+      "grad_norm": 0.13829593360424042,
+      "learning_rate": 4.816885792203996e-05,
+      "loss": 0.18486063480377196,
+      "step": 820
+    },
+    {
+      "epoch": 0.15010917030567686,
+      "grad_norm": 0.15033291280269623,
+      "learning_rate": 4.814110533876852e-05,
+      "loss": 0.18061509132385253,
+      "step": 825
+    },
+    {
+      "epoch": 0.151018922852984,
+      "grad_norm": 0.17150473594665527,
+      "learning_rate": 4.811315214452051e-05,
+      "loss": 0.18464866876602173,
+      "step": 830
+    },
+    {
+      "epoch": 0.15192867540029112,
+      "grad_norm": 0.15317125618457794,
+      "learning_rate": 4.808499858162307e-05,
+      "loss": 0.1837708592414856,
+      "step": 835
+    },
+    {
+      "epoch": 0.15283842794759825,
+      "grad_norm": 0.2671392560005188,
+      "learning_rate": 4.805664489414031e-05,
+      "loss": 0.19338636398315429,
+      "step": 840
+    },
+    {
+      "epoch": 0.15374818049490538,
+      "grad_norm": 0.14047028124332428,
+      "learning_rate": 4.802809132787125e-05,
+      "loss": 0.17069108486175538,
+      "step": 845
+    },
+    {
+      "epoch": 0.1546579330422125,
+      "grad_norm": 0.1520431935787201,
+      "learning_rate": 4.799933813034768e-05,
+      "loss": 0.18607735633850098,
+      "step": 850
+    },
+    {
+      "epoch": 0.15556768558951964,
+      "grad_norm": 0.17239463329315186,
+      "learning_rate": 4.797038555083197e-05,
+      "loss": 0.18069062232971192,
+      "step": 855
+    },
+    {
+      "epoch": 0.1564774381368268,
+      "grad_norm": 0.1377955675125122,
+      "learning_rate": 4.794123384031495e-05,
+      "loss": 0.18870222568511963,
+      "step": 860
+    },
+    {
+      "epoch": 0.15738719068413393,
+      "grad_norm": 0.15901461243629456,
+      "learning_rate": 4.791188325151373e-05,
+      "loss": 0.18128334283828734,
+      "step": 865
+    },
+    {
+      "epoch": 0.15829694323144106,
+      "grad_norm": 0.14634132385253906,
+      "learning_rate": 4.7882334038869495e-05,
+      "loss": 0.1866163969039917,
+      "step": 870
+    },
+    {
+      "epoch": 0.1592066957787482,
+      "grad_norm": 0.15361061692237854,
+      "learning_rate": 4.785258645854529e-05,
+      "loss": 0.17850807905197144,
+      "step": 875
+    },
+    {
+      "epoch": 0.16011644832605532,
+      "grad_norm": 0.13751649856567383,
+      "learning_rate": 4.782264076842385e-05,
+      "loss": 0.17731113433837892,
+      "step": 880
+    },
+    {
+      "epoch": 0.16102620087336245,
+      "grad_norm": 0.17909638583660126,
+      "learning_rate": 4.7792497228105314e-05,
+      "loss": 0.18344542980194092,
+      "step": 885
+    },
+    {
+      "epoch": 0.16193595342066958,
+      "grad_norm": 0.16038304567337036,
+      "learning_rate": 4.776215609890498e-05,
+      "loss": 0.18868647813796996,
+      "step": 890
+    },
+    {
+      "epoch": 0.1628457059679767,
+      "grad_norm": 0.1653951108455658,
+      "learning_rate": 4.773161764385107e-05,
+      "loss": 0.18614152669906617,
+      "step": 895
+    },
+    {
+      "epoch": 0.16375545851528384,
+      "grad_norm": 0.16193026304244995,
+      "learning_rate": 4.770088212768241e-05,
+      "loss": 0.18564575910568237,
+      "step": 900
+    },
+    {
+      "epoch": 0.16466521106259097,
+      "grad_norm": 0.16048531234264374,
+      "learning_rate": 4.7669949816846173e-05,
+      "loss": 0.18330031633377075,
+      "step": 905
+    },
+    {
+      "epoch": 0.1655749636098981,
+      "grad_norm": 0.1440177708864212,
+      "learning_rate": 4.7638820979495534e-05,
+      "loss": 0.17712442874908446,
+      "step": 910
+    },
+    {
+      "epoch": 0.16648471615720525,
+      "grad_norm": 0.19635969400405884,
+      "learning_rate": 4.760749588548738e-05,
+      "loss": 0.18679027557373046,
+      "step": 915
+    },
+    {
+      "epoch": 0.16739446870451238,
+      "grad_norm": 0.15576541423797607,
+      "learning_rate": 4.757597480637995e-05,
+      "loss": 0.19283764362335204,
+      "step": 920
+    },
+    {
+      "epoch": 0.1683042212518195,
+      "grad_norm": 0.1550331562757492,
+      "learning_rate": 4.7544258015430463e-05,
+      "loss": 0.18269542455673218,
+      "step": 925
+    },
+    {
+      "epoch": 0.16921397379912664,
+      "grad_norm": 0.18369626998901367,
+      "learning_rate": 4.75123457875928e-05,
+      "loss": 0.1697891116142273,
+      "step": 930
+    },
+    {
+      "epoch": 0.17012372634643377,
+      "grad_norm": 0.15266314148902893,
+      "learning_rate": 4.7480238399515074e-05,
+      "loss": 0.18523451089859008,
+      "step": 935
+    },
+    {
+      "epoch": 0.1710334788937409,
+      "grad_norm": 0.16709664463996887,
+      "learning_rate": 4.744793612953724e-05,
+      "loss": 0.1803238034248352,
+      "step": 940
+    },
+    {
+      "epoch": 0.17194323144104803,
+      "grad_norm": 0.14929179847240448,
+      "learning_rate": 4.741543925768872e-05,
+      "loss": 0.1861217737197876,
+      "step": 945
+    },
+    {
+      "epoch": 0.17285298398835516,
+      "grad_norm": 0.1362280696630478,
+      "learning_rate": 4.7382748065685915e-05,
+      "loss": 0.17896100282669067,
+      "step": 950
+    },
+    {
+      "epoch": 0.1737627365356623,
+      "grad_norm": 0.15290239453315735,
+      "learning_rate": 4.734986283692982e-05,
+      "loss": 0.18432788848876952,
+      "step": 955
+    },
+    {
+      "epoch": 0.17467248908296942,
+      "grad_norm": 0.1287035197019577,
+      "learning_rate": 4.73167838565035e-05,
+      "loss": 0.18485682010650634,
+      "step": 960
+    },
+    {
+      "epoch": 0.17558224163027655,
+      "grad_norm": 0.17969627678394318,
+      "learning_rate": 4.728351141116971e-05,
+      "loss": 0.17361557483673096,
+      "step": 965
+    },
+    {
+      "epoch": 0.1764919941775837,
+      "grad_norm": 0.13751201331615448,
+      "learning_rate": 4.7250045789368326e-05,
+      "loss": 0.1731679320335388,
+      "step": 970
+    },
+    {
+      "epoch": 0.17740174672489084,
+      "grad_norm": 0.1603265255689621,
+      "learning_rate": 4.721638728121388e-05,
+      "loss": 0.17308170795440675,
+      "step": 975
+    },
+    {
+      "epoch": 0.17831149927219797,
+      "grad_norm": 0.1592789888381958,
+      "learning_rate": 4.718253617849306e-05,
+      "loss": 0.17534757852554322,
+      "step": 980
+    },
+    {
+      "epoch": 0.1792212518195051,
+      "grad_norm": 0.12727224826812744,
+      "learning_rate": 4.714849277466214e-05,
+      "loss": 0.17817609310150145,
+      "step": 985
+    },
+    {
+      "epoch": 0.18013100436681223,
+      "grad_norm": 0.15401554107666016,
+      "learning_rate": 4.711425736484447e-05,
+      "loss": 0.1733405351638794,
+      "step": 990
+    },
+    {
+      "epoch": 0.18104075691411936,
+      "grad_norm": 0.13253968954086304,
+      "learning_rate": 4.7079830245827906e-05,
+      "loss": 0.17846795320510864,
+      "step": 995
+    },
+    {
+      "epoch": 0.1819505094614265,
+      "grad_norm": 0.21846213936805725,
+      "learning_rate": 4.7045211716062245e-05,
+      "loss": 0.18021599054336548,
+      "step": 1000
+    },
+    {
+      "epoch": 0.18286026200873362,
+      "grad_norm": 0.16867990791797638,
+      "learning_rate": 4.7010402075656595e-05,
+      "loss": 0.18232386112213134,
+      "step": 1005
+    },
+    {
+      "epoch": 0.18377001455604075,
+      "grad_norm": 0.17180582880973816,
+      "learning_rate": 4.697540162637686e-05,
+      "loss": 0.1816317319869995,
+      "step": 1010
+    },
+    {
+      "epoch": 0.18467976710334788,
+      "grad_norm": 0.16480213403701782,
+      "learning_rate": 4.694021067164303e-05,
+      "loss": 0.17718446254730225,
+      "step": 1015
+    },
+    {
+      "epoch": 0.185589519650655,
+      "grad_norm": 0.15015918016433716,
+      "learning_rate": 4.6904829516526605e-05,
+      "loss": 0.17412011623382567,
+      "step": 1020
+    },
+    {
+      "epoch": 0.18649927219796217,
+      "grad_norm": 0.14445139467716217,
+      "learning_rate": 4.686925846774795e-05,
+      "loss": 0.1778018832206726,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1874090247452693,
+      "grad_norm": 0.1701960265636444,
+      "learning_rate": 4.683349783367362e-05,
+      "loss": 0.16901081800460815,
+      "step": 1030
+    },
+    {
+      "epoch": 0.18831877729257643,
+      "grad_norm": 0.15894867479801178,
+      "learning_rate": 4.679754792431368e-05,
+      "loss": 0.17055928707122803,
+      "step": 1035
+    },
+    {
+      "epoch": 0.18922852983988356,
+      "grad_norm": 0.1511942446231842,
+      "learning_rate": 4.676140905131903e-05,
+      "loss": 0.17339680194854737,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1901382823871907,
+      "grad_norm": 0.14735209941864014,
+      "learning_rate": 4.672508152797872e-05,
+      "loss": 0.17802717685699462,
+      "step": 1045
+    },
+    {
+      "epoch": 0.19104803493449782,
+      "grad_norm": 0.17367291450500488,
+      "learning_rate": 4.66885656692172e-05,
+      "loss": 0.1732744097709656,
+      "step": 1050
+    },
+    {
+      "epoch": 0.19195778748180495,
+      "grad_norm": 0.147227481007576,
+      "learning_rate": 4.665186179159159e-05,
+      "loss": 0.17040517330169677,
+      "step": 1055
+    },
+    {
+      "epoch": 0.19286754002911208,
+      "grad_norm": 0.1709655076265335,
+      "learning_rate": 4.6614970213289e-05,
+      "loss": 0.17794088125228882,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1937772925764192,
+      "grad_norm": 0.1588088721036911,
+      "learning_rate": 4.657789125412366e-05,
+      "loss": 0.17180380821228028,
+      "step": 1065
+    },
+    {
+      "epoch": 0.19468704512372634,
+      "grad_norm": 0.14827021956443787,
+      "learning_rate": 4.654062523553428e-05,
+      "loss": 0.182997989654541,
+      "step": 1070
+    },
+    {
+      "epoch": 0.19559679767103347,
+      "grad_norm": 0.16230466961860657,
+      "learning_rate": 4.6503172480581126e-05,
+      "loss": 0.17346880435943604,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1965065502183406,
+      "grad_norm": 0.1637624353170395,
+      "learning_rate": 4.646553331394333e-05,
+      "loss": 0.17263576984405518,
+      "step": 1080
+    },
+    {
+      "epoch": 0.19741630276564776,
+      "grad_norm": 0.15977843105793,
+      "learning_rate": 4.642770806191603e-05,
+      "loss": 0.17284308671951293,
+      "step": 1085
+    },
+    {
+      "epoch": 0.19832605531295489,
+      "grad_norm": 0.15394869446754456,
+      "learning_rate": 4.6389697052407534e-05,
+      "loss": 0.17797101736068727,
+      "step": 1090
+    },
+    {
+      "epoch": 0.19923580786026202,
+      "grad_norm": 0.15995225310325623,
+      "learning_rate": 4.6351500614936485e-05,
+      "loss": 0.18137198686599731,
+      "step": 1095
+    },
+    {
+      "epoch": 0.20014556040756915,
+      "grad_norm": 0.1779479682445526,
+      "learning_rate": 4.6313119080629006e-05,
+      "loss": 0.17998344898223878,
+      "step": 1100
+    },
+    {
+      "epoch": 0.20105531295487628,
+      "grad_norm": 0.14362832903862,
+      "learning_rate": 4.627455278221584e-05,
+      "loss": 0.18196423053741456,
+      "step": 1105
+    },
+    {
+      "epoch": 0.2019650655021834,
+      "grad_norm": 0.15951639413833618,
+      "learning_rate": 4.623580205402947e-05,
+      "loss": 0.17423888444900512,
+      "step": 1110
+    },
+    {
+      "epoch": 0.20287481804949054,
+      "grad_norm": 0.17273563146591187,
+      "learning_rate": 4.619686723200115e-05,
+      "loss": 0.17392473220825194,
+      "step": 1115
+    },
+    {
+      "epoch": 0.20378457059679767,
+      "grad_norm": 0.1655360758304596,
+      "learning_rate": 4.615774865365813e-05,
+      "loss": 0.17528389692306517,
+      "step": 1120
+    },
+    {
+      "epoch": 0.2046943231441048,
+      "grad_norm": 0.15920691192150116,
+      "learning_rate": 4.611844665812058e-05,
+      "loss": 0.1806849241256714,
+      "step": 1125
+    },
+    {
+      "epoch": 0.20560407569141192,
+      "grad_norm": 0.16114577651023865,
+      "learning_rate": 4.607896158609875e-05,
+      "loss": 0.17217352390289306,
+      "step": 1130
+    },
+    {
+      "epoch": 0.20651382823871905,
+      "grad_norm": 0.1499422937631607,
+      "learning_rate": 4.603929377988999e-05,
+      "loss": 0.17806737422943114,
+      "step": 1135
+    },
+    {
+      "epoch": 0.2074235807860262,
+      "grad_norm": 0.17605191469192505,
+      "learning_rate": 4.5999443583375765e-05,
+      "loss": 0.17842113971710205,
+      "step": 1140
+    },
+    {
+      "epoch": 0.20833333333333334,
+      "grad_norm": 0.16117210686206818,
+      "learning_rate": 4.595941134201871e-05,
+      "loss": 0.18379683494567872,
+      "step": 1145
+    },
+    {
+      "epoch": 0.20924308588064047,
+      "grad_norm": 0.21199050545692444,
+      "learning_rate": 4.591919740285957e-05,
+      "loss": 0.16286123991012574,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2101528384279476,
+      "grad_norm": 0.15100529789924622,
+      "learning_rate": 4.587880211451427e-05,
+      "loss": 0.17995200157165528,
+      "step": 1155
+    },
+    {
+      "epoch": 0.21106259097525473,
+      "grad_norm": 0.16618172824382782,
+      "learning_rate": 4.583822582717085e-05,
+      "loss": 0.16960303783416747,
+      "step": 1160
+    },
+    {
+      "epoch": 0.21197234352256186,
+      "grad_norm": 0.14743569493293762,
+      "learning_rate": 4.579746889258643e-05,
+      "loss": 0.17762668132781984,
+      "step": 1165
+    },
+    {
+      "epoch": 0.212882096069869,
+      "grad_norm": 0.1697179079055786,
+      "learning_rate": 4.575653166408417e-05,
+      "loss": 0.16656005382537842,
+      "step": 1170
+    },
+    {
+      "epoch": 0.21379184861717612,
+      "grad_norm": 0.14886513352394104,
+      "learning_rate": 4.57154144965502e-05,
+      "loss": 0.17091882228851318,
+      "step": 1175
+    },
+    {
+      "epoch": 0.21470160116448325,
+      "grad_norm": 0.18197473883628845,
+      "learning_rate": 4.5674117746430556e-05,
+      "loss": 0.1770920753479004,
+      "step": 1180
+    },
+    {
+      "epoch": 0.21561135371179038,
+      "grad_norm": 0.17323088645935059,
+      "learning_rate": 4.563264177172807e-05,
+      "loss": 0.1734643578529358,
+      "step": 1185
+    },
+    {
+      "epoch": 0.2165211062590975,
+      "grad_norm": 0.1521984338760376,
+      "learning_rate": 4.559098693199929e-05,
+      "loss": 0.17515116930007935,
+      "step": 1190
+    },
+    {
+      "epoch": 0.21743085880640467,
+      "grad_norm": 0.1842304915189743,
+      "learning_rate": 4.554915358835134e-05,
+      "loss": 0.16798022985458375,
+      "step": 1195
+    },
+    {
+      "epoch": 0.2183406113537118,
+      "grad_norm": 0.14753451943397522,
+      "learning_rate": 4.5507142103438794e-05,
+      "loss": 0.1755476713180542,
+      "step": 1200
+    },
+    {
+      "epoch": 0.21925036390101893,
+      "grad_norm": 0.17096194624900818,
+      "learning_rate": 4.546495284146057e-05,
+      "loss": 0.1792473554611206,
+      "step": 1205
+    },
+    {
+      "epoch": 0.22016011644832606,
+      "grad_norm": 0.1579233556985855,
+      "learning_rate": 4.542258616815669e-05,
+      "loss": 0.17230144739151002,
+      "step": 1210
+    },
+    {
+      "epoch": 0.2210698689956332,
+      "grad_norm": 0.177297905087471,
+      "learning_rate": 4.5380042450805216e-05,
+      "loss": 0.1807127833366394,
+      "step": 1215
+    },
+    {
+      "epoch": 0.22197962154294032,
+      "grad_norm": 0.14331696927547455,
+      "learning_rate": 4.533732205821897e-05,
+      "loss": 0.17201389074325563,
+      "step": 1220
+    },
+    {
+      "epoch": 0.22288937409024745,
+      "grad_norm": 0.14473360776901245,
+      "learning_rate": 4.529442536074239e-05,
+      "loss": 0.17036900520324708,
+      "step": 1225
+    },
+    {
+      "epoch": 0.22379912663755458,
+      "grad_norm": 0.1820901483297348,
+      "learning_rate": 4.5251352730248314e-05,
+      "loss": 0.17704882621765136,
+      "step": 1230
+    },
+    {
+      "epoch": 0.2247088791848617,
+      "grad_norm": 0.1948976367712021,
+      "learning_rate": 4.5208104540134746e-05,
+      "loss": 0.1706973433494568,
+      "step": 1235
+    },
+    {
+      "epoch": 0.22561863173216884,
+      "grad_norm": 0.16660070419311523,
+      "learning_rate": 4.51646811653216e-05,
+      "loss": 0.17636821269989014,
+      "step": 1240
+    },
+    {
+      "epoch": 0.22652838427947597,
+      "grad_norm": 0.1699984073638916,
+      "learning_rate": 4.512108298224751e-05,
+      "loss": 0.16986632347106934,
+      "step": 1245
+    },
+    {
+      "epoch": 0.22743813682678313,
+      "grad_norm": 0.17601042985916138,
+      "learning_rate": 4.50773103688665e-05,
+      "loss": 0.17507898807525635,
+      "step": 1250
+    },
+    {
+      "epoch": 0.22834788937409026,
+      "grad_norm": 0.17557238042354584,
+      "learning_rate": 4.503336370464476e-05,
+      "loss": 0.17702863216400147,
+      "step": 1255
+    },
+    {
+      "epoch": 0.2292576419213974,
+      "grad_norm": 0.1800651252269745,
+      "learning_rate": 4.498924337055729e-05,
+      "loss": 0.16419180631637573,
+      "step": 1260
+    },
+    {
+      "epoch": 0.23016739446870452,
+      "grad_norm": 0.2022479772567749,
+      "learning_rate": 4.494494974908468e-05,
+      "loss": 0.17482060194015503,
+      "step": 1265
+    },
+    {
+      "epoch": 0.23107714701601165,
+      "grad_norm": 0.14180205762386322,
+      "learning_rate": 4.490048322420973e-05,
+      "loss": 0.1723136067390442,
+      "step": 1270
+    },
+    {
+      "epoch": 0.23198689956331878,
+      "grad_norm": 0.18607310950756073,
+      "learning_rate": 4.485584418141419e-05,
+      "loss": 0.17096419334411622,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2328966521106259,
+      "grad_norm": 0.15958310663700104,
+      "learning_rate": 4.481103300767529e-05,
+      "loss": 0.1656244158744812,
+      "step": 1280
+    },
+    {
+      "epoch": 0.23380640465793304,
+      "grad_norm": 0.17552383244037628,
+      "learning_rate": 4.476605009146255e-05,
+      "loss": 0.17677626609802247,
+      "step": 1285
+    },
+    {
+      "epoch": 0.23471615720524017,
+      "grad_norm": 0.15299823880195618,
+      "learning_rate": 4.472089582273429e-05,
+      "loss": 0.1778991103172302,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2356259097525473,
+      "grad_norm": 0.14613987505435944,
+      "learning_rate": 4.46755705929343e-05,
+      "loss": 0.17071452140808105,
+      "step": 1295
+    },
+    {
+      "epoch": 0.23653566229985443,
+      "grad_norm": 0.17781122028827667,
+      "learning_rate": 4.463007479498843e-05,
+      "loss": 0.16955430507659913,
+      "step": 1300
+    },
+    {
+      "epoch": 0.23744541484716158,
+      "grad_norm": 0.16326487064361572,
+      "learning_rate": 4.458440882330119e-05,
+      "loss": 0.1777693510055542,
+      "step": 1305
+    },
+    {
+      "epoch": 0.23835516739446871,
+      "grad_norm": 0.17701926827430725,
+      "learning_rate": 4.4538573073752365e-05,
+      "loss": 0.16323351860046387,
+      "step": 1310
+    },
+    {
+      "epoch": 0.23926491994177584,
+      "grad_norm": 0.13104717433452606,
+      "learning_rate": 4.449256794369349e-05,
+      "loss": 0.17653456926345826,
+      "step": 1315
+    },
+    {
+      "epoch": 0.24017467248908297,
+      "grad_norm": 0.1796836256980896,
+      "learning_rate": 4.444639383194452e-05,
+      "loss": 0.17189600467681884,
+      "step": 1320
+    },
+    {
+      "epoch": 0.2410844250363901,
+      "grad_norm": 0.14919696748256683,
+      "learning_rate": 4.440005113879029e-05,
+      "loss": 0.17003334760665895,
+      "step": 1325
+    },
+    {
+      "epoch": 0.24199417758369723,
+      "grad_norm": 0.1728784441947937,
+      "learning_rate": 4.4353540265977064e-05,
+      "loss": 0.17397408485412597,
+      "step": 1330
+    },
+    {
+      "epoch": 0.24290393013100436,
+      "grad_norm": 0.14591015875339508,
+      "learning_rate": 4.43068616167091e-05,
+      "loss": 0.16498478651046752,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2438136826783115,
+      "grad_norm": 0.18417201936244965,
+      "learning_rate": 4.4260015595645055e-05,
+      "loss": 0.16841750144958495,
+      "step": 1340
+    },
+    {
+      "epoch": 0.24472343522561862,
+      "grad_norm": 0.16264279186725616,
+      "learning_rate": 4.4213002608894605e-05,
+      "loss": 0.16907373666763306,
+      "step": 1345
+    },
+    {
+      "epoch": 0.24563318777292575,
+      "grad_norm": 0.15248481929302216,
+      "learning_rate": 4.416582306401481e-05,
+      "loss": 0.15931472778320313,
+      "step": 1350
+    },
+    {
+      "epoch": 0.24654294032023288,
+      "grad_norm": 0.1488373875617981,
+      "learning_rate": 4.4118477370006636e-05,
+      "loss": 0.1701716423034668,
+      "step": 1355
+    },
+    {
+      "epoch": 0.24745269286754004,
+      "grad_norm": 0.14679782092571259,
+      "learning_rate": 4.407096593731142e-05,
+      "loss": 0.157412326335907,
+      "step": 1360
+    },
+    {
+      "epoch": 0.24836244541484717,
+      "grad_norm": 0.17139530181884766,
+      "learning_rate": 4.402328917780728e-05,
+      "loss": 0.17303754091262818,
+      "step": 1365
+    },
+    {
+      "epoch": 0.2492721979621543,
+      "grad_norm": 0.1534871757030487,
+      "learning_rate": 4.397544750480554e-05,
+      "loss": 0.1786255121231079,
+      "step": 1370
+    },
+    {
+      "epoch": 0.2501819505094614,
+      "grad_norm": 0.1876252293586731,
+      "learning_rate": 4.39274413330472e-05,
+      "loss": 0.16442898511886597,
+      "step": 1375
+    },
+    {
+      "epoch": 0.25109170305676853,
+      "grad_norm": 0.16165752708911896,
+      "learning_rate": 4.387927107869928e-05,
+      "loss": 0.1780426025390625,
+      "step": 1380
+    },
+    {
+      "epoch": 0.25200145560407566,
+      "grad_norm": 0.17242255806922913,
+      "learning_rate": 4.383093715935124e-05,
+      "loss": 0.15959256887435913,
+      "step": 1385
+    },
+    {
+      "epoch": 0.25291120815138285,
+      "grad_norm": 0.1627114862203598,
+      "learning_rate": 4.378243999401137e-05,
+      "loss": 0.17606115341186523,
+      "step": 1390
+    },
+    {
+      "epoch": 0.25382096069869,
+      "grad_norm": 0.15911224484443665,
+      "learning_rate": 4.373378000310312e-05,
+      "loss": 0.16798585653305054,
+      "step": 1395
+    },
+    {
+      "epoch": 0.2547307132459971,
+      "grad_norm": 0.15542249381542206,
+      "learning_rate": 4.3684957608461505e-05,
+      "loss": 0.1695417881011963,
+      "step": 1400
+    },
+    {
+      "epoch": 0.25564046579330424,
+      "grad_norm": 0.1475304812192917,
+      "learning_rate": 4.363597323332941e-05,
+      "loss": 0.16340878009796142,
+      "step": 1405
+    },
+    {
+      "epoch": 0.25655021834061137,
+      "grad_norm": 0.16943927109241486,
+      "learning_rate": 4.358682730235395e-05,
+      "loss": 0.17240238189697266,
+      "step": 1410
+    },
+    {
+      "epoch": 0.2574599708879185,
+      "grad_norm": 0.1816391944885254,
+      "learning_rate": 4.3537520241582744e-05,
+      "loss": 0.16558437347412108,
+      "step": 1415
+    },
+    {
+      "epoch": 0.25836972343522563,
+      "grad_norm": 0.23851341009140015,
+      "learning_rate": 4.348805247846027e-05,
+      "loss": 0.16796000003814698,
+      "step": 1420
+    },
+    {
+      "epoch": 0.25927947598253276,
+      "grad_norm": 0.15415243804454803,
+      "learning_rate": 4.343842444182414e-05,
+      "loss": 0.1746017098426819,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2601892285298399,
+      "grad_norm": 0.15651032328605652,
+      "learning_rate": 4.338863656190139e-05,
+      "loss": 0.1649057984352112,
+      "step": 1430
+    },
+    {
+      "epoch": 0.261098981077147,
+      "grad_norm": 0.16601966321468353,
+      "learning_rate": 4.333868927030471e-05,
+      "loss": 0.15888988971710205,
+      "step": 1435
+    },
+    {
+      "epoch": 0.26200873362445415,
+      "grad_norm": 0.1549467295408249,
+      "learning_rate": 4.328858300002876e-05,
+      "loss": 0.16357985734939576,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2629184861717613,
+      "grad_norm": 0.16332370042800903,
+      "learning_rate": 4.32383181854464e-05,
+      "loss": 0.16749982833862304,
+      "step": 1445
+    },
+    {
+      "epoch": 0.2638282387190684,
+      "grad_norm": 0.14827077090740204,
+      "learning_rate": 4.3187895262304894e-05,
+      "loss": 0.16886214017868043,
+      "step": 1450
+    },
+    {
+      "epoch": 0.26473799126637554,
+      "grad_norm": 0.1557198166847229,
+      "learning_rate": 4.313731466772216e-05,
+      "loss": 0.17512214183807373,
+      "step": 1455
+    },
+    {
+      "epoch": 0.26564774381368267,
+      "grad_norm": 0.17263570427894592,
+      "learning_rate": 4.308657684018299e-05,
+      "loss": 0.16248074769973755,
+      "step": 1460
+    },
+    {
+      "epoch": 0.2665574963609898,
+      "grad_norm": 0.17135761678218842,
+      "learning_rate": 4.303568221953521e-05,
+      "loss": 0.16605921983718872,
+      "step": 1465
+    },
+    {
+      "epoch": 0.26746724890829693,
+      "grad_norm": 0.14322632551193237,
+      "learning_rate": 4.2984631246985897e-05,
+      "loss": 0.1610772728919983,
+      "step": 1470
+    },
+    {
+      "epoch": 0.26837700145560406,
+      "grad_norm": 0.18852312862873077,
+      "learning_rate": 4.2933424365097564e-05,
+      "loss": 0.1686462163925171,
+      "step": 1475
+    },
+    {
+      "epoch": 0.2692867540029112,
+      "grad_norm": 0.1780245155096054,
+      "learning_rate": 4.2882062017784294e-05,
+      "loss": 0.16953932046890258,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2701965065502183,
+      "grad_norm": 0.180568665266037,
+      "learning_rate": 4.2830544650307895e-05,
+      "loss": 0.16442664861679077,
+      "step": 1485
+    },
+    {
+      "epoch": 0.27110625909752545,
+      "grad_norm": 0.16876435279846191,
+      "learning_rate": 4.277887270927407e-05,
+      "loss": 0.17128173112869263,
+      "step": 1490
+    },
+    {
+      "epoch": 0.2720160116448326,
+      "grad_norm": 0.164053276181221,
+      "learning_rate": 4.2727046642628513e-05,
+      "loss": 0.16331382989883422,
+      "step": 1495
+    },
+    {
+      "epoch": 0.27292576419213976,
+      "grad_norm": 0.14577528834342957,
+      "learning_rate": 4.267506689965305e-05,
+      "loss": 0.1638316035270691,
+      "step": 1500
+    },
+    {
+      "epoch": 0.2738355167394469,
+      "grad_norm": 0.1648740917444229,
+      "learning_rate": 4.262293393096171e-05,
+      "loss": 0.15332664251327516,
+      "step": 1505
+    },
+    {
+      "epoch": 0.274745269286754,
+      "grad_norm": 0.16445094347000122,
+      "learning_rate": 4.257064818849685e-05,
+      "loss": 0.1706634521484375,
+      "step": 1510
+    },
+    {
+      "epoch": 0.27565502183406115,
+      "grad_norm": 0.1584935486316681,
+      "learning_rate": 4.251821012552524e-05,
+      "loss": 0.1684114694595337,
+      "step": 1515
+    },
+    {
+      "epoch": 0.2765647743813683,
+      "grad_norm": 0.17215611040592194,
+      "learning_rate": 4.24656201966341e-05,
+      "loss": 0.15594131946563722,
+      "step": 1520
+    },
+    {
+      "epoch": 0.2774745269286754,
+      "grad_norm": 0.15945589542388916,
+      "learning_rate": 4.2412878857727214e-05,
+      "loss": 0.1686659574508667,
+      "step": 1525
+    },
+    {
+      "epoch": 0.27838427947598254,
+      "grad_norm": 0.16103951632976532,
+      "learning_rate": 4.2359986566020906e-05,
+      "loss": 0.17779340744018554,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2792940320232897,
+      "grad_norm": 0.1770307570695877,
+      "learning_rate": 4.230694378004014e-05,
+      "loss": 0.16786882877349854,
+      "step": 1535
+    },
+    {
+      "epoch": 0.2802037845705968,
+      "grad_norm": 0.16225053369998932,
+      "learning_rate": 4.2253750959614504e-05,
+      "loss": 0.16558897495269775,
+      "step": 1540
+    },
+    {
+      "epoch": 0.28111353711790393,
+      "grad_norm": 0.27213969826698303,
+      "learning_rate": 4.220040856587425e-05,
+      "loss": 0.1641119599342346,
+      "step": 1545
+    },
+    {
+      "epoch": 0.28202328966521106,
+      "grad_norm": 0.1773071587085724,
+      "learning_rate": 4.2146917061246284e-05,
+      "loss": 0.16919140815734862,
+      "step": 1550
+    },
+    {
+      "epoch": 0.2829330422125182,
+      "grad_norm": 0.15519705414772034,
+      "learning_rate": 4.209327690945014e-05,
+      "loss": 0.15501506328582765,
+      "step": 1555
+    },
+    {
+      "epoch": 0.2838427947598253,
+      "grad_norm": 0.19921597838401794,
+      "learning_rate": 4.203948857549402e-05,
+      "loss": 0.1690821886062622,
+      "step": 1560
+    },
+    {
+      "epoch": 0.28475254730713245,
+      "grad_norm": 0.15417630970478058,
+      "learning_rate": 4.1985552525670696e-05,
+      "loss": 0.1675640344619751,
+      "step": 1565
+    },
+    {
+      "epoch": 0.2856622998544396,
+      "grad_norm": 0.1739572137594223,
+      "learning_rate": 4.193146922755348e-05,
+      "loss": 0.16738017797470092,
+      "step": 1570
+    },
+    {
+      "epoch": 0.2865720524017467,
+      "grad_norm": 0.1384361982345581,
+      "learning_rate": 4.187723914999221e-05,
+      "loss": 0.16802358627319336,
+      "step": 1575
+    },
+    {
+      "epoch": 0.28748180494905384,
+      "grad_norm": 0.1491454839706421,
+      "learning_rate": 4.182286276310915e-05,
+      "loss": 0.1619583249092102,
+      "step": 1580
+    },
+    {
+      "epoch": 0.288391557496361,
+      "grad_norm": 0.15831919014453888,
+      "learning_rate": 4.176834053829492e-05,
+      "loss": 0.1625199794769287,
+      "step": 1585
+    },
+    {
+      "epoch": 0.2893013100436681,
+      "grad_norm": 0.16265396773815155,
+      "learning_rate": 4.1713672948204416e-05,
+      "loss": 0.16718552112579346,
+      "step": 1590
+    },
+    {
+      "epoch": 0.29021106259097523,
+      "grad_norm": 0.15153461694717407,
+      "learning_rate": 4.1658860466752714e-05,
+      "loss": 0.15979087352752686,
+      "step": 1595
+    },
+    {
+      "epoch": 0.29112081513828236,
+      "grad_norm": 0.1620412915945053,
+      "learning_rate": 4.160390356911096e-05,
+      "loss": 0.16103557348251343,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2920305676855895,
+      "grad_norm": 0.16673807799816132,
+      "learning_rate": 4.154880273170223e-05,
+      "loss": 0.16394708156585694,
+      "step": 1605
+    },
+    {
+      "epoch": 0.2929403202328967,
+      "grad_norm": 0.14834867417812347,
+      "learning_rate": 4.149355843219744e-05,
+      "loss": 0.15916435718536376,
+      "step": 1610
+    },
+    {
+      "epoch": 0.2938500727802038,
+      "grad_norm": 0.16977964341640472,
+      "learning_rate": 4.143817114951119e-05,
+      "loss": 0.16538127660751342,
+      "step": 1615
+    },
+    {
+      "epoch": 0.29475982532751094,
+      "grad_norm": 0.17986875772476196,
+      "learning_rate": 4.138264136379756e-05,
+      "loss": 0.15514618158340454,
+      "step": 1620
+    },
+    {
+      "epoch": 0.29566957787481807,
+      "grad_norm": 0.15794920921325684,
+      "learning_rate": 4.132696955644605e-05,
+      "loss": 0.15992183685302735,
+      "step": 1625
+    },
+    {
+      "epoch": 0.2965793304221252,
+      "grad_norm": 0.19955399632453918,
+      "learning_rate": 4.127115621007731e-05,
+      "loss": 0.16362056732177735,
+      "step": 1630
+    },
+    {
+      "epoch": 0.29748908296943233,
+      "grad_norm": 0.1352023035287857,
+      "learning_rate": 4.121520180853903e-05,
+      "loss": 0.15631601810455323,
+      "step": 1635
+    },
+    {
+      "epoch": 0.29839883551673946,
+      "grad_norm": 0.15340781211853027,
+      "learning_rate": 4.1159106836901674e-05,
+      "loss": 0.1571858048439026,
+      "step": 1640
+    },
+    {
+      "epoch": 0.2993085880640466,
+      "grad_norm": 0.15311770141124725,
+      "learning_rate": 4.110287178145433e-05,
+      "loss": 0.16082344055175782,
+      "step": 1645
+    },
+    {
+      "epoch": 0.3002183406113537,
+      "grad_norm": 0.17811627686023712,
+      "learning_rate": 4.10464971297005e-05,
+      "loss": 0.16117215156555176,
+      "step": 1650
+    },
+    {
+      "epoch": 0.30112809315866085,
+      "grad_norm": 0.21060039103031158,
+      "learning_rate": 4.0989983370353805e-05,
+      "loss": 0.15838587284088135,
+      "step": 1655
+    },
+    {
+      "epoch": 0.302037845705968,
+      "grad_norm": 0.155836820602417,
+      "learning_rate": 4.093333099333383e-05,
+      "loss": 0.16648870706558228,
+      "step": 1660
+    },
+    {
+      "epoch": 0.3029475982532751,
+      "grad_norm": 0.13711698353290558,
+      "learning_rate": 4.0876540489761826e-05,
+      "loss": 0.16899349689483642,
+      "step": 1665
+    },
+    {
+      "epoch": 0.30385735080058224,
+      "grad_norm": 0.15162716805934906,
+      "learning_rate": 4.0819612351956485e-05,
+      "loss": 0.16574090719223022,
+      "step": 1670
+    },
+    {
+      "epoch": 0.30476710334788937,
+      "grad_norm": 0.15016348659992218,
+      "learning_rate": 4.0762547073429615e-05,
+      "loss": 0.1689780354499817,
+      "step": 1675
+    },
+    {
+      "epoch": 0.3056768558951965,
+      "grad_norm": 0.15182986855506897,
+      "learning_rate": 4.070534514888194e-05,
+      "loss": 0.1593686819076538,
+      "step": 1680
+    },
+    {
+      "epoch": 0.3065866084425036,
+      "grad_norm": 0.15648750960826874,
+      "learning_rate": 4.0648007074198765e-05,
+      "loss": 0.16436235904693602,
+      "step": 1685
+    },
+    {
+      "epoch": 0.30749636098981076,
+      "grad_norm": 0.18339484930038452,
+      "learning_rate": 4.0590533346445665e-05,
+      "loss": 0.1678077220916748,
+      "step": 1690
+    },
+    {
+      "epoch": 0.3084061135371179,
+      "grad_norm": 0.16426527500152588,
+      "learning_rate": 4.053292446386422e-05,
+      "loss": 0.1689227342605591,
+      "step": 1695
+    },
+    {
+      "epoch": 0.309315866084425,
+      "grad_norm": 0.16129335761070251,
+      "learning_rate": 4.047518092586766e-05,
+      "loss": 0.16592445373535156,
+      "step": 1700
+    },
+    {
+      "epoch": 0.31022561863173215,
+      "grad_norm": 0.15512363612651825,
+      "learning_rate": 4.041730323303654e-05,
+      "loss": 0.16142364740371704,
+      "step": 1705
+    },
+    {
+      "epoch": 0.3111353711790393,
+      "grad_norm": 0.159842386841774,
+      "learning_rate": 4.0359291887114425e-05,
+      "loss": 0.1702875852584839,
+      "step": 1710
+    },
+    {
+      "epoch": 0.3120451237263464,
+      "grad_norm": 0.19558854401111603,
+      "learning_rate": 4.030114739100352e-05,
+      "loss": 0.15966148376464845,
+      "step": 1715
+    },
+    {
+      "epoch": 0.3129548762736536,
+      "grad_norm": 0.1577496975660324,
+      "learning_rate": 4.024287024876029e-05,
+      "loss": 0.1620358943939209,
+      "step": 1720
+    },
+    {
+      "epoch": 0.3138646288209607,
+      "grad_norm": 0.1629355251789093,
+      "learning_rate": 4.0184460965591144e-05,
+      "loss": 0.16511552333831786,
+      "step": 1725
+    },
+    {
+      "epoch": 0.31477438136826785,
+      "grad_norm": 0.17060767114162445,
+      "learning_rate": 4.0125920047848e-05,
+      "loss": 0.15672838687896729,
+      "step": 1730
+    },
+    {
+      "epoch": 0.315684133915575,
+      "grad_norm": 0.22447620332241058,
+      "learning_rate": 4.006724800302394e-05,
+      "loss": 0.15339784622192382,
+      "step": 1735
+    },
+    {
+      "epoch": 0.3165938864628821,
+      "grad_norm": 0.14572037756443024,
+      "learning_rate": 4.000844533974878e-05,
+      "loss": 0.16566959619522095,
+      "step": 1740
+    },
+    {
+      "epoch": 0.31750363901018924,
+      "grad_norm": 0.15915483236312866,
+      "learning_rate": 3.9949512567784684e-05,
+      "loss": 0.16153957843780517,
+      "step": 1745
+    },
+    {
+      "epoch": 0.3184133915574964,
+      "grad_norm": 0.1668540984392166,
+      "learning_rate": 3.9890450198021704e-05,
+      "loss": 0.1659809947013855,
+      "step": 1750
+    },
+    {
+      "epoch": 0.3193231441048035,
+      "grad_norm": 0.16612035036087036,
+      "learning_rate": 3.983125874247341e-05,
+      "loss": 0.16941241025924683,
+      "step": 1755
+    },
+    {
+      "epoch": 0.32023289665211063,
+      "grad_norm": 0.15163679420948029,
+      "learning_rate": 3.9771938714272407e-05,
+      "loss": 0.16053590774536133,
+      "step": 1760
+    },
+    {
+      "epoch": 0.32114264919941776,
+      "grad_norm": 0.1797824203968048,
+      "learning_rate": 3.97124906276659e-05,
+      "loss": 0.1667110800743103,
+      "step": 1765
+    },
+    {
+      "epoch": 0.3220524017467249,
+      "grad_norm": 0.15076608955860138,
+      "learning_rate": 3.9652914998011237e-05,
+      "loss": 0.1607860803604126,
+      "step": 1770
+    },
+    {
+      "epoch": 0.322962154294032,
+      "grad_norm": 0.16523587703704834,
+      "learning_rate": 3.959321234177144e-05,
+      "loss": 0.16515827178955078,
+      "step": 1775
+    },
+    {
+      "epoch": 0.32387190684133915,
+      "grad_norm": 0.22065149247646332,
+      "learning_rate": 3.9533383176510746e-05,
+      "loss": 0.1618957757949829,
+      "step": 1780
+    },
+    {
+      "epoch": 0.3247816593886463,
+      "grad_norm": 0.16426463425159454,
+      "learning_rate": 3.9473428020890066e-05,
+      "loss": 0.15763382911682128,
+      "step": 1785
+    },
+    {
+      "epoch": 0.3256914119359534,
+      "grad_norm": 0.16474904119968414,
+      "learning_rate": 3.941334739466257e-05,
+      "loss": 0.15135571956634522,
+      "step": 1790
+    },
+    {
+      "epoch": 0.32660116448326054,
+      "grad_norm": 0.16746412217617035,
+      "learning_rate": 3.935314181866909e-05,
+      "loss": 0.15925389528274536,
+      "step": 1795
+    },
+    {
+      "epoch": 0.32751091703056767,
+      "grad_norm": 0.17819371819496155,
+      "learning_rate": 3.929281181483369e-05,
+      "loss": 0.1598669171333313,
+      "step": 1800
+    },
+    {
+      "epoch": 0.3284206695778748,
+      "grad_norm": 0.1816040277481079,
+      "learning_rate": 3.923235790615907e-05,
+      "loss": 0.1652522087097168,
+      "step": 1805
+    },
+    {
+      "epoch": 0.32933042212518193,
+      "grad_norm": 0.14846695959568024,
+      "learning_rate": 3.917178061672211e-05,
+      "loss": 0.16665585041046144,
+      "step": 1810
+    },
+    {
+      "epoch": 0.33024017467248906,
+      "grad_norm": 0.1734926551580429,
+      "learning_rate": 3.911108047166924e-05,
+      "loss": 0.16069791316986085,
+      "step": 1815
+    },
+    {
+      "epoch": 0.3311499272197962,
+      "grad_norm": 0.16154922544956207,
+      "learning_rate": 3.905025799721194e-05,
+      "loss": 0.16114097833633423,
+      "step": 1820
+    },
+    {
+      "epoch": 0.3320596797671033,
+      "grad_norm": 0.1538771390914917,
+      "learning_rate": 3.898931372062217e-05,
+      "loss": 0.1602831244468689,
+      "step": 1825
+    },
+    {
+      "epoch": 0.3329694323144105,
+      "grad_norm": 0.14036566019058228,
+      "learning_rate": 3.892824817022781e-05,
+      "loss": 0.1502395749092102,
+      "step": 1830
+    },
+    {
+      "epoch": 0.33387918486171764,
+      "grad_norm": 0.19212059676647186,
+      "learning_rate": 3.886706187540804e-05,
+      "loss": 0.16265250444412233,
+      "step": 1835
+    },
+    {
+      "epoch": 0.33478893740902477,
+      "grad_norm": 0.17410333454608917,
+      "learning_rate": 3.880575536658881e-05,
+      "loss": 0.15689224004745483,
+      "step": 1840
+    },
+    {
+      "epoch": 0.3356986899563319,
+      "grad_norm": 0.15165294706821442,
+      "learning_rate": 3.874432917523817e-05,
+      "loss": 0.15033140182495117,
+      "step": 1845
+    },
+    {
+      "epoch": 0.336608442503639,
+      "grad_norm": 0.16166730225086212,
+      "learning_rate": 3.8682783833861736e-05,
+      "loss": 0.16896235942840576,
+      "step": 1850
+    },
+    {
+      "epoch": 0.33751819505094616,
+      "grad_norm": 0.16497021913528442,
+      "learning_rate": 3.8621119875998026e-05,
+      "loss": 0.1600774645805359,
+      "step": 1855
+    },
+    {
+      "epoch": 0.3384279475982533,
+      "grad_norm": 0.17264948785305023,
+      "learning_rate": 3.855933783621384e-05,
+      "loss": 0.16947593688964843,
+      "step": 1860
+    },
+    {
+      "epoch": 0.3393377001455604,
+      "grad_norm": 0.16870704293251038,
+      "learning_rate": 3.8497438250099636e-05,
+      "loss": 0.16062095165252685,
+      "step": 1865
+    },
+    {
+      "epoch": 0.34024745269286755,
+      "grad_norm": 0.16644036769866943,
+      "learning_rate": 3.843542165426492e-05,
+      "loss": 0.16015599966049193,
+      "step": 1870
+    },
+    {
+      "epoch": 0.3411572052401747,
+      "grad_norm": 0.1626352220773697,
+      "learning_rate": 3.837328858633349e-05,
+      "loss": 0.17444703578948975,
+      "step": 1875
+    },
+    {
+      "epoch": 0.3420669577874818,
+      "grad_norm": 0.1427375227212906,
+      "learning_rate": 3.83110395849389e-05,
+      "loss": 0.1589805006980896,
+      "step": 1880
+    },
+    {
+      "epoch": 0.34297671033478894,
+      "grad_norm": 0.17840255796909332,
+      "learning_rate": 3.824867518971973e-05,
+      "loss": 0.15953952074050903,
+      "step": 1885
+    },
+    {
+      "epoch": 0.34388646288209607,
+      "grad_norm": 0.16998249292373657,
+      "learning_rate": 3.818619594131489e-05,
+      "loss": 0.16027032136917113,
+      "step": 1890
+    },
+    {
+      "epoch": 0.3447962154294032,
+      "grad_norm": 0.14950257539749146,
+      "learning_rate": 3.812360238135897e-05,
+      "loss": 0.15335670709609986,
+      "step": 1895
+    },
+    {
+      "epoch": 0.3457059679767103,
+      "grad_norm": 0.1678011417388916,
+      "learning_rate": 3.806089505247752e-05,
+      "loss": 0.1560648798942566,
+      "step": 1900
+    },
+    {
+      "epoch": 0.34661572052401746,
+      "grad_norm": 0.17944541573524475,
+      "learning_rate": 3.799807449828238e-05,
+      "loss": 0.16072254180908202,
+      "step": 1905
+    },
+    {
+      "epoch": 0.3475254730713246,
+      "grad_norm": 0.166817307472229,
+      "learning_rate": 3.793514126336691e-05,
+      "loss": 0.1542820692062378,
+      "step": 1910
+    },
+    {
+      "epoch": 0.3484352256186317,
+      "grad_norm": 0.16047626733779907,
+      "learning_rate": 3.787209589330134e-05,
+      "loss": 0.16092092990875245,
+      "step": 1915
+    },
+    {
+      "epoch": 0.34934497816593885,
+      "grad_norm": 0.16478900611400604,
+      "learning_rate": 3.7808938934627965e-05,
+      "loss": 0.16765867471694945,
+      "step": 1920
+    },
+    {
+      "epoch": 0.350254730713246,
+      "grad_norm": 0.15349514782428741,
+      "learning_rate": 3.774567093485648e-05,
+      "loss": 0.15890377759933472,
+      "step": 1925
+    },
+    {
+      "epoch": 0.3511644832605531,
+      "grad_norm": 0.1515921950340271,
+      "learning_rate": 3.768229244245917e-05,
+      "loss": 0.16668319702148438,
+      "step": 1930
+    },
+    {
+      "epoch": 0.35207423580786024,
+      "grad_norm": 0.16310466825962067,
+      "learning_rate": 3.7618804006866195e-05,
+      "loss": 0.15182652473449706,
+      "step": 1935
+    },
+    {
+      "epoch": 0.3529839883551674,
+      "grad_norm": 0.17294517159461975,
+      "learning_rate": 3.755520617846084e-05,
+      "loss": 0.16287628412246705,
+      "step": 1940
+    },
+    {
+      "epoch": 0.35389374090247455,
+      "grad_norm": 0.1482895463705063,
+      "learning_rate": 3.749149950857467e-05,
+      "loss": 0.15321952104568481,
+      "step": 1945
+    },
+    {
+      "epoch": 0.3548034934497817,
+      "grad_norm": 0.2236029952764511,
+      "learning_rate": 3.7427684549482847e-05,
+      "loss": 0.15403482913970948,
+      "step": 1950
+    },
+    {
+      "epoch": 0.3557132459970888,
+      "grad_norm": 0.20185327529907227,
+      "learning_rate": 3.736376185439927e-05,
+      "loss": 0.1633884072303772,
+      "step": 1955
+    },
+    {
+      "epoch": 0.35662299854439594,
+      "grad_norm": 0.13906247913837433,
+      "learning_rate": 3.7299731977471816e-05,
+      "loss": 0.15925350189208984,
+      "step": 1960
+    },
+    {
+      "epoch": 0.35753275109170307,
+      "grad_norm": 0.18665002286434174,
+      "learning_rate": 3.723559547377751e-05,
+      "loss": 0.1612026572227478,
+      "step": 1965
+    },
+    {
+      "epoch": 0.3584425036390102,
+      "grad_norm": 0.16913433372974396,
+      "learning_rate": 3.717135289931774e-05,
+      "loss": 0.15479494333267213,
+      "step": 1970
+    },
+    {
+      "epoch": 0.35935225618631733,
+      "grad_norm": 0.1620066910982132,
+      "learning_rate": 3.7107004811013434e-05,
+      "loss": 0.1604058027267456,
+      "step": 1975
+    },
+    {
+      "epoch": 0.36026200873362446,
+      "grad_norm": 0.16838301718235016,
+      "learning_rate": 3.704255176670021e-05,
+      "loss": 0.15335073471069335,
+      "step": 1980
+    },
+    {
+      "epoch": 0.3611717612809316,
+      "grad_norm": 0.3054695427417755,
+      "learning_rate": 3.6977994325123535e-05,
+      "loss": 0.16558053493499755,
+      "step": 1985
+    },
+    {
+      "epoch": 0.3620815138282387,
+      "grad_norm": 0.1526716649532318,
+      "learning_rate": 3.6913333045933934e-05,
+      "loss": 0.16148923635482787,
+      "step": 1990
+    },
+    {
+      "epoch": 0.36299126637554585,
+      "grad_norm": 0.15328513085842133,
+      "learning_rate": 3.684856848968209e-05,
+      "loss": 0.1553613781929016,
+      "step": 1995
+    },
+    {
+      "epoch": 0.363901018922853,
+      "grad_norm": 0.16129714250564575,
+      "learning_rate": 3.6783701217813995e-05,
+      "loss": 0.16724612712860107,
+      "step": 2000
+    },
+    {
+      "epoch": 0.3648107714701601,
+      "grad_norm": 0.15715539455413818,
+      "learning_rate": 3.6718731792666086e-05,
+      "loss": 0.15867922306060792,
+      "step": 2005
+    },
+    {
+      "epoch": 0.36572052401746724,
+      "grad_norm": 0.15569166839122772,
+      "learning_rate": 3.6653660777460366e-05,
+      "loss": 0.1552058696746826,
+      "step": 2010
+    },
+    {
+      "epoch": 0.36663027656477437,
+      "grad_norm": 0.16223010420799255,
+      "learning_rate": 3.6588488736299535e-05,
+      "loss": 0.1583200454711914,
+      "step": 2015
+    },
+    {
+      "epoch": 0.3675400291120815,
+      "grad_norm": 0.18441995978355408,
+      "learning_rate": 3.652321623416209e-05,
+      "loss": 0.15050662755966188,
+      "step": 2020
+    },
+    {
+      "epoch": 0.36844978165938863,
+      "grad_norm": 0.13792674243450165,
+      "learning_rate": 3.645784383689742e-05,
+      "loss": 0.15458759069442748,
+      "step": 2025
+    },
+    {
+      "epoch": 0.36935953420669576,
+      "grad_norm": 0.14993111789226532,
+      "learning_rate": 3.639237211122091e-05,
+      "loss": 0.15926222801208495,
+      "step": 2030
+    },
+    {
+      "epoch": 0.3702692867540029,
+      "grad_norm": 0.16815930604934692,
+      "learning_rate": 3.632680162470904e-05,
+      "loss": 0.15524441003799438,
+      "step": 2035
+    },
+    {
+      "epoch": 0.37117903930131,
+      "grad_norm": 0.13312821090221405,
+      "learning_rate": 3.626113294579441e-05,
+      "loss": 0.15883516073226928,
+      "step": 2040
+    },
+    {
+      "epoch": 0.37208879184861715,
+      "grad_norm": 0.16838273406028748,
+      "learning_rate": 3.619536664376091e-05,
+      "loss": 0.15829603672027587,
+      "step": 2045
+    },
+    {
+      "epoch": 0.37299854439592434,
+      "grad_norm": 0.14706873893737793,
+      "learning_rate": 3.612950328873869e-05,
+      "loss": 0.15644397735595703,
+      "step": 2050
+    },
+    {
+      "epoch": 0.37390829694323147,
+      "grad_norm": 0.1644199639558792,
+      "learning_rate": 3.606354345169926e-05,
+      "loss": 0.15858219861984252,
+      "step": 2055
+    },
+    {
+      "epoch": 0.3748180494905386,
+      "grad_norm": 0.18077051639556885,
+      "learning_rate": 3.599748770445055e-05,
+      "loss": 0.1641286849975586,
+      "step": 2060
+    },
+    {
+      "epoch": 0.3757278020378457,
+      "grad_norm": 0.16329127550125122,
+      "learning_rate": 3.5931336619631914e-05,
+      "loss": 0.15027186870574952,
+      "step": 2065
+    },
+    {
+      "epoch": 0.37663755458515286,
+      "grad_norm": 0.16346783936023712,
+      "learning_rate": 3.586509077070922e-05,
+      "loss": 0.1558641314506531,
+      "step": 2070
+    },
+    {
+      "epoch": 0.37754730713246,
+      "grad_norm": 0.1727602630853653,
+      "learning_rate": 3.5798750731969834e-05,
+      "loss": 0.15390506982803345,
+      "step": 2075
+    },
+    {
+      "epoch": 0.3784570596797671,
+      "grad_norm": 0.7598192691802979,
+      "learning_rate": 3.5732317078517654e-05,
+      "loss": 0.1533232808113098,
+      "step": 2080
+    },
+    {
+      "epoch": 0.37936681222707425,
+      "grad_norm": 0.1433355212211609,
+      "learning_rate": 3.5665790386268124e-05,
+      "loss": 0.15560413599014283,
+      "step": 2085
+    },
+    {
+      "epoch": 0.3802765647743814,
+      "grad_norm": 0.18439625203609467,
+      "learning_rate": 3.559917123194325e-05,
+      "loss": 0.16695556640625,
+      "step": 2090
+    },
+    {
+      "epoch": 0.3811863173216885,
+      "grad_norm": 0.1693502813577652,
+      "learning_rate": 3.55324601930666e-05,
+      "loss": 0.15957870483398437,
+      "step": 2095
+    },
+    {
+      "epoch": 0.38209606986899564,
+      "grad_norm": 0.17776088416576385,
+      "learning_rate": 3.54656578479583e-05,
+      "loss": 0.1527492880821228,
+      "step": 2100
+    },
+    {
+      "epoch": 0.38300582241630277,
+      "grad_norm": 0.15993724763393402,
+      "learning_rate": 3.539876477572998e-05,
+      "loss": 0.1567505717277527,
+      "step": 2105
+    },
+    {
+      "epoch": 0.3839155749636099,
+      "grad_norm": 0.17067375779151917,
+      "learning_rate": 3.533178155627981e-05,
+      "loss": 0.14660797119140626,
+      "step": 2110
+    },
+    {
+      "epoch": 0.384825327510917,
+      "grad_norm": 0.20239882171154022,
+      "learning_rate": 3.526470877028745e-05,
+      "loss": 0.1596767544746399,
+      "step": 2115
+    },
+    {
+      "epoch": 0.38573508005822416,
+      "grad_norm": 0.1863643079996109,
+      "learning_rate": 3.5197546999209005e-05,
+      "loss": 0.15738571882247926,
+      "step": 2120
+    },
+    {
+      "epoch": 0.3866448326055313,
+      "grad_norm": 0.16994133591651917,
+      "learning_rate": 3.5130296825272014e-05,
+      "loss": 0.16255316734313965,
+      "step": 2125
+    },
+    {
+      "epoch": 0.3875545851528384,
+      "grad_norm": 0.18703415989875793,
+      "learning_rate": 3.5062958831470355e-05,
+      "loss": 0.15206334590911866,
+      "step": 2130
+    },
+    {
+      "epoch": 0.38846433770014555,
+      "grad_norm": 0.15433982014656067,
+      "learning_rate": 3.4995533601559226e-05,
+      "loss": 0.1590178370475769,
+      "step": 2135
+    },
+    {
+      "epoch": 0.3893740902474527,
+      "grad_norm": 0.16498146951198578,
+      "learning_rate": 3.4928021720050104e-05,
+      "loss": 0.14759145975112914,
+      "step": 2140
+    },
+    {
+      "epoch": 0.3902838427947598,
+      "grad_norm": 0.17880478501319885,
+      "learning_rate": 3.486042377220562e-05,
+      "loss": 0.1642458915710449,
+      "step": 2145
+    },
+    {
+      "epoch": 0.39119359534206694,
+      "grad_norm": 0.14700061082839966,
+      "learning_rate": 3.479274034403455e-05,
+      "loss": 0.16105138063430785,
+      "step": 2150
+    },
+    {
+      "epoch": 0.39210334788937407,
+      "grad_norm": 0.1620762050151825,
+      "learning_rate": 3.472497202228664e-05,
+      "loss": 0.15104985237121582,
+      "step": 2155
+    },
+    {
+      "epoch": 0.3930131004366812,
+      "grad_norm": 0.1625058799982071,
+      "learning_rate": 3.4657119394447654e-05,
+      "loss": 0.16145485639572144,
+      "step": 2160
+    },
+    {
+      "epoch": 0.3939228529839884,
+      "grad_norm": 0.1631549596786499,
+      "learning_rate": 3.458918304873417e-05,
+      "loss": 0.16712255477905275,
+      "step": 2165
+    },
+    {
+      "epoch": 0.3948326055312955,
+      "grad_norm": 0.16041551530361176,
+      "learning_rate": 3.452116357408853e-05,
+      "loss": 0.15118330717086792,
+      "step": 2170
+    },
+    {
+      "epoch": 0.39574235807860264,
+      "grad_norm": 0.16692611575126648,
+      "learning_rate": 3.44530615601737e-05,
+      "loss": 0.16982550621032716,
+      "step": 2175
+    },
+    {
+      "epoch": 0.39665211062590977,
+      "grad_norm": 0.16082268953323364,
+      "learning_rate": 3.438487759736821e-05,
+      "loss": 0.1513260006904602,
+      "step": 2180
+    },
+    {
+      "epoch": 0.3975618631732169,
+      "grad_norm": 0.1474589854478836,
+      "learning_rate": 3.4316612276761004e-05,
+      "loss": 0.14968743324279785,
+      "step": 2185
+    },
+    {
+      "epoch": 0.39847161572052403,
+      "grad_norm": 0.14531342685222626,
+      "learning_rate": 3.42482661901463e-05,
+      "loss": 0.1563260555267334,
+      "step": 2190
+    },
+    {
+      "epoch": 0.39938136826783116,
+      "grad_norm": 0.16775506734848022,
+      "learning_rate": 3.41798399300185e-05,
+      "loss": 0.14861010313034057,
+      "step": 2195
+    },
+    {
+      "epoch": 0.4002911208151383,
+      "grad_norm": 0.15065217018127441,
+      "learning_rate": 3.411133408956703e-05,
+      "loss": 0.15559519529342652,
+      "step": 2200
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.215214662937741e+18,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-2200/training_args.bin b/checkpoint-2200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-2200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/checkpoint-2300/README.md b/checkpoint-2300/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-2300/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-2300/adapter_config.json b/checkpoint-2300/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-2300/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-2300/adapter_model.safetensors b/checkpoint-2300/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..9cf89c45643ab80f9c5df5c58b97003229ccccb7
--- /dev/null
+++ b/checkpoint-2300/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e14803de733438d76bcf1a7df87de4d28f4de3fdd96c32248bc74a90cf182a62
+size 169741912
diff --git a/checkpoint-2300/chat_template.jinja b/checkpoint-2300/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-2300/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-2300/optimizer.pt b/checkpoint-2300/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..029f122f22dcc59a9bcd5b9675be3fed281d17d1
--- /dev/null
+++ b/checkpoint-2300/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b81919ff41dce57dd507f064720073b68b2521c73bf4075c5bf87bc504864950
+size 72807355
diff --git a/checkpoint-2300/processor_config.json b/checkpoint-2300/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-2300/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-2300/rng_state.pth b/checkpoint-2300/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-2300/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-2300/scheduler.pt b/checkpoint-2300/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2a725a52fc0df173be64487a37eea58eb300f6cd
--- /dev/null
+++ b/checkpoint-2300/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5420b6c93e7edddec2910b70fdf481953abc7fb5f197a0f487e43331d572974b
+size 1465
diff --git a/checkpoint-2300/tokenizer.json b/checkpoint-2300/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-2300/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-2300/tokenizer_config.json b/checkpoint-2300/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-2300/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-2300/trainer_state.json b/checkpoint-2300/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..0d82063abdb1fc65a5c80de19cc5bcb352a46a65
--- /dev/null
+++ b/checkpoint-2300/trainer_state.json
@@ -0,0 +1,3262 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.41848617176128095,
+  "eval_steps": 100,
+  "global_step": 2300,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    },
+    {
+      "epoch": 0.03729985443959243,
+      "grad_norm": 0.061678580939769745,
+      "learning_rate": 4.999340748636462e-05,
+      "loss": 0.24956226348876953,
+      "step": 205
+    },
+    {
+      "epoch": 0.03820960698689956,
+      "grad_norm": 0.07328873127698898,
+      "learning_rate": 4.999160884067051e-05,
+      "loss": 0.26169676780700685,
+      "step": 210
+    },
+    {
+      "epoch": 0.0391193595342067,
+      "grad_norm": 0.08287990838289261,
+      "learning_rate": 4.9989593541926246e-05,
+      "loss": 0.2574604034423828,
+      "step": 215
+    },
+    {
+      "epoch": 0.04002911208151383,
+      "grad_norm": 0.06787359714508057,
+      "learning_rate": 4.9987361607602525e-05,
+      "loss": 0.25351409912109374,
+      "step": 220
+    },
+    {
+      "epoch": 0.04093886462882096,
+      "grad_norm": 0.06695502996444702,
+      "learning_rate": 4.998491305704805e-05,
+      "loss": 0.24522039890289307,
+      "step": 225
+    },
+    {
+      "epoch": 0.041848617176128096,
+      "grad_norm": 0.08872214704751968,
+      "learning_rate": 4.9982247911489375e-05,
+      "loss": 0.2581867933273315,
+      "step": 230
+    },
+    {
+      "epoch": 0.042758369723435226,
+      "grad_norm": 0.07637131959199905,
+      "learning_rate": 4.9979366194030743e-05,
+      "loss": 0.25569658279418944,
+      "step": 235
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 0.08158119022846222,
+      "learning_rate": 4.997626792965385e-05,
+      "loss": 0.2529409646987915,
+      "step": 240
+    },
+    {
+      "epoch": 0.04457787481804949,
+      "grad_norm": 0.07529161125421524,
+      "learning_rate": 4.997295314521766e-05,
+      "loss": 0.24049024581909179,
+      "step": 245
+    },
+    {
+      "epoch": 0.04548762736535662,
+      "grad_norm": 0.08860139548778534,
+      "learning_rate": 4.996942186945813e-05,
+      "loss": 0.2490522861480713,
+      "step": 250
+    },
+    {
+      "epoch": 0.04639737991266375,
+      "grad_norm": 0.0850321501493454,
+      "learning_rate": 4.9965674132988005e-05,
+      "loss": 0.24180831909179687,
+      "step": 255
+    },
+    {
+      "epoch": 0.04730713245997089,
+      "grad_norm": 0.07556115090847015,
+      "learning_rate": 4.996170996829653e-05,
+      "loss": 0.2509631872177124,
+      "step": 260
+    },
+    {
+      "epoch": 0.04821688500727802,
+      "grad_norm": 0.07971206307411194,
+      "learning_rate": 4.995752940974918e-05,
+      "loss": 0.24398891925811766,
+      "step": 265
+    },
+    {
+      "epoch": 0.04912663755458515,
+      "grad_norm": 0.09149336814880371,
+      "learning_rate": 4.9953132493587344e-05,
+      "loss": 0.2300492286682129,
+      "step": 270
+    },
+    {
+      "epoch": 0.050036390101892286,
+      "grad_norm": 0.08265820890665054,
+      "learning_rate": 4.9948519257928034e-05,
+      "loss": 0.24246792793273925,
+      "step": 275
+    },
+    {
+      "epoch": 0.050946142649199416,
+      "grad_norm": 0.10328587144613266,
+      "learning_rate": 4.9943689742763534e-05,
+      "loss": 0.2367171049118042,
+      "step": 280
+    },
+    {
+      "epoch": 0.05185589519650655,
+      "grad_norm": 0.0836917981505394,
+      "learning_rate": 4.993864398996105e-05,
+      "loss": 0.23215813636779786,
+      "step": 285
+    },
+    {
+      "epoch": 0.05276564774381368,
+      "grad_norm": 0.09475161135196686,
+      "learning_rate": 4.99333820432624e-05,
+      "loss": 0.2350748062133789,
+      "step": 290
+    },
+    {
+      "epoch": 0.05367540029112081,
+      "grad_norm": 0.08040128648281097,
+      "learning_rate": 4.992790394828355e-05,
+      "loss": 0.23253886699676513,
+      "step": 295
+    },
+    {
+      "epoch": 0.05458515283842795,
+      "grad_norm": 0.08852150291204453,
+      "learning_rate": 4.992220975251428e-05,
+      "loss": 0.23856515884399415,
+      "step": 300
+    },
+    {
+      "epoch": 0.05549490538573508,
+      "grad_norm": 0.09565229713916779,
+      "learning_rate": 4.991629950531775e-05,
+      "loss": 0.23311660289764405,
+      "step": 305
+    },
+    {
+      "epoch": 0.05640465793304221,
+      "grad_norm": 0.08158160001039505,
+      "learning_rate": 4.991017325793009e-05,
+      "loss": 0.22467944622039795,
+      "step": 310
+    },
+    {
+      "epoch": 0.05731441048034935,
+      "grad_norm": 0.07746429741382599,
+      "learning_rate": 4.990383106345994e-05,
+      "loss": 0.229844069480896,
+      "step": 315
+    },
+    {
+      "epoch": 0.05822416302765648,
+      "grad_norm": 0.08564355969429016,
+      "learning_rate": 4.989727297688797e-05,
+      "loss": 0.22414517402648926,
+      "step": 320
+    },
+    {
+      "epoch": 0.05913391557496361,
+      "grad_norm": 0.07517435401678085,
+      "learning_rate": 4.9890499055066435e-05,
+      "loss": 0.2236532211303711,
+      "step": 325
+    },
+    {
+      "epoch": 0.060043668122270744,
+      "grad_norm": 0.111734539270401,
+      "learning_rate": 4.988350935671869e-05,
+      "loss": 0.21474847793579102,
+      "step": 330
+    },
+    {
+      "epoch": 0.060953420669577874,
+      "grad_norm": 0.09906989336013794,
+      "learning_rate": 4.987630394243866e-05,
+      "loss": 0.23321933746337892,
+      "step": 335
+    },
+    {
+      "epoch": 0.06186317321688501,
+      "grad_norm": 0.10131457448005676,
+      "learning_rate": 4.98688828746903e-05,
+      "loss": 0.2310662031173706,
+      "step": 340
+    },
+    {
+      "epoch": 0.06277292576419213,
+      "grad_norm": 0.09203507006168365,
+      "learning_rate": 4.986124621780708e-05,
+      "loss": 0.22021169662475587,
+      "step": 345
+    },
+    {
+      "epoch": 0.06368267831149928,
+      "grad_norm": 0.09505912661552429,
+      "learning_rate": 4.9853394037991416e-05,
+      "loss": 0.2197155237197876,
+      "step": 350
+    },
+    {
+      "epoch": 0.06459243085880641,
+      "grad_norm": 0.09038657695055008,
+      "learning_rate": 4.984532640331412e-05,
+      "loss": 0.22066287994384765,
+      "step": 355
+    },
+    {
+      "epoch": 0.06550218340611354,
+      "grad_norm": 0.09707064181566238,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 0.22455451488494874,
+      "step": 360
+    },
+    {
+      "epoch": 0.06641193595342067,
+      "grad_norm": 0.10367228090763092,
+      "learning_rate": 4.98285450509961e-05,
+      "loss": 0.21993820667266845,
+      "step": 365
+    },
+    {
+      "epoch": 0.0673216885007278,
+      "grad_norm": 0.12229471653699875,
+      "learning_rate": 4.9819831478833456e-05,
+      "loss": 0.2168867588043213,
+      "step": 370
+    },
+    {
+      "epoch": 0.06823144104803494,
+      "grad_norm": 0.0964592918753624,
+      "learning_rate": 4.981090274276406e-05,
+      "loss": 0.21579203605651856,
+      "step": 375
+    },
+    {
+      "epoch": 0.06914119359534207,
+      "grad_norm": 0.09400496631860733,
+      "learning_rate": 4.980175892019141e-05,
+      "loss": 0.20972180366516113,
+      "step": 380
+    },
+    {
+      "epoch": 0.0700509461426492,
+      "grad_norm": 0.08158645778894424,
+      "learning_rate": 4.9792400090383594e-05,
+      "loss": 0.22148358821868896,
+      "step": 385
+    },
+    {
+      "epoch": 0.07096069868995633,
+      "grad_norm": 0.10916394740343094,
+      "learning_rate": 4.978282633447261e-05,
+      "loss": 0.2214418649673462,
+      "step": 390
+    },
+    {
+      "epoch": 0.07187045123726346,
+      "grad_norm": 0.11138810962438583,
+      "learning_rate": 4.9773037735453636e-05,
+      "loss": 0.21814754009246826,
+      "step": 395
+    },
+    {
+      "epoch": 0.07278020378457059,
+      "grad_norm": 0.10914396494626999,
+      "learning_rate": 4.9763034378184365e-05,
+      "loss": 0.21310818195343018,
+      "step": 400
+    },
+    {
+      "epoch": 0.07368995633187773,
+      "grad_norm": 0.1043366864323616,
+      "learning_rate": 4.975281634938421e-05,
+      "loss": 0.21266789436340333,
+      "step": 405
+    },
+    {
+      "epoch": 0.07459970887918486,
+      "grad_norm": 0.1036868542432785,
+      "learning_rate": 4.9742383737633594e-05,
+      "loss": 0.21606721878051757,
+      "step": 410
+    },
+    {
+      "epoch": 0.075509461426492,
+      "grad_norm": 0.11640442907810211,
+      "learning_rate": 4.9731736633373144e-05,
+      "loss": 0.21532948017120362,
+      "step": 415
+    },
+    {
+      "epoch": 0.07641921397379912,
+      "grad_norm": 0.11219926178455353,
+      "learning_rate": 4.9720875128902956e-05,
+      "loss": 0.2191627025604248,
+      "step": 420
+    },
+    {
+      "epoch": 0.07732896652110625,
+      "grad_norm": 0.12103637307882309,
+      "learning_rate": 4.970979931838176e-05,
+      "loss": 0.20938868522644044,
+      "step": 425
+    },
+    {
+      "epoch": 0.0782387190684134,
+      "grad_norm": 0.13274189829826355,
+      "learning_rate": 4.96985092978261e-05,
+      "loss": 0.21792960166931152,
+      "step": 430
+    },
+    {
+      "epoch": 0.07914847161572053,
+      "grad_norm": 0.11164513230323792,
+      "learning_rate": 4.968700516510954e-05,
+      "loss": 0.2022618055343628,
+      "step": 435
+    },
+    {
+      "epoch": 0.08005822416302766,
+      "grad_norm": 0.09532847255468369,
+      "learning_rate": 4.967528701996174e-05,
+      "loss": 0.21255812644958497,
+      "step": 440
+    },
+    {
+      "epoch": 0.08096797671033479,
+      "grad_norm": 0.10279258340597153,
+      "learning_rate": 4.96633549639677e-05,
+      "loss": 0.20683050155639648,
+      "step": 445
+    },
+    {
+      "epoch": 0.08187772925764192,
+      "grad_norm": 0.1257462352514267,
+      "learning_rate": 4.965120910056677e-05,
+      "loss": 0.21419920921325683,
+      "step": 450
+    },
+    {
+      "epoch": 0.08278748180494905,
+      "grad_norm": 0.11663137376308441,
+      "learning_rate": 4.963884953505186e-05,
+      "loss": 0.2072287082672119,
+      "step": 455
+    },
+    {
+      "epoch": 0.08369723435225619,
+      "grad_norm": 0.10488224029541016,
+      "learning_rate": 4.96262763745684e-05,
+      "loss": 0.1982678532600403,
+      "step": 460
+    },
+    {
+      "epoch": 0.08460698689956332,
+      "grad_norm": 0.11801692098379135,
+      "learning_rate": 4.961348972811354e-05,
+      "loss": 0.20662031173706055,
+      "step": 465
+    },
+    {
+      "epoch": 0.08551673944687045,
+      "grad_norm": 0.11318827420473099,
+      "learning_rate": 4.96004897065351e-05,
+      "loss": 0.20947303771972656,
+      "step": 470
+    },
+    {
+      "epoch": 0.08642649199417758,
+      "grad_norm": 0.13409486413002014,
+      "learning_rate": 4.95872764225307e-05,
+      "loss": 0.19670876264572143,
+      "step": 475
+    },
+    {
+      "epoch": 0.08733624454148471,
+      "grad_norm": 0.14440792798995972,
+      "learning_rate": 4.957384999064672e-05,
+      "loss": 0.19842848777770997,
+      "step": 480
+    },
+    {
+      "epoch": 0.08824599708879186,
+      "grad_norm": 0.12246996909379959,
+      "learning_rate": 4.956021052727731e-05,
+      "loss": 0.20318071842193602,
+      "step": 485
+    },
+    {
+      "epoch": 0.08915574963609899,
+      "grad_norm": 0.13437233865261078,
+      "learning_rate": 4.954635815066342e-05,
+      "loss": 0.21675212383270265,
+      "step": 490
+    },
+    {
+      "epoch": 0.09006550218340612,
+      "grad_norm": 0.11109672486782074,
+      "learning_rate": 4.9532292980891744e-05,
+      "loss": 0.2100757837295532,
+      "step": 495
+    },
+    {
+      "epoch": 0.09097525473071325,
+      "grad_norm": 0.1388893872499466,
+      "learning_rate": 4.9518015139893675e-05,
+      "loss": 0.20303285121917725,
+      "step": 500
+    },
+    {
+      "epoch": 0.09188500727802038,
+      "grad_norm": 0.13239721953868866,
+      "learning_rate": 4.950352475144427e-05,
+      "loss": 0.2152268409729004,
+      "step": 505
+    },
+    {
+      "epoch": 0.0927947598253275,
+      "grad_norm": 0.12834979593753815,
+      "learning_rate": 4.948882194116119e-05,
+      "loss": 0.20799248218536376,
+      "step": 510
+    },
+    {
+      "epoch": 0.09370451237263465,
+      "grad_norm": 0.11886704713106155,
+      "learning_rate": 4.947390683650354e-05,
+      "loss": 0.20394976139068605,
+      "step": 515
+    },
+    {
+      "epoch": 0.09461426491994178,
+      "grad_norm": 0.11398876458406448,
+      "learning_rate": 4.945877956677083e-05,
+      "loss": 0.2091092586517334,
+      "step": 520
+    },
+    {
+      "epoch": 0.09552401746724891,
+      "grad_norm": 0.1422540694475174,
+      "learning_rate": 4.944344026310186e-05,
+      "loss": 0.19564238786697388,
+      "step": 525
+    },
+    {
+      "epoch": 0.09643377001455604,
+      "grad_norm": 0.11359584331512451,
+      "learning_rate": 4.9427889058473535e-05,
+      "loss": 0.20493624210357667,
+      "step": 530
+    },
+    {
+      "epoch": 0.09734352256186317,
+      "grad_norm": 0.11703553050756454,
+      "learning_rate": 4.941212608769974e-05,
+      "loss": 0.2098615884780884,
+      "step": 535
+    },
+    {
+      "epoch": 0.0982532751091703,
+      "grad_norm": 0.14552047848701477,
+      "learning_rate": 4.939615148743017e-05,
+      "loss": 0.20382182598114013,
+      "step": 540
+    },
+    {
+      "epoch": 0.09916302765647744,
+      "grad_norm": 0.13178016245365143,
+      "learning_rate": 4.937996539614914e-05,
+      "loss": 0.19901862144470214,
+      "step": 545
+    },
+    {
+      "epoch": 0.10007278020378457,
+      "grad_norm": 0.635392427444458,
+      "learning_rate": 4.936356795417439e-05,
+      "loss": 0.20694944858551026,
+      "step": 550
+    },
+    {
+      "epoch": 0.1009825327510917,
+      "grad_norm": 0.15019077062606812,
+      "learning_rate": 4.934695930365586e-05,
+      "loss": 0.19313746690750122,
+      "step": 555
+    },
+    {
+      "epoch": 0.10189228529839883,
+      "grad_norm": 0.12941956520080566,
+      "learning_rate": 4.9330139588574474e-05,
+      "loss": 0.19671722650527954,
+      "step": 560
+    },
+    {
+      "epoch": 0.10280203784570596,
+      "grad_norm": 0.13818831741809845,
+      "learning_rate": 4.931310895474088e-05,
+      "loss": 0.20026786327362062,
+      "step": 565
+    },
+    {
+      "epoch": 0.1037117903930131,
+      "grad_norm": 0.12011194974184036,
+      "learning_rate": 4.929586754979417e-05,
+      "loss": 0.1932437539100647,
+      "step": 570
+    },
+    {
+      "epoch": 0.10462154294032024,
+      "grad_norm": 0.1345364898443222,
+      "learning_rate": 4.9278415523200644e-05,
+      "loss": 0.20245940685272218,
+      "step": 575
+    },
+    {
+      "epoch": 0.10553129548762737,
+      "grad_norm": 0.13281017541885376,
+      "learning_rate": 4.926075302625247e-05,
+      "loss": 0.19864981174468993,
+      "step": 580
+    },
+    {
+      "epoch": 0.1064410480349345,
+      "grad_norm": 0.13465586304664612,
+      "learning_rate": 4.924288021206639e-05,
+      "loss": 0.19573183059692384,
+      "step": 585
+    },
+    {
+      "epoch": 0.10735080058224163,
+      "grad_norm": 0.15225961804389954,
+      "learning_rate": 4.9224797235582396e-05,
+      "loss": 0.19946801662445068,
+      "step": 590
+    },
+    {
+      "epoch": 0.10826055312954876,
+      "grad_norm": 0.12816746532917023,
+      "learning_rate": 4.92065042535624e-05,
+      "loss": 0.19851526021957397,
+      "step": 595
+    },
+    {
+      "epoch": 0.1091703056768559,
+      "grad_norm": 0.13802853226661682,
+      "learning_rate": 4.9188001424588824e-05,
+      "loss": 0.19321763515472412,
+      "step": 600
+    },
+    {
+      "epoch": 0.11008005822416303,
+      "grad_norm": 0.17504797875881195,
+      "learning_rate": 4.9169288909063295e-05,
+      "loss": 0.2032616138458252,
+      "step": 605
+    },
+    {
+      "epoch": 0.11098981077147016,
+      "grad_norm": 0.13544194400310516,
+      "learning_rate": 4.91503668692052e-05,
+      "loss": 0.2011256456375122,
+      "step": 610
+    },
+    {
+      "epoch": 0.11189956331877729,
+      "grad_norm": 1.3976134061813354,
+      "learning_rate": 4.91312354690503e-05,
+      "loss": 0.19916868209838867,
+      "step": 615
+    },
+    {
+      "epoch": 0.11280931586608442,
+      "grad_norm": 0.1465059071779251,
+      "learning_rate": 4.91118948744493e-05,
+      "loss": 0.19487457275390624,
+      "step": 620
+    },
+    {
+      "epoch": 0.11371906841339156,
+      "grad_norm": 0.12103168666362762,
+      "learning_rate": 4.909234525306645e-05,
+      "loss": 0.1907251238822937,
+      "step": 625
+    },
+    {
+      "epoch": 0.1146288209606987,
+      "grad_norm": 0.12660574913024902,
+      "learning_rate": 4.907258677437802e-05,
+      "loss": 0.19327253103256226,
+      "step": 630
+    },
+    {
+      "epoch": 0.11553857350800582,
+      "grad_norm": 0.1347813606262207,
+      "learning_rate": 4.90526196096709e-05,
+      "loss": 0.19637736082077026,
+      "step": 635
+    },
+    {
+      "epoch": 0.11644832605531295,
+      "grad_norm": 0.14953652024269104,
+      "learning_rate": 4.903244393204107e-05,
+      "loss": 0.20325069427490233,
+      "step": 640
+    },
+    {
+      "epoch": 0.11735807860262008,
+      "grad_norm": 0.13936272263526917,
+      "learning_rate": 4.901205991639213e-05,
+      "loss": 0.1930275321006775,
+      "step": 645
+    },
+    {
+      "epoch": 0.11826783114992721,
+      "grad_norm": 0.1448420137166977,
+      "learning_rate": 4.899146773943374e-05,
+      "loss": 0.20026936531066894,
+      "step": 650
+    },
+    {
+      "epoch": 0.11917758369723436,
+      "grad_norm": 0.1312534064054489,
+      "learning_rate": 4.897066757968014e-05,
+      "loss": 0.19062033891677857,
+      "step": 655
+    },
+    {
+      "epoch": 0.12008733624454149,
+      "grad_norm": 0.13644742965698242,
+      "learning_rate": 4.894965961744859e-05,
+      "loss": 0.18719595670700073,
+      "step": 660
+    },
+    {
+      "epoch": 0.12099708879184862,
+      "grad_norm": 0.14276087284088135,
+      "learning_rate": 4.892844403485777e-05,
+      "loss": 0.19784307479858398,
+      "step": 665
+    },
+    {
+      "epoch": 0.12190684133915575,
+      "grad_norm": 0.14735399186611176,
+      "learning_rate": 4.890702101582623e-05,
+      "loss": 0.19163782596588136,
+      "step": 670
+    },
+    {
+      "epoch": 0.12281659388646288,
+      "grad_norm": 0.15742065012454987,
+      "learning_rate": 4.888539074607082e-05,
+      "loss": 0.19312986135482788,
+      "step": 675
+    },
+    {
+      "epoch": 0.12372634643377002,
+      "grad_norm": 0.12917031347751617,
+      "learning_rate": 4.8863553413105025e-05,
+      "loss": 0.20066320896148682,
+      "step": 680
+    },
+    {
+      "epoch": 0.12463609898107715,
+      "grad_norm": 0.1484801322221756,
+      "learning_rate": 4.884150920623737e-05,
+      "loss": 0.20096096992492676,
+      "step": 685
+    },
+    {
+      "epoch": 0.12554585152838427,
+      "grad_norm": 0.1455296128988266,
+      "learning_rate": 4.88192583165698e-05,
+      "loss": 0.20518505573272705,
+      "step": 690
+    },
+    {
+      "epoch": 0.12645560407569142,
+      "grad_norm": 0.14517490565776825,
+      "learning_rate": 4.879680093699598e-05,
+      "loss": 0.18859238624572755,
+      "step": 695
+    },
+    {
+      "epoch": 0.12736535662299855,
+      "grad_norm": 0.18778090178966522,
+      "learning_rate": 4.877413726219964e-05,
+      "loss": 0.197074818611145,
+      "step": 700
+    },
+    {
+      "epoch": 0.12827510917030568,
+      "grad_norm": 0.13497677445411682,
+      "learning_rate": 4.87512674886529e-05,
+      "loss": 0.18713107109069824,
+      "step": 705
+    },
+    {
+      "epoch": 0.12918486171761281,
+      "grad_norm": 0.12657155096530914,
+      "learning_rate": 4.872819181461455e-05,
+      "loss": 0.1858484387397766,
+      "step": 710
+    },
+    {
+      "epoch": 0.13009461426491994,
+      "grad_norm": 0.11458148807287216,
+      "learning_rate": 4.870491044012834e-05,
+      "loss": 0.18732179403305055,
+      "step": 715
+    },
+    {
+      "epoch": 0.13100436681222707,
+      "grad_norm": 0.13000249862670898,
+      "learning_rate": 4.8681423567021244e-05,
+      "loss": 0.1872936010360718,
+      "step": 720
+    },
+    {
+      "epoch": 0.1319141193595342,
+      "grad_norm": 0.14580890536308289,
+      "learning_rate": 4.865773139890172e-05,
+      "loss": 0.19280019998550416,
+      "step": 725
+    },
+    {
+      "epoch": 0.13282387190684133,
+      "grad_norm": 0.1507277935743332,
+      "learning_rate": 4.8633834141157913e-05,
+      "loss": 0.1898929238319397,
+      "step": 730
+    },
+    {
+      "epoch": 0.13373362445414846,
+      "grad_norm": 0.1418737769126892,
+      "learning_rate": 4.860973200095592e-05,
+      "loss": 0.17926375865936278,
+      "step": 735
+    },
+    {
+      "epoch": 0.1346433770014556,
+      "grad_norm": 0.17151866853237152,
+      "learning_rate": 4.858542518723794e-05,
+      "loss": 0.18963592052459716,
+      "step": 740
+    },
+    {
+      "epoch": 0.13555312954876272,
+      "grad_norm": 0.11162743717432022,
+      "learning_rate": 4.8560913910720535e-05,
+      "loss": 0.19466646909713745,
+      "step": 745
+    },
+    {
+      "epoch": 0.13646288209606988,
+      "grad_norm": 0.15628376603126526,
+      "learning_rate": 4.8536198383892725e-05,
+      "loss": 0.19494034051895143,
+      "step": 750
+    },
+    {
+      "epoch": 0.137372634643377,
+      "grad_norm": 0.18209289014339447,
+      "learning_rate": 4.851127882101421e-05,
+      "loss": 0.18747550249099731,
+      "step": 755
+    },
+    {
+      "epoch": 0.13828238719068414,
+      "grad_norm": 0.14559614658355713,
+      "learning_rate": 4.8486155438113454e-05,
+      "loss": 0.1897158980369568,
+      "step": 760
+    },
+    {
+      "epoch": 0.13919213973799127,
+      "grad_norm": 0.3198587894439697,
+      "learning_rate": 4.846082845298586e-05,
+      "loss": 0.18571001291275024,
+      "step": 765
+    },
+    {
+      "epoch": 0.1401018922852984,
+      "grad_norm": 0.1486678421497345,
+      "learning_rate": 4.843529808519189e-05,
+      "loss": 0.19561930894851684,
+      "step": 770
+    },
+    {
+      "epoch": 0.14101164483260553,
+      "grad_norm": 0.15318170189857483,
+      "learning_rate": 4.840956455605509e-05,
+      "loss": 0.187040114402771,
+      "step": 775
+    },
+    {
+      "epoch": 0.14192139737991266,
+      "grad_norm": 0.13754244148731232,
+      "learning_rate": 4.838362808866025e-05,
+      "loss": 0.18345539569854735,
+      "step": 780
+    },
+    {
+      "epoch": 0.1428311499272198,
+      "grad_norm": 0.12943248450756073,
+      "learning_rate": 4.835748890785143e-05,
+      "loss": 0.1921079397201538,
+      "step": 785
+    },
+    {
+      "epoch": 0.14374090247452692,
+      "grad_norm": 0.110458143055439,
+      "learning_rate": 4.833114724023001e-05,
+      "loss": 0.17927205562591553,
+      "step": 790
+    },
+    {
+      "epoch": 0.14465065502183405,
+      "grad_norm": 0.2421770840883255,
+      "learning_rate": 4.830460331415275e-05,
+      "loss": 0.18317567110061644,
+      "step": 795
+    },
+    {
+      "epoch": 0.14556040756914118,
+      "grad_norm": 0.14752762019634247,
+      "learning_rate": 4.8277857359729787e-05,
+      "loss": 0.1843916058540344,
+      "step": 800
+    },
+    {
+      "epoch": 0.14647016011644834,
+      "grad_norm": 0.15043556690216064,
+      "learning_rate": 4.8250909608822644e-05,
+      "loss": 0.18354393243789674,
+      "step": 805
+    },
+    {
+      "epoch": 0.14737991266375547,
+      "grad_norm": 0.1381794661283493,
+      "learning_rate": 4.822376029504223e-05,
+      "loss": 0.1789781332015991,
+      "step": 810
+    },
+    {
+      "epoch": 0.1482896652110626,
+      "grad_norm": 0.18386174738407135,
+      "learning_rate": 4.819640965374681e-05,
+      "loss": 0.19494292736053467,
+      "step": 815
+    },
+    {
+      "epoch": 0.14919941775836973,
+      "grad_norm": 0.13829593360424042,
+      "learning_rate": 4.816885792203996e-05,
+      "loss": 0.18486063480377196,
+      "step": 820
+    },
+    {
+      "epoch": 0.15010917030567686,
+      "grad_norm": 0.15033291280269623,
+      "learning_rate": 4.814110533876852e-05,
+      "loss": 0.18061509132385253,
+      "step": 825
+    },
+    {
+      "epoch": 0.151018922852984,
+      "grad_norm": 0.17150473594665527,
+      "learning_rate": 4.811315214452051e-05,
+      "loss": 0.18464866876602173,
+      "step": 830
+    },
+    {
+      "epoch": 0.15192867540029112,
+      "grad_norm": 0.15317125618457794,
+      "learning_rate": 4.808499858162307e-05,
+      "loss": 0.1837708592414856,
+      "step": 835
+    },
+    {
+      "epoch": 0.15283842794759825,
+      "grad_norm": 0.2671392560005188,
+      "learning_rate": 4.805664489414031e-05,
+      "loss": 0.19338636398315429,
+      "step": 840
+    },
+    {
+      "epoch": 0.15374818049490538,
+      "grad_norm": 0.14047028124332428,
+      "learning_rate": 4.802809132787125e-05,
+      "loss": 0.17069108486175538,
+      "step": 845
+    },
+    {
+      "epoch": 0.1546579330422125,
+      "grad_norm": 0.1520431935787201,
+      "learning_rate": 4.799933813034768e-05,
+      "loss": 0.18607735633850098,
+      "step": 850
+    },
+    {
+      "epoch": 0.15556768558951964,
+      "grad_norm": 0.17239463329315186,
+      "learning_rate": 4.797038555083197e-05,
+      "loss": 0.18069062232971192,
+      "step": 855
+    },
+    {
+      "epoch": 0.1564774381368268,
+      "grad_norm": 0.1377955675125122,
+      "learning_rate": 4.794123384031495e-05,
+      "loss": 0.18870222568511963,
+      "step": 860
+    },
+    {
+      "epoch": 0.15738719068413393,
+      "grad_norm": 0.15901461243629456,
+      "learning_rate": 4.791188325151373e-05,
+      "loss": 0.18128334283828734,
+      "step": 865
+    },
+    {
+      "epoch": 0.15829694323144106,
+      "grad_norm": 0.14634132385253906,
+      "learning_rate": 4.7882334038869495e-05,
+      "loss": 0.1866163969039917,
+      "step": 870
+    },
+    {
+      "epoch": 0.1592066957787482,
+      "grad_norm": 0.15361061692237854,
+      "learning_rate": 4.785258645854529e-05,
+      "loss": 0.17850807905197144,
+      "step": 875
+    },
+    {
+      "epoch": 0.16011644832605532,
+      "grad_norm": 0.13751649856567383,
+      "learning_rate": 4.782264076842385e-05,
+      "loss": 0.17731113433837892,
+      "step": 880
+    },
+    {
+      "epoch": 0.16102620087336245,
+      "grad_norm": 0.17909638583660126,
+      "learning_rate": 4.7792497228105314e-05,
+      "loss": 0.18344542980194092,
+      "step": 885
+    },
+    {
+      "epoch": 0.16193595342066958,
+      "grad_norm": 0.16038304567337036,
+      "learning_rate": 4.776215609890498e-05,
+      "loss": 0.18868647813796996,
+      "step": 890
+    },
+    {
+      "epoch": 0.1628457059679767,
+      "grad_norm": 0.1653951108455658,
+      "learning_rate": 4.773161764385107e-05,
+      "loss": 0.18614152669906617,
+      "step": 895
+    },
+    {
+      "epoch": 0.16375545851528384,
+      "grad_norm": 0.16193026304244995,
+      "learning_rate": 4.770088212768241e-05,
+      "loss": 0.18564575910568237,
+      "step": 900
+    },
+    {
+      "epoch": 0.16466521106259097,
+      "grad_norm": 0.16048531234264374,
+      "learning_rate": 4.7669949816846173e-05,
+      "loss": 0.18330031633377075,
+      "step": 905
+    },
+    {
+      "epoch": 0.1655749636098981,
+      "grad_norm": 0.1440177708864212,
+      "learning_rate": 4.7638820979495534e-05,
+      "loss": 0.17712442874908446,
+      "step": 910
+    },
+    {
+      "epoch": 0.16648471615720525,
+      "grad_norm": 0.19635969400405884,
+      "learning_rate": 4.760749588548738e-05,
+      "loss": 0.18679027557373046,
+      "step": 915
+    },
+    {
+      "epoch": 0.16739446870451238,
+      "grad_norm": 0.15576541423797607,
+      "learning_rate": 4.757597480637995e-05,
+      "loss": 0.19283764362335204,
+      "step": 920
+    },
+    {
+      "epoch": 0.1683042212518195,
+      "grad_norm": 0.1550331562757492,
+      "learning_rate": 4.7544258015430463e-05,
+      "loss": 0.18269542455673218,
+      "step": 925
+    },
+    {
+      "epoch": 0.16921397379912664,
+      "grad_norm": 0.18369626998901367,
+      "learning_rate": 4.75123457875928e-05,
+      "loss": 0.1697891116142273,
+      "step": 930
+    },
+    {
+      "epoch": 0.17012372634643377,
+      "grad_norm": 0.15266314148902893,
+      "learning_rate": 4.7480238399515074e-05,
+      "loss": 0.18523451089859008,
+      "step": 935
+    },
+    {
+      "epoch": 0.1710334788937409,
+      "grad_norm": 0.16709664463996887,
+      "learning_rate": 4.744793612953724e-05,
+      "loss": 0.1803238034248352,
+      "step": 940
+    },
+    {
+      "epoch": 0.17194323144104803,
+      "grad_norm": 0.14929179847240448,
+      "learning_rate": 4.741543925768872e-05,
+      "loss": 0.1861217737197876,
+      "step": 945
+    },
+    {
+      "epoch": 0.17285298398835516,
+      "grad_norm": 0.1362280696630478,
+      "learning_rate": 4.7382748065685915e-05,
+      "loss": 0.17896100282669067,
+      "step": 950
+    },
+    {
+      "epoch": 0.1737627365356623,
+      "grad_norm": 0.15290239453315735,
+      "learning_rate": 4.734986283692982e-05,
+      "loss": 0.18432788848876952,
+      "step": 955
+    },
+    {
+      "epoch": 0.17467248908296942,
+      "grad_norm": 0.1287035197019577,
+      "learning_rate": 4.73167838565035e-05,
+      "loss": 0.18485682010650634,
+      "step": 960
+    },
+    {
+      "epoch": 0.17558224163027655,
+      "grad_norm": 0.17969627678394318,
+      "learning_rate": 4.728351141116971e-05,
+      "loss": 0.17361557483673096,
+      "step": 965
+    },
+    {
+      "epoch": 0.1764919941775837,
+      "grad_norm": 0.13751201331615448,
+      "learning_rate": 4.7250045789368326e-05,
+      "loss": 0.1731679320335388,
+      "step": 970
+    },
+    {
+      "epoch": 0.17740174672489084,
+      "grad_norm": 0.1603265255689621,
+      "learning_rate": 4.721638728121388e-05,
+      "loss": 0.17308170795440675,
+      "step": 975
+    },
+    {
+      "epoch": 0.17831149927219797,
+      "grad_norm": 0.1592789888381958,
+      "learning_rate": 4.718253617849306e-05,
+      "loss": 0.17534757852554322,
+      "step": 980
+    },
+    {
+      "epoch": 0.1792212518195051,
+      "grad_norm": 0.12727224826812744,
+      "learning_rate": 4.714849277466214e-05,
+      "loss": 0.17817609310150145,
+      "step": 985
+    },
+    {
+      "epoch": 0.18013100436681223,
+      "grad_norm": 0.15401554107666016,
+      "learning_rate": 4.711425736484447e-05,
+      "loss": 0.1733405351638794,
+      "step": 990
+    },
+    {
+      "epoch": 0.18104075691411936,
+      "grad_norm": 0.13253968954086304,
+      "learning_rate": 4.7079830245827906e-05,
+      "loss": 0.17846795320510864,
+      "step": 995
+    },
+    {
+      "epoch": 0.1819505094614265,
+      "grad_norm": 0.21846213936805725,
+      "learning_rate": 4.7045211716062245e-05,
+      "loss": 0.18021599054336548,
+      "step": 1000
+    },
+    {
+      "epoch": 0.18286026200873362,
+      "grad_norm": 0.16867990791797638,
+      "learning_rate": 4.7010402075656595e-05,
+      "loss": 0.18232386112213134,
+      "step": 1005
+    },
+    {
+      "epoch": 0.18377001455604075,
+      "grad_norm": 0.17180582880973816,
+      "learning_rate": 4.697540162637686e-05,
+      "loss": 0.1816317319869995,
+      "step": 1010
+    },
+    {
+      "epoch": 0.18467976710334788,
+      "grad_norm": 0.16480213403701782,
+      "learning_rate": 4.694021067164303e-05,
+      "loss": 0.17718446254730225,
+      "step": 1015
+    },
+    {
+      "epoch": 0.185589519650655,
+      "grad_norm": 0.15015918016433716,
+      "learning_rate": 4.6904829516526605e-05,
+      "loss": 0.17412011623382567,
+      "step": 1020
+    },
+    {
+      "epoch": 0.18649927219796217,
+      "grad_norm": 0.14445139467716217,
+      "learning_rate": 4.686925846774795e-05,
+      "loss": 0.1778018832206726,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1874090247452693,
+      "grad_norm": 0.1701960265636444,
+      "learning_rate": 4.683349783367362e-05,
+      "loss": 0.16901081800460815,
+      "step": 1030
+    },
+    {
+      "epoch": 0.18831877729257643,
+      "grad_norm": 0.15894867479801178,
+      "learning_rate": 4.679754792431368e-05,
+      "loss": 0.17055928707122803,
+      "step": 1035
+    },
+    {
+      "epoch": 0.18922852983988356,
+      "grad_norm": 0.1511942446231842,
+      "learning_rate": 4.676140905131903e-05,
+      "loss": 0.17339680194854737,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1901382823871907,
+      "grad_norm": 0.14735209941864014,
+      "learning_rate": 4.672508152797872e-05,
+      "loss": 0.17802717685699462,
+      "step": 1045
+    },
+    {
+      "epoch": 0.19104803493449782,
+      "grad_norm": 0.17367291450500488,
+      "learning_rate": 4.66885656692172e-05,
+      "loss": 0.1732744097709656,
+      "step": 1050
+    },
+    {
+      "epoch": 0.19195778748180495,
+      "grad_norm": 0.147227481007576,
+      "learning_rate": 4.665186179159159e-05,
+      "loss": 0.17040517330169677,
+      "step": 1055
+    },
+    {
+      "epoch": 0.19286754002911208,
+      "grad_norm": 0.1709655076265335,
+      "learning_rate": 4.6614970213289e-05,
+      "loss": 0.17794088125228882,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1937772925764192,
+      "grad_norm": 0.1588088721036911,
+      "learning_rate": 4.657789125412366e-05,
+      "loss": 0.17180380821228028,
+      "step": 1065
+    },
+    {
+      "epoch": 0.19468704512372634,
+      "grad_norm": 0.14827021956443787,
+      "learning_rate": 4.654062523553428e-05,
+      "loss": 0.182997989654541,
+      "step": 1070
+    },
+    {
+      "epoch": 0.19559679767103347,
+      "grad_norm": 0.16230466961860657,
+      "learning_rate": 4.6503172480581126e-05,
+      "loss": 0.17346880435943604,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1965065502183406,
+      "grad_norm": 0.1637624353170395,
+      "learning_rate": 4.646553331394333e-05,
+      "loss": 0.17263576984405518,
+      "step": 1080
+    },
+    {
+      "epoch": 0.19741630276564776,
+      "grad_norm": 0.15977843105793,
+      "learning_rate": 4.642770806191603e-05,
+      "loss": 0.17284308671951293,
+      "step": 1085
+    },
+    {
+      "epoch": 0.19832605531295489,
+      "grad_norm": 0.15394869446754456,
+      "learning_rate": 4.6389697052407534e-05,
+      "loss": 0.17797101736068727,
+      "step": 1090
+    },
+    {
+      "epoch": 0.19923580786026202,
+      "grad_norm": 0.15995225310325623,
+      "learning_rate": 4.6351500614936485e-05,
+      "loss": 0.18137198686599731,
+      "step": 1095
+    },
+    {
+      "epoch": 0.20014556040756915,
+      "grad_norm": 0.1779479682445526,
+      "learning_rate": 4.6313119080629006e-05,
+      "loss": 0.17998344898223878,
+      "step": 1100
+    },
+    {
+      "epoch": 0.20105531295487628,
+      "grad_norm": 0.14362832903862,
+      "learning_rate": 4.627455278221584e-05,
+      "loss": 0.18196423053741456,
+      "step": 1105
+    },
+    {
+      "epoch": 0.2019650655021834,
+      "grad_norm": 0.15951639413833618,
+      "learning_rate": 4.623580205402947e-05,
+      "loss": 0.17423888444900512,
+      "step": 1110
+    },
+    {
+      "epoch": 0.20287481804949054,
+      "grad_norm": 0.17273563146591187,
+      "learning_rate": 4.619686723200115e-05,
+      "loss": 0.17392473220825194,
+      "step": 1115
+    },
+    {
+      "epoch": 0.20378457059679767,
+      "grad_norm": 0.1655360758304596,
+      "learning_rate": 4.615774865365813e-05,
+      "loss": 0.17528389692306517,
+      "step": 1120
+    },
+    {
+      "epoch": 0.2046943231441048,
+      "grad_norm": 0.15920691192150116,
+      "learning_rate": 4.611844665812058e-05,
+      "loss": 0.1806849241256714,
+      "step": 1125
+    },
+    {
+      "epoch": 0.20560407569141192,
+      "grad_norm": 0.16114577651023865,
+      "learning_rate": 4.607896158609875e-05,
+      "loss": 0.17217352390289306,
+      "step": 1130
+    },
+    {
+      "epoch": 0.20651382823871905,
+      "grad_norm": 0.1499422937631607,
+      "learning_rate": 4.603929377988999e-05,
+      "loss": 0.17806737422943114,
+      "step": 1135
+    },
+    {
+      "epoch": 0.2074235807860262,
+      "grad_norm": 0.17605191469192505,
+      "learning_rate": 4.5999443583375765e-05,
+      "loss": 0.17842113971710205,
+      "step": 1140
+    },
+    {
+      "epoch": 0.20833333333333334,
+      "grad_norm": 0.16117210686206818,
+      "learning_rate": 4.595941134201871e-05,
+      "loss": 0.18379683494567872,
+      "step": 1145
+    },
+    {
+      "epoch": 0.20924308588064047,
+      "grad_norm": 0.21199050545692444,
+      "learning_rate": 4.591919740285957e-05,
+      "loss": 0.16286123991012574,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2101528384279476,
+      "grad_norm": 0.15100529789924622,
+      "learning_rate": 4.587880211451427e-05,
+      "loss": 0.17995200157165528,
+      "step": 1155
+    },
+    {
+      "epoch": 0.21106259097525473,
+      "grad_norm": 0.16618172824382782,
+      "learning_rate": 4.583822582717085e-05,
+      "loss": 0.16960303783416747,
+      "step": 1160
+    },
+    {
+      "epoch": 0.21197234352256186,
+      "grad_norm": 0.14743569493293762,
+      "learning_rate": 4.579746889258643e-05,
+      "loss": 0.17762668132781984,
+      "step": 1165
+    },
+    {
+      "epoch": 0.212882096069869,
+      "grad_norm": 0.1697179079055786,
+      "learning_rate": 4.575653166408417e-05,
+      "loss": 0.16656005382537842,
+      "step": 1170
+    },
+    {
+      "epoch": 0.21379184861717612,
+      "grad_norm": 0.14886513352394104,
+      "learning_rate": 4.57154144965502e-05,
+      "loss": 0.17091882228851318,
+      "step": 1175
+    },
+    {
+      "epoch": 0.21470160116448325,
+      "grad_norm": 0.18197473883628845,
+      "learning_rate": 4.5674117746430556e-05,
+      "loss": 0.1770920753479004,
+      "step": 1180
+    },
+    {
+      "epoch": 0.21561135371179038,
+      "grad_norm": 0.17323088645935059,
+      "learning_rate": 4.563264177172807e-05,
+      "loss": 0.1734643578529358,
+      "step": 1185
+    },
+    {
+      "epoch": 0.2165211062590975,
+      "grad_norm": 0.1521984338760376,
+      "learning_rate": 4.559098693199929e-05,
+      "loss": 0.17515116930007935,
+      "step": 1190
+    },
+    {
+      "epoch": 0.21743085880640467,
+      "grad_norm": 0.1842304915189743,
+      "learning_rate": 4.554915358835134e-05,
+      "loss": 0.16798022985458375,
+      "step": 1195
+    },
+    {
+      "epoch": 0.2183406113537118,
+      "grad_norm": 0.14753451943397522,
+      "learning_rate": 4.5507142103438794e-05,
+      "loss": 0.1755476713180542,
+      "step": 1200
+    },
+    {
+      "epoch": 0.21925036390101893,
+      "grad_norm": 0.17096194624900818,
+      "learning_rate": 4.546495284146057e-05,
+      "loss": 0.1792473554611206,
+      "step": 1205
+    },
+    {
+      "epoch": 0.22016011644832606,
+      "grad_norm": 0.1579233556985855,
+      "learning_rate": 4.542258616815669e-05,
+      "loss": 0.17230144739151002,
+      "step": 1210
+    },
+    {
+      "epoch": 0.2210698689956332,
+      "grad_norm": 0.177297905087471,
+      "learning_rate": 4.5380042450805216e-05,
+      "loss": 0.1807127833366394,
+      "step": 1215
+    },
+    {
+      "epoch": 0.22197962154294032,
+      "grad_norm": 0.14331696927547455,
+      "learning_rate": 4.533732205821897e-05,
+      "loss": 0.17201389074325563,
+      "step": 1220
+    },
+    {
+      "epoch": 0.22288937409024745,
+      "grad_norm": 0.14473360776901245,
+      "learning_rate": 4.529442536074239e-05,
+      "loss": 0.17036900520324708,
+      "step": 1225
+    },
+    {
+      "epoch": 0.22379912663755458,
+      "grad_norm": 0.1820901483297348,
+      "learning_rate": 4.5251352730248314e-05,
+      "loss": 0.17704882621765136,
+      "step": 1230
+    },
+    {
+      "epoch": 0.2247088791848617,
+      "grad_norm": 0.1948976367712021,
+      "learning_rate": 4.5208104540134746e-05,
+      "loss": 0.1706973433494568,
+      "step": 1235
+    },
+    {
+      "epoch": 0.22561863173216884,
+      "grad_norm": 0.16660070419311523,
+      "learning_rate": 4.51646811653216e-05,
+      "loss": 0.17636821269989014,
+      "step": 1240
+    },
+    {
+      "epoch": 0.22652838427947597,
+      "grad_norm": 0.1699984073638916,
+      "learning_rate": 4.512108298224751e-05,
+      "loss": 0.16986632347106934,
+      "step": 1245
+    },
+    {
+      "epoch": 0.22743813682678313,
+      "grad_norm": 0.17601042985916138,
+      "learning_rate": 4.50773103688665e-05,
+      "loss": 0.17507898807525635,
+      "step": 1250
+    },
+    {
+      "epoch": 0.22834788937409026,
+      "grad_norm": 0.17557238042354584,
+      "learning_rate": 4.503336370464476e-05,
+      "loss": 0.17702863216400147,
+      "step": 1255
+    },
+    {
+      "epoch": 0.2292576419213974,
+      "grad_norm": 0.1800651252269745,
+      "learning_rate": 4.498924337055729e-05,
+      "loss": 0.16419180631637573,
+      "step": 1260
+    },
+    {
+      "epoch": 0.23016739446870452,
+      "grad_norm": 0.2022479772567749,
+      "learning_rate": 4.494494974908468e-05,
+      "loss": 0.17482060194015503,
+      "step": 1265
+    },
+    {
+      "epoch": 0.23107714701601165,
+      "grad_norm": 0.14180205762386322,
+      "learning_rate": 4.490048322420973e-05,
+      "loss": 0.1723136067390442,
+      "step": 1270
+    },
+    {
+      "epoch": 0.23198689956331878,
+      "grad_norm": 0.18607310950756073,
+      "learning_rate": 4.485584418141419e-05,
+      "loss": 0.17096419334411622,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2328966521106259,
+      "grad_norm": 0.15958310663700104,
+      "learning_rate": 4.481103300767529e-05,
+      "loss": 0.1656244158744812,
+      "step": 1280
+    },
+    {
+      "epoch": 0.23380640465793304,
+      "grad_norm": 0.17552383244037628,
+      "learning_rate": 4.476605009146255e-05,
+      "loss": 0.17677626609802247,
+      "step": 1285
+    },
+    {
+      "epoch": 0.23471615720524017,
+      "grad_norm": 0.15299823880195618,
+      "learning_rate": 4.472089582273429e-05,
+      "loss": 0.1778991103172302,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2356259097525473,
+      "grad_norm": 0.14613987505435944,
+      "learning_rate": 4.46755705929343e-05,
+      "loss": 0.17071452140808105,
+      "step": 1295
+    },
+    {
+      "epoch": 0.23653566229985443,
+      "grad_norm": 0.17781122028827667,
+      "learning_rate": 4.463007479498843e-05,
+      "loss": 0.16955430507659913,
+      "step": 1300
+    },
+    {
+      "epoch": 0.23744541484716158,
+      "grad_norm": 0.16326487064361572,
+      "learning_rate": 4.458440882330119e-05,
+      "loss": 0.1777693510055542,
+      "step": 1305
+    },
+    {
+      "epoch": 0.23835516739446871,
+      "grad_norm": 0.17701926827430725,
+      "learning_rate": 4.4538573073752365e-05,
+      "loss": 0.16323351860046387,
+      "step": 1310
+    },
+    {
+      "epoch": 0.23926491994177584,
+      "grad_norm": 0.13104717433452606,
+      "learning_rate": 4.449256794369349e-05,
+      "loss": 0.17653456926345826,
+      "step": 1315
+    },
+    {
+      "epoch": 0.24017467248908297,
+      "grad_norm": 0.1796836256980896,
+      "learning_rate": 4.444639383194452e-05,
+      "loss": 0.17189600467681884,
+      "step": 1320
+    },
+    {
+      "epoch": 0.2410844250363901,
+      "grad_norm": 0.14919696748256683,
+      "learning_rate": 4.440005113879029e-05,
+      "loss": 0.17003334760665895,
+      "step": 1325
+    },
+    {
+      "epoch": 0.24199417758369723,
+      "grad_norm": 0.1728784441947937,
+      "learning_rate": 4.4353540265977064e-05,
+      "loss": 0.17397408485412597,
+      "step": 1330
+    },
+    {
+      "epoch": 0.24290393013100436,
+      "grad_norm": 0.14591015875339508,
+      "learning_rate": 4.43068616167091e-05,
+      "loss": 0.16498478651046752,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2438136826783115,
+      "grad_norm": 0.18417201936244965,
+      "learning_rate": 4.4260015595645055e-05,
+      "loss": 0.16841750144958495,
+      "step": 1340
+    },
+    {
+      "epoch": 0.24472343522561862,
+      "grad_norm": 0.16264279186725616,
+      "learning_rate": 4.4213002608894605e-05,
+      "loss": 0.16907373666763306,
+      "step": 1345
+    },
+    {
+      "epoch": 0.24563318777292575,
+      "grad_norm": 0.15248481929302216,
+      "learning_rate": 4.416582306401481e-05,
+      "loss": 0.15931472778320313,
+      "step": 1350
+    },
+    {
+      "epoch": 0.24654294032023288,
+      "grad_norm": 0.1488373875617981,
+      "learning_rate": 4.4118477370006636e-05,
+      "loss": 0.1701716423034668,
+      "step": 1355
+    },
+    {
+      "epoch": 0.24745269286754004,
+      "grad_norm": 0.14679782092571259,
+      "learning_rate": 4.407096593731142e-05,
+      "loss": 0.157412326335907,
+      "step": 1360
+    },
+    {
+      "epoch": 0.24836244541484717,
+      "grad_norm": 0.17139530181884766,
+      "learning_rate": 4.402328917780728e-05,
+      "loss": 0.17303754091262818,
+      "step": 1365
+    },
+    {
+      "epoch": 0.2492721979621543,
+      "grad_norm": 0.1534871757030487,
+      "learning_rate": 4.397544750480554e-05,
+      "loss": 0.1786255121231079,
+      "step": 1370
+    },
+    {
+      "epoch": 0.2501819505094614,
+      "grad_norm": 0.1876252293586731,
+      "learning_rate": 4.39274413330472e-05,
+      "loss": 0.16442898511886597,
+      "step": 1375
+    },
+    {
+      "epoch": 0.25109170305676853,
+      "grad_norm": 0.16165752708911896,
+      "learning_rate": 4.387927107869928e-05,
+      "loss": 0.1780426025390625,
+      "step": 1380
+    },
+    {
+      "epoch": 0.25200145560407566,
+      "grad_norm": 0.17242255806922913,
+      "learning_rate": 4.383093715935124e-05,
+      "loss": 0.15959256887435913,
+      "step": 1385
+    },
+    {
+      "epoch": 0.25291120815138285,
+      "grad_norm": 0.1627114862203598,
+      "learning_rate": 4.378243999401137e-05,
+      "loss": 0.17606115341186523,
+      "step": 1390
+    },
+    {
+      "epoch": 0.25382096069869,
+      "grad_norm": 0.15911224484443665,
+      "learning_rate": 4.373378000310312e-05,
+      "loss": 0.16798585653305054,
+      "step": 1395
+    },
+    {
+      "epoch": 0.2547307132459971,
+      "grad_norm": 0.15542249381542206,
+      "learning_rate": 4.3684957608461505e-05,
+      "loss": 0.1695417881011963,
+      "step": 1400
+    },
+    {
+      "epoch": 0.25564046579330424,
+      "grad_norm": 0.1475304812192917,
+      "learning_rate": 4.363597323332941e-05,
+      "loss": 0.16340878009796142,
+      "step": 1405
+    },
+    {
+      "epoch": 0.25655021834061137,
+      "grad_norm": 0.16943927109241486,
+      "learning_rate": 4.358682730235395e-05,
+      "loss": 0.17240238189697266,
+      "step": 1410
+    },
+    {
+      "epoch": 0.2574599708879185,
+      "grad_norm": 0.1816391944885254,
+      "learning_rate": 4.3537520241582744e-05,
+      "loss": 0.16558437347412108,
+      "step": 1415
+    },
+    {
+      "epoch": 0.25836972343522563,
+      "grad_norm": 0.23851341009140015,
+      "learning_rate": 4.348805247846027e-05,
+      "loss": 0.16796000003814698,
+      "step": 1420
+    },
+    {
+      "epoch": 0.25927947598253276,
+      "grad_norm": 0.15415243804454803,
+      "learning_rate": 4.343842444182414e-05,
+      "loss": 0.1746017098426819,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2601892285298399,
+      "grad_norm": 0.15651032328605652,
+      "learning_rate": 4.338863656190139e-05,
+      "loss": 0.1649057984352112,
+      "step": 1430
+    },
+    {
+      "epoch": 0.261098981077147,
+      "grad_norm": 0.16601966321468353,
+      "learning_rate": 4.333868927030471e-05,
+      "loss": 0.15888988971710205,
+      "step": 1435
+    },
+    {
+      "epoch": 0.26200873362445415,
+      "grad_norm": 0.1549467295408249,
+      "learning_rate": 4.328858300002876e-05,
+      "loss": 0.16357985734939576,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2629184861717613,
+      "grad_norm": 0.16332370042800903,
+      "learning_rate": 4.32383181854464e-05,
+      "loss": 0.16749982833862304,
+      "step": 1445
+    },
+    {
+      "epoch": 0.2638282387190684,
+      "grad_norm": 0.14827077090740204,
+      "learning_rate": 4.3187895262304894e-05,
+      "loss": 0.16886214017868043,
+      "step": 1450
+    },
+    {
+      "epoch": 0.26473799126637554,
+      "grad_norm": 0.1557198166847229,
+      "learning_rate": 4.313731466772216e-05,
+      "loss": 0.17512214183807373,
+      "step": 1455
+    },
+    {
+      "epoch": 0.26564774381368267,
+      "grad_norm": 0.17263570427894592,
+      "learning_rate": 4.308657684018299e-05,
+      "loss": 0.16248074769973755,
+      "step": 1460
+    },
+    {
+      "epoch": 0.2665574963609898,
+      "grad_norm": 0.17135761678218842,
+      "learning_rate": 4.303568221953521e-05,
+      "loss": 0.16605921983718872,
+      "step": 1465
+    },
+    {
+      "epoch": 0.26746724890829693,
+      "grad_norm": 0.14322632551193237,
+      "learning_rate": 4.2984631246985897e-05,
+      "loss": 0.1610772728919983,
+      "step": 1470
+    },
+    {
+      "epoch": 0.26837700145560406,
+      "grad_norm": 0.18852312862873077,
+      "learning_rate": 4.2933424365097564e-05,
+      "loss": 0.1686462163925171,
+      "step": 1475
+    },
+    {
+      "epoch": 0.2692867540029112,
+      "grad_norm": 0.1780245155096054,
+      "learning_rate": 4.2882062017784294e-05,
+      "loss": 0.16953932046890258,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2701965065502183,
+      "grad_norm": 0.180568665266037,
+      "learning_rate": 4.2830544650307895e-05,
+      "loss": 0.16442664861679077,
+      "step": 1485
+    },
+    {
+      "epoch": 0.27110625909752545,
+      "grad_norm": 0.16876435279846191,
+      "learning_rate": 4.277887270927407e-05,
+      "loss": 0.17128173112869263,
+      "step": 1490
+    },
+    {
+      "epoch": 0.2720160116448326,
+      "grad_norm": 0.164053276181221,
+      "learning_rate": 4.2727046642628513e-05,
+      "loss": 0.16331382989883422,
+      "step": 1495
+    },
+    {
+      "epoch": 0.27292576419213976,
+      "grad_norm": 0.14577528834342957,
+      "learning_rate": 4.267506689965305e-05,
+      "loss": 0.1638316035270691,
+      "step": 1500
+    },
+    {
+      "epoch": 0.2738355167394469,
+      "grad_norm": 0.1648740917444229,
+      "learning_rate": 4.262293393096171e-05,
+      "loss": 0.15332664251327516,
+      "step": 1505
+    },
+    {
+      "epoch": 0.274745269286754,
+      "grad_norm": 0.16445094347000122,
+      "learning_rate": 4.257064818849685e-05,
+      "loss": 0.1706634521484375,
+      "step": 1510
+    },
+    {
+      "epoch": 0.27565502183406115,
+      "grad_norm": 0.1584935486316681,
+      "learning_rate": 4.251821012552524e-05,
+      "loss": 0.1684114694595337,
+      "step": 1515
+    },
+    {
+      "epoch": 0.2765647743813683,
+      "grad_norm": 0.17215611040592194,
+      "learning_rate": 4.24656201966341e-05,
+      "loss": 0.15594131946563722,
+      "step": 1520
+    },
+    {
+      "epoch": 0.2774745269286754,
+      "grad_norm": 0.15945589542388916,
+      "learning_rate": 4.2412878857727214e-05,
+      "loss": 0.1686659574508667,
+      "step": 1525
+    },
+    {
+      "epoch": 0.27838427947598254,
+      "grad_norm": 0.16103951632976532,
+      "learning_rate": 4.2359986566020906e-05,
+      "loss": 0.17779340744018554,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2792940320232897,
+      "grad_norm": 0.1770307570695877,
+      "learning_rate": 4.230694378004014e-05,
+      "loss": 0.16786882877349854,
+      "step": 1535
+    },
+    {
+      "epoch": 0.2802037845705968,
+      "grad_norm": 0.16225053369998932,
+      "learning_rate": 4.2253750959614504e-05,
+      "loss": 0.16558897495269775,
+      "step": 1540
+    },
+    {
+      "epoch": 0.28111353711790393,
+      "grad_norm": 0.27213969826698303,
+      "learning_rate": 4.220040856587425e-05,
+      "loss": 0.1641119599342346,
+      "step": 1545
+    },
+    {
+      "epoch": 0.28202328966521106,
+      "grad_norm": 0.1773071587085724,
+      "learning_rate": 4.2146917061246284e-05,
+      "loss": 0.16919140815734862,
+      "step": 1550
+    },
+    {
+      "epoch": 0.2829330422125182,
+      "grad_norm": 0.15519705414772034,
+      "learning_rate": 4.209327690945014e-05,
+      "loss": 0.15501506328582765,
+      "step": 1555
+    },
+    {
+      "epoch": 0.2838427947598253,
+      "grad_norm": 0.19921597838401794,
+      "learning_rate": 4.203948857549402e-05,
+      "loss": 0.1690821886062622,
+      "step": 1560
+    },
+    {
+      "epoch": 0.28475254730713245,
+      "grad_norm": 0.15417630970478058,
+      "learning_rate": 4.1985552525670696e-05,
+      "loss": 0.1675640344619751,
+      "step": 1565
+    },
+    {
+      "epoch": 0.2856622998544396,
+      "grad_norm": 0.1739572137594223,
+      "learning_rate": 4.193146922755348e-05,
+      "loss": 0.16738017797470092,
+      "step": 1570
+    },
+    {
+      "epoch": 0.2865720524017467,
+      "grad_norm": 0.1384361982345581,
+      "learning_rate": 4.187723914999221e-05,
+      "loss": 0.16802358627319336,
+      "step": 1575
+    },
+    {
+      "epoch": 0.28748180494905384,
+      "grad_norm": 0.1491454839706421,
+      "learning_rate": 4.182286276310915e-05,
+      "loss": 0.1619583249092102,
+      "step": 1580
+    },
+    {
+      "epoch": 0.288391557496361,
+      "grad_norm": 0.15831919014453888,
+      "learning_rate": 4.176834053829492e-05,
+      "loss": 0.1625199794769287,
+      "step": 1585
+    },
+    {
+      "epoch": 0.2893013100436681,
+      "grad_norm": 0.16265396773815155,
+      "learning_rate": 4.1713672948204416e-05,
+      "loss": 0.16718552112579346,
+      "step": 1590
+    },
+    {
+      "epoch": 0.29021106259097523,
+      "grad_norm": 0.15153461694717407,
+      "learning_rate": 4.1658860466752714e-05,
+      "loss": 0.15979087352752686,
+      "step": 1595
+    },
+    {
+      "epoch": 0.29112081513828236,
+      "grad_norm": 0.1620412915945053,
+      "learning_rate": 4.160390356911096e-05,
+      "loss": 0.16103557348251343,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2920305676855895,
+      "grad_norm": 0.16673807799816132,
+      "learning_rate": 4.154880273170223e-05,
+      "loss": 0.16394708156585694,
+      "step": 1605
+    },
+    {
+      "epoch": 0.2929403202328967,
+      "grad_norm": 0.14834867417812347,
+      "learning_rate": 4.149355843219744e-05,
+      "loss": 0.15916435718536376,
+      "step": 1610
+    },
+    {
+      "epoch": 0.2938500727802038,
+      "grad_norm": 0.16977964341640472,
+      "learning_rate": 4.143817114951119e-05,
+      "loss": 0.16538127660751342,
+      "step": 1615
+    },
+    {
+      "epoch": 0.29475982532751094,
+      "grad_norm": 0.17986875772476196,
+      "learning_rate": 4.138264136379756e-05,
+      "loss": 0.15514618158340454,
+      "step": 1620
+    },
+    {
+      "epoch": 0.29566957787481807,
+      "grad_norm": 0.15794920921325684,
+      "learning_rate": 4.132696955644605e-05,
+      "loss": 0.15992183685302735,
+      "step": 1625
+    },
+    {
+      "epoch": 0.2965793304221252,
+      "grad_norm": 0.19955399632453918,
+      "learning_rate": 4.127115621007731e-05,
+      "loss": 0.16362056732177735,
+      "step": 1630
+    },
+    {
+      "epoch": 0.29748908296943233,
+      "grad_norm": 0.1352023035287857,
+      "learning_rate": 4.121520180853903e-05,
+      "loss": 0.15631601810455323,
+      "step": 1635
+    },
+    {
+      "epoch": 0.29839883551673946,
+      "grad_norm": 0.15340781211853027,
+      "learning_rate": 4.1159106836901674e-05,
+      "loss": 0.1571858048439026,
+      "step": 1640
+    },
+    {
+      "epoch": 0.2993085880640466,
+      "grad_norm": 0.15311770141124725,
+      "learning_rate": 4.110287178145433e-05,
+      "loss": 0.16082344055175782,
+      "step": 1645
+    },
+    {
+      "epoch": 0.3002183406113537,
+      "grad_norm": 0.17811627686023712,
+      "learning_rate": 4.10464971297005e-05,
+      "loss": 0.16117215156555176,
+      "step": 1650
+    },
+    {
+      "epoch": 0.30112809315866085,
+      "grad_norm": 0.21060039103031158,
+      "learning_rate": 4.0989983370353805e-05,
+      "loss": 0.15838587284088135,
+      "step": 1655
+    },
+    {
+      "epoch": 0.302037845705968,
+      "grad_norm": 0.155836820602417,
+      "learning_rate": 4.093333099333383e-05,
+      "loss": 0.16648870706558228,
+      "step": 1660
+    },
+    {
+      "epoch": 0.3029475982532751,
+      "grad_norm": 0.13711698353290558,
+      "learning_rate": 4.0876540489761826e-05,
+      "loss": 0.16899349689483642,
+      "step": 1665
+    },
+    {
+      "epoch": 0.30385735080058224,
+      "grad_norm": 0.15162716805934906,
+      "learning_rate": 4.0819612351956485e-05,
+      "loss": 0.16574090719223022,
+      "step": 1670
+    },
+    {
+      "epoch": 0.30476710334788937,
+      "grad_norm": 0.15016348659992218,
+      "learning_rate": 4.0762547073429615e-05,
+      "loss": 0.1689780354499817,
+      "step": 1675
+    },
+    {
+      "epoch": 0.3056768558951965,
+      "grad_norm": 0.15182986855506897,
+      "learning_rate": 4.070534514888194e-05,
+      "loss": 0.1593686819076538,
+      "step": 1680
+    },
+    {
+      "epoch": 0.3065866084425036,
+      "grad_norm": 0.15648750960826874,
+      "learning_rate": 4.0648007074198765e-05,
+      "loss": 0.16436235904693602,
+      "step": 1685
+    },
+    {
+      "epoch": 0.30749636098981076,
+      "grad_norm": 0.18339484930038452,
+      "learning_rate": 4.0590533346445665e-05,
+      "loss": 0.1678077220916748,
+      "step": 1690
+    },
+    {
+      "epoch": 0.3084061135371179,
+      "grad_norm": 0.16426527500152588,
+      "learning_rate": 4.053292446386422e-05,
+      "loss": 0.1689227342605591,
+      "step": 1695
+    },
+    {
+      "epoch": 0.309315866084425,
+      "grad_norm": 0.16129335761070251,
+      "learning_rate": 4.047518092586766e-05,
+      "loss": 0.16592445373535156,
+      "step": 1700
+    },
+    {
+      "epoch": 0.31022561863173215,
+      "grad_norm": 0.15512363612651825,
+      "learning_rate": 4.041730323303654e-05,
+      "loss": 0.16142364740371704,
+      "step": 1705
+    },
+    {
+      "epoch": 0.3111353711790393,
+      "grad_norm": 0.159842386841774,
+      "learning_rate": 4.0359291887114425e-05,
+      "loss": 0.1702875852584839,
+      "step": 1710
+    },
+    {
+      "epoch": 0.3120451237263464,
+      "grad_norm": 0.19558854401111603,
+      "learning_rate": 4.030114739100352e-05,
+      "loss": 0.15966148376464845,
+      "step": 1715
+    },
+    {
+      "epoch": 0.3129548762736536,
+      "grad_norm": 0.1577496975660324,
+      "learning_rate": 4.024287024876029e-05,
+      "loss": 0.1620358943939209,
+      "step": 1720
+    },
+    {
+      "epoch": 0.3138646288209607,
+      "grad_norm": 0.1629355251789093,
+      "learning_rate": 4.0184460965591144e-05,
+      "loss": 0.16511552333831786,
+      "step": 1725
+    },
+    {
+      "epoch": 0.31477438136826785,
+      "grad_norm": 0.17060767114162445,
+      "learning_rate": 4.0125920047848e-05,
+      "loss": 0.15672838687896729,
+      "step": 1730
+    },
+    {
+      "epoch": 0.315684133915575,
+      "grad_norm": 0.22447620332241058,
+      "learning_rate": 4.006724800302394e-05,
+      "loss": 0.15339784622192382,
+      "step": 1735
+    },
+    {
+      "epoch": 0.3165938864628821,
+      "grad_norm": 0.14572037756443024,
+      "learning_rate": 4.000844533974878e-05,
+      "loss": 0.16566959619522095,
+      "step": 1740
+    },
+    {
+      "epoch": 0.31750363901018924,
+      "grad_norm": 0.15915483236312866,
+      "learning_rate": 3.9949512567784684e-05,
+      "loss": 0.16153957843780517,
+      "step": 1745
+    },
+    {
+      "epoch": 0.3184133915574964,
+      "grad_norm": 0.1668540984392166,
+      "learning_rate": 3.9890450198021704e-05,
+      "loss": 0.1659809947013855,
+      "step": 1750
+    },
+    {
+      "epoch": 0.3193231441048035,
+      "grad_norm": 0.16612035036087036,
+      "learning_rate": 3.983125874247341e-05,
+      "loss": 0.16941241025924683,
+      "step": 1755
+    },
+    {
+      "epoch": 0.32023289665211063,
+      "grad_norm": 0.15163679420948029,
+      "learning_rate": 3.9771938714272407e-05,
+      "loss": 0.16053590774536133,
+      "step": 1760
+    },
+    {
+      "epoch": 0.32114264919941776,
+      "grad_norm": 0.1797824203968048,
+      "learning_rate": 3.97124906276659e-05,
+      "loss": 0.1667110800743103,
+      "step": 1765
+    },
+    {
+      "epoch": 0.3220524017467249,
+      "grad_norm": 0.15076608955860138,
+      "learning_rate": 3.9652914998011237e-05,
+      "loss": 0.1607860803604126,
+      "step": 1770
+    },
+    {
+      "epoch": 0.322962154294032,
+      "grad_norm": 0.16523587703704834,
+      "learning_rate": 3.959321234177144e-05,
+      "loss": 0.16515827178955078,
+      "step": 1775
+    },
+    {
+      "epoch": 0.32387190684133915,
+      "grad_norm": 0.22065149247646332,
+      "learning_rate": 3.9533383176510746e-05,
+      "loss": 0.1618957757949829,
+      "step": 1780
+    },
+    {
+      "epoch": 0.3247816593886463,
+      "grad_norm": 0.16426463425159454,
+      "learning_rate": 3.9473428020890066e-05,
+      "loss": 0.15763382911682128,
+      "step": 1785
+    },
+    {
+      "epoch": 0.3256914119359534,
+      "grad_norm": 0.16474904119968414,
+      "learning_rate": 3.941334739466257e-05,
+      "loss": 0.15135571956634522,
+      "step": 1790
+    },
+    {
+      "epoch": 0.32660116448326054,
+      "grad_norm": 0.16746412217617035,
+      "learning_rate": 3.935314181866909e-05,
+      "loss": 0.15925389528274536,
+      "step": 1795
+    },
+    {
+      "epoch": 0.32751091703056767,
+      "grad_norm": 0.17819371819496155,
+      "learning_rate": 3.929281181483369e-05,
+      "loss": 0.1598669171333313,
+      "step": 1800
+    },
+    {
+      "epoch": 0.3284206695778748,
+      "grad_norm": 0.1816040277481079,
+      "learning_rate": 3.923235790615907e-05,
+      "loss": 0.1652522087097168,
+      "step": 1805
+    },
+    {
+      "epoch": 0.32933042212518193,
+      "grad_norm": 0.14846695959568024,
+      "learning_rate": 3.917178061672211e-05,
+      "loss": 0.16665585041046144,
+      "step": 1810
+    },
+    {
+      "epoch": 0.33024017467248906,
+      "grad_norm": 0.1734926551580429,
+      "learning_rate": 3.911108047166924e-05,
+      "loss": 0.16069791316986085,
+      "step": 1815
+    },
+    {
+      "epoch": 0.3311499272197962,
+      "grad_norm": 0.16154922544956207,
+      "learning_rate": 3.905025799721194e-05,
+      "loss": 0.16114097833633423,
+      "step": 1820
+    },
+    {
+      "epoch": 0.3320596797671033,
+      "grad_norm": 0.1538771390914917,
+      "learning_rate": 3.898931372062217e-05,
+      "loss": 0.1602831244468689,
+      "step": 1825
+    },
+    {
+      "epoch": 0.3329694323144105,
+      "grad_norm": 0.14036566019058228,
+      "learning_rate": 3.892824817022781e-05,
+      "loss": 0.1502395749092102,
+      "step": 1830
+    },
+    {
+      "epoch": 0.33387918486171764,
+      "grad_norm": 0.19212059676647186,
+      "learning_rate": 3.886706187540804e-05,
+      "loss": 0.16265250444412233,
+      "step": 1835
+    },
+    {
+      "epoch": 0.33478893740902477,
+      "grad_norm": 0.17410333454608917,
+      "learning_rate": 3.880575536658881e-05,
+      "loss": 0.15689224004745483,
+      "step": 1840
+    },
+    {
+      "epoch": 0.3356986899563319,
+      "grad_norm": 0.15165294706821442,
+      "learning_rate": 3.874432917523817e-05,
+      "loss": 0.15033140182495117,
+      "step": 1845
+    },
+    {
+      "epoch": 0.336608442503639,
+      "grad_norm": 0.16166730225086212,
+      "learning_rate": 3.8682783833861736e-05,
+      "loss": 0.16896235942840576,
+      "step": 1850
+    },
+    {
+      "epoch": 0.33751819505094616,
+      "grad_norm": 0.16497021913528442,
+      "learning_rate": 3.8621119875998026e-05,
+      "loss": 0.1600774645805359,
+      "step": 1855
+    },
+    {
+      "epoch": 0.3384279475982533,
+      "grad_norm": 0.17264948785305023,
+      "learning_rate": 3.855933783621384e-05,
+      "loss": 0.16947593688964843,
+      "step": 1860
+    },
+    {
+      "epoch": 0.3393377001455604,
+      "grad_norm": 0.16870704293251038,
+      "learning_rate": 3.8497438250099636e-05,
+      "loss": 0.16062095165252685,
+      "step": 1865
+    },
+    {
+      "epoch": 0.34024745269286755,
+      "grad_norm": 0.16644036769866943,
+      "learning_rate": 3.843542165426492e-05,
+      "loss": 0.16015599966049193,
+      "step": 1870
+    },
+    {
+      "epoch": 0.3411572052401747,
+      "grad_norm": 0.1626352220773697,
+      "learning_rate": 3.837328858633349e-05,
+      "loss": 0.17444703578948975,
+      "step": 1875
+    },
+    {
+      "epoch": 0.3420669577874818,
+      "grad_norm": 0.1427375227212906,
+      "learning_rate": 3.83110395849389e-05,
+      "loss": 0.1589805006980896,
+      "step": 1880
+    },
+    {
+      "epoch": 0.34297671033478894,
+      "grad_norm": 0.17840255796909332,
+      "learning_rate": 3.824867518971973e-05,
+      "loss": 0.15953952074050903,
+      "step": 1885
+    },
+    {
+      "epoch": 0.34388646288209607,
+      "grad_norm": 0.16998249292373657,
+      "learning_rate": 3.818619594131489e-05,
+      "loss": 0.16027032136917113,
+      "step": 1890
+    },
+    {
+      "epoch": 0.3447962154294032,
+      "grad_norm": 0.14950257539749146,
+      "learning_rate": 3.812360238135897e-05,
+      "loss": 0.15335670709609986,
+      "step": 1895
+    },
+    {
+      "epoch": 0.3457059679767103,
+      "grad_norm": 0.1678011417388916,
+      "learning_rate": 3.806089505247752e-05,
+      "loss": 0.1560648798942566,
+      "step": 1900
+    },
+    {
+      "epoch": 0.34661572052401746,
+      "grad_norm": 0.17944541573524475,
+      "learning_rate": 3.799807449828238e-05,
+      "loss": 0.16072254180908202,
+      "step": 1905
+    },
+    {
+      "epoch": 0.3475254730713246,
+      "grad_norm": 0.166817307472229,
+      "learning_rate": 3.793514126336691e-05,
+      "loss": 0.1542820692062378,
+      "step": 1910
+    },
+    {
+      "epoch": 0.3484352256186317,
+      "grad_norm": 0.16047626733779907,
+      "learning_rate": 3.787209589330134e-05,
+      "loss": 0.16092092990875245,
+      "step": 1915
+    },
+    {
+      "epoch": 0.34934497816593885,
+      "grad_norm": 0.16478900611400604,
+      "learning_rate": 3.7808938934627965e-05,
+      "loss": 0.16765867471694945,
+      "step": 1920
+    },
+    {
+      "epoch": 0.350254730713246,
+      "grad_norm": 0.15349514782428741,
+      "learning_rate": 3.774567093485648e-05,
+      "loss": 0.15890377759933472,
+      "step": 1925
+    },
+    {
+      "epoch": 0.3511644832605531,
+      "grad_norm": 0.1515921950340271,
+      "learning_rate": 3.768229244245917e-05,
+      "loss": 0.16668319702148438,
+      "step": 1930
+    },
+    {
+      "epoch": 0.35207423580786024,
+      "grad_norm": 0.16310466825962067,
+      "learning_rate": 3.7618804006866195e-05,
+      "loss": 0.15182652473449706,
+      "step": 1935
+    },
+    {
+      "epoch": 0.3529839883551674,
+      "grad_norm": 0.17294517159461975,
+      "learning_rate": 3.755520617846084e-05,
+      "loss": 0.16287628412246705,
+      "step": 1940
+    },
+    {
+      "epoch": 0.35389374090247455,
+      "grad_norm": 0.1482895463705063,
+      "learning_rate": 3.749149950857467e-05,
+      "loss": 0.15321952104568481,
+      "step": 1945
+    },
+    {
+      "epoch": 0.3548034934497817,
+      "grad_norm": 0.2236029952764511,
+      "learning_rate": 3.7427684549482847e-05,
+      "loss": 0.15403482913970948,
+      "step": 1950
+    },
+    {
+      "epoch": 0.3557132459970888,
+      "grad_norm": 0.20185327529907227,
+      "learning_rate": 3.736376185439927e-05,
+      "loss": 0.1633884072303772,
+      "step": 1955
+    },
+    {
+      "epoch": 0.35662299854439594,
+      "grad_norm": 0.13906247913837433,
+      "learning_rate": 3.7299731977471816e-05,
+      "loss": 0.15925350189208984,
+      "step": 1960
+    },
+    {
+      "epoch": 0.35753275109170307,
+      "grad_norm": 0.18665002286434174,
+      "learning_rate": 3.723559547377751e-05,
+      "loss": 0.1612026572227478,
+      "step": 1965
+    },
+    {
+      "epoch": 0.3584425036390102,
+      "grad_norm": 0.16913433372974396,
+      "learning_rate": 3.717135289931774e-05,
+      "loss": 0.15479494333267213,
+      "step": 1970
+    },
+    {
+      "epoch": 0.35935225618631733,
+      "grad_norm": 0.1620066910982132,
+      "learning_rate": 3.7107004811013434e-05,
+      "loss": 0.1604058027267456,
+      "step": 1975
+    },
+    {
+      "epoch": 0.36026200873362446,
+      "grad_norm": 0.16838301718235016,
+      "learning_rate": 3.704255176670021e-05,
+      "loss": 0.15335073471069335,
+      "step": 1980
+    },
+    {
+      "epoch": 0.3611717612809316,
+      "grad_norm": 0.3054695427417755,
+      "learning_rate": 3.6977994325123535e-05,
+      "loss": 0.16558053493499755,
+      "step": 1985
+    },
+    {
+      "epoch": 0.3620815138282387,
+      "grad_norm": 0.1526716649532318,
+      "learning_rate": 3.6913333045933934e-05,
+      "loss": 0.16148923635482787,
+      "step": 1990
+    },
+    {
+      "epoch": 0.36299126637554585,
+      "grad_norm": 0.15328513085842133,
+      "learning_rate": 3.684856848968209e-05,
+      "loss": 0.1553613781929016,
+      "step": 1995
+    },
+    {
+      "epoch": 0.363901018922853,
+      "grad_norm": 0.16129714250564575,
+      "learning_rate": 3.6783701217813995e-05,
+      "loss": 0.16724612712860107,
+      "step": 2000
+    },
+    {
+      "epoch": 0.3648107714701601,
+      "grad_norm": 0.15715539455413818,
+      "learning_rate": 3.6718731792666086e-05,
+      "loss": 0.15867922306060792,
+      "step": 2005
+    },
+    {
+      "epoch": 0.36572052401746724,
+      "grad_norm": 0.15569166839122772,
+      "learning_rate": 3.6653660777460366e-05,
+      "loss": 0.1552058696746826,
+      "step": 2010
+    },
+    {
+      "epoch": 0.36663027656477437,
+      "grad_norm": 0.16223010420799255,
+      "learning_rate": 3.6588488736299535e-05,
+      "loss": 0.1583200454711914,
+      "step": 2015
+    },
+    {
+      "epoch": 0.3675400291120815,
+      "grad_norm": 0.18441995978355408,
+      "learning_rate": 3.652321623416209e-05,
+      "loss": 0.15050662755966188,
+      "step": 2020
+    },
+    {
+      "epoch": 0.36844978165938863,
+      "grad_norm": 0.13792674243450165,
+      "learning_rate": 3.645784383689742e-05,
+      "loss": 0.15458759069442748,
+      "step": 2025
+    },
+    {
+      "epoch": 0.36935953420669576,
+      "grad_norm": 0.14993111789226532,
+      "learning_rate": 3.639237211122091e-05,
+      "loss": 0.15926222801208495,
+      "step": 2030
+    },
+    {
+      "epoch": 0.3702692867540029,
+      "grad_norm": 0.16815930604934692,
+      "learning_rate": 3.632680162470904e-05,
+      "loss": 0.15524441003799438,
+      "step": 2035
+    },
+    {
+      "epoch": 0.37117903930131,
+      "grad_norm": 0.13312821090221405,
+      "learning_rate": 3.626113294579441e-05,
+      "loss": 0.15883516073226928,
+      "step": 2040
+    },
+    {
+      "epoch": 0.37208879184861715,
+      "grad_norm": 0.16838273406028748,
+      "learning_rate": 3.619536664376091e-05,
+      "loss": 0.15829603672027587,
+      "step": 2045
+    },
+    {
+      "epoch": 0.37299854439592434,
+      "grad_norm": 0.14706873893737793,
+      "learning_rate": 3.612950328873869e-05,
+      "loss": 0.15644397735595703,
+      "step": 2050
+    },
+    {
+      "epoch": 0.37390829694323147,
+      "grad_norm": 0.1644199639558792,
+      "learning_rate": 3.606354345169926e-05,
+      "loss": 0.15858219861984252,
+      "step": 2055
+    },
+    {
+      "epoch": 0.3748180494905386,
+      "grad_norm": 0.18077051639556885,
+      "learning_rate": 3.599748770445055e-05,
+      "loss": 0.1641286849975586,
+      "step": 2060
+    },
+    {
+      "epoch": 0.3757278020378457,
+      "grad_norm": 0.16329127550125122,
+      "learning_rate": 3.5931336619631914e-05,
+      "loss": 0.15027186870574952,
+      "step": 2065
+    },
+    {
+      "epoch": 0.37663755458515286,
+      "grad_norm": 0.16346783936023712,
+      "learning_rate": 3.586509077070922e-05,
+      "loss": 0.1558641314506531,
+      "step": 2070
+    },
+    {
+      "epoch": 0.37754730713246,
+      "grad_norm": 0.1727602630853653,
+      "learning_rate": 3.5798750731969834e-05,
+      "loss": 0.15390506982803345,
+      "step": 2075
+    },
+    {
+      "epoch": 0.3784570596797671,
+      "grad_norm": 0.7598192691802979,
+      "learning_rate": 3.5732317078517654e-05,
+      "loss": 0.1533232808113098,
+      "step": 2080
+    },
+    {
+      "epoch": 0.37936681222707425,
+      "grad_norm": 0.1433355212211609,
+      "learning_rate": 3.5665790386268124e-05,
+      "loss": 0.15560413599014283,
+      "step": 2085
+    },
+    {
+      "epoch": 0.3802765647743814,
+      "grad_norm": 0.18439625203609467,
+      "learning_rate": 3.559917123194325e-05,
+      "loss": 0.16695556640625,
+      "step": 2090
+    },
+    {
+      "epoch": 0.3811863173216885,
+      "grad_norm": 0.1693502813577652,
+      "learning_rate": 3.55324601930666e-05,
+      "loss": 0.15957870483398437,
+      "step": 2095
+    },
+    {
+      "epoch": 0.38209606986899564,
+      "grad_norm": 0.17776088416576385,
+      "learning_rate": 3.54656578479583e-05,
+      "loss": 0.1527492880821228,
+      "step": 2100
+    },
+    {
+      "epoch": 0.38300582241630277,
+      "grad_norm": 0.15993724763393402,
+      "learning_rate": 3.539876477572998e-05,
+      "loss": 0.1567505717277527,
+      "step": 2105
+    },
+    {
+      "epoch": 0.3839155749636099,
+      "grad_norm": 0.17067375779151917,
+      "learning_rate": 3.533178155627981e-05,
+      "loss": 0.14660797119140626,
+      "step": 2110
+    },
+    {
+      "epoch": 0.384825327510917,
+      "grad_norm": 0.20239882171154022,
+      "learning_rate": 3.526470877028745e-05,
+      "loss": 0.1596767544746399,
+      "step": 2115
+    },
+    {
+      "epoch": 0.38573508005822416,
+      "grad_norm": 0.1863643079996109,
+      "learning_rate": 3.5197546999209005e-05,
+      "loss": 0.15738571882247926,
+      "step": 2120
+    },
+    {
+      "epoch": 0.3866448326055313,
+      "grad_norm": 0.16994133591651917,
+      "learning_rate": 3.5130296825272014e-05,
+      "loss": 0.16255316734313965,
+      "step": 2125
+    },
+    {
+      "epoch": 0.3875545851528384,
+      "grad_norm": 0.18703415989875793,
+      "learning_rate": 3.5062958831470355e-05,
+      "loss": 0.15206334590911866,
+      "step": 2130
+    },
+    {
+      "epoch": 0.38846433770014555,
+      "grad_norm": 0.15433982014656067,
+      "learning_rate": 3.4995533601559226e-05,
+      "loss": 0.1590178370475769,
+      "step": 2135
+    },
+    {
+      "epoch": 0.3893740902474527,
+      "grad_norm": 0.16498146951198578,
+      "learning_rate": 3.4928021720050104e-05,
+      "loss": 0.14759145975112914,
+      "step": 2140
+    },
+    {
+      "epoch": 0.3902838427947598,
+      "grad_norm": 0.17880478501319885,
+      "learning_rate": 3.486042377220562e-05,
+      "loss": 0.1642458915710449,
+      "step": 2145
+    },
+    {
+      "epoch": 0.39119359534206694,
+      "grad_norm": 0.14700061082839966,
+      "learning_rate": 3.479274034403455e-05,
+      "loss": 0.16105138063430785,
+      "step": 2150
+    },
+    {
+      "epoch": 0.39210334788937407,
+      "grad_norm": 0.1620762050151825,
+      "learning_rate": 3.472497202228664e-05,
+      "loss": 0.15104985237121582,
+      "step": 2155
+    },
+    {
+      "epoch": 0.3930131004366812,
+      "grad_norm": 0.1625058799982071,
+      "learning_rate": 3.4657119394447654e-05,
+      "loss": 0.16145485639572144,
+      "step": 2160
+    },
+    {
+      "epoch": 0.3939228529839884,
+      "grad_norm": 0.1631549596786499,
+      "learning_rate": 3.458918304873417e-05,
+      "loss": 0.16712255477905275,
+      "step": 2165
+    },
+    {
+      "epoch": 0.3948326055312955,
+      "grad_norm": 0.16041551530361176,
+      "learning_rate": 3.452116357408853e-05,
+      "loss": 0.15118330717086792,
+      "step": 2170
+    },
+    {
+      "epoch": 0.39574235807860264,
+      "grad_norm": 0.16692611575126648,
+      "learning_rate": 3.44530615601737e-05,
+      "loss": 0.16982550621032716,
+      "step": 2175
+    },
+    {
+      "epoch": 0.39665211062590977,
+      "grad_norm": 0.16082268953323364,
+      "learning_rate": 3.438487759736821e-05,
+      "loss": 0.1513260006904602,
+      "step": 2180
+    },
+    {
+      "epoch": 0.3975618631732169,
+      "grad_norm": 0.1474589854478836,
+      "learning_rate": 3.4316612276761004e-05,
+      "loss": 0.14968743324279785,
+      "step": 2185
+    },
+    {
+      "epoch": 0.39847161572052403,
+      "grad_norm": 0.14531342685222626,
+      "learning_rate": 3.42482661901463e-05,
+      "loss": 0.1563260555267334,
+      "step": 2190
+    },
+    {
+      "epoch": 0.39938136826783116,
+      "grad_norm": 0.16775506734848022,
+      "learning_rate": 3.41798399300185e-05,
+      "loss": 0.14861010313034057,
+      "step": 2195
+    },
+    {
+      "epoch": 0.4002911208151383,
+      "grad_norm": 0.15065217018127441,
+      "learning_rate": 3.411133408956703e-05,
+      "loss": 0.15559519529342652,
+      "step": 2200
+    },
+    {
+      "epoch": 0.4012008733624454,
+      "grad_norm": 0.16655296087265015,
+      "learning_rate": 3.4042749262671184e-05,
+      "loss": 0.16025567054748535,
+      "step": 2205
+    },
+    {
+      "epoch": 0.40211062590975255,
+      "grad_norm": 0.14773905277252197,
+      "learning_rate": 3.397408604389501e-05,
+      "loss": 0.15074082612991332,
+      "step": 2210
+    },
+    {
+      "epoch": 0.4030203784570597,
+      "grad_norm": 0.16233304142951965,
+      "learning_rate": 3.3905345028482125e-05,
+      "loss": 0.15490520000457764,
+      "step": 2215
+    },
+    {
+      "epoch": 0.4039301310043668,
+      "grad_norm": 0.17520153522491455,
+      "learning_rate": 3.383652681235058e-05,
+      "loss": 0.1517520785331726,
+      "step": 2220
+    },
+    {
+      "epoch": 0.40483988355167394,
+      "grad_norm": 0.14749875664710999,
+      "learning_rate": 3.376763199208766e-05,
+      "loss": 0.15410997867584228,
+      "step": 2225
+    },
+    {
+      "epoch": 0.40574963609898107,
+      "grad_norm": 0.16855919361114502,
+      "learning_rate": 3.369866116494477e-05,
+      "loss": 0.1510261058807373,
+      "step": 2230
+    },
+    {
+      "epoch": 0.4066593886462882,
+      "grad_norm": 0.1594122350215912,
+      "learning_rate": 3.362961492883218e-05,
+      "loss": 0.1493813395500183,
+      "step": 2235
+    },
+    {
+      "epoch": 0.40756914119359533,
+      "grad_norm": 0.13645926117897034,
+      "learning_rate": 3.3560493882313915e-05,
+      "loss": 0.14876762628555298,
+      "step": 2240
+    },
+    {
+      "epoch": 0.40847889374090246,
+      "grad_norm": 0.14304400980472565,
+      "learning_rate": 3.349129862460251e-05,
+      "loss": 0.15567013025283813,
+      "step": 2245
+    },
+    {
+      "epoch": 0.4093886462882096,
+      "grad_norm": 0.17040041089057922,
+      "learning_rate": 3.342202975555386e-05,
+      "loss": 0.1563249945640564,
+      "step": 2250
+    },
+    {
+      "epoch": 0.4102983988355167,
+      "grad_norm": 0.15594671666622162,
+      "learning_rate": 3.3352687875661984e-05,
+      "loss": 0.1546410083770752,
+      "step": 2255
+    },
+    {
+      "epoch": 0.41120815138282385,
+      "grad_norm": 0.1677195280790329,
+      "learning_rate": 3.328327358605384e-05,
+      "loss": 0.15710171461105346,
+      "step": 2260
+    },
+    {
+      "epoch": 0.412117903930131,
+      "grad_norm": 0.1731705516576767,
+      "learning_rate": 3.321378748848412e-05,
+      "loss": 0.16444036960601807,
+      "step": 2265
+    },
+    {
+      "epoch": 0.4130276564774381,
+      "grad_norm": 0.18779033422470093,
+      "learning_rate": 3.3144230185329984e-05,
+      "loss": 0.15659687519073487,
+      "step": 2270
+    },
+    {
+      "epoch": 0.4139374090247453,
+      "grad_norm": 0.1543768346309662,
+      "learning_rate": 3.3074602279585913e-05,
+      "loss": 0.15100739002227784,
+      "step": 2275
+    },
+    {
+      "epoch": 0.4148471615720524,
+      "grad_norm": 0.16672168672084808,
+      "learning_rate": 3.300490437485843e-05,
+      "loss": 0.15535364151000977,
+      "step": 2280
+    },
+    {
+      "epoch": 0.41575691411935956,
+      "grad_norm": 0.16741308569908142,
+      "learning_rate": 3.293513707536089e-05,
+      "loss": 0.15523911714553834,
+      "step": 2285
+    },
+    {
+      "epoch": 0.4166666666666667,
+      "grad_norm": 0.1488303542137146,
+      "learning_rate": 3.286530098590822e-05,
+      "loss": 0.1542000651359558,
+      "step": 2290
+    },
+    {
+      "epoch": 0.4175764192139738,
+      "grad_norm": 0.1637732982635498,
+      "learning_rate": 3.2795396711911694e-05,
+      "loss": 0.15354831218719484,
+      "step": 2295
+    },
+    {
+      "epoch": 0.41848617176128095,
+      "grad_norm": 0.1472022533416748,
+      "learning_rate": 3.272542485937369e-05,
+      "loss": 0.16235145330429077,
+      "step": 2300
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.2705431556818465e+18,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-2300/training_args.bin b/checkpoint-2300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-2300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/checkpoint-2400/README.md b/checkpoint-2400/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-2400/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-2400/adapter_config.json b/checkpoint-2400/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-2400/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-2400/adapter_model.safetensors b/checkpoint-2400/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..ed54a7e89bf39570e891469479025873aef777c8
--- /dev/null
+++ b/checkpoint-2400/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:34f96387bf15e2e5e186883b276e153549328fab909434291857087619cbd064
+size 169741912
diff --git a/checkpoint-2400/chat_template.jinja b/checkpoint-2400/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-2400/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-2400/optimizer.pt b/checkpoint-2400/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b0eaf1a4d12f89e361c04e94ae83a3da8f1934b2
--- /dev/null
+++ b/checkpoint-2400/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fb749e1e610c077e8f9950c8973644a4698f1b479184c93919d17ee94df26ce2
+size 72807355
diff --git a/checkpoint-2400/processor_config.json b/checkpoint-2400/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-2400/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-2400/rng_state.pth b/checkpoint-2400/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-2400/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-2400/scheduler.pt b/checkpoint-2400/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..12bc88601ab56223224a6763ab10f53be2457274
--- /dev/null
+++ b/checkpoint-2400/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eba492d1342c7608d70619516bfa6a83de7dd0de8ddd7c8fc9b80f417698ce96
+size 1465
diff --git a/checkpoint-2400/tokenizer.json b/checkpoint-2400/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-2400/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-2400/tokenizer_config.json b/checkpoint-2400/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-2400/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-2400/trainer_state.json b/checkpoint-2400/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..0faf26591fae2e72213066fe77c2a04a380bd1cb
--- /dev/null
+++ b/checkpoint-2400/trainer_state.json
@@ -0,0 +1,3402 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.4366812227074236,
+  "eval_steps": 100,
+  "global_step": 2400,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    },
+    {
+      "epoch": 0.03729985443959243,
+      "grad_norm": 0.061678580939769745,
+      "learning_rate": 4.999340748636462e-05,
+      "loss": 0.24956226348876953,
+      "step": 205
+    },
+    {
+      "epoch": 0.03820960698689956,
+      "grad_norm": 0.07328873127698898,
+      "learning_rate": 4.999160884067051e-05,
+      "loss": 0.26169676780700685,
+      "step": 210
+    },
+    {
+      "epoch": 0.0391193595342067,
+      "grad_norm": 0.08287990838289261,
+      "learning_rate": 4.9989593541926246e-05,
+      "loss": 0.2574604034423828,
+      "step": 215
+    },
+    {
+      "epoch": 0.04002911208151383,
+      "grad_norm": 0.06787359714508057,
+      "learning_rate": 4.9987361607602525e-05,
+      "loss": 0.25351409912109374,
+      "step": 220
+    },
+    {
+      "epoch": 0.04093886462882096,
+      "grad_norm": 0.06695502996444702,
+      "learning_rate": 4.998491305704805e-05,
+      "loss": 0.24522039890289307,
+      "step": 225
+    },
+    {
+      "epoch": 0.041848617176128096,
+      "grad_norm": 0.08872214704751968,
+      "learning_rate": 4.9982247911489375e-05,
+      "loss": 0.2581867933273315,
+      "step": 230
+    },
+    {
+      "epoch": 0.042758369723435226,
+      "grad_norm": 0.07637131959199905,
+      "learning_rate": 4.9979366194030743e-05,
+      "loss": 0.25569658279418944,
+      "step": 235
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 0.08158119022846222,
+      "learning_rate": 4.997626792965385e-05,
+      "loss": 0.2529409646987915,
+      "step": 240
+    },
+    {
+      "epoch": 0.04457787481804949,
+      "grad_norm": 0.07529161125421524,
+      "learning_rate": 4.997295314521766e-05,
+      "loss": 0.24049024581909179,
+      "step": 245
+    },
+    {
+      "epoch": 0.04548762736535662,
+      "grad_norm": 0.08860139548778534,
+      "learning_rate": 4.996942186945813e-05,
+      "loss": 0.2490522861480713,
+      "step": 250
+    },
+    {
+      "epoch": 0.04639737991266375,
+      "grad_norm": 0.0850321501493454,
+      "learning_rate": 4.9965674132988005e-05,
+      "loss": 0.24180831909179687,
+      "step": 255
+    },
+    {
+      "epoch": 0.04730713245997089,
+      "grad_norm": 0.07556115090847015,
+      "learning_rate": 4.996170996829653e-05,
+      "loss": 0.2509631872177124,
+      "step": 260
+    },
+    {
+      "epoch": 0.04821688500727802,
+      "grad_norm": 0.07971206307411194,
+      "learning_rate": 4.995752940974918e-05,
+      "loss": 0.24398891925811766,
+      "step": 265
+    },
+    {
+      "epoch": 0.04912663755458515,
+      "grad_norm": 0.09149336814880371,
+      "learning_rate": 4.9953132493587344e-05,
+      "loss": 0.2300492286682129,
+      "step": 270
+    },
+    {
+      "epoch": 0.050036390101892286,
+      "grad_norm": 0.08265820890665054,
+      "learning_rate": 4.9948519257928034e-05,
+      "loss": 0.24246792793273925,
+      "step": 275
+    },
+    {
+      "epoch": 0.050946142649199416,
+      "grad_norm": 0.10328587144613266,
+      "learning_rate": 4.9943689742763534e-05,
+      "loss": 0.2367171049118042,
+      "step": 280
+    },
+    {
+      "epoch": 0.05185589519650655,
+      "grad_norm": 0.0836917981505394,
+      "learning_rate": 4.993864398996105e-05,
+      "loss": 0.23215813636779786,
+      "step": 285
+    },
+    {
+      "epoch": 0.05276564774381368,
+      "grad_norm": 0.09475161135196686,
+      "learning_rate": 4.99333820432624e-05,
+      "loss": 0.2350748062133789,
+      "step": 290
+    },
+    {
+      "epoch": 0.05367540029112081,
+      "grad_norm": 0.08040128648281097,
+      "learning_rate": 4.992790394828355e-05,
+      "loss": 0.23253886699676513,
+      "step": 295
+    },
+    {
+      "epoch": 0.05458515283842795,
+      "grad_norm": 0.08852150291204453,
+      "learning_rate": 4.992220975251428e-05,
+      "loss": 0.23856515884399415,
+      "step": 300
+    },
+    {
+      "epoch": 0.05549490538573508,
+      "grad_norm": 0.09565229713916779,
+      "learning_rate": 4.991629950531775e-05,
+      "loss": 0.23311660289764405,
+      "step": 305
+    },
+    {
+      "epoch": 0.05640465793304221,
+      "grad_norm": 0.08158160001039505,
+      "learning_rate": 4.991017325793009e-05,
+      "loss": 0.22467944622039795,
+      "step": 310
+    },
+    {
+      "epoch": 0.05731441048034935,
+      "grad_norm": 0.07746429741382599,
+      "learning_rate": 4.990383106345994e-05,
+      "loss": 0.229844069480896,
+      "step": 315
+    },
+    {
+      "epoch": 0.05822416302765648,
+      "grad_norm": 0.08564355969429016,
+      "learning_rate": 4.989727297688797e-05,
+      "loss": 0.22414517402648926,
+      "step": 320
+    },
+    {
+      "epoch": 0.05913391557496361,
+      "grad_norm": 0.07517435401678085,
+      "learning_rate": 4.9890499055066435e-05,
+      "loss": 0.2236532211303711,
+      "step": 325
+    },
+    {
+      "epoch": 0.060043668122270744,
+      "grad_norm": 0.111734539270401,
+      "learning_rate": 4.988350935671869e-05,
+      "loss": 0.21474847793579102,
+      "step": 330
+    },
+    {
+      "epoch": 0.060953420669577874,
+      "grad_norm": 0.09906989336013794,
+      "learning_rate": 4.987630394243866e-05,
+      "loss": 0.23321933746337892,
+      "step": 335
+    },
+    {
+      "epoch": 0.06186317321688501,
+      "grad_norm": 0.10131457448005676,
+      "learning_rate": 4.98688828746903e-05,
+      "loss": 0.2310662031173706,
+      "step": 340
+    },
+    {
+      "epoch": 0.06277292576419213,
+      "grad_norm": 0.09203507006168365,
+      "learning_rate": 4.986124621780708e-05,
+      "loss": 0.22021169662475587,
+      "step": 345
+    },
+    {
+      "epoch": 0.06368267831149928,
+      "grad_norm": 0.09505912661552429,
+      "learning_rate": 4.9853394037991416e-05,
+      "loss": 0.2197155237197876,
+      "step": 350
+    },
+    {
+      "epoch": 0.06459243085880641,
+      "grad_norm": 0.09038657695055008,
+      "learning_rate": 4.984532640331412e-05,
+      "loss": 0.22066287994384765,
+      "step": 355
+    },
+    {
+      "epoch": 0.06550218340611354,
+      "grad_norm": 0.09707064181566238,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 0.22455451488494874,
+      "step": 360
+    },
+    {
+      "epoch": 0.06641193595342067,
+      "grad_norm": 0.10367228090763092,
+      "learning_rate": 4.98285450509961e-05,
+      "loss": 0.21993820667266845,
+      "step": 365
+    },
+    {
+      "epoch": 0.0673216885007278,
+      "grad_norm": 0.12229471653699875,
+      "learning_rate": 4.9819831478833456e-05,
+      "loss": 0.2168867588043213,
+      "step": 370
+    },
+    {
+      "epoch": 0.06823144104803494,
+      "grad_norm": 0.0964592918753624,
+      "learning_rate": 4.981090274276406e-05,
+      "loss": 0.21579203605651856,
+      "step": 375
+    },
+    {
+      "epoch": 0.06914119359534207,
+      "grad_norm": 0.09400496631860733,
+      "learning_rate": 4.980175892019141e-05,
+      "loss": 0.20972180366516113,
+      "step": 380
+    },
+    {
+      "epoch": 0.0700509461426492,
+      "grad_norm": 0.08158645778894424,
+      "learning_rate": 4.9792400090383594e-05,
+      "loss": 0.22148358821868896,
+      "step": 385
+    },
+    {
+      "epoch": 0.07096069868995633,
+      "grad_norm": 0.10916394740343094,
+      "learning_rate": 4.978282633447261e-05,
+      "loss": 0.2214418649673462,
+      "step": 390
+    },
+    {
+      "epoch": 0.07187045123726346,
+      "grad_norm": 0.11138810962438583,
+      "learning_rate": 4.9773037735453636e-05,
+      "loss": 0.21814754009246826,
+      "step": 395
+    },
+    {
+      "epoch": 0.07278020378457059,
+      "grad_norm": 0.10914396494626999,
+      "learning_rate": 4.9763034378184365e-05,
+      "loss": 0.21310818195343018,
+      "step": 400
+    },
+    {
+      "epoch": 0.07368995633187773,
+      "grad_norm": 0.1043366864323616,
+      "learning_rate": 4.975281634938421e-05,
+      "loss": 0.21266789436340333,
+      "step": 405
+    },
+    {
+      "epoch": 0.07459970887918486,
+      "grad_norm": 0.1036868542432785,
+      "learning_rate": 4.9742383737633594e-05,
+      "loss": 0.21606721878051757,
+      "step": 410
+    },
+    {
+      "epoch": 0.075509461426492,
+      "grad_norm": 0.11640442907810211,
+      "learning_rate": 4.9731736633373144e-05,
+      "loss": 0.21532948017120362,
+      "step": 415
+    },
+    {
+      "epoch": 0.07641921397379912,
+      "grad_norm": 0.11219926178455353,
+      "learning_rate": 4.9720875128902956e-05,
+      "loss": 0.2191627025604248,
+      "step": 420
+    },
+    {
+      "epoch": 0.07732896652110625,
+      "grad_norm": 0.12103637307882309,
+      "learning_rate": 4.970979931838176e-05,
+      "loss": 0.20938868522644044,
+      "step": 425
+    },
+    {
+      "epoch": 0.0782387190684134,
+      "grad_norm": 0.13274189829826355,
+      "learning_rate": 4.96985092978261e-05,
+      "loss": 0.21792960166931152,
+      "step": 430
+    },
+    {
+      "epoch": 0.07914847161572053,
+      "grad_norm": 0.11164513230323792,
+      "learning_rate": 4.968700516510954e-05,
+      "loss": 0.2022618055343628,
+      "step": 435
+    },
+    {
+      "epoch": 0.08005822416302766,
+      "grad_norm": 0.09532847255468369,
+      "learning_rate": 4.967528701996174e-05,
+      "loss": 0.21255812644958497,
+      "step": 440
+    },
+    {
+      "epoch": 0.08096797671033479,
+      "grad_norm": 0.10279258340597153,
+      "learning_rate": 4.96633549639677e-05,
+      "loss": 0.20683050155639648,
+      "step": 445
+    },
+    {
+      "epoch": 0.08187772925764192,
+      "grad_norm": 0.1257462352514267,
+      "learning_rate": 4.965120910056677e-05,
+      "loss": 0.21419920921325683,
+      "step": 450
+    },
+    {
+      "epoch": 0.08278748180494905,
+      "grad_norm": 0.11663137376308441,
+      "learning_rate": 4.963884953505186e-05,
+      "loss": 0.2072287082672119,
+      "step": 455
+    },
+    {
+      "epoch": 0.08369723435225619,
+      "grad_norm": 0.10488224029541016,
+      "learning_rate": 4.96262763745684e-05,
+      "loss": 0.1982678532600403,
+      "step": 460
+    },
+    {
+      "epoch": 0.08460698689956332,
+      "grad_norm": 0.11801692098379135,
+      "learning_rate": 4.961348972811354e-05,
+      "loss": 0.20662031173706055,
+      "step": 465
+    },
+    {
+      "epoch": 0.08551673944687045,
+      "grad_norm": 0.11318827420473099,
+      "learning_rate": 4.96004897065351e-05,
+      "loss": 0.20947303771972656,
+      "step": 470
+    },
+    {
+      "epoch": 0.08642649199417758,
+      "grad_norm": 0.13409486413002014,
+      "learning_rate": 4.95872764225307e-05,
+      "loss": 0.19670876264572143,
+      "step": 475
+    },
+    {
+      "epoch": 0.08733624454148471,
+      "grad_norm": 0.14440792798995972,
+      "learning_rate": 4.957384999064672e-05,
+      "loss": 0.19842848777770997,
+      "step": 480
+    },
+    {
+      "epoch": 0.08824599708879186,
+      "grad_norm": 0.12246996909379959,
+      "learning_rate": 4.956021052727731e-05,
+      "loss": 0.20318071842193602,
+      "step": 485
+    },
+    {
+      "epoch": 0.08915574963609899,
+      "grad_norm": 0.13437233865261078,
+      "learning_rate": 4.954635815066342e-05,
+      "loss": 0.21675212383270265,
+      "step": 490
+    },
+    {
+      "epoch": 0.09006550218340612,
+      "grad_norm": 0.11109672486782074,
+      "learning_rate": 4.9532292980891744e-05,
+      "loss": 0.2100757837295532,
+      "step": 495
+    },
+    {
+      "epoch": 0.09097525473071325,
+      "grad_norm": 0.1388893872499466,
+      "learning_rate": 4.9518015139893675e-05,
+      "loss": 0.20303285121917725,
+      "step": 500
+    },
+    {
+      "epoch": 0.09188500727802038,
+      "grad_norm": 0.13239721953868866,
+      "learning_rate": 4.950352475144427e-05,
+      "loss": 0.2152268409729004,
+      "step": 505
+    },
+    {
+      "epoch": 0.0927947598253275,
+      "grad_norm": 0.12834979593753815,
+      "learning_rate": 4.948882194116119e-05,
+      "loss": 0.20799248218536376,
+      "step": 510
+    },
+    {
+      "epoch": 0.09370451237263465,
+      "grad_norm": 0.11886704713106155,
+      "learning_rate": 4.947390683650354e-05,
+      "loss": 0.20394976139068605,
+      "step": 515
+    },
+    {
+      "epoch": 0.09461426491994178,
+      "grad_norm": 0.11398876458406448,
+      "learning_rate": 4.945877956677083e-05,
+      "loss": 0.2091092586517334,
+      "step": 520
+    },
+    {
+      "epoch": 0.09552401746724891,
+      "grad_norm": 0.1422540694475174,
+      "learning_rate": 4.944344026310186e-05,
+      "loss": 0.19564238786697388,
+      "step": 525
+    },
+    {
+      "epoch": 0.09643377001455604,
+      "grad_norm": 0.11359584331512451,
+      "learning_rate": 4.9427889058473535e-05,
+      "loss": 0.20493624210357667,
+      "step": 530
+    },
+    {
+      "epoch": 0.09734352256186317,
+      "grad_norm": 0.11703553050756454,
+      "learning_rate": 4.941212608769974e-05,
+      "loss": 0.2098615884780884,
+      "step": 535
+    },
+    {
+      "epoch": 0.0982532751091703,
+      "grad_norm": 0.14552047848701477,
+      "learning_rate": 4.939615148743017e-05,
+      "loss": 0.20382182598114013,
+      "step": 540
+    },
+    {
+      "epoch": 0.09916302765647744,
+      "grad_norm": 0.13178016245365143,
+      "learning_rate": 4.937996539614914e-05,
+      "loss": 0.19901862144470214,
+      "step": 545
+    },
+    {
+      "epoch": 0.10007278020378457,
+      "grad_norm": 0.635392427444458,
+      "learning_rate": 4.936356795417439e-05,
+      "loss": 0.20694944858551026,
+      "step": 550
+    },
+    {
+      "epoch": 0.1009825327510917,
+      "grad_norm": 0.15019077062606812,
+      "learning_rate": 4.934695930365586e-05,
+      "loss": 0.19313746690750122,
+      "step": 555
+    },
+    {
+      "epoch": 0.10189228529839883,
+      "grad_norm": 0.12941956520080566,
+      "learning_rate": 4.9330139588574474e-05,
+      "loss": 0.19671722650527954,
+      "step": 560
+    },
+    {
+      "epoch": 0.10280203784570596,
+      "grad_norm": 0.13818831741809845,
+      "learning_rate": 4.931310895474088e-05,
+      "loss": 0.20026786327362062,
+      "step": 565
+    },
+    {
+      "epoch": 0.1037117903930131,
+      "grad_norm": 0.12011194974184036,
+      "learning_rate": 4.929586754979417e-05,
+      "loss": 0.1932437539100647,
+      "step": 570
+    },
+    {
+      "epoch": 0.10462154294032024,
+      "grad_norm": 0.1345364898443222,
+      "learning_rate": 4.9278415523200644e-05,
+      "loss": 0.20245940685272218,
+      "step": 575
+    },
+    {
+      "epoch": 0.10553129548762737,
+      "grad_norm": 0.13281017541885376,
+      "learning_rate": 4.926075302625247e-05,
+      "loss": 0.19864981174468993,
+      "step": 580
+    },
+    {
+      "epoch": 0.1064410480349345,
+      "grad_norm": 0.13465586304664612,
+      "learning_rate": 4.924288021206639e-05,
+      "loss": 0.19573183059692384,
+      "step": 585
+    },
+    {
+      "epoch": 0.10735080058224163,
+      "grad_norm": 0.15225961804389954,
+      "learning_rate": 4.9224797235582396e-05,
+      "loss": 0.19946801662445068,
+      "step": 590
+    },
+    {
+      "epoch": 0.10826055312954876,
+      "grad_norm": 0.12816746532917023,
+      "learning_rate": 4.92065042535624e-05,
+      "loss": 0.19851526021957397,
+      "step": 595
+    },
+    {
+      "epoch": 0.1091703056768559,
+      "grad_norm": 0.13802853226661682,
+      "learning_rate": 4.9188001424588824e-05,
+      "loss": 0.19321763515472412,
+      "step": 600
+    },
+    {
+      "epoch": 0.11008005822416303,
+      "grad_norm": 0.17504797875881195,
+      "learning_rate": 4.9169288909063295e-05,
+      "loss": 0.2032616138458252,
+      "step": 605
+    },
+    {
+      "epoch": 0.11098981077147016,
+      "grad_norm": 0.13544194400310516,
+      "learning_rate": 4.91503668692052e-05,
+      "loss": 0.2011256456375122,
+      "step": 610
+    },
+    {
+      "epoch": 0.11189956331877729,
+      "grad_norm": 1.3976134061813354,
+      "learning_rate": 4.91312354690503e-05,
+      "loss": 0.19916868209838867,
+      "step": 615
+    },
+    {
+      "epoch": 0.11280931586608442,
+      "grad_norm": 0.1465059071779251,
+      "learning_rate": 4.91118948744493e-05,
+      "loss": 0.19487457275390624,
+      "step": 620
+    },
+    {
+      "epoch": 0.11371906841339156,
+      "grad_norm": 0.12103168666362762,
+      "learning_rate": 4.909234525306645e-05,
+      "loss": 0.1907251238822937,
+      "step": 625
+    },
+    {
+      "epoch": 0.1146288209606987,
+      "grad_norm": 0.12660574913024902,
+      "learning_rate": 4.907258677437802e-05,
+      "loss": 0.19327253103256226,
+      "step": 630
+    },
+    {
+      "epoch": 0.11553857350800582,
+      "grad_norm": 0.1347813606262207,
+      "learning_rate": 4.90526196096709e-05,
+      "loss": 0.19637736082077026,
+      "step": 635
+    },
+    {
+      "epoch": 0.11644832605531295,
+      "grad_norm": 0.14953652024269104,
+      "learning_rate": 4.903244393204107e-05,
+      "loss": 0.20325069427490233,
+      "step": 640
+    },
+    {
+      "epoch": 0.11735807860262008,
+      "grad_norm": 0.13936272263526917,
+      "learning_rate": 4.901205991639213e-05,
+      "loss": 0.1930275321006775,
+      "step": 645
+    },
+    {
+      "epoch": 0.11826783114992721,
+      "grad_norm": 0.1448420137166977,
+      "learning_rate": 4.899146773943374e-05,
+      "loss": 0.20026936531066894,
+      "step": 650
+    },
+    {
+      "epoch": 0.11917758369723436,
+      "grad_norm": 0.1312534064054489,
+      "learning_rate": 4.897066757968014e-05,
+      "loss": 0.19062033891677857,
+      "step": 655
+    },
+    {
+      "epoch": 0.12008733624454149,
+      "grad_norm": 0.13644742965698242,
+      "learning_rate": 4.894965961744859e-05,
+      "loss": 0.18719595670700073,
+      "step": 660
+    },
+    {
+      "epoch": 0.12099708879184862,
+      "grad_norm": 0.14276087284088135,
+      "learning_rate": 4.892844403485777e-05,
+      "loss": 0.19784307479858398,
+      "step": 665
+    },
+    {
+      "epoch": 0.12190684133915575,
+      "grad_norm": 0.14735399186611176,
+      "learning_rate": 4.890702101582623e-05,
+      "loss": 0.19163782596588136,
+      "step": 670
+    },
+    {
+      "epoch": 0.12281659388646288,
+      "grad_norm": 0.15742065012454987,
+      "learning_rate": 4.888539074607082e-05,
+      "loss": 0.19312986135482788,
+      "step": 675
+    },
+    {
+      "epoch": 0.12372634643377002,
+      "grad_norm": 0.12917031347751617,
+      "learning_rate": 4.8863553413105025e-05,
+      "loss": 0.20066320896148682,
+      "step": 680
+    },
+    {
+      "epoch": 0.12463609898107715,
+      "grad_norm": 0.1484801322221756,
+      "learning_rate": 4.884150920623737e-05,
+      "loss": 0.20096096992492676,
+      "step": 685
+    },
+    {
+      "epoch": 0.12554585152838427,
+      "grad_norm": 0.1455296128988266,
+      "learning_rate": 4.88192583165698e-05,
+      "loss": 0.20518505573272705,
+      "step": 690
+    },
+    {
+      "epoch": 0.12645560407569142,
+      "grad_norm": 0.14517490565776825,
+      "learning_rate": 4.879680093699598e-05,
+      "loss": 0.18859238624572755,
+      "step": 695
+    },
+    {
+      "epoch": 0.12736535662299855,
+      "grad_norm": 0.18778090178966522,
+      "learning_rate": 4.877413726219964e-05,
+      "loss": 0.197074818611145,
+      "step": 700
+    },
+    {
+      "epoch": 0.12827510917030568,
+      "grad_norm": 0.13497677445411682,
+      "learning_rate": 4.87512674886529e-05,
+      "loss": 0.18713107109069824,
+      "step": 705
+    },
+    {
+      "epoch": 0.12918486171761281,
+      "grad_norm": 0.12657155096530914,
+      "learning_rate": 4.872819181461455e-05,
+      "loss": 0.1858484387397766,
+      "step": 710
+    },
+    {
+      "epoch": 0.13009461426491994,
+      "grad_norm": 0.11458148807287216,
+      "learning_rate": 4.870491044012834e-05,
+      "loss": 0.18732179403305055,
+      "step": 715
+    },
+    {
+      "epoch": 0.13100436681222707,
+      "grad_norm": 0.13000249862670898,
+      "learning_rate": 4.8681423567021244e-05,
+      "loss": 0.1872936010360718,
+      "step": 720
+    },
+    {
+      "epoch": 0.1319141193595342,
+      "grad_norm": 0.14580890536308289,
+      "learning_rate": 4.865773139890172e-05,
+      "loss": 0.19280019998550416,
+      "step": 725
+    },
+    {
+      "epoch": 0.13282387190684133,
+      "grad_norm": 0.1507277935743332,
+      "learning_rate": 4.8633834141157913e-05,
+      "loss": 0.1898929238319397,
+      "step": 730
+    },
+    {
+      "epoch": 0.13373362445414846,
+      "grad_norm": 0.1418737769126892,
+      "learning_rate": 4.860973200095592e-05,
+      "loss": 0.17926375865936278,
+      "step": 735
+    },
+    {
+      "epoch": 0.1346433770014556,
+      "grad_norm": 0.17151866853237152,
+      "learning_rate": 4.858542518723794e-05,
+      "loss": 0.18963592052459716,
+      "step": 740
+    },
+    {
+      "epoch": 0.13555312954876272,
+      "grad_norm": 0.11162743717432022,
+      "learning_rate": 4.8560913910720535e-05,
+      "loss": 0.19466646909713745,
+      "step": 745
+    },
+    {
+      "epoch": 0.13646288209606988,
+      "grad_norm": 0.15628376603126526,
+      "learning_rate": 4.8536198383892725e-05,
+      "loss": 0.19494034051895143,
+      "step": 750
+    },
+    {
+      "epoch": 0.137372634643377,
+      "grad_norm": 0.18209289014339447,
+      "learning_rate": 4.851127882101421e-05,
+      "loss": 0.18747550249099731,
+      "step": 755
+    },
+    {
+      "epoch": 0.13828238719068414,
+      "grad_norm": 0.14559614658355713,
+      "learning_rate": 4.8486155438113454e-05,
+      "loss": 0.1897158980369568,
+      "step": 760
+    },
+    {
+      "epoch": 0.13919213973799127,
+      "grad_norm": 0.3198587894439697,
+      "learning_rate": 4.846082845298586e-05,
+      "loss": 0.18571001291275024,
+      "step": 765
+    },
+    {
+      "epoch": 0.1401018922852984,
+      "grad_norm": 0.1486678421497345,
+      "learning_rate": 4.843529808519189e-05,
+      "loss": 0.19561930894851684,
+      "step": 770
+    },
+    {
+      "epoch": 0.14101164483260553,
+      "grad_norm": 0.15318170189857483,
+      "learning_rate": 4.840956455605509e-05,
+      "loss": 0.187040114402771,
+      "step": 775
+    },
+    {
+      "epoch": 0.14192139737991266,
+      "grad_norm": 0.13754244148731232,
+      "learning_rate": 4.838362808866025e-05,
+      "loss": 0.18345539569854735,
+      "step": 780
+    },
+    {
+      "epoch": 0.1428311499272198,
+      "grad_norm": 0.12943248450756073,
+      "learning_rate": 4.835748890785143e-05,
+      "loss": 0.1921079397201538,
+      "step": 785
+    },
+    {
+      "epoch": 0.14374090247452692,
+      "grad_norm": 0.110458143055439,
+      "learning_rate": 4.833114724023001e-05,
+      "loss": 0.17927205562591553,
+      "step": 790
+    },
+    {
+      "epoch": 0.14465065502183405,
+      "grad_norm": 0.2421770840883255,
+      "learning_rate": 4.830460331415275e-05,
+      "loss": 0.18317567110061644,
+      "step": 795
+    },
+    {
+      "epoch": 0.14556040756914118,
+      "grad_norm": 0.14752762019634247,
+      "learning_rate": 4.8277857359729787e-05,
+      "loss": 0.1843916058540344,
+      "step": 800
+    },
+    {
+      "epoch": 0.14647016011644834,
+      "grad_norm": 0.15043556690216064,
+      "learning_rate": 4.8250909608822644e-05,
+      "loss": 0.18354393243789674,
+      "step": 805
+    },
+    {
+      "epoch": 0.14737991266375547,
+      "grad_norm": 0.1381794661283493,
+      "learning_rate": 4.822376029504223e-05,
+      "loss": 0.1789781332015991,
+      "step": 810
+    },
+    {
+      "epoch": 0.1482896652110626,
+      "grad_norm": 0.18386174738407135,
+      "learning_rate": 4.819640965374681e-05,
+      "loss": 0.19494292736053467,
+      "step": 815
+    },
+    {
+      "epoch": 0.14919941775836973,
+      "grad_norm": 0.13829593360424042,
+      "learning_rate": 4.816885792203996e-05,
+      "loss": 0.18486063480377196,
+      "step": 820
+    },
+    {
+      "epoch": 0.15010917030567686,
+      "grad_norm": 0.15033291280269623,
+      "learning_rate": 4.814110533876852e-05,
+      "loss": 0.18061509132385253,
+      "step": 825
+    },
+    {
+      "epoch": 0.151018922852984,
+      "grad_norm": 0.17150473594665527,
+      "learning_rate": 4.811315214452051e-05,
+      "loss": 0.18464866876602173,
+      "step": 830
+    },
+    {
+      "epoch": 0.15192867540029112,
+      "grad_norm": 0.15317125618457794,
+      "learning_rate": 4.808499858162307e-05,
+      "loss": 0.1837708592414856,
+      "step": 835
+    },
+    {
+      "epoch": 0.15283842794759825,
+      "grad_norm": 0.2671392560005188,
+      "learning_rate": 4.805664489414031e-05,
+      "loss": 0.19338636398315429,
+      "step": 840
+    },
+    {
+      "epoch": 0.15374818049490538,
+      "grad_norm": 0.14047028124332428,
+      "learning_rate": 4.802809132787125e-05,
+      "loss": 0.17069108486175538,
+      "step": 845
+    },
+    {
+      "epoch": 0.1546579330422125,
+      "grad_norm": 0.1520431935787201,
+      "learning_rate": 4.799933813034768e-05,
+      "loss": 0.18607735633850098,
+      "step": 850
+    },
+    {
+      "epoch": 0.15556768558951964,
+      "grad_norm": 0.17239463329315186,
+      "learning_rate": 4.797038555083197e-05,
+      "loss": 0.18069062232971192,
+      "step": 855
+    },
+    {
+      "epoch": 0.1564774381368268,
+      "grad_norm": 0.1377955675125122,
+      "learning_rate": 4.794123384031495e-05,
+      "loss": 0.18870222568511963,
+      "step": 860
+    },
+    {
+      "epoch": 0.15738719068413393,
+      "grad_norm": 0.15901461243629456,
+      "learning_rate": 4.791188325151373e-05,
+      "loss": 0.18128334283828734,
+      "step": 865
+    },
+    {
+      "epoch": 0.15829694323144106,
+      "grad_norm": 0.14634132385253906,
+      "learning_rate": 4.7882334038869495e-05,
+      "loss": 0.1866163969039917,
+      "step": 870
+    },
+    {
+      "epoch": 0.1592066957787482,
+      "grad_norm": 0.15361061692237854,
+      "learning_rate": 4.785258645854529e-05,
+      "loss": 0.17850807905197144,
+      "step": 875
+    },
+    {
+      "epoch": 0.16011644832605532,
+      "grad_norm": 0.13751649856567383,
+      "learning_rate": 4.782264076842385e-05,
+      "loss": 0.17731113433837892,
+      "step": 880
+    },
+    {
+      "epoch": 0.16102620087336245,
+      "grad_norm": 0.17909638583660126,
+      "learning_rate": 4.7792497228105314e-05,
+      "loss": 0.18344542980194092,
+      "step": 885
+    },
+    {
+      "epoch": 0.16193595342066958,
+      "grad_norm": 0.16038304567337036,
+      "learning_rate": 4.776215609890498e-05,
+      "loss": 0.18868647813796996,
+      "step": 890
+    },
+    {
+      "epoch": 0.1628457059679767,
+      "grad_norm": 0.1653951108455658,
+      "learning_rate": 4.773161764385107e-05,
+      "loss": 0.18614152669906617,
+      "step": 895
+    },
+    {
+      "epoch": 0.16375545851528384,
+      "grad_norm": 0.16193026304244995,
+      "learning_rate": 4.770088212768241e-05,
+      "loss": 0.18564575910568237,
+      "step": 900
+    },
+    {
+      "epoch": 0.16466521106259097,
+      "grad_norm": 0.16048531234264374,
+      "learning_rate": 4.7669949816846173e-05,
+      "loss": 0.18330031633377075,
+      "step": 905
+    },
+    {
+      "epoch": 0.1655749636098981,
+      "grad_norm": 0.1440177708864212,
+      "learning_rate": 4.7638820979495534e-05,
+      "loss": 0.17712442874908446,
+      "step": 910
+    },
+    {
+      "epoch": 0.16648471615720525,
+      "grad_norm": 0.19635969400405884,
+      "learning_rate": 4.760749588548738e-05,
+      "loss": 0.18679027557373046,
+      "step": 915
+    },
+    {
+      "epoch": 0.16739446870451238,
+      "grad_norm": 0.15576541423797607,
+      "learning_rate": 4.757597480637995e-05,
+      "loss": 0.19283764362335204,
+      "step": 920
+    },
+    {
+      "epoch": 0.1683042212518195,
+      "grad_norm": 0.1550331562757492,
+      "learning_rate": 4.7544258015430463e-05,
+      "loss": 0.18269542455673218,
+      "step": 925
+    },
+    {
+      "epoch": 0.16921397379912664,
+      "grad_norm": 0.18369626998901367,
+      "learning_rate": 4.75123457875928e-05,
+      "loss": 0.1697891116142273,
+      "step": 930
+    },
+    {
+      "epoch": 0.17012372634643377,
+      "grad_norm": 0.15266314148902893,
+      "learning_rate": 4.7480238399515074e-05,
+      "loss": 0.18523451089859008,
+      "step": 935
+    },
+    {
+      "epoch": 0.1710334788937409,
+      "grad_norm": 0.16709664463996887,
+      "learning_rate": 4.744793612953724e-05,
+      "loss": 0.1803238034248352,
+      "step": 940
+    },
+    {
+      "epoch": 0.17194323144104803,
+      "grad_norm": 0.14929179847240448,
+      "learning_rate": 4.741543925768872e-05,
+      "loss": 0.1861217737197876,
+      "step": 945
+    },
+    {
+      "epoch": 0.17285298398835516,
+      "grad_norm": 0.1362280696630478,
+      "learning_rate": 4.7382748065685915e-05,
+      "loss": 0.17896100282669067,
+      "step": 950
+    },
+    {
+      "epoch": 0.1737627365356623,
+      "grad_norm": 0.15290239453315735,
+      "learning_rate": 4.734986283692982e-05,
+      "loss": 0.18432788848876952,
+      "step": 955
+    },
+    {
+      "epoch": 0.17467248908296942,
+      "grad_norm": 0.1287035197019577,
+      "learning_rate": 4.73167838565035e-05,
+      "loss": 0.18485682010650634,
+      "step": 960
+    },
+    {
+      "epoch": 0.17558224163027655,
+      "grad_norm": 0.17969627678394318,
+      "learning_rate": 4.728351141116971e-05,
+      "loss": 0.17361557483673096,
+      "step": 965
+    },
+    {
+      "epoch": 0.1764919941775837,
+      "grad_norm": 0.13751201331615448,
+      "learning_rate": 4.7250045789368326e-05,
+      "loss": 0.1731679320335388,
+      "step": 970
+    },
+    {
+      "epoch": 0.17740174672489084,
+      "grad_norm": 0.1603265255689621,
+      "learning_rate": 4.721638728121388e-05,
+      "loss": 0.17308170795440675,
+      "step": 975
+    },
+    {
+      "epoch": 0.17831149927219797,
+      "grad_norm": 0.1592789888381958,
+      "learning_rate": 4.718253617849306e-05,
+      "loss": 0.17534757852554322,
+      "step": 980
+    },
+    {
+      "epoch": 0.1792212518195051,
+      "grad_norm": 0.12727224826812744,
+      "learning_rate": 4.714849277466214e-05,
+      "loss": 0.17817609310150145,
+      "step": 985
+    },
+    {
+      "epoch": 0.18013100436681223,
+      "grad_norm": 0.15401554107666016,
+      "learning_rate": 4.711425736484447e-05,
+      "loss": 0.1733405351638794,
+      "step": 990
+    },
+    {
+      "epoch": 0.18104075691411936,
+      "grad_norm": 0.13253968954086304,
+      "learning_rate": 4.7079830245827906e-05,
+      "loss": 0.17846795320510864,
+      "step": 995
+    },
+    {
+      "epoch": 0.1819505094614265,
+      "grad_norm": 0.21846213936805725,
+      "learning_rate": 4.7045211716062245e-05,
+      "loss": 0.18021599054336548,
+      "step": 1000
+    },
+    {
+      "epoch": 0.18286026200873362,
+      "grad_norm": 0.16867990791797638,
+      "learning_rate": 4.7010402075656595e-05,
+      "loss": 0.18232386112213134,
+      "step": 1005
+    },
+    {
+      "epoch": 0.18377001455604075,
+      "grad_norm": 0.17180582880973816,
+      "learning_rate": 4.697540162637686e-05,
+      "loss": 0.1816317319869995,
+      "step": 1010
+    },
+    {
+      "epoch": 0.18467976710334788,
+      "grad_norm": 0.16480213403701782,
+      "learning_rate": 4.694021067164303e-05,
+      "loss": 0.17718446254730225,
+      "step": 1015
+    },
+    {
+      "epoch": 0.185589519650655,
+      "grad_norm": 0.15015918016433716,
+      "learning_rate": 4.6904829516526605e-05,
+      "loss": 0.17412011623382567,
+      "step": 1020
+    },
+    {
+      "epoch": 0.18649927219796217,
+      "grad_norm": 0.14445139467716217,
+      "learning_rate": 4.686925846774795e-05,
+      "loss": 0.1778018832206726,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1874090247452693,
+      "grad_norm": 0.1701960265636444,
+      "learning_rate": 4.683349783367362e-05,
+      "loss": 0.16901081800460815,
+      "step": 1030
+    },
+    {
+      "epoch": 0.18831877729257643,
+      "grad_norm": 0.15894867479801178,
+      "learning_rate": 4.679754792431368e-05,
+      "loss": 0.17055928707122803,
+      "step": 1035
+    },
+    {
+      "epoch": 0.18922852983988356,
+      "grad_norm": 0.1511942446231842,
+      "learning_rate": 4.676140905131903e-05,
+      "loss": 0.17339680194854737,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1901382823871907,
+      "grad_norm": 0.14735209941864014,
+      "learning_rate": 4.672508152797872e-05,
+      "loss": 0.17802717685699462,
+      "step": 1045
+    },
+    {
+      "epoch": 0.19104803493449782,
+      "grad_norm": 0.17367291450500488,
+      "learning_rate": 4.66885656692172e-05,
+      "loss": 0.1732744097709656,
+      "step": 1050
+    },
+    {
+      "epoch": 0.19195778748180495,
+      "grad_norm": 0.147227481007576,
+      "learning_rate": 4.665186179159159e-05,
+      "loss": 0.17040517330169677,
+      "step": 1055
+    },
+    {
+      "epoch": 0.19286754002911208,
+      "grad_norm": 0.1709655076265335,
+      "learning_rate": 4.6614970213289e-05,
+      "loss": 0.17794088125228882,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1937772925764192,
+      "grad_norm": 0.1588088721036911,
+      "learning_rate": 4.657789125412366e-05,
+      "loss": 0.17180380821228028,
+      "step": 1065
+    },
+    {
+      "epoch": 0.19468704512372634,
+      "grad_norm": 0.14827021956443787,
+      "learning_rate": 4.654062523553428e-05,
+      "loss": 0.182997989654541,
+      "step": 1070
+    },
+    {
+      "epoch": 0.19559679767103347,
+      "grad_norm": 0.16230466961860657,
+      "learning_rate": 4.6503172480581126e-05,
+      "loss": 0.17346880435943604,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1965065502183406,
+      "grad_norm": 0.1637624353170395,
+      "learning_rate": 4.646553331394333e-05,
+      "loss": 0.17263576984405518,
+      "step": 1080
+    },
+    {
+      "epoch": 0.19741630276564776,
+      "grad_norm": 0.15977843105793,
+      "learning_rate": 4.642770806191603e-05,
+      "loss": 0.17284308671951293,
+      "step": 1085
+    },
+    {
+      "epoch": 0.19832605531295489,
+      "grad_norm": 0.15394869446754456,
+      "learning_rate": 4.6389697052407534e-05,
+      "loss": 0.17797101736068727,
+      "step": 1090
+    },
+    {
+      "epoch": 0.19923580786026202,
+      "grad_norm": 0.15995225310325623,
+      "learning_rate": 4.6351500614936485e-05,
+      "loss": 0.18137198686599731,
+      "step": 1095
+    },
+    {
+      "epoch": 0.20014556040756915,
+      "grad_norm": 0.1779479682445526,
+      "learning_rate": 4.6313119080629006e-05,
+      "loss": 0.17998344898223878,
+      "step": 1100
+    },
+    {
+      "epoch": 0.20105531295487628,
+      "grad_norm": 0.14362832903862,
+      "learning_rate": 4.627455278221584e-05,
+      "loss": 0.18196423053741456,
+      "step": 1105
+    },
+    {
+      "epoch": 0.2019650655021834,
+      "grad_norm": 0.15951639413833618,
+      "learning_rate": 4.623580205402947e-05,
+      "loss": 0.17423888444900512,
+      "step": 1110
+    },
+    {
+      "epoch": 0.20287481804949054,
+      "grad_norm": 0.17273563146591187,
+      "learning_rate": 4.619686723200115e-05,
+      "loss": 0.17392473220825194,
+      "step": 1115
+    },
+    {
+      "epoch": 0.20378457059679767,
+      "grad_norm": 0.1655360758304596,
+      "learning_rate": 4.615774865365813e-05,
+      "loss": 0.17528389692306517,
+      "step": 1120
+    },
+    {
+      "epoch": 0.2046943231441048,
+      "grad_norm": 0.15920691192150116,
+      "learning_rate": 4.611844665812058e-05,
+      "loss": 0.1806849241256714,
+      "step": 1125
+    },
+    {
+      "epoch": 0.20560407569141192,
+      "grad_norm": 0.16114577651023865,
+      "learning_rate": 4.607896158609875e-05,
+      "loss": 0.17217352390289306,
+      "step": 1130
+    },
+    {
+      "epoch": 0.20651382823871905,
+      "grad_norm": 0.1499422937631607,
+      "learning_rate": 4.603929377988999e-05,
+      "loss": 0.17806737422943114,
+      "step": 1135
+    },
+    {
+      "epoch": 0.2074235807860262,
+      "grad_norm": 0.17605191469192505,
+      "learning_rate": 4.5999443583375765e-05,
+      "loss": 0.17842113971710205,
+      "step": 1140
+    },
+    {
+      "epoch": 0.20833333333333334,
+      "grad_norm": 0.16117210686206818,
+      "learning_rate": 4.595941134201871e-05,
+      "loss": 0.18379683494567872,
+      "step": 1145
+    },
+    {
+      "epoch": 0.20924308588064047,
+      "grad_norm": 0.21199050545692444,
+      "learning_rate": 4.591919740285957e-05,
+      "loss": 0.16286123991012574,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2101528384279476,
+      "grad_norm": 0.15100529789924622,
+      "learning_rate": 4.587880211451427e-05,
+      "loss": 0.17995200157165528,
+      "step": 1155
+    },
+    {
+      "epoch": 0.21106259097525473,
+      "grad_norm": 0.16618172824382782,
+      "learning_rate": 4.583822582717085e-05,
+      "loss": 0.16960303783416747,
+      "step": 1160
+    },
+    {
+      "epoch": 0.21197234352256186,
+      "grad_norm": 0.14743569493293762,
+      "learning_rate": 4.579746889258643e-05,
+      "loss": 0.17762668132781984,
+      "step": 1165
+    },
+    {
+      "epoch": 0.212882096069869,
+      "grad_norm": 0.1697179079055786,
+      "learning_rate": 4.575653166408417e-05,
+      "loss": 0.16656005382537842,
+      "step": 1170
+    },
+    {
+      "epoch": 0.21379184861717612,
+      "grad_norm": 0.14886513352394104,
+      "learning_rate": 4.57154144965502e-05,
+      "loss": 0.17091882228851318,
+      "step": 1175
+    },
+    {
+      "epoch": 0.21470160116448325,
+      "grad_norm": 0.18197473883628845,
+      "learning_rate": 4.5674117746430556e-05,
+      "loss": 0.1770920753479004,
+      "step": 1180
+    },
+    {
+      "epoch": 0.21561135371179038,
+      "grad_norm": 0.17323088645935059,
+      "learning_rate": 4.563264177172807e-05,
+      "loss": 0.1734643578529358,
+      "step": 1185
+    },
+    {
+      "epoch": 0.2165211062590975,
+      "grad_norm": 0.1521984338760376,
+      "learning_rate": 4.559098693199929e-05,
+      "loss": 0.17515116930007935,
+      "step": 1190
+    },
+    {
+      "epoch": 0.21743085880640467,
+      "grad_norm": 0.1842304915189743,
+      "learning_rate": 4.554915358835134e-05,
+      "loss": 0.16798022985458375,
+      "step": 1195
+    },
+    {
+      "epoch": 0.2183406113537118,
+      "grad_norm": 0.14753451943397522,
+      "learning_rate": 4.5507142103438794e-05,
+      "loss": 0.1755476713180542,
+      "step": 1200
+    },
+    {
+      "epoch": 0.21925036390101893,
+      "grad_norm": 0.17096194624900818,
+      "learning_rate": 4.546495284146057e-05,
+      "loss": 0.1792473554611206,
+      "step": 1205
+    },
+    {
+      "epoch": 0.22016011644832606,
+      "grad_norm": 0.1579233556985855,
+      "learning_rate": 4.542258616815669e-05,
+      "loss": 0.17230144739151002,
+      "step": 1210
+    },
+    {
+      "epoch": 0.2210698689956332,
+      "grad_norm": 0.177297905087471,
+      "learning_rate": 4.5380042450805216e-05,
+      "loss": 0.1807127833366394,
+      "step": 1215
+    },
+    {
+      "epoch": 0.22197962154294032,
+      "grad_norm": 0.14331696927547455,
+      "learning_rate": 4.533732205821897e-05,
+      "loss": 0.17201389074325563,
+      "step": 1220
+    },
+    {
+      "epoch": 0.22288937409024745,
+      "grad_norm": 0.14473360776901245,
+      "learning_rate": 4.529442536074239e-05,
+      "loss": 0.17036900520324708,
+      "step": 1225
+    },
+    {
+      "epoch": 0.22379912663755458,
+      "grad_norm": 0.1820901483297348,
+      "learning_rate": 4.5251352730248314e-05,
+      "loss": 0.17704882621765136,
+      "step": 1230
+    },
+    {
+      "epoch": 0.2247088791848617,
+      "grad_norm": 0.1948976367712021,
+      "learning_rate": 4.5208104540134746e-05,
+      "loss": 0.1706973433494568,
+      "step": 1235
+    },
+    {
+      "epoch": 0.22561863173216884,
+      "grad_norm": 0.16660070419311523,
+      "learning_rate": 4.51646811653216e-05,
+      "loss": 0.17636821269989014,
+      "step": 1240
+    },
+    {
+      "epoch": 0.22652838427947597,
+      "grad_norm": 0.1699984073638916,
+      "learning_rate": 4.512108298224751e-05,
+      "loss": 0.16986632347106934,
+      "step": 1245
+    },
+    {
+      "epoch": 0.22743813682678313,
+      "grad_norm": 0.17601042985916138,
+      "learning_rate": 4.50773103688665e-05,
+      "loss": 0.17507898807525635,
+      "step": 1250
+    },
+    {
+      "epoch": 0.22834788937409026,
+      "grad_norm": 0.17557238042354584,
+      "learning_rate": 4.503336370464476e-05,
+      "loss": 0.17702863216400147,
+      "step": 1255
+    },
+    {
+      "epoch": 0.2292576419213974,
+      "grad_norm": 0.1800651252269745,
+      "learning_rate": 4.498924337055729e-05,
+      "loss": 0.16419180631637573,
+      "step": 1260
+    },
+    {
+      "epoch": 0.23016739446870452,
+      "grad_norm": 0.2022479772567749,
+      "learning_rate": 4.494494974908468e-05,
+      "loss": 0.17482060194015503,
+      "step": 1265
+    },
+    {
+      "epoch": 0.23107714701601165,
+      "grad_norm": 0.14180205762386322,
+      "learning_rate": 4.490048322420973e-05,
+      "loss": 0.1723136067390442,
+      "step": 1270
+    },
+    {
+      "epoch": 0.23198689956331878,
+      "grad_norm": 0.18607310950756073,
+      "learning_rate": 4.485584418141419e-05,
+      "loss": 0.17096419334411622,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2328966521106259,
+      "grad_norm": 0.15958310663700104,
+      "learning_rate": 4.481103300767529e-05,
+      "loss": 0.1656244158744812,
+      "step": 1280
+    },
+    {
+      "epoch": 0.23380640465793304,
+      "grad_norm": 0.17552383244037628,
+      "learning_rate": 4.476605009146255e-05,
+      "loss": 0.17677626609802247,
+      "step": 1285
+    },
+    {
+      "epoch": 0.23471615720524017,
+      "grad_norm": 0.15299823880195618,
+      "learning_rate": 4.472089582273429e-05,
+      "loss": 0.1778991103172302,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2356259097525473,
+      "grad_norm": 0.14613987505435944,
+      "learning_rate": 4.46755705929343e-05,
+      "loss": 0.17071452140808105,
+      "step": 1295
+    },
+    {
+      "epoch": 0.23653566229985443,
+      "grad_norm": 0.17781122028827667,
+      "learning_rate": 4.463007479498843e-05,
+      "loss": 0.16955430507659913,
+      "step": 1300
+    },
+    {
+      "epoch": 0.23744541484716158,
+      "grad_norm": 0.16326487064361572,
+      "learning_rate": 4.458440882330119e-05,
+      "loss": 0.1777693510055542,
+      "step": 1305
+    },
+    {
+      "epoch": 0.23835516739446871,
+      "grad_norm": 0.17701926827430725,
+      "learning_rate": 4.4538573073752365e-05,
+      "loss": 0.16323351860046387,
+      "step": 1310
+    },
+    {
+      "epoch": 0.23926491994177584,
+      "grad_norm": 0.13104717433452606,
+      "learning_rate": 4.449256794369349e-05,
+      "loss": 0.17653456926345826,
+      "step": 1315
+    },
+    {
+      "epoch": 0.24017467248908297,
+      "grad_norm": 0.1796836256980896,
+      "learning_rate": 4.444639383194452e-05,
+      "loss": 0.17189600467681884,
+      "step": 1320
+    },
+    {
+      "epoch": 0.2410844250363901,
+      "grad_norm": 0.14919696748256683,
+      "learning_rate": 4.440005113879029e-05,
+      "loss": 0.17003334760665895,
+      "step": 1325
+    },
+    {
+      "epoch": 0.24199417758369723,
+      "grad_norm": 0.1728784441947937,
+      "learning_rate": 4.4353540265977064e-05,
+      "loss": 0.17397408485412597,
+      "step": 1330
+    },
+    {
+      "epoch": 0.24290393013100436,
+      "grad_norm": 0.14591015875339508,
+      "learning_rate": 4.43068616167091e-05,
+      "loss": 0.16498478651046752,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2438136826783115,
+      "grad_norm": 0.18417201936244965,
+      "learning_rate": 4.4260015595645055e-05,
+      "loss": 0.16841750144958495,
+      "step": 1340
+    },
+    {
+      "epoch": 0.24472343522561862,
+      "grad_norm": 0.16264279186725616,
+      "learning_rate": 4.4213002608894605e-05,
+      "loss": 0.16907373666763306,
+      "step": 1345
+    },
+    {
+      "epoch": 0.24563318777292575,
+      "grad_norm": 0.15248481929302216,
+      "learning_rate": 4.416582306401481e-05,
+      "loss": 0.15931472778320313,
+      "step": 1350
+    },
+    {
+      "epoch": 0.24654294032023288,
+      "grad_norm": 0.1488373875617981,
+      "learning_rate": 4.4118477370006636e-05,
+      "loss": 0.1701716423034668,
+      "step": 1355
+    },
+    {
+      "epoch": 0.24745269286754004,
+      "grad_norm": 0.14679782092571259,
+      "learning_rate": 4.407096593731142e-05,
+      "loss": 0.157412326335907,
+      "step": 1360
+    },
+    {
+      "epoch": 0.24836244541484717,
+      "grad_norm": 0.17139530181884766,
+      "learning_rate": 4.402328917780728e-05,
+      "loss": 0.17303754091262818,
+      "step": 1365
+    },
+    {
+      "epoch": 0.2492721979621543,
+      "grad_norm": 0.1534871757030487,
+      "learning_rate": 4.397544750480554e-05,
+      "loss": 0.1786255121231079,
+      "step": 1370
+    },
+    {
+      "epoch": 0.2501819505094614,
+      "grad_norm": 0.1876252293586731,
+      "learning_rate": 4.39274413330472e-05,
+      "loss": 0.16442898511886597,
+      "step": 1375
+    },
+    {
+      "epoch": 0.25109170305676853,
+      "grad_norm": 0.16165752708911896,
+      "learning_rate": 4.387927107869928e-05,
+      "loss": 0.1780426025390625,
+      "step": 1380
+    },
+    {
+      "epoch": 0.25200145560407566,
+      "grad_norm": 0.17242255806922913,
+      "learning_rate": 4.383093715935124e-05,
+      "loss": 0.15959256887435913,
+      "step": 1385
+    },
+    {
+      "epoch": 0.25291120815138285,
+      "grad_norm": 0.1627114862203598,
+      "learning_rate": 4.378243999401137e-05,
+      "loss": 0.17606115341186523,
+      "step": 1390
+    },
+    {
+      "epoch": 0.25382096069869,
+      "grad_norm": 0.15911224484443665,
+      "learning_rate": 4.373378000310312e-05,
+      "loss": 0.16798585653305054,
+      "step": 1395
+    },
+    {
+      "epoch": 0.2547307132459971,
+      "grad_norm": 0.15542249381542206,
+      "learning_rate": 4.3684957608461505e-05,
+      "loss": 0.1695417881011963,
+      "step": 1400
+    },
+    {
+      "epoch": 0.25564046579330424,
+      "grad_norm": 0.1475304812192917,
+      "learning_rate": 4.363597323332941e-05,
+      "loss": 0.16340878009796142,
+      "step": 1405
+    },
+    {
+      "epoch": 0.25655021834061137,
+      "grad_norm": 0.16943927109241486,
+      "learning_rate": 4.358682730235395e-05,
+      "loss": 0.17240238189697266,
+      "step": 1410
+    },
+    {
+      "epoch": 0.2574599708879185,
+      "grad_norm": 0.1816391944885254,
+      "learning_rate": 4.3537520241582744e-05,
+      "loss": 0.16558437347412108,
+      "step": 1415
+    },
+    {
+      "epoch": 0.25836972343522563,
+      "grad_norm": 0.23851341009140015,
+      "learning_rate": 4.348805247846027e-05,
+      "loss": 0.16796000003814698,
+      "step": 1420
+    },
+    {
+      "epoch": 0.25927947598253276,
+      "grad_norm": 0.15415243804454803,
+      "learning_rate": 4.343842444182414e-05,
+      "loss": 0.1746017098426819,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2601892285298399,
+      "grad_norm": 0.15651032328605652,
+      "learning_rate": 4.338863656190139e-05,
+      "loss": 0.1649057984352112,
+      "step": 1430
+    },
+    {
+      "epoch": 0.261098981077147,
+      "grad_norm": 0.16601966321468353,
+      "learning_rate": 4.333868927030471e-05,
+      "loss": 0.15888988971710205,
+      "step": 1435
+    },
+    {
+      "epoch": 0.26200873362445415,
+      "grad_norm": 0.1549467295408249,
+      "learning_rate": 4.328858300002876e-05,
+      "loss": 0.16357985734939576,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2629184861717613,
+      "grad_norm": 0.16332370042800903,
+      "learning_rate": 4.32383181854464e-05,
+      "loss": 0.16749982833862304,
+      "step": 1445
+    },
+    {
+      "epoch": 0.2638282387190684,
+      "grad_norm": 0.14827077090740204,
+      "learning_rate": 4.3187895262304894e-05,
+      "loss": 0.16886214017868043,
+      "step": 1450
+    },
+    {
+      "epoch": 0.26473799126637554,
+      "grad_norm": 0.1557198166847229,
+      "learning_rate": 4.313731466772216e-05,
+      "loss": 0.17512214183807373,
+      "step": 1455
+    },
+    {
+      "epoch": 0.26564774381368267,
+      "grad_norm": 0.17263570427894592,
+      "learning_rate": 4.308657684018299e-05,
+      "loss": 0.16248074769973755,
+      "step": 1460
+    },
+    {
+      "epoch": 0.2665574963609898,
+      "grad_norm": 0.17135761678218842,
+      "learning_rate": 4.303568221953521e-05,
+      "loss": 0.16605921983718872,
+      "step": 1465
+    },
+    {
+      "epoch": 0.26746724890829693,
+      "grad_norm": 0.14322632551193237,
+      "learning_rate": 4.2984631246985897e-05,
+      "loss": 0.1610772728919983,
+      "step": 1470
+    },
+    {
+      "epoch": 0.26837700145560406,
+      "grad_norm": 0.18852312862873077,
+      "learning_rate": 4.2933424365097564e-05,
+      "loss": 0.1686462163925171,
+      "step": 1475
+    },
+    {
+      "epoch": 0.2692867540029112,
+      "grad_norm": 0.1780245155096054,
+      "learning_rate": 4.2882062017784294e-05,
+      "loss": 0.16953932046890258,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2701965065502183,
+      "grad_norm": 0.180568665266037,
+      "learning_rate": 4.2830544650307895e-05,
+      "loss": 0.16442664861679077,
+      "step": 1485
+    },
+    {
+      "epoch": 0.27110625909752545,
+      "grad_norm": 0.16876435279846191,
+      "learning_rate": 4.277887270927407e-05,
+      "loss": 0.17128173112869263,
+      "step": 1490
+    },
+    {
+      "epoch": 0.2720160116448326,
+      "grad_norm": 0.164053276181221,
+      "learning_rate": 4.2727046642628513e-05,
+      "loss": 0.16331382989883422,
+      "step": 1495
+    },
+    {
+      "epoch": 0.27292576419213976,
+      "grad_norm": 0.14577528834342957,
+      "learning_rate": 4.267506689965305e-05,
+      "loss": 0.1638316035270691,
+      "step": 1500
+    },
+    {
+      "epoch": 0.2738355167394469,
+      "grad_norm": 0.1648740917444229,
+      "learning_rate": 4.262293393096171e-05,
+      "loss": 0.15332664251327516,
+      "step": 1505
+    },
+    {
+      "epoch": 0.274745269286754,
+      "grad_norm": 0.16445094347000122,
+      "learning_rate": 4.257064818849685e-05,
+      "loss": 0.1706634521484375,
+      "step": 1510
+    },
+    {
+      "epoch": 0.27565502183406115,
+      "grad_norm": 0.1584935486316681,
+      "learning_rate": 4.251821012552524e-05,
+      "loss": 0.1684114694595337,
+      "step": 1515
+    },
+    {
+      "epoch": 0.2765647743813683,
+      "grad_norm": 0.17215611040592194,
+      "learning_rate": 4.24656201966341e-05,
+      "loss": 0.15594131946563722,
+      "step": 1520
+    },
+    {
+      "epoch": 0.2774745269286754,
+      "grad_norm": 0.15945589542388916,
+      "learning_rate": 4.2412878857727214e-05,
+      "loss": 0.1686659574508667,
+      "step": 1525
+    },
+    {
+      "epoch": 0.27838427947598254,
+      "grad_norm": 0.16103951632976532,
+      "learning_rate": 4.2359986566020906e-05,
+      "loss": 0.17779340744018554,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2792940320232897,
+      "grad_norm": 0.1770307570695877,
+      "learning_rate": 4.230694378004014e-05,
+      "loss": 0.16786882877349854,
+      "step": 1535
+    },
+    {
+      "epoch": 0.2802037845705968,
+      "grad_norm": 0.16225053369998932,
+      "learning_rate": 4.2253750959614504e-05,
+      "loss": 0.16558897495269775,
+      "step": 1540
+    },
+    {
+      "epoch": 0.28111353711790393,
+      "grad_norm": 0.27213969826698303,
+      "learning_rate": 4.220040856587425e-05,
+      "loss": 0.1641119599342346,
+      "step": 1545
+    },
+    {
+      "epoch": 0.28202328966521106,
+      "grad_norm": 0.1773071587085724,
+      "learning_rate": 4.2146917061246284e-05,
+      "loss": 0.16919140815734862,
+      "step": 1550
+    },
+    {
+      "epoch": 0.2829330422125182,
+      "grad_norm": 0.15519705414772034,
+      "learning_rate": 4.209327690945014e-05,
+      "loss": 0.15501506328582765,
+      "step": 1555
+    },
+    {
+      "epoch": 0.2838427947598253,
+      "grad_norm": 0.19921597838401794,
+      "learning_rate": 4.203948857549402e-05,
+      "loss": 0.1690821886062622,
+      "step": 1560
+    },
+    {
+      "epoch": 0.28475254730713245,
+      "grad_norm": 0.15417630970478058,
+      "learning_rate": 4.1985552525670696e-05,
+      "loss": 0.1675640344619751,
+      "step": 1565
+    },
+    {
+      "epoch": 0.2856622998544396,
+      "grad_norm": 0.1739572137594223,
+      "learning_rate": 4.193146922755348e-05,
+      "loss": 0.16738017797470092,
+      "step": 1570
+    },
+    {
+      "epoch": 0.2865720524017467,
+      "grad_norm": 0.1384361982345581,
+      "learning_rate": 4.187723914999221e-05,
+      "loss": 0.16802358627319336,
+      "step": 1575
+    },
+    {
+      "epoch": 0.28748180494905384,
+      "grad_norm": 0.1491454839706421,
+      "learning_rate": 4.182286276310915e-05,
+      "loss": 0.1619583249092102,
+      "step": 1580
+    },
+    {
+      "epoch": 0.288391557496361,
+      "grad_norm": 0.15831919014453888,
+      "learning_rate": 4.176834053829492e-05,
+      "loss": 0.1625199794769287,
+      "step": 1585
+    },
+    {
+      "epoch": 0.2893013100436681,
+      "grad_norm": 0.16265396773815155,
+      "learning_rate": 4.1713672948204416e-05,
+      "loss": 0.16718552112579346,
+      "step": 1590
+    },
+    {
+      "epoch": 0.29021106259097523,
+      "grad_norm": 0.15153461694717407,
+      "learning_rate": 4.1658860466752714e-05,
+      "loss": 0.15979087352752686,
+      "step": 1595
+    },
+    {
+      "epoch": 0.29112081513828236,
+      "grad_norm": 0.1620412915945053,
+      "learning_rate": 4.160390356911096e-05,
+      "loss": 0.16103557348251343,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2920305676855895,
+      "grad_norm": 0.16673807799816132,
+      "learning_rate": 4.154880273170223e-05,
+      "loss": 0.16394708156585694,
+      "step": 1605
+    },
+    {
+      "epoch": 0.2929403202328967,
+      "grad_norm": 0.14834867417812347,
+      "learning_rate": 4.149355843219744e-05,
+      "loss": 0.15916435718536376,
+      "step": 1610
+    },
+    {
+      "epoch": 0.2938500727802038,
+      "grad_norm": 0.16977964341640472,
+      "learning_rate": 4.143817114951119e-05,
+      "loss": 0.16538127660751342,
+      "step": 1615
+    },
+    {
+      "epoch": 0.29475982532751094,
+      "grad_norm": 0.17986875772476196,
+      "learning_rate": 4.138264136379756e-05,
+      "loss": 0.15514618158340454,
+      "step": 1620
+    },
+    {
+      "epoch": 0.29566957787481807,
+      "grad_norm": 0.15794920921325684,
+      "learning_rate": 4.132696955644605e-05,
+      "loss": 0.15992183685302735,
+      "step": 1625
+    },
+    {
+      "epoch": 0.2965793304221252,
+      "grad_norm": 0.19955399632453918,
+      "learning_rate": 4.127115621007731e-05,
+      "loss": 0.16362056732177735,
+      "step": 1630
+    },
+    {
+      "epoch": 0.29748908296943233,
+      "grad_norm": 0.1352023035287857,
+      "learning_rate": 4.121520180853903e-05,
+      "loss": 0.15631601810455323,
+      "step": 1635
+    },
+    {
+      "epoch": 0.29839883551673946,
+      "grad_norm": 0.15340781211853027,
+      "learning_rate": 4.1159106836901674e-05,
+      "loss": 0.1571858048439026,
+      "step": 1640
+    },
+    {
+      "epoch": 0.2993085880640466,
+      "grad_norm": 0.15311770141124725,
+      "learning_rate": 4.110287178145433e-05,
+      "loss": 0.16082344055175782,
+      "step": 1645
+    },
+    {
+      "epoch": 0.3002183406113537,
+      "grad_norm": 0.17811627686023712,
+      "learning_rate": 4.10464971297005e-05,
+      "loss": 0.16117215156555176,
+      "step": 1650
+    },
+    {
+      "epoch": 0.30112809315866085,
+      "grad_norm": 0.21060039103031158,
+      "learning_rate": 4.0989983370353805e-05,
+      "loss": 0.15838587284088135,
+      "step": 1655
+    },
+    {
+      "epoch": 0.302037845705968,
+      "grad_norm": 0.155836820602417,
+      "learning_rate": 4.093333099333383e-05,
+      "loss": 0.16648870706558228,
+      "step": 1660
+    },
+    {
+      "epoch": 0.3029475982532751,
+      "grad_norm": 0.13711698353290558,
+      "learning_rate": 4.0876540489761826e-05,
+      "loss": 0.16899349689483642,
+      "step": 1665
+    },
+    {
+      "epoch": 0.30385735080058224,
+      "grad_norm": 0.15162716805934906,
+      "learning_rate": 4.0819612351956485e-05,
+      "loss": 0.16574090719223022,
+      "step": 1670
+    },
+    {
+      "epoch": 0.30476710334788937,
+      "grad_norm": 0.15016348659992218,
+      "learning_rate": 4.0762547073429615e-05,
+      "loss": 0.1689780354499817,
+      "step": 1675
+    },
+    {
+      "epoch": 0.3056768558951965,
+      "grad_norm": 0.15182986855506897,
+      "learning_rate": 4.070534514888194e-05,
+      "loss": 0.1593686819076538,
+      "step": 1680
+    },
+    {
+      "epoch": 0.3065866084425036,
+      "grad_norm": 0.15648750960826874,
+      "learning_rate": 4.0648007074198765e-05,
+      "loss": 0.16436235904693602,
+      "step": 1685
+    },
+    {
+      "epoch": 0.30749636098981076,
+      "grad_norm": 0.18339484930038452,
+      "learning_rate": 4.0590533346445665e-05,
+      "loss": 0.1678077220916748,
+      "step": 1690
+    },
+    {
+      "epoch": 0.3084061135371179,
+      "grad_norm": 0.16426527500152588,
+      "learning_rate": 4.053292446386422e-05,
+      "loss": 0.1689227342605591,
+      "step": 1695
+    },
+    {
+      "epoch": 0.309315866084425,
+      "grad_norm": 0.16129335761070251,
+      "learning_rate": 4.047518092586766e-05,
+      "loss": 0.16592445373535156,
+      "step": 1700
+    },
+    {
+      "epoch": 0.31022561863173215,
+      "grad_norm": 0.15512363612651825,
+      "learning_rate": 4.041730323303654e-05,
+      "loss": 0.16142364740371704,
+      "step": 1705
+    },
+    {
+      "epoch": 0.3111353711790393,
+      "grad_norm": 0.159842386841774,
+      "learning_rate": 4.0359291887114425e-05,
+      "loss": 0.1702875852584839,
+      "step": 1710
+    },
+    {
+      "epoch": 0.3120451237263464,
+      "grad_norm": 0.19558854401111603,
+      "learning_rate": 4.030114739100352e-05,
+      "loss": 0.15966148376464845,
+      "step": 1715
+    },
+    {
+      "epoch": 0.3129548762736536,
+      "grad_norm": 0.1577496975660324,
+      "learning_rate": 4.024287024876029e-05,
+      "loss": 0.1620358943939209,
+      "step": 1720
+    },
+    {
+      "epoch": 0.3138646288209607,
+      "grad_norm": 0.1629355251789093,
+      "learning_rate": 4.0184460965591144e-05,
+      "loss": 0.16511552333831786,
+      "step": 1725
+    },
+    {
+      "epoch": 0.31477438136826785,
+      "grad_norm": 0.17060767114162445,
+      "learning_rate": 4.0125920047848e-05,
+      "loss": 0.15672838687896729,
+      "step": 1730
+    },
+    {
+      "epoch": 0.315684133915575,
+      "grad_norm": 0.22447620332241058,
+      "learning_rate": 4.006724800302394e-05,
+      "loss": 0.15339784622192382,
+      "step": 1735
+    },
+    {
+      "epoch": 0.3165938864628821,
+      "grad_norm": 0.14572037756443024,
+      "learning_rate": 4.000844533974878e-05,
+      "loss": 0.16566959619522095,
+      "step": 1740
+    },
+    {
+      "epoch": 0.31750363901018924,
+      "grad_norm": 0.15915483236312866,
+      "learning_rate": 3.9949512567784684e-05,
+      "loss": 0.16153957843780517,
+      "step": 1745
+    },
+    {
+      "epoch": 0.3184133915574964,
+      "grad_norm": 0.1668540984392166,
+      "learning_rate": 3.9890450198021704e-05,
+      "loss": 0.1659809947013855,
+      "step": 1750
+    },
+    {
+      "epoch": 0.3193231441048035,
+      "grad_norm": 0.16612035036087036,
+      "learning_rate": 3.983125874247341e-05,
+      "loss": 0.16941241025924683,
+      "step": 1755
+    },
+    {
+      "epoch": 0.32023289665211063,
+      "grad_norm": 0.15163679420948029,
+      "learning_rate": 3.9771938714272407e-05,
+      "loss": 0.16053590774536133,
+      "step": 1760
+    },
+    {
+      "epoch": 0.32114264919941776,
+      "grad_norm": 0.1797824203968048,
+      "learning_rate": 3.97124906276659e-05,
+      "loss": 0.1667110800743103,
+      "step": 1765
+    },
+    {
+      "epoch": 0.3220524017467249,
+      "grad_norm": 0.15076608955860138,
+      "learning_rate": 3.9652914998011237e-05,
+      "loss": 0.1607860803604126,
+      "step": 1770
+    },
+    {
+      "epoch": 0.322962154294032,
+      "grad_norm": 0.16523587703704834,
+      "learning_rate": 3.959321234177144e-05,
+      "loss": 0.16515827178955078,
+      "step": 1775
+    },
+    {
+      "epoch": 0.32387190684133915,
+      "grad_norm": 0.22065149247646332,
+      "learning_rate": 3.9533383176510746e-05,
+      "loss": 0.1618957757949829,
+      "step": 1780
+    },
+    {
+      "epoch": 0.3247816593886463,
+      "grad_norm": 0.16426463425159454,
+      "learning_rate": 3.9473428020890066e-05,
+      "loss": 0.15763382911682128,
+      "step": 1785
+    },
+    {
+      "epoch": 0.3256914119359534,
+      "grad_norm": 0.16474904119968414,
+      "learning_rate": 3.941334739466257e-05,
+      "loss": 0.15135571956634522,
+      "step": 1790
+    },
+    {
+      "epoch": 0.32660116448326054,
+      "grad_norm": 0.16746412217617035,
+      "learning_rate": 3.935314181866909e-05,
+      "loss": 0.15925389528274536,
+      "step": 1795
+    },
+    {
+      "epoch": 0.32751091703056767,
+      "grad_norm": 0.17819371819496155,
+      "learning_rate": 3.929281181483369e-05,
+      "loss": 0.1598669171333313,
+      "step": 1800
+    },
+    {
+      "epoch": 0.3284206695778748,
+      "grad_norm": 0.1816040277481079,
+      "learning_rate": 3.923235790615907e-05,
+      "loss": 0.1652522087097168,
+      "step": 1805
+    },
+    {
+      "epoch": 0.32933042212518193,
+      "grad_norm": 0.14846695959568024,
+      "learning_rate": 3.917178061672211e-05,
+      "loss": 0.16665585041046144,
+      "step": 1810
+    },
+    {
+      "epoch": 0.33024017467248906,
+      "grad_norm": 0.1734926551580429,
+      "learning_rate": 3.911108047166924e-05,
+      "loss": 0.16069791316986085,
+      "step": 1815
+    },
+    {
+      "epoch": 0.3311499272197962,
+      "grad_norm": 0.16154922544956207,
+      "learning_rate": 3.905025799721194e-05,
+      "loss": 0.16114097833633423,
+      "step": 1820
+    },
+    {
+      "epoch": 0.3320596797671033,
+      "grad_norm": 0.1538771390914917,
+      "learning_rate": 3.898931372062217e-05,
+      "loss": 0.1602831244468689,
+      "step": 1825
+    },
+    {
+      "epoch": 0.3329694323144105,
+      "grad_norm": 0.14036566019058228,
+      "learning_rate": 3.892824817022781e-05,
+      "loss": 0.1502395749092102,
+      "step": 1830
+    },
+    {
+      "epoch": 0.33387918486171764,
+      "grad_norm": 0.19212059676647186,
+      "learning_rate": 3.886706187540804e-05,
+      "loss": 0.16265250444412233,
+      "step": 1835
+    },
+    {
+      "epoch": 0.33478893740902477,
+      "grad_norm": 0.17410333454608917,
+      "learning_rate": 3.880575536658881e-05,
+      "loss": 0.15689224004745483,
+      "step": 1840
+    },
+    {
+      "epoch": 0.3356986899563319,
+      "grad_norm": 0.15165294706821442,
+      "learning_rate": 3.874432917523817e-05,
+      "loss": 0.15033140182495117,
+      "step": 1845
+    },
+    {
+      "epoch": 0.336608442503639,
+      "grad_norm": 0.16166730225086212,
+      "learning_rate": 3.8682783833861736e-05,
+      "loss": 0.16896235942840576,
+      "step": 1850
+    },
+    {
+      "epoch": 0.33751819505094616,
+      "grad_norm": 0.16497021913528442,
+      "learning_rate": 3.8621119875998026e-05,
+      "loss": 0.1600774645805359,
+      "step": 1855
+    },
+    {
+      "epoch": 0.3384279475982533,
+      "grad_norm": 0.17264948785305023,
+      "learning_rate": 3.855933783621384e-05,
+      "loss": 0.16947593688964843,
+      "step": 1860
+    },
+    {
+      "epoch": 0.3393377001455604,
+      "grad_norm": 0.16870704293251038,
+      "learning_rate": 3.8497438250099636e-05,
+      "loss": 0.16062095165252685,
+      "step": 1865
+    },
+    {
+      "epoch": 0.34024745269286755,
+      "grad_norm": 0.16644036769866943,
+      "learning_rate": 3.843542165426492e-05,
+      "loss": 0.16015599966049193,
+      "step": 1870
+    },
+    {
+      "epoch": 0.3411572052401747,
+      "grad_norm": 0.1626352220773697,
+      "learning_rate": 3.837328858633349e-05,
+      "loss": 0.17444703578948975,
+      "step": 1875
+    },
+    {
+      "epoch": 0.3420669577874818,
+      "grad_norm": 0.1427375227212906,
+      "learning_rate": 3.83110395849389e-05,
+      "loss": 0.1589805006980896,
+      "step": 1880
+    },
+    {
+      "epoch": 0.34297671033478894,
+      "grad_norm": 0.17840255796909332,
+      "learning_rate": 3.824867518971973e-05,
+      "loss": 0.15953952074050903,
+      "step": 1885
+    },
+    {
+      "epoch": 0.34388646288209607,
+      "grad_norm": 0.16998249292373657,
+      "learning_rate": 3.818619594131489e-05,
+      "loss": 0.16027032136917113,
+      "step": 1890
+    },
+    {
+      "epoch": 0.3447962154294032,
+      "grad_norm": 0.14950257539749146,
+      "learning_rate": 3.812360238135897e-05,
+      "loss": 0.15335670709609986,
+      "step": 1895
+    },
+    {
+      "epoch": 0.3457059679767103,
+      "grad_norm": 0.1678011417388916,
+      "learning_rate": 3.806089505247752e-05,
+      "loss": 0.1560648798942566,
+      "step": 1900
+    },
+    {
+      "epoch": 0.34661572052401746,
+      "grad_norm": 0.17944541573524475,
+      "learning_rate": 3.799807449828238e-05,
+      "loss": 0.16072254180908202,
+      "step": 1905
+    },
+    {
+      "epoch": 0.3475254730713246,
+      "grad_norm": 0.166817307472229,
+      "learning_rate": 3.793514126336691e-05,
+      "loss": 0.1542820692062378,
+      "step": 1910
+    },
+    {
+      "epoch": 0.3484352256186317,
+      "grad_norm": 0.16047626733779907,
+      "learning_rate": 3.787209589330134e-05,
+      "loss": 0.16092092990875245,
+      "step": 1915
+    },
+    {
+      "epoch": 0.34934497816593885,
+      "grad_norm": 0.16478900611400604,
+      "learning_rate": 3.7808938934627965e-05,
+      "loss": 0.16765867471694945,
+      "step": 1920
+    },
+    {
+      "epoch": 0.350254730713246,
+      "grad_norm": 0.15349514782428741,
+      "learning_rate": 3.774567093485648e-05,
+      "loss": 0.15890377759933472,
+      "step": 1925
+    },
+    {
+      "epoch": 0.3511644832605531,
+      "grad_norm": 0.1515921950340271,
+      "learning_rate": 3.768229244245917e-05,
+      "loss": 0.16668319702148438,
+      "step": 1930
+    },
+    {
+      "epoch": 0.35207423580786024,
+      "grad_norm": 0.16310466825962067,
+      "learning_rate": 3.7618804006866195e-05,
+      "loss": 0.15182652473449706,
+      "step": 1935
+    },
+    {
+      "epoch": 0.3529839883551674,
+      "grad_norm": 0.17294517159461975,
+      "learning_rate": 3.755520617846084e-05,
+      "loss": 0.16287628412246705,
+      "step": 1940
+    },
+    {
+      "epoch": 0.35389374090247455,
+      "grad_norm": 0.1482895463705063,
+      "learning_rate": 3.749149950857467e-05,
+      "loss": 0.15321952104568481,
+      "step": 1945
+    },
+    {
+      "epoch": 0.3548034934497817,
+      "grad_norm": 0.2236029952764511,
+      "learning_rate": 3.7427684549482847e-05,
+      "loss": 0.15403482913970948,
+      "step": 1950
+    },
+    {
+      "epoch": 0.3557132459970888,
+      "grad_norm": 0.20185327529907227,
+      "learning_rate": 3.736376185439927e-05,
+      "loss": 0.1633884072303772,
+      "step": 1955
+    },
+    {
+      "epoch": 0.35662299854439594,
+      "grad_norm": 0.13906247913837433,
+      "learning_rate": 3.7299731977471816e-05,
+      "loss": 0.15925350189208984,
+      "step": 1960
+    },
+    {
+      "epoch": 0.35753275109170307,
+      "grad_norm": 0.18665002286434174,
+      "learning_rate": 3.723559547377751e-05,
+      "loss": 0.1612026572227478,
+      "step": 1965
+    },
+    {
+      "epoch": 0.3584425036390102,
+      "grad_norm": 0.16913433372974396,
+      "learning_rate": 3.717135289931774e-05,
+      "loss": 0.15479494333267213,
+      "step": 1970
+    },
+    {
+      "epoch": 0.35935225618631733,
+      "grad_norm": 0.1620066910982132,
+      "learning_rate": 3.7107004811013434e-05,
+      "loss": 0.1604058027267456,
+      "step": 1975
+    },
+    {
+      "epoch": 0.36026200873362446,
+      "grad_norm": 0.16838301718235016,
+      "learning_rate": 3.704255176670021e-05,
+      "loss": 0.15335073471069335,
+      "step": 1980
+    },
+    {
+      "epoch": 0.3611717612809316,
+      "grad_norm": 0.3054695427417755,
+      "learning_rate": 3.6977994325123535e-05,
+      "loss": 0.16558053493499755,
+      "step": 1985
+    },
+    {
+      "epoch": 0.3620815138282387,
+      "grad_norm": 0.1526716649532318,
+      "learning_rate": 3.6913333045933934e-05,
+      "loss": 0.16148923635482787,
+      "step": 1990
+    },
+    {
+      "epoch": 0.36299126637554585,
+      "grad_norm": 0.15328513085842133,
+      "learning_rate": 3.684856848968209e-05,
+      "loss": 0.1553613781929016,
+      "step": 1995
+    },
+    {
+      "epoch": 0.363901018922853,
+      "grad_norm": 0.16129714250564575,
+      "learning_rate": 3.6783701217813995e-05,
+      "loss": 0.16724612712860107,
+      "step": 2000
+    },
+    {
+      "epoch": 0.3648107714701601,
+      "grad_norm": 0.15715539455413818,
+      "learning_rate": 3.6718731792666086e-05,
+      "loss": 0.15867922306060792,
+      "step": 2005
+    },
+    {
+      "epoch": 0.36572052401746724,
+      "grad_norm": 0.15569166839122772,
+      "learning_rate": 3.6653660777460366e-05,
+      "loss": 0.1552058696746826,
+      "step": 2010
+    },
+    {
+      "epoch": 0.36663027656477437,
+      "grad_norm": 0.16223010420799255,
+      "learning_rate": 3.6588488736299535e-05,
+      "loss": 0.1583200454711914,
+      "step": 2015
+    },
+    {
+      "epoch": 0.3675400291120815,
+      "grad_norm": 0.18441995978355408,
+      "learning_rate": 3.652321623416209e-05,
+      "loss": 0.15050662755966188,
+      "step": 2020
+    },
+    {
+      "epoch": 0.36844978165938863,
+      "grad_norm": 0.13792674243450165,
+      "learning_rate": 3.645784383689742e-05,
+      "loss": 0.15458759069442748,
+      "step": 2025
+    },
+    {
+      "epoch": 0.36935953420669576,
+      "grad_norm": 0.14993111789226532,
+      "learning_rate": 3.639237211122091e-05,
+      "loss": 0.15926222801208495,
+      "step": 2030
+    },
+    {
+      "epoch": 0.3702692867540029,
+      "grad_norm": 0.16815930604934692,
+      "learning_rate": 3.632680162470904e-05,
+      "loss": 0.15524441003799438,
+      "step": 2035
+    },
+    {
+      "epoch": 0.37117903930131,
+      "grad_norm": 0.13312821090221405,
+      "learning_rate": 3.626113294579441e-05,
+      "loss": 0.15883516073226928,
+      "step": 2040
+    },
+    {
+      "epoch": 0.37208879184861715,
+      "grad_norm": 0.16838273406028748,
+      "learning_rate": 3.619536664376091e-05,
+      "loss": 0.15829603672027587,
+      "step": 2045
+    },
+    {
+      "epoch": 0.37299854439592434,
+      "grad_norm": 0.14706873893737793,
+      "learning_rate": 3.612950328873869e-05,
+      "loss": 0.15644397735595703,
+      "step": 2050
+    },
+    {
+      "epoch": 0.37390829694323147,
+      "grad_norm": 0.1644199639558792,
+      "learning_rate": 3.606354345169926e-05,
+      "loss": 0.15858219861984252,
+      "step": 2055
+    },
+    {
+      "epoch": 0.3748180494905386,
+      "grad_norm": 0.18077051639556885,
+      "learning_rate": 3.599748770445055e-05,
+      "loss": 0.1641286849975586,
+      "step": 2060
+    },
+    {
+      "epoch": 0.3757278020378457,
+      "grad_norm": 0.16329127550125122,
+      "learning_rate": 3.5931336619631914e-05,
+      "loss": 0.15027186870574952,
+      "step": 2065
+    },
+    {
+      "epoch": 0.37663755458515286,
+      "grad_norm": 0.16346783936023712,
+      "learning_rate": 3.586509077070922e-05,
+      "loss": 0.1558641314506531,
+      "step": 2070
+    },
+    {
+      "epoch": 0.37754730713246,
+      "grad_norm": 0.1727602630853653,
+      "learning_rate": 3.5798750731969834e-05,
+      "loss": 0.15390506982803345,
+      "step": 2075
+    },
+    {
+      "epoch": 0.3784570596797671,
+      "grad_norm": 0.7598192691802979,
+      "learning_rate": 3.5732317078517654e-05,
+      "loss": 0.1533232808113098,
+      "step": 2080
+    },
+    {
+      "epoch": 0.37936681222707425,
+      "grad_norm": 0.1433355212211609,
+      "learning_rate": 3.5665790386268124e-05,
+      "loss": 0.15560413599014283,
+      "step": 2085
+    },
+    {
+      "epoch": 0.3802765647743814,
+      "grad_norm": 0.18439625203609467,
+      "learning_rate": 3.559917123194325e-05,
+      "loss": 0.16695556640625,
+      "step": 2090
+    },
+    {
+      "epoch": 0.3811863173216885,
+      "grad_norm": 0.1693502813577652,
+      "learning_rate": 3.55324601930666e-05,
+      "loss": 0.15957870483398437,
+      "step": 2095
+    },
+    {
+      "epoch": 0.38209606986899564,
+      "grad_norm": 0.17776088416576385,
+      "learning_rate": 3.54656578479583e-05,
+      "loss": 0.1527492880821228,
+      "step": 2100
+    },
+    {
+      "epoch": 0.38300582241630277,
+      "grad_norm": 0.15993724763393402,
+      "learning_rate": 3.539876477572998e-05,
+      "loss": 0.1567505717277527,
+      "step": 2105
+    },
+    {
+      "epoch": 0.3839155749636099,
+      "grad_norm": 0.17067375779151917,
+      "learning_rate": 3.533178155627981e-05,
+      "loss": 0.14660797119140626,
+      "step": 2110
+    },
+    {
+      "epoch": 0.384825327510917,
+      "grad_norm": 0.20239882171154022,
+      "learning_rate": 3.526470877028745e-05,
+      "loss": 0.1596767544746399,
+      "step": 2115
+    },
+    {
+      "epoch": 0.38573508005822416,
+      "grad_norm": 0.1863643079996109,
+      "learning_rate": 3.5197546999209005e-05,
+      "loss": 0.15738571882247926,
+      "step": 2120
+    },
+    {
+      "epoch": 0.3866448326055313,
+      "grad_norm": 0.16994133591651917,
+      "learning_rate": 3.5130296825272014e-05,
+      "loss": 0.16255316734313965,
+      "step": 2125
+    },
+    {
+      "epoch": 0.3875545851528384,
+      "grad_norm": 0.18703415989875793,
+      "learning_rate": 3.5062958831470355e-05,
+      "loss": 0.15206334590911866,
+      "step": 2130
+    },
+    {
+      "epoch": 0.38846433770014555,
+      "grad_norm": 0.15433982014656067,
+      "learning_rate": 3.4995533601559226e-05,
+      "loss": 0.1590178370475769,
+      "step": 2135
+    },
+    {
+      "epoch": 0.3893740902474527,
+      "grad_norm": 0.16498146951198578,
+      "learning_rate": 3.4928021720050104e-05,
+      "loss": 0.14759145975112914,
+      "step": 2140
+    },
+    {
+      "epoch": 0.3902838427947598,
+      "grad_norm": 0.17880478501319885,
+      "learning_rate": 3.486042377220562e-05,
+      "loss": 0.1642458915710449,
+      "step": 2145
+    },
+    {
+      "epoch": 0.39119359534206694,
+      "grad_norm": 0.14700061082839966,
+      "learning_rate": 3.479274034403455e-05,
+      "loss": 0.16105138063430785,
+      "step": 2150
+    },
+    {
+      "epoch": 0.39210334788937407,
+      "grad_norm": 0.1620762050151825,
+      "learning_rate": 3.472497202228664e-05,
+      "loss": 0.15104985237121582,
+      "step": 2155
+    },
+    {
+      "epoch": 0.3930131004366812,
+      "grad_norm": 0.1625058799982071,
+      "learning_rate": 3.4657119394447654e-05,
+      "loss": 0.16145485639572144,
+      "step": 2160
+    },
+    {
+      "epoch": 0.3939228529839884,
+      "grad_norm": 0.1631549596786499,
+      "learning_rate": 3.458918304873417e-05,
+      "loss": 0.16712255477905275,
+      "step": 2165
+    },
+    {
+      "epoch": 0.3948326055312955,
+      "grad_norm": 0.16041551530361176,
+      "learning_rate": 3.452116357408853e-05,
+      "loss": 0.15118330717086792,
+      "step": 2170
+    },
+    {
+      "epoch": 0.39574235807860264,
+      "grad_norm": 0.16692611575126648,
+      "learning_rate": 3.44530615601737e-05,
+      "loss": 0.16982550621032716,
+      "step": 2175
+    },
+    {
+      "epoch": 0.39665211062590977,
+      "grad_norm": 0.16082268953323364,
+      "learning_rate": 3.438487759736821e-05,
+      "loss": 0.1513260006904602,
+      "step": 2180
+    },
+    {
+      "epoch": 0.3975618631732169,
+      "grad_norm": 0.1474589854478836,
+      "learning_rate": 3.4316612276761004e-05,
+      "loss": 0.14968743324279785,
+      "step": 2185
+    },
+    {
+      "epoch": 0.39847161572052403,
+      "grad_norm": 0.14531342685222626,
+      "learning_rate": 3.42482661901463e-05,
+      "loss": 0.1563260555267334,
+      "step": 2190
+    },
+    {
+      "epoch": 0.39938136826783116,
+      "grad_norm": 0.16775506734848022,
+      "learning_rate": 3.41798399300185e-05,
+      "loss": 0.14861010313034057,
+      "step": 2195
+    },
+    {
+      "epoch": 0.4002911208151383,
+      "grad_norm": 0.15065217018127441,
+      "learning_rate": 3.411133408956703e-05,
+      "loss": 0.15559519529342652,
+      "step": 2200
+    },
+    {
+      "epoch": 0.4012008733624454,
+      "grad_norm": 0.16655296087265015,
+      "learning_rate": 3.4042749262671184e-05,
+      "loss": 0.16025567054748535,
+      "step": 2205
+    },
+    {
+      "epoch": 0.40211062590975255,
+      "grad_norm": 0.14773905277252197,
+      "learning_rate": 3.397408604389501e-05,
+      "loss": 0.15074082612991332,
+      "step": 2210
+    },
+    {
+      "epoch": 0.4030203784570597,
+      "grad_norm": 0.16233304142951965,
+      "learning_rate": 3.3905345028482125e-05,
+      "loss": 0.15490520000457764,
+      "step": 2215
+    },
+    {
+      "epoch": 0.4039301310043668,
+      "grad_norm": 0.17520153522491455,
+      "learning_rate": 3.383652681235058e-05,
+      "loss": 0.1517520785331726,
+      "step": 2220
+    },
+    {
+      "epoch": 0.40483988355167394,
+      "grad_norm": 0.14749875664710999,
+      "learning_rate": 3.376763199208766e-05,
+      "loss": 0.15410997867584228,
+      "step": 2225
+    },
+    {
+      "epoch": 0.40574963609898107,
+      "grad_norm": 0.16855919361114502,
+      "learning_rate": 3.369866116494477e-05,
+      "loss": 0.1510261058807373,
+      "step": 2230
+    },
+    {
+      "epoch": 0.4066593886462882,
+      "grad_norm": 0.1594122350215912,
+      "learning_rate": 3.362961492883218e-05,
+      "loss": 0.1493813395500183,
+      "step": 2235
+    },
+    {
+      "epoch": 0.40756914119359533,
+      "grad_norm": 0.13645926117897034,
+      "learning_rate": 3.3560493882313915e-05,
+      "loss": 0.14876762628555298,
+      "step": 2240
+    },
+    {
+      "epoch": 0.40847889374090246,
+      "grad_norm": 0.14304400980472565,
+      "learning_rate": 3.349129862460251e-05,
+      "loss": 0.15567013025283813,
+      "step": 2245
+    },
+    {
+      "epoch": 0.4093886462882096,
+      "grad_norm": 0.17040041089057922,
+      "learning_rate": 3.342202975555386e-05,
+      "loss": 0.1563249945640564,
+      "step": 2250
+    },
+    {
+      "epoch": 0.4102983988355167,
+      "grad_norm": 0.15594671666622162,
+      "learning_rate": 3.3352687875661984e-05,
+      "loss": 0.1546410083770752,
+      "step": 2255
+    },
+    {
+      "epoch": 0.41120815138282385,
+      "grad_norm": 0.1677195280790329,
+      "learning_rate": 3.328327358605384e-05,
+      "loss": 0.15710171461105346,
+      "step": 2260
+    },
+    {
+      "epoch": 0.412117903930131,
+      "grad_norm": 0.1731705516576767,
+      "learning_rate": 3.321378748848412e-05,
+      "loss": 0.16444036960601807,
+      "step": 2265
+    },
+    {
+      "epoch": 0.4130276564774381,
+      "grad_norm": 0.18779033422470093,
+      "learning_rate": 3.3144230185329984e-05,
+      "loss": 0.15659687519073487,
+      "step": 2270
+    },
+    {
+      "epoch": 0.4139374090247453,
+      "grad_norm": 0.1543768346309662,
+      "learning_rate": 3.3074602279585913e-05,
+      "loss": 0.15100739002227784,
+      "step": 2275
+    },
+    {
+      "epoch": 0.4148471615720524,
+      "grad_norm": 0.16672168672084808,
+      "learning_rate": 3.300490437485843e-05,
+      "loss": 0.15535364151000977,
+      "step": 2280
+    },
+    {
+      "epoch": 0.41575691411935956,
+      "grad_norm": 0.16741308569908142,
+      "learning_rate": 3.293513707536089e-05,
+      "loss": 0.15523911714553834,
+      "step": 2285
+    },
+    {
+      "epoch": 0.4166666666666667,
+      "grad_norm": 0.1488303542137146,
+      "learning_rate": 3.286530098590822e-05,
+      "loss": 0.1542000651359558,
+      "step": 2290
+    },
+    {
+      "epoch": 0.4175764192139738,
+      "grad_norm": 0.1637732982635498,
+      "learning_rate": 3.2795396711911694e-05,
+      "loss": 0.15354831218719484,
+      "step": 2295
+    },
+    {
+      "epoch": 0.41848617176128095,
+      "grad_norm": 0.1472022533416748,
+      "learning_rate": 3.272542485937369e-05,
+      "loss": 0.16235145330429077,
+      "step": 2300
+    },
+    {
+      "epoch": 0.4193959243085881,
+      "grad_norm": 0.15908290445804596,
+      "learning_rate": 3.265538603488241e-05,
+      "loss": 0.15642645359039306,
+      "step": 2305
+    },
+    {
+      "epoch": 0.4203056768558952,
+      "grad_norm": 0.1584865301847458,
+      "learning_rate": 3.2585280845606645e-05,
+      "loss": 0.15490249395370484,
+      "step": 2310
+    },
+    {
+      "epoch": 0.42121542940320233,
+      "grad_norm": 0.15893949568271637,
+      "learning_rate": 3.251510989929052e-05,
+      "loss": 0.1598116159439087,
+      "step": 2315
+    },
+    {
+      "epoch": 0.42212518195050946,
+      "grad_norm": 0.18930596113204956,
+      "learning_rate": 3.244487380424817e-05,
+      "loss": 0.1482008934020996,
+      "step": 2320
+    },
+    {
+      "epoch": 0.4230349344978166,
+      "grad_norm": 0.132876455783844,
+      "learning_rate": 3.237457316935856e-05,
+      "loss": 0.15304710865020751,
+      "step": 2325
+    },
+    {
+      "epoch": 0.4239446870451237,
+      "grad_norm": 0.16447032988071442,
+      "learning_rate": 3.2304208604060106e-05,
+      "loss": 0.15298750400543212,
+      "step": 2330
+    },
+    {
+      "epoch": 0.42485443959243085,
+      "grad_norm": 0.17748120427131653,
+      "learning_rate": 3.223378071834546e-05,
+      "loss": 0.1556084156036377,
+      "step": 2335
+    },
+    {
+      "epoch": 0.425764192139738,
+      "grad_norm": 0.16366586089134216,
+      "learning_rate": 3.2163290122756206e-05,
+      "loss": 0.14387927055358887,
+      "step": 2340
+    },
+    {
+      "epoch": 0.4266739446870451,
+      "grad_norm": 0.15398970246315002,
+      "learning_rate": 3.209273742837755e-05,
+      "loss": 0.16091293096542358,
+      "step": 2345
+    },
+    {
+      "epoch": 0.42758369723435224,
+      "grad_norm": 0.164212167263031,
+      "learning_rate": 3.202212324683305e-05,
+      "loss": 0.15523531436920165,
+      "step": 2350
+    },
+    {
+      "epoch": 0.4284934497816594,
+      "grad_norm": 0.16749800741672516,
+      "learning_rate": 3.1951448190279255e-05,
+      "loss": 0.15354975461959838,
+      "step": 2355
+    },
+    {
+      "epoch": 0.4294032023289665,
+      "grad_norm": 0.14137034118175507,
+      "learning_rate": 3.18807128714005e-05,
+      "loss": 0.14981694221496583,
+      "step": 2360
+    },
+    {
+      "epoch": 0.43031295487627363,
+      "grad_norm": 0.14848439395427704,
+      "learning_rate": 3.1809917903403507e-05,
+      "loss": 0.15448769330978393,
+      "step": 2365
+    },
+    {
+      "epoch": 0.43122270742358076,
+      "grad_norm": 0.1747605800628662,
+      "learning_rate": 3.1739063900012095e-05,
+      "loss": 0.15882387161254882,
+      "step": 2370
+    },
+    {
+      "epoch": 0.4321324599708879,
+      "grad_norm": 0.16054467856884003,
+      "learning_rate": 3.166815147546186e-05,
+      "loss": 0.15170297622680665,
+      "step": 2375
+    },
+    {
+      "epoch": 0.433042212518195,
+      "grad_norm": 0.15428027510643005,
+      "learning_rate": 3.1597181244494886e-05,
+      "loss": 0.16202548742294312,
+      "step": 2380
+    },
+    {
+      "epoch": 0.4339519650655022,
+      "grad_norm": 0.16747219860553741,
+      "learning_rate": 3.1526153822354325e-05,
+      "loss": 0.15461477041244506,
+      "step": 2385
+    },
+    {
+      "epoch": 0.43486171761280934,
+      "grad_norm": 0.17415772378444672,
+      "learning_rate": 3.145506982477918e-05,
+      "loss": 0.16173542737960817,
+      "step": 2390
+    },
+    {
+      "epoch": 0.43577147016011647,
+      "grad_norm": 0.1293518990278244,
+      "learning_rate": 3.1383929867998865e-05,
+      "loss": 0.15572521686553956,
+      "step": 2395
+    },
+    {
+      "epoch": 0.4366812227074236,
+      "grad_norm": 0.16909323632717133,
+      "learning_rate": 3.1312734568727935e-05,
+      "loss": 0.15898628234863282,
+      "step": 2400
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.325162457139422e+18,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-2400/training_args.bin b/checkpoint-2400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-2400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/checkpoint-2500/README.md b/checkpoint-2500/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-2500/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-2500/adapter_config.json b/checkpoint-2500/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-2500/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-2500/adapter_model.safetensors b/checkpoint-2500/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..39cf0f5764581e39fa89250a439c5a4909fa0a3c
--- /dev/null
+++ b/checkpoint-2500/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a3eef1d2c60b4e9acabb5b2f6950ae95b7f548fc3090dad4828f252e4d574a14
+size 169741912
diff --git a/checkpoint-2500/chat_template.jinja b/checkpoint-2500/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-2500/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-2500/optimizer.pt b/checkpoint-2500/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7449ab4934dd9d4922d73666d75f35e90001c1a4
--- /dev/null
+++ b/checkpoint-2500/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e6d3e13dfdbf076cf85af2f2af9389ebb2acf56cf15cf373ae51af80ea2dab7a
+size 72807355
diff --git a/checkpoint-2500/processor_config.json b/checkpoint-2500/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-2500/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-2500/rng_state.pth b/checkpoint-2500/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-2500/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-2500/scheduler.pt b/checkpoint-2500/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..569a93c33519ee27bc9e3719cfc9b1ab78eeeb19
--- /dev/null
+++ b/checkpoint-2500/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c8e57b03658c31690f8603ab1d1d5ff52fdb21cb504d78cc2e5d0024447a0df7
+size 1465
diff --git a/checkpoint-2500/tokenizer.json b/checkpoint-2500/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-2500/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-2500/tokenizer_config.json b/checkpoint-2500/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-2500/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-2500/trainer_state.json b/checkpoint-2500/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..c6a8661d2d1c4cd899d743e50a524509a027062a
--- /dev/null
+++ b/checkpoint-2500/trainer_state.json
@@ -0,0 +1,3542 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.45487627365356625,
+  "eval_steps": 100,
+  "global_step": 2500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    },
+    {
+      "epoch": 0.03729985443959243,
+      "grad_norm": 0.061678580939769745,
+      "learning_rate": 4.999340748636462e-05,
+      "loss": 0.24956226348876953,
+      "step": 205
+    },
+    {
+      "epoch": 0.03820960698689956,
+      "grad_norm": 0.07328873127698898,
+      "learning_rate": 4.999160884067051e-05,
+      "loss": 0.26169676780700685,
+      "step": 210
+    },
+    {
+      "epoch": 0.0391193595342067,
+      "grad_norm": 0.08287990838289261,
+      "learning_rate": 4.9989593541926246e-05,
+      "loss": 0.2574604034423828,
+      "step": 215
+    },
+    {
+      "epoch": 0.04002911208151383,
+      "grad_norm": 0.06787359714508057,
+      "learning_rate": 4.9987361607602525e-05,
+      "loss": 0.25351409912109374,
+      "step": 220
+    },
+    {
+      "epoch": 0.04093886462882096,
+      "grad_norm": 0.06695502996444702,
+      "learning_rate": 4.998491305704805e-05,
+      "loss": 0.24522039890289307,
+      "step": 225
+    },
+    {
+      "epoch": 0.041848617176128096,
+      "grad_norm": 0.08872214704751968,
+      "learning_rate": 4.9982247911489375e-05,
+      "loss": 0.2581867933273315,
+      "step": 230
+    },
+    {
+      "epoch": 0.042758369723435226,
+      "grad_norm": 0.07637131959199905,
+      "learning_rate": 4.9979366194030743e-05,
+      "loss": 0.25569658279418944,
+      "step": 235
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 0.08158119022846222,
+      "learning_rate": 4.997626792965385e-05,
+      "loss": 0.2529409646987915,
+      "step": 240
+    },
+    {
+      "epoch": 0.04457787481804949,
+      "grad_norm": 0.07529161125421524,
+      "learning_rate": 4.997295314521766e-05,
+      "loss": 0.24049024581909179,
+      "step": 245
+    },
+    {
+      "epoch": 0.04548762736535662,
+      "grad_norm": 0.08860139548778534,
+      "learning_rate": 4.996942186945813e-05,
+      "loss": 0.2490522861480713,
+      "step": 250
+    },
+    {
+      "epoch": 0.04639737991266375,
+      "grad_norm": 0.0850321501493454,
+      "learning_rate": 4.9965674132988005e-05,
+      "loss": 0.24180831909179687,
+      "step": 255
+    },
+    {
+      "epoch": 0.04730713245997089,
+      "grad_norm": 0.07556115090847015,
+      "learning_rate": 4.996170996829653e-05,
+      "loss": 0.2509631872177124,
+      "step": 260
+    },
+    {
+      "epoch": 0.04821688500727802,
+      "grad_norm": 0.07971206307411194,
+      "learning_rate": 4.995752940974918e-05,
+      "loss": 0.24398891925811766,
+      "step": 265
+    },
+    {
+      "epoch": 0.04912663755458515,
+      "grad_norm": 0.09149336814880371,
+      "learning_rate": 4.9953132493587344e-05,
+      "loss": 0.2300492286682129,
+      "step": 270
+    },
+    {
+      "epoch": 0.050036390101892286,
+      "grad_norm": 0.08265820890665054,
+      "learning_rate": 4.9948519257928034e-05,
+      "loss": 0.24246792793273925,
+      "step": 275
+    },
+    {
+      "epoch": 0.050946142649199416,
+      "grad_norm": 0.10328587144613266,
+      "learning_rate": 4.9943689742763534e-05,
+      "loss": 0.2367171049118042,
+      "step": 280
+    },
+    {
+      "epoch": 0.05185589519650655,
+      "grad_norm": 0.0836917981505394,
+      "learning_rate": 4.993864398996105e-05,
+      "loss": 0.23215813636779786,
+      "step": 285
+    },
+    {
+      "epoch": 0.05276564774381368,
+      "grad_norm": 0.09475161135196686,
+      "learning_rate": 4.99333820432624e-05,
+      "loss": 0.2350748062133789,
+      "step": 290
+    },
+    {
+      "epoch": 0.05367540029112081,
+      "grad_norm": 0.08040128648281097,
+      "learning_rate": 4.992790394828355e-05,
+      "loss": 0.23253886699676513,
+      "step": 295
+    },
+    {
+      "epoch": 0.05458515283842795,
+      "grad_norm": 0.08852150291204453,
+      "learning_rate": 4.992220975251428e-05,
+      "loss": 0.23856515884399415,
+      "step": 300
+    },
+    {
+      "epoch": 0.05549490538573508,
+      "grad_norm": 0.09565229713916779,
+      "learning_rate": 4.991629950531775e-05,
+      "loss": 0.23311660289764405,
+      "step": 305
+    },
+    {
+      "epoch": 0.05640465793304221,
+      "grad_norm": 0.08158160001039505,
+      "learning_rate": 4.991017325793009e-05,
+      "loss": 0.22467944622039795,
+      "step": 310
+    },
+    {
+      "epoch": 0.05731441048034935,
+      "grad_norm": 0.07746429741382599,
+      "learning_rate": 4.990383106345994e-05,
+      "loss": 0.229844069480896,
+      "step": 315
+    },
+    {
+      "epoch": 0.05822416302765648,
+      "grad_norm": 0.08564355969429016,
+      "learning_rate": 4.989727297688797e-05,
+      "loss": 0.22414517402648926,
+      "step": 320
+    },
+    {
+      "epoch": 0.05913391557496361,
+      "grad_norm": 0.07517435401678085,
+      "learning_rate": 4.9890499055066435e-05,
+      "loss": 0.2236532211303711,
+      "step": 325
+    },
+    {
+      "epoch": 0.060043668122270744,
+      "grad_norm": 0.111734539270401,
+      "learning_rate": 4.988350935671869e-05,
+      "loss": 0.21474847793579102,
+      "step": 330
+    },
+    {
+      "epoch": 0.060953420669577874,
+      "grad_norm": 0.09906989336013794,
+      "learning_rate": 4.987630394243866e-05,
+      "loss": 0.23321933746337892,
+      "step": 335
+    },
+    {
+      "epoch": 0.06186317321688501,
+      "grad_norm": 0.10131457448005676,
+      "learning_rate": 4.98688828746903e-05,
+      "loss": 0.2310662031173706,
+      "step": 340
+    },
+    {
+      "epoch": 0.06277292576419213,
+      "grad_norm": 0.09203507006168365,
+      "learning_rate": 4.986124621780708e-05,
+      "loss": 0.22021169662475587,
+      "step": 345
+    },
+    {
+      "epoch": 0.06368267831149928,
+      "grad_norm": 0.09505912661552429,
+      "learning_rate": 4.9853394037991416e-05,
+      "loss": 0.2197155237197876,
+      "step": 350
+    },
+    {
+      "epoch": 0.06459243085880641,
+      "grad_norm": 0.09038657695055008,
+      "learning_rate": 4.984532640331412e-05,
+      "loss": 0.22066287994384765,
+      "step": 355
+    },
+    {
+      "epoch": 0.06550218340611354,
+      "grad_norm": 0.09707064181566238,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 0.22455451488494874,
+      "step": 360
+    },
+    {
+      "epoch": 0.06641193595342067,
+      "grad_norm": 0.10367228090763092,
+      "learning_rate": 4.98285450509961e-05,
+      "loss": 0.21993820667266845,
+      "step": 365
+    },
+    {
+      "epoch": 0.0673216885007278,
+      "grad_norm": 0.12229471653699875,
+      "learning_rate": 4.9819831478833456e-05,
+      "loss": 0.2168867588043213,
+      "step": 370
+    },
+    {
+      "epoch": 0.06823144104803494,
+      "grad_norm": 0.0964592918753624,
+      "learning_rate": 4.981090274276406e-05,
+      "loss": 0.21579203605651856,
+      "step": 375
+    },
+    {
+      "epoch": 0.06914119359534207,
+      "grad_norm": 0.09400496631860733,
+      "learning_rate": 4.980175892019141e-05,
+      "loss": 0.20972180366516113,
+      "step": 380
+    },
+    {
+      "epoch": 0.0700509461426492,
+      "grad_norm": 0.08158645778894424,
+      "learning_rate": 4.9792400090383594e-05,
+      "loss": 0.22148358821868896,
+      "step": 385
+    },
+    {
+      "epoch": 0.07096069868995633,
+      "grad_norm": 0.10916394740343094,
+      "learning_rate": 4.978282633447261e-05,
+      "loss": 0.2214418649673462,
+      "step": 390
+    },
+    {
+      "epoch": 0.07187045123726346,
+      "grad_norm": 0.11138810962438583,
+      "learning_rate": 4.9773037735453636e-05,
+      "loss": 0.21814754009246826,
+      "step": 395
+    },
+    {
+      "epoch": 0.07278020378457059,
+      "grad_norm": 0.10914396494626999,
+      "learning_rate": 4.9763034378184365e-05,
+      "loss": 0.21310818195343018,
+      "step": 400
+    },
+    {
+      "epoch": 0.07368995633187773,
+      "grad_norm": 0.1043366864323616,
+      "learning_rate": 4.975281634938421e-05,
+      "loss": 0.21266789436340333,
+      "step": 405
+    },
+    {
+      "epoch": 0.07459970887918486,
+      "grad_norm": 0.1036868542432785,
+      "learning_rate": 4.9742383737633594e-05,
+      "loss": 0.21606721878051757,
+      "step": 410
+    },
+    {
+      "epoch": 0.075509461426492,
+      "grad_norm": 0.11640442907810211,
+      "learning_rate": 4.9731736633373144e-05,
+      "loss": 0.21532948017120362,
+      "step": 415
+    },
+    {
+      "epoch": 0.07641921397379912,
+      "grad_norm": 0.11219926178455353,
+      "learning_rate": 4.9720875128902956e-05,
+      "loss": 0.2191627025604248,
+      "step": 420
+    },
+    {
+      "epoch": 0.07732896652110625,
+      "grad_norm": 0.12103637307882309,
+      "learning_rate": 4.970979931838176e-05,
+      "loss": 0.20938868522644044,
+      "step": 425
+    },
+    {
+      "epoch": 0.0782387190684134,
+      "grad_norm": 0.13274189829826355,
+      "learning_rate": 4.96985092978261e-05,
+      "loss": 0.21792960166931152,
+      "step": 430
+    },
+    {
+      "epoch": 0.07914847161572053,
+      "grad_norm": 0.11164513230323792,
+      "learning_rate": 4.968700516510954e-05,
+      "loss": 0.2022618055343628,
+      "step": 435
+    },
+    {
+      "epoch": 0.08005822416302766,
+      "grad_norm": 0.09532847255468369,
+      "learning_rate": 4.967528701996174e-05,
+      "loss": 0.21255812644958497,
+      "step": 440
+    },
+    {
+      "epoch": 0.08096797671033479,
+      "grad_norm": 0.10279258340597153,
+      "learning_rate": 4.96633549639677e-05,
+      "loss": 0.20683050155639648,
+      "step": 445
+    },
+    {
+      "epoch": 0.08187772925764192,
+      "grad_norm": 0.1257462352514267,
+      "learning_rate": 4.965120910056677e-05,
+      "loss": 0.21419920921325683,
+      "step": 450
+    },
+    {
+      "epoch": 0.08278748180494905,
+      "grad_norm": 0.11663137376308441,
+      "learning_rate": 4.963884953505186e-05,
+      "loss": 0.2072287082672119,
+      "step": 455
+    },
+    {
+      "epoch": 0.08369723435225619,
+      "grad_norm": 0.10488224029541016,
+      "learning_rate": 4.96262763745684e-05,
+      "loss": 0.1982678532600403,
+      "step": 460
+    },
+    {
+      "epoch": 0.08460698689956332,
+      "grad_norm": 0.11801692098379135,
+      "learning_rate": 4.961348972811354e-05,
+      "loss": 0.20662031173706055,
+      "step": 465
+    },
+    {
+      "epoch": 0.08551673944687045,
+      "grad_norm": 0.11318827420473099,
+      "learning_rate": 4.96004897065351e-05,
+      "loss": 0.20947303771972656,
+      "step": 470
+    },
+    {
+      "epoch": 0.08642649199417758,
+      "grad_norm": 0.13409486413002014,
+      "learning_rate": 4.95872764225307e-05,
+      "loss": 0.19670876264572143,
+      "step": 475
+    },
+    {
+      "epoch": 0.08733624454148471,
+      "grad_norm": 0.14440792798995972,
+      "learning_rate": 4.957384999064672e-05,
+      "loss": 0.19842848777770997,
+      "step": 480
+    },
+    {
+      "epoch": 0.08824599708879186,
+      "grad_norm": 0.12246996909379959,
+      "learning_rate": 4.956021052727731e-05,
+      "loss": 0.20318071842193602,
+      "step": 485
+    },
+    {
+      "epoch": 0.08915574963609899,
+      "grad_norm": 0.13437233865261078,
+      "learning_rate": 4.954635815066342e-05,
+      "loss": 0.21675212383270265,
+      "step": 490
+    },
+    {
+      "epoch": 0.09006550218340612,
+      "grad_norm": 0.11109672486782074,
+      "learning_rate": 4.9532292980891744e-05,
+      "loss": 0.2100757837295532,
+      "step": 495
+    },
+    {
+      "epoch": 0.09097525473071325,
+      "grad_norm": 0.1388893872499466,
+      "learning_rate": 4.9518015139893675e-05,
+      "loss": 0.20303285121917725,
+      "step": 500
+    },
+    {
+      "epoch": 0.09188500727802038,
+      "grad_norm": 0.13239721953868866,
+      "learning_rate": 4.950352475144427e-05,
+      "loss": 0.2152268409729004,
+      "step": 505
+    },
+    {
+      "epoch": 0.0927947598253275,
+      "grad_norm": 0.12834979593753815,
+      "learning_rate": 4.948882194116119e-05,
+      "loss": 0.20799248218536376,
+      "step": 510
+    },
+    {
+      "epoch": 0.09370451237263465,
+      "grad_norm": 0.11886704713106155,
+      "learning_rate": 4.947390683650354e-05,
+      "loss": 0.20394976139068605,
+      "step": 515
+    },
+    {
+      "epoch": 0.09461426491994178,
+      "grad_norm": 0.11398876458406448,
+      "learning_rate": 4.945877956677083e-05,
+      "loss": 0.2091092586517334,
+      "step": 520
+    },
+    {
+      "epoch": 0.09552401746724891,
+      "grad_norm": 0.1422540694475174,
+      "learning_rate": 4.944344026310186e-05,
+      "loss": 0.19564238786697388,
+      "step": 525
+    },
+    {
+      "epoch": 0.09643377001455604,
+      "grad_norm": 0.11359584331512451,
+      "learning_rate": 4.9427889058473535e-05,
+      "loss": 0.20493624210357667,
+      "step": 530
+    },
+    {
+      "epoch": 0.09734352256186317,
+      "grad_norm": 0.11703553050756454,
+      "learning_rate": 4.941212608769974e-05,
+      "loss": 0.2098615884780884,
+      "step": 535
+    },
+    {
+      "epoch": 0.0982532751091703,
+      "grad_norm": 0.14552047848701477,
+      "learning_rate": 4.939615148743017e-05,
+      "loss": 0.20382182598114013,
+      "step": 540
+    },
+    {
+      "epoch": 0.09916302765647744,
+      "grad_norm": 0.13178016245365143,
+      "learning_rate": 4.937996539614914e-05,
+      "loss": 0.19901862144470214,
+      "step": 545
+    },
+    {
+      "epoch": 0.10007278020378457,
+      "grad_norm": 0.635392427444458,
+      "learning_rate": 4.936356795417439e-05,
+      "loss": 0.20694944858551026,
+      "step": 550
+    },
+    {
+      "epoch": 0.1009825327510917,
+      "grad_norm": 0.15019077062606812,
+      "learning_rate": 4.934695930365586e-05,
+      "loss": 0.19313746690750122,
+      "step": 555
+    },
+    {
+      "epoch": 0.10189228529839883,
+      "grad_norm": 0.12941956520080566,
+      "learning_rate": 4.9330139588574474e-05,
+      "loss": 0.19671722650527954,
+      "step": 560
+    },
+    {
+      "epoch": 0.10280203784570596,
+      "grad_norm": 0.13818831741809845,
+      "learning_rate": 4.931310895474088e-05,
+      "loss": 0.20026786327362062,
+      "step": 565
+    },
+    {
+      "epoch": 0.1037117903930131,
+      "grad_norm": 0.12011194974184036,
+      "learning_rate": 4.929586754979417e-05,
+      "loss": 0.1932437539100647,
+      "step": 570
+    },
+    {
+      "epoch": 0.10462154294032024,
+      "grad_norm": 0.1345364898443222,
+      "learning_rate": 4.9278415523200644e-05,
+      "loss": 0.20245940685272218,
+      "step": 575
+    },
+    {
+      "epoch": 0.10553129548762737,
+      "grad_norm": 0.13281017541885376,
+      "learning_rate": 4.926075302625247e-05,
+      "loss": 0.19864981174468993,
+      "step": 580
+    },
+    {
+      "epoch": 0.1064410480349345,
+      "grad_norm": 0.13465586304664612,
+      "learning_rate": 4.924288021206639e-05,
+      "loss": 0.19573183059692384,
+      "step": 585
+    },
+    {
+      "epoch": 0.10735080058224163,
+      "grad_norm": 0.15225961804389954,
+      "learning_rate": 4.9224797235582396e-05,
+      "loss": 0.19946801662445068,
+      "step": 590
+    },
+    {
+      "epoch": 0.10826055312954876,
+      "grad_norm": 0.12816746532917023,
+      "learning_rate": 4.92065042535624e-05,
+      "loss": 0.19851526021957397,
+      "step": 595
+    },
+    {
+      "epoch": 0.1091703056768559,
+      "grad_norm": 0.13802853226661682,
+      "learning_rate": 4.9188001424588824e-05,
+      "loss": 0.19321763515472412,
+      "step": 600
+    },
+    {
+      "epoch": 0.11008005822416303,
+      "grad_norm": 0.17504797875881195,
+      "learning_rate": 4.9169288909063295e-05,
+      "loss": 0.2032616138458252,
+      "step": 605
+    },
+    {
+      "epoch": 0.11098981077147016,
+      "grad_norm": 0.13544194400310516,
+      "learning_rate": 4.91503668692052e-05,
+      "loss": 0.2011256456375122,
+      "step": 610
+    },
+    {
+      "epoch": 0.11189956331877729,
+      "grad_norm": 1.3976134061813354,
+      "learning_rate": 4.91312354690503e-05,
+      "loss": 0.19916868209838867,
+      "step": 615
+    },
+    {
+      "epoch": 0.11280931586608442,
+      "grad_norm": 0.1465059071779251,
+      "learning_rate": 4.91118948744493e-05,
+      "loss": 0.19487457275390624,
+      "step": 620
+    },
+    {
+      "epoch": 0.11371906841339156,
+      "grad_norm": 0.12103168666362762,
+      "learning_rate": 4.909234525306645e-05,
+      "loss": 0.1907251238822937,
+      "step": 625
+    },
+    {
+      "epoch": 0.1146288209606987,
+      "grad_norm": 0.12660574913024902,
+      "learning_rate": 4.907258677437802e-05,
+      "loss": 0.19327253103256226,
+      "step": 630
+    },
+    {
+      "epoch": 0.11553857350800582,
+      "grad_norm": 0.1347813606262207,
+      "learning_rate": 4.90526196096709e-05,
+      "loss": 0.19637736082077026,
+      "step": 635
+    },
+    {
+      "epoch": 0.11644832605531295,
+      "grad_norm": 0.14953652024269104,
+      "learning_rate": 4.903244393204107e-05,
+      "loss": 0.20325069427490233,
+      "step": 640
+    },
+    {
+      "epoch": 0.11735807860262008,
+      "grad_norm": 0.13936272263526917,
+      "learning_rate": 4.901205991639213e-05,
+      "loss": 0.1930275321006775,
+      "step": 645
+    },
+    {
+      "epoch": 0.11826783114992721,
+      "grad_norm": 0.1448420137166977,
+      "learning_rate": 4.899146773943374e-05,
+      "loss": 0.20026936531066894,
+      "step": 650
+    },
+    {
+      "epoch": 0.11917758369723436,
+      "grad_norm": 0.1312534064054489,
+      "learning_rate": 4.897066757968014e-05,
+      "loss": 0.19062033891677857,
+      "step": 655
+    },
+    {
+      "epoch": 0.12008733624454149,
+      "grad_norm": 0.13644742965698242,
+      "learning_rate": 4.894965961744859e-05,
+      "loss": 0.18719595670700073,
+      "step": 660
+    },
+    {
+      "epoch": 0.12099708879184862,
+      "grad_norm": 0.14276087284088135,
+      "learning_rate": 4.892844403485777e-05,
+      "loss": 0.19784307479858398,
+      "step": 665
+    },
+    {
+      "epoch": 0.12190684133915575,
+      "grad_norm": 0.14735399186611176,
+      "learning_rate": 4.890702101582623e-05,
+      "loss": 0.19163782596588136,
+      "step": 670
+    },
+    {
+      "epoch": 0.12281659388646288,
+      "grad_norm": 0.15742065012454987,
+      "learning_rate": 4.888539074607082e-05,
+      "loss": 0.19312986135482788,
+      "step": 675
+    },
+    {
+      "epoch": 0.12372634643377002,
+      "grad_norm": 0.12917031347751617,
+      "learning_rate": 4.8863553413105025e-05,
+      "loss": 0.20066320896148682,
+      "step": 680
+    },
+    {
+      "epoch": 0.12463609898107715,
+      "grad_norm": 0.1484801322221756,
+      "learning_rate": 4.884150920623737e-05,
+      "loss": 0.20096096992492676,
+      "step": 685
+    },
+    {
+      "epoch": 0.12554585152838427,
+      "grad_norm": 0.1455296128988266,
+      "learning_rate": 4.88192583165698e-05,
+      "loss": 0.20518505573272705,
+      "step": 690
+    },
+    {
+      "epoch": 0.12645560407569142,
+      "grad_norm": 0.14517490565776825,
+      "learning_rate": 4.879680093699598e-05,
+      "loss": 0.18859238624572755,
+      "step": 695
+    },
+    {
+      "epoch": 0.12736535662299855,
+      "grad_norm": 0.18778090178966522,
+      "learning_rate": 4.877413726219964e-05,
+      "loss": 0.197074818611145,
+      "step": 700
+    },
+    {
+      "epoch": 0.12827510917030568,
+      "grad_norm": 0.13497677445411682,
+      "learning_rate": 4.87512674886529e-05,
+      "loss": 0.18713107109069824,
+      "step": 705
+    },
+    {
+      "epoch": 0.12918486171761281,
+      "grad_norm": 0.12657155096530914,
+      "learning_rate": 4.872819181461455e-05,
+      "loss": 0.1858484387397766,
+      "step": 710
+    },
+    {
+      "epoch": 0.13009461426491994,
+      "grad_norm": 0.11458148807287216,
+      "learning_rate": 4.870491044012834e-05,
+      "loss": 0.18732179403305055,
+      "step": 715
+    },
+    {
+      "epoch": 0.13100436681222707,
+      "grad_norm": 0.13000249862670898,
+      "learning_rate": 4.8681423567021244e-05,
+      "loss": 0.1872936010360718,
+      "step": 720
+    },
+    {
+      "epoch": 0.1319141193595342,
+      "grad_norm": 0.14580890536308289,
+      "learning_rate": 4.865773139890172e-05,
+      "loss": 0.19280019998550416,
+      "step": 725
+    },
+    {
+      "epoch": 0.13282387190684133,
+      "grad_norm": 0.1507277935743332,
+      "learning_rate": 4.8633834141157913e-05,
+      "loss": 0.1898929238319397,
+      "step": 730
+    },
+    {
+      "epoch": 0.13373362445414846,
+      "grad_norm": 0.1418737769126892,
+      "learning_rate": 4.860973200095592e-05,
+      "loss": 0.17926375865936278,
+      "step": 735
+    },
+    {
+      "epoch": 0.1346433770014556,
+      "grad_norm": 0.17151866853237152,
+      "learning_rate": 4.858542518723794e-05,
+      "loss": 0.18963592052459716,
+      "step": 740
+    },
+    {
+      "epoch": 0.13555312954876272,
+      "grad_norm": 0.11162743717432022,
+      "learning_rate": 4.8560913910720535e-05,
+      "loss": 0.19466646909713745,
+      "step": 745
+    },
+    {
+      "epoch": 0.13646288209606988,
+      "grad_norm": 0.15628376603126526,
+      "learning_rate": 4.8536198383892725e-05,
+      "loss": 0.19494034051895143,
+      "step": 750
+    },
+    {
+      "epoch": 0.137372634643377,
+      "grad_norm": 0.18209289014339447,
+      "learning_rate": 4.851127882101421e-05,
+      "loss": 0.18747550249099731,
+      "step": 755
+    },
+    {
+      "epoch": 0.13828238719068414,
+      "grad_norm": 0.14559614658355713,
+      "learning_rate": 4.8486155438113454e-05,
+      "loss": 0.1897158980369568,
+      "step": 760
+    },
+    {
+      "epoch": 0.13919213973799127,
+      "grad_norm": 0.3198587894439697,
+      "learning_rate": 4.846082845298586e-05,
+      "loss": 0.18571001291275024,
+      "step": 765
+    },
+    {
+      "epoch": 0.1401018922852984,
+      "grad_norm": 0.1486678421497345,
+      "learning_rate": 4.843529808519189e-05,
+      "loss": 0.19561930894851684,
+      "step": 770
+    },
+    {
+      "epoch": 0.14101164483260553,
+      "grad_norm": 0.15318170189857483,
+      "learning_rate": 4.840956455605509e-05,
+      "loss": 0.187040114402771,
+      "step": 775
+    },
+    {
+      "epoch": 0.14192139737991266,
+      "grad_norm": 0.13754244148731232,
+      "learning_rate": 4.838362808866025e-05,
+      "loss": 0.18345539569854735,
+      "step": 780
+    },
+    {
+      "epoch": 0.1428311499272198,
+      "grad_norm": 0.12943248450756073,
+      "learning_rate": 4.835748890785143e-05,
+      "loss": 0.1921079397201538,
+      "step": 785
+    },
+    {
+      "epoch": 0.14374090247452692,
+      "grad_norm": 0.110458143055439,
+      "learning_rate": 4.833114724023001e-05,
+      "loss": 0.17927205562591553,
+      "step": 790
+    },
+    {
+      "epoch": 0.14465065502183405,
+      "grad_norm": 0.2421770840883255,
+      "learning_rate": 4.830460331415275e-05,
+      "loss": 0.18317567110061644,
+      "step": 795
+    },
+    {
+      "epoch": 0.14556040756914118,
+      "grad_norm": 0.14752762019634247,
+      "learning_rate": 4.8277857359729787e-05,
+      "loss": 0.1843916058540344,
+      "step": 800
+    },
+    {
+      "epoch": 0.14647016011644834,
+      "grad_norm": 0.15043556690216064,
+      "learning_rate": 4.8250909608822644e-05,
+      "loss": 0.18354393243789674,
+      "step": 805
+    },
+    {
+      "epoch": 0.14737991266375547,
+      "grad_norm": 0.1381794661283493,
+      "learning_rate": 4.822376029504223e-05,
+      "loss": 0.1789781332015991,
+      "step": 810
+    },
+    {
+      "epoch": 0.1482896652110626,
+      "grad_norm": 0.18386174738407135,
+      "learning_rate": 4.819640965374681e-05,
+      "loss": 0.19494292736053467,
+      "step": 815
+    },
+    {
+      "epoch": 0.14919941775836973,
+      "grad_norm": 0.13829593360424042,
+      "learning_rate": 4.816885792203996e-05,
+      "loss": 0.18486063480377196,
+      "step": 820
+    },
+    {
+      "epoch": 0.15010917030567686,
+      "grad_norm": 0.15033291280269623,
+      "learning_rate": 4.814110533876852e-05,
+      "loss": 0.18061509132385253,
+      "step": 825
+    },
+    {
+      "epoch": 0.151018922852984,
+      "grad_norm": 0.17150473594665527,
+      "learning_rate": 4.811315214452051e-05,
+      "loss": 0.18464866876602173,
+      "step": 830
+    },
+    {
+      "epoch": 0.15192867540029112,
+      "grad_norm": 0.15317125618457794,
+      "learning_rate": 4.808499858162307e-05,
+      "loss": 0.1837708592414856,
+      "step": 835
+    },
+    {
+      "epoch": 0.15283842794759825,
+      "grad_norm": 0.2671392560005188,
+      "learning_rate": 4.805664489414031e-05,
+      "loss": 0.19338636398315429,
+      "step": 840
+    },
+    {
+      "epoch": 0.15374818049490538,
+      "grad_norm": 0.14047028124332428,
+      "learning_rate": 4.802809132787125e-05,
+      "loss": 0.17069108486175538,
+      "step": 845
+    },
+    {
+      "epoch": 0.1546579330422125,
+      "grad_norm": 0.1520431935787201,
+      "learning_rate": 4.799933813034768e-05,
+      "loss": 0.18607735633850098,
+      "step": 850
+    },
+    {
+      "epoch": 0.15556768558951964,
+      "grad_norm": 0.17239463329315186,
+      "learning_rate": 4.797038555083197e-05,
+      "loss": 0.18069062232971192,
+      "step": 855
+    },
+    {
+      "epoch": 0.1564774381368268,
+      "grad_norm": 0.1377955675125122,
+      "learning_rate": 4.794123384031495e-05,
+      "loss": 0.18870222568511963,
+      "step": 860
+    },
+    {
+      "epoch": 0.15738719068413393,
+      "grad_norm": 0.15901461243629456,
+      "learning_rate": 4.791188325151373e-05,
+      "loss": 0.18128334283828734,
+      "step": 865
+    },
+    {
+      "epoch": 0.15829694323144106,
+      "grad_norm": 0.14634132385253906,
+      "learning_rate": 4.7882334038869495e-05,
+      "loss": 0.1866163969039917,
+      "step": 870
+    },
+    {
+      "epoch": 0.1592066957787482,
+      "grad_norm": 0.15361061692237854,
+      "learning_rate": 4.785258645854529e-05,
+      "loss": 0.17850807905197144,
+      "step": 875
+    },
+    {
+      "epoch": 0.16011644832605532,
+      "grad_norm": 0.13751649856567383,
+      "learning_rate": 4.782264076842385e-05,
+      "loss": 0.17731113433837892,
+      "step": 880
+    },
+    {
+      "epoch": 0.16102620087336245,
+      "grad_norm": 0.17909638583660126,
+      "learning_rate": 4.7792497228105314e-05,
+      "loss": 0.18344542980194092,
+      "step": 885
+    },
+    {
+      "epoch": 0.16193595342066958,
+      "grad_norm": 0.16038304567337036,
+      "learning_rate": 4.776215609890498e-05,
+      "loss": 0.18868647813796996,
+      "step": 890
+    },
+    {
+      "epoch": 0.1628457059679767,
+      "grad_norm": 0.1653951108455658,
+      "learning_rate": 4.773161764385107e-05,
+      "loss": 0.18614152669906617,
+      "step": 895
+    },
+    {
+      "epoch": 0.16375545851528384,
+      "grad_norm": 0.16193026304244995,
+      "learning_rate": 4.770088212768241e-05,
+      "loss": 0.18564575910568237,
+      "step": 900
+    },
+    {
+      "epoch": 0.16466521106259097,
+      "grad_norm": 0.16048531234264374,
+      "learning_rate": 4.7669949816846173e-05,
+      "loss": 0.18330031633377075,
+      "step": 905
+    },
+    {
+      "epoch": 0.1655749636098981,
+      "grad_norm": 0.1440177708864212,
+      "learning_rate": 4.7638820979495534e-05,
+      "loss": 0.17712442874908446,
+      "step": 910
+    },
+    {
+      "epoch": 0.16648471615720525,
+      "grad_norm": 0.19635969400405884,
+      "learning_rate": 4.760749588548738e-05,
+      "loss": 0.18679027557373046,
+      "step": 915
+    },
+    {
+      "epoch": 0.16739446870451238,
+      "grad_norm": 0.15576541423797607,
+      "learning_rate": 4.757597480637995e-05,
+      "loss": 0.19283764362335204,
+      "step": 920
+    },
+    {
+      "epoch": 0.1683042212518195,
+      "grad_norm": 0.1550331562757492,
+      "learning_rate": 4.7544258015430463e-05,
+      "loss": 0.18269542455673218,
+      "step": 925
+    },
+    {
+      "epoch": 0.16921397379912664,
+      "grad_norm": 0.18369626998901367,
+      "learning_rate": 4.75123457875928e-05,
+      "loss": 0.1697891116142273,
+      "step": 930
+    },
+    {
+      "epoch": 0.17012372634643377,
+      "grad_norm": 0.15266314148902893,
+      "learning_rate": 4.7480238399515074e-05,
+      "loss": 0.18523451089859008,
+      "step": 935
+    },
+    {
+      "epoch": 0.1710334788937409,
+      "grad_norm": 0.16709664463996887,
+      "learning_rate": 4.744793612953724e-05,
+      "loss": 0.1803238034248352,
+      "step": 940
+    },
+    {
+      "epoch": 0.17194323144104803,
+      "grad_norm": 0.14929179847240448,
+      "learning_rate": 4.741543925768872e-05,
+      "loss": 0.1861217737197876,
+      "step": 945
+    },
+    {
+      "epoch": 0.17285298398835516,
+      "grad_norm": 0.1362280696630478,
+      "learning_rate": 4.7382748065685915e-05,
+      "loss": 0.17896100282669067,
+      "step": 950
+    },
+    {
+      "epoch": 0.1737627365356623,
+      "grad_norm": 0.15290239453315735,
+      "learning_rate": 4.734986283692982e-05,
+      "loss": 0.18432788848876952,
+      "step": 955
+    },
+    {
+      "epoch": 0.17467248908296942,
+      "grad_norm": 0.1287035197019577,
+      "learning_rate": 4.73167838565035e-05,
+      "loss": 0.18485682010650634,
+      "step": 960
+    },
+    {
+      "epoch": 0.17558224163027655,
+      "grad_norm": 0.17969627678394318,
+      "learning_rate": 4.728351141116971e-05,
+      "loss": 0.17361557483673096,
+      "step": 965
+    },
+    {
+      "epoch": 0.1764919941775837,
+      "grad_norm": 0.13751201331615448,
+      "learning_rate": 4.7250045789368326e-05,
+      "loss": 0.1731679320335388,
+      "step": 970
+    },
+    {
+      "epoch": 0.17740174672489084,
+      "grad_norm": 0.1603265255689621,
+      "learning_rate": 4.721638728121388e-05,
+      "loss": 0.17308170795440675,
+      "step": 975
+    },
+    {
+      "epoch": 0.17831149927219797,
+      "grad_norm": 0.1592789888381958,
+      "learning_rate": 4.718253617849306e-05,
+      "loss": 0.17534757852554322,
+      "step": 980
+    },
+    {
+      "epoch": 0.1792212518195051,
+      "grad_norm": 0.12727224826812744,
+      "learning_rate": 4.714849277466214e-05,
+      "loss": 0.17817609310150145,
+      "step": 985
+    },
+    {
+      "epoch": 0.18013100436681223,
+      "grad_norm": 0.15401554107666016,
+      "learning_rate": 4.711425736484447e-05,
+      "loss": 0.1733405351638794,
+      "step": 990
+    },
+    {
+      "epoch": 0.18104075691411936,
+      "grad_norm": 0.13253968954086304,
+      "learning_rate": 4.7079830245827906e-05,
+      "loss": 0.17846795320510864,
+      "step": 995
+    },
+    {
+      "epoch": 0.1819505094614265,
+      "grad_norm": 0.21846213936805725,
+      "learning_rate": 4.7045211716062245e-05,
+      "loss": 0.18021599054336548,
+      "step": 1000
+    },
+    {
+      "epoch": 0.18286026200873362,
+      "grad_norm": 0.16867990791797638,
+      "learning_rate": 4.7010402075656595e-05,
+      "loss": 0.18232386112213134,
+      "step": 1005
+    },
+    {
+      "epoch": 0.18377001455604075,
+      "grad_norm": 0.17180582880973816,
+      "learning_rate": 4.697540162637686e-05,
+      "loss": 0.1816317319869995,
+      "step": 1010
+    },
+    {
+      "epoch": 0.18467976710334788,
+      "grad_norm": 0.16480213403701782,
+      "learning_rate": 4.694021067164303e-05,
+      "loss": 0.17718446254730225,
+      "step": 1015
+    },
+    {
+      "epoch": 0.185589519650655,
+      "grad_norm": 0.15015918016433716,
+      "learning_rate": 4.6904829516526605e-05,
+      "loss": 0.17412011623382567,
+      "step": 1020
+    },
+    {
+      "epoch": 0.18649927219796217,
+      "grad_norm": 0.14445139467716217,
+      "learning_rate": 4.686925846774795e-05,
+      "loss": 0.1778018832206726,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1874090247452693,
+      "grad_norm": 0.1701960265636444,
+      "learning_rate": 4.683349783367362e-05,
+      "loss": 0.16901081800460815,
+      "step": 1030
+    },
+    {
+      "epoch": 0.18831877729257643,
+      "grad_norm": 0.15894867479801178,
+      "learning_rate": 4.679754792431368e-05,
+      "loss": 0.17055928707122803,
+      "step": 1035
+    },
+    {
+      "epoch": 0.18922852983988356,
+      "grad_norm": 0.1511942446231842,
+      "learning_rate": 4.676140905131903e-05,
+      "loss": 0.17339680194854737,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1901382823871907,
+      "grad_norm": 0.14735209941864014,
+      "learning_rate": 4.672508152797872e-05,
+      "loss": 0.17802717685699462,
+      "step": 1045
+    },
+    {
+      "epoch": 0.19104803493449782,
+      "grad_norm": 0.17367291450500488,
+      "learning_rate": 4.66885656692172e-05,
+      "loss": 0.1732744097709656,
+      "step": 1050
+    },
+    {
+      "epoch": 0.19195778748180495,
+      "grad_norm": 0.147227481007576,
+      "learning_rate": 4.665186179159159e-05,
+      "loss": 0.17040517330169677,
+      "step": 1055
+    },
+    {
+      "epoch": 0.19286754002911208,
+      "grad_norm": 0.1709655076265335,
+      "learning_rate": 4.6614970213289e-05,
+      "loss": 0.17794088125228882,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1937772925764192,
+      "grad_norm": 0.1588088721036911,
+      "learning_rate": 4.657789125412366e-05,
+      "loss": 0.17180380821228028,
+      "step": 1065
+    },
+    {
+      "epoch": 0.19468704512372634,
+      "grad_norm": 0.14827021956443787,
+      "learning_rate": 4.654062523553428e-05,
+      "loss": 0.182997989654541,
+      "step": 1070
+    },
+    {
+      "epoch": 0.19559679767103347,
+      "grad_norm": 0.16230466961860657,
+      "learning_rate": 4.6503172480581126e-05,
+      "loss": 0.17346880435943604,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1965065502183406,
+      "grad_norm": 0.1637624353170395,
+      "learning_rate": 4.646553331394333e-05,
+      "loss": 0.17263576984405518,
+      "step": 1080
+    },
+    {
+      "epoch": 0.19741630276564776,
+      "grad_norm": 0.15977843105793,
+      "learning_rate": 4.642770806191603e-05,
+      "loss": 0.17284308671951293,
+      "step": 1085
+    },
+    {
+      "epoch": 0.19832605531295489,
+      "grad_norm": 0.15394869446754456,
+      "learning_rate": 4.6389697052407534e-05,
+      "loss": 0.17797101736068727,
+      "step": 1090
+    },
+    {
+      "epoch": 0.19923580786026202,
+      "grad_norm": 0.15995225310325623,
+      "learning_rate": 4.6351500614936485e-05,
+      "loss": 0.18137198686599731,
+      "step": 1095
+    },
+    {
+      "epoch": 0.20014556040756915,
+      "grad_norm": 0.1779479682445526,
+      "learning_rate": 4.6313119080629006e-05,
+      "loss": 0.17998344898223878,
+      "step": 1100
+    },
+    {
+      "epoch": 0.20105531295487628,
+      "grad_norm": 0.14362832903862,
+      "learning_rate": 4.627455278221584e-05,
+      "loss": 0.18196423053741456,
+      "step": 1105
+    },
+    {
+      "epoch": 0.2019650655021834,
+      "grad_norm": 0.15951639413833618,
+      "learning_rate": 4.623580205402947e-05,
+      "loss": 0.17423888444900512,
+      "step": 1110
+    },
+    {
+      "epoch": 0.20287481804949054,
+      "grad_norm": 0.17273563146591187,
+      "learning_rate": 4.619686723200115e-05,
+      "loss": 0.17392473220825194,
+      "step": 1115
+    },
+    {
+      "epoch": 0.20378457059679767,
+      "grad_norm": 0.1655360758304596,
+      "learning_rate": 4.615774865365813e-05,
+      "loss": 0.17528389692306517,
+      "step": 1120
+    },
+    {
+      "epoch": 0.2046943231441048,
+      "grad_norm": 0.15920691192150116,
+      "learning_rate": 4.611844665812058e-05,
+      "loss": 0.1806849241256714,
+      "step": 1125
+    },
+    {
+      "epoch": 0.20560407569141192,
+      "grad_norm": 0.16114577651023865,
+      "learning_rate": 4.607896158609875e-05,
+      "loss": 0.17217352390289306,
+      "step": 1130
+    },
+    {
+      "epoch": 0.20651382823871905,
+      "grad_norm": 0.1499422937631607,
+      "learning_rate": 4.603929377988999e-05,
+      "loss": 0.17806737422943114,
+      "step": 1135
+    },
+    {
+      "epoch": 0.2074235807860262,
+      "grad_norm": 0.17605191469192505,
+      "learning_rate": 4.5999443583375765e-05,
+      "loss": 0.17842113971710205,
+      "step": 1140
+    },
+    {
+      "epoch": 0.20833333333333334,
+      "grad_norm": 0.16117210686206818,
+      "learning_rate": 4.595941134201871e-05,
+      "loss": 0.18379683494567872,
+      "step": 1145
+    },
+    {
+      "epoch": 0.20924308588064047,
+      "grad_norm": 0.21199050545692444,
+      "learning_rate": 4.591919740285957e-05,
+      "loss": 0.16286123991012574,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2101528384279476,
+      "grad_norm": 0.15100529789924622,
+      "learning_rate": 4.587880211451427e-05,
+      "loss": 0.17995200157165528,
+      "step": 1155
+    },
+    {
+      "epoch": 0.21106259097525473,
+      "grad_norm": 0.16618172824382782,
+      "learning_rate": 4.583822582717085e-05,
+      "loss": 0.16960303783416747,
+      "step": 1160
+    },
+    {
+      "epoch": 0.21197234352256186,
+      "grad_norm": 0.14743569493293762,
+      "learning_rate": 4.579746889258643e-05,
+      "loss": 0.17762668132781984,
+      "step": 1165
+    },
+    {
+      "epoch": 0.212882096069869,
+      "grad_norm": 0.1697179079055786,
+      "learning_rate": 4.575653166408417e-05,
+      "loss": 0.16656005382537842,
+      "step": 1170
+    },
+    {
+      "epoch": 0.21379184861717612,
+      "grad_norm": 0.14886513352394104,
+      "learning_rate": 4.57154144965502e-05,
+      "loss": 0.17091882228851318,
+      "step": 1175
+    },
+    {
+      "epoch": 0.21470160116448325,
+      "grad_norm": 0.18197473883628845,
+      "learning_rate": 4.5674117746430556e-05,
+      "loss": 0.1770920753479004,
+      "step": 1180
+    },
+    {
+      "epoch": 0.21561135371179038,
+      "grad_norm": 0.17323088645935059,
+      "learning_rate": 4.563264177172807e-05,
+      "loss": 0.1734643578529358,
+      "step": 1185
+    },
+    {
+      "epoch": 0.2165211062590975,
+      "grad_norm": 0.1521984338760376,
+      "learning_rate": 4.559098693199929e-05,
+      "loss": 0.17515116930007935,
+      "step": 1190
+    },
+    {
+      "epoch": 0.21743085880640467,
+      "grad_norm": 0.1842304915189743,
+      "learning_rate": 4.554915358835134e-05,
+      "loss": 0.16798022985458375,
+      "step": 1195
+    },
+    {
+      "epoch": 0.2183406113537118,
+      "grad_norm": 0.14753451943397522,
+      "learning_rate": 4.5507142103438794e-05,
+      "loss": 0.1755476713180542,
+      "step": 1200
+    },
+    {
+      "epoch": 0.21925036390101893,
+      "grad_norm": 0.17096194624900818,
+      "learning_rate": 4.546495284146057e-05,
+      "loss": 0.1792473554611206,
+      "step": 1205
+    },
+    {
+      "epoch": 0.22016011644832606,
+      "grad_norm": 0.1579233556985855,
+      "learning_rate": 4.542258616815669e-05,
+      "loss": 0.17230144739151002,
+      "step": 1210
+    },
+    {
+      "epoch": 0.2210698689956332,
+      "grad_norm": 0.177297905087471,
+      "learning_rate": 4.5380042450805216e-05,
+      "loss": 0.1807127833366394,
+      "step": 1215
+    },
+    {
+      "epoch": 0.22197962154294032,
+      "grad_norm": 0.14331696927547455,
+      "learning_rate": 4.533732205821897e-05,
+      "loss": 0.17201389074325563,
+      "step": 1220
+    },
+    {
+      "epoch": 0.22288937409024745,
+      "grad_norm": 0.14473360776901245,
+      "learning_rate": 4.529442536074239e-05,
+      "loss": 0.17036900520324708,
+      "step": 1225
+    },
+    {
+      "epoch": 0.22379912663755458,
+      "grad_norm": 0.1820901483297348,
+      "learning_rate": 4.5251352730248314e-05,
+      "loss": 0.17704882621765136,
+      "step": 1230
+    },
+    {
+      "epoch": 0.2247088791848617,
+      "grad_norm": 0.1948976367712021,
+      "learning_rate": 4.5208104540134746e-05,
+      "loss": 0.1706973433494568,
+      "step": 1235
+    },
+    {
+      "epoch": 0.22561863173216884,
+      "grad_norm": 0.16660070419311523,
+      "learning_rate": 4.51646811653216e-05,
+      "loss": 0.17636821269989014,
+      "step": 1240
+    },
+    {
+      "epoch": 0.22652838427947597,
+      "grad_norm": 0.1699984073638916,
+      "learning_rate": 4.512108298224751e-05,
+      "loss": 0.16986632347106934,
+      "step": 1245
+    },
+    {
+      "epoch": 0.22743813682678313,
+      "grad_norm": 0.17601042985916138,
+      "learning_rate": 4.50773103688665e-05,
+      "loss": 0.17507898807525635,
+      "step": 1250
+    },
+    {
+      "epoch": 0.22834788937409026,
+      "grad_norm": 0.17557238042354584,
+      "learning_rate": 4.503336370464476e-05,
+      "loss": 0.17702863216400147,
+      "step": 1255
+    },
+    {
+      "epoch": 0.2292576419213974,
+      "grad_norm": 0.1800651252269745,
+      "learning_rate": 4.498924337055729e-05,
+      "loss": 0.16419180631637573,
+      "step": 1260
+    },
+    {
+      "epoch": 0.23016739446870452,
+      "grad_norm": 0.2022479772567749,
+      "learning_rate": 4.494494974908468e-05,
+      "loss": 0.17482060194015503,
+      "step": 1265
+    },
+    {
+      "epoch": 0.23107714701601165,
+      "grad_norm": 0.14180205762386322,
+      "learning_rate": 4.490048322420973e-05,
+      "loss": 0.1723136067390442,
+      "step": 1270
+    },
+    {
+      "epoch": 0.23198689956331878,
+      "grad_norm": 0.18607310950756073,
+      "learning_rate": 4.485584418141419e-05,
+      "loss": 0.17096419334411622,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2328966521106259,
+      "grad_norm": 0.15958310663700104,
+      "learning_rate": 4.481103300767529e-05,
+      "loss": 0.1656244158744812,
+      "step": 1280
+    },
+    {
+      "epoch": 0.23380640465793304,
+      "grad_norm": 0.17552383244037628,
+      "learning_rate": 4.476605009146255e-05,
+      "loss": 0.17677626609802247,
+      "step": 1285
+    },
+    {
+      "epoch": 0.23471615720524017,
+      "grad_norm": 0.15299823880195618,
+      "learning_rate": 4.472089582273429e-05,
+      "loss": 0.1778991103172302,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2356259097525473,
+      "grad_norm": 0.14613987505435944,
+      "learning_rate": 4.46755705929343e-05,
+      "loss": 0.17071452140808105,
+      "step": 1295
+    },
+    {
+      "epoch": 0.23653566229985443,
+      "grad_norm": 0.17781122028827667,
+      "learning_rate": 4.463007479498843e-05,
+      "loss": 0.16955430507659913,
+      "step": 1300
+    },
+    {
+      "epoch": 0.23744541484716158,
+      "grad_norm": 0.16326487064361572,
+      "learning_rate": 4.458440882330119e-05,
+      "loss": 0.1777693510055542,
+      "step": 1305
+    },
+    {
+      "epoch": 0.23835516739446871,
+      "grad_norm": 0.17701926827430725,
+      "learning_rate": 4.4538573073752365e-05,
+      "loss": 0.16323351860046387,
+      "step": 1310
+    },
+    {
+      "epoch": 0.23926491994177584,
+      "grad_norm": 0.13104717433452606,
+      "learning_rate": 4.449256794369349e-05,
+      "loss": 0.17653456926345826,
+      "step": 1315
+    },
+    {
+      "epoch": 0.24017467248908297,
+      "grad_norm": 0.1796836256980896,
+      "learning_rate": 4.444639383194452e-05,
+      "loss": 0.17189600467681884,
+      "step": 1320
+    },
+    {
+      "epoch": 0.2410844250363901,
+      "grad_norm": 0.14919696748256683,
+      "learning_rate": 4.440005113879029e-05,
+      "loss": 0.17003334760665895,
+      "step": 1325
+    },
+    {
+      "epoch": 0.24199417758369723,
+      "grad_norm": 0.1728784441947937,
+      "learning_rate": 4.4353540265977064e-05,
+      "loss": 0.17397408485412597,
+      "step": 1330
+    },
+    {
+      "epoch": 0.24290393013100436,
+      "grad_norm": 0.14591015875339508,
+      "learning_rate": 4.43068616167091e-05,
+      "loss": 0.16498478651046752,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2438136826783115,
+      "grad_norm": 0.18417201936244965,
+      "learning_rate": 4.4260015595645055e-05,
+      "loss": 0.16841750144958495,
+      "step": 1340
+    },
+    {
+      "epoch": 0.24472343522561862,
+      "grad_norm": 0.16264279186725616,
+      "learning_rate": 4.4213002608894605e-05,
+      "loss": 0.16907373666763306,
+      "step": 1345
+    },
+    {
+      "epoch": 0.24563318777292575,
+      "grad_norm": 0.15248481929302216,
+      "learning_rate": 4.416582306401481e-05,
+      "loss": 0.15931472778320313,
+      "step": 1350
+    },
+    {
+      "epoch": 0.24654294032023288,
+      "grad_norm": 0.1488373875617981,
+      "learning_rate": 4.4118477370006636e-05,
+      "loss": 0.1701716423034668,
+      "step": 1355
+    },
+    {
+      "epoch": 0.24745269286754004,
+      "grad_norm": 0.14679782092571259,
+      "learning_rate": 4.407096593731142e-05,
+      "loss": 0.157412326335907,
+      "step": 1360
+    },
+    {
+      "epoch": 0.24836244541484717,
+      "grad_norm": 0.17139530181884766,
+      "learning_rate": 4.402328917780728e-05,
+      "loss": 0.17303754091262818,
+      "step": 1365
+    },
+    {
+      "epoch": 0.2492721979621543,
+      "grad_norm": 0.1534871757030487,
+      "learning_rate": 4.397544750480554e-05,
+      "loss": 0.1786255121231079,
+      "step": 1370
+    },
+    {
+      "epoch": 0.2501819505094614,
+      "grad_norm": 0.1876252293586731,
+      "learning_rate": 4.39274413330472e-05,
+      "loss": 0.16442898511886597,
+      "step": 1375
+    },
+    {
+      "epoch": 0.25109170305676853,
+      "grad_norm": 0.16165752708911896,
+      "learning_rate": 4.387927107869928e-05,
+      "loss": 0.1780426025390625,
+      "step": 1380
+    },
+    {
+      "epoch": 0.25200145560407566,
+      "grad_norm": 0.17242255806922913,
+      "learning_rate": 4.383093715935124e-05,
+      "loss": 0.15959256887435913,
+      "step": 1385
+    },
+    {
+      "epoch": 0.25291120815138285,
+      "grad_norm": 0.1627114862203598,
+      "learning_rate": 4.378243999401137e-05,
+      "loss": 0.17606115341186523,
+      "step": 1390
+    },
+    {
+      "epoch": 0.25382096069869,
+      "grad_norm": 0.15911224484443665,
+      "learning_rate": 4.373378000310312e-05,
+      "loss": 0.16798585653305054,
+      "step": 1395
+    },
+    {
+      "epoch": 0.2547307132459971,
+      "grad_norm": 0.15542249381542206,
+      "learning_rate": 4.3684957608461505e-05,
+      "loss": 0.1695417881011963,
+      "step": 1400
+    },
+    {
+      "epoch": 0.25564046579330424,
+      "grad_norm": 0.1475304812192917,
+      "learning_rate": 4.363597323332941e-05,
+      "loss": 0.16340878009796142,
+      "step": 1405
+    },
+    {
+      "epoch": 0.25655021834061137,
+      "grad_norm": 0.16943927109241486,
+      "learning_rate": 4.358682730235395e-05,
+      "loss": 0.17240238189697266,
+      "step": 1410
+    },
+    {
+      "epoch": 0.2574599708879185,
+      "grad_norm": 0.1816391944885254,
+      "learning_rate": 4.3537520241582744e-05,
+      "loss": 0.16558437347412108,
+      "step": 1415
+    },
+    {
+      "epoch": 0.25836972343522563,
+      "grad_norm": 0.23851341009140015,
+      "learning_rate": 4.348805247846027e-05,
+      "loss": 0.16796000003814698,
+      "step": 1420
+    },
+    {
+      "epoch": 0.25927947598253276,
+      "grad_norm": 0.15415243804454803,
+      "learning_rate": 4.343842444182414e-05,
+      "loss": 0.1746017098426819,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2601892285298399,
+      "grad_norm": 0.15651032328605652,
+      "learning_rate": 4.338863656190139e-05,
+      "loss": 0.1649057984352112,
+      "step": 1430
+    },
+    {
+      "epoch": 0.261098981077147,
+      "grad_norm": 0.16601966321468353,
+      "learning_rate": 4.333868927030471e-05,
+      "loss": 0.15888988971710205,
+      "step": 1435
+    },
+    {
+      "epoch": 0.26200873362445415,
+      "grad_norm": 0.1549467295408249,
+      "learning_rate": 4.328858300002876e-05,
+      "loss": 0.16357985734939576,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2629184861717613,
+      "grad_norm": 0.16332370042800903,
+      "learning_rate": 4.32383181854464e-05,
+      "loss": 0.16749982833862304,
+      "step": 1445
+    },
+    {
+      "epoch": 0.2638282387190684,
+      "grad_norm": 0.14827077090740204,
+      "learning_rate": 4.3187895262304894e-05,
+      "loss": 0.16886214017868043,
+      "step": 1450
+    },
+    {
+      "epoch": 0.26473799126637554,
+      "grad_norm": 0.1557198166847229,
+      "learning_rate": 4.313731466772216e-05,
+      "loss": 0.17512214183807373,
+      "step": 1455
+    },
+    {
+      "epoch": 0.26564774381368267,
+      "grad_norm": 0.17263570427894592,
+      "learning_rate": 4.308657684018299e-05,
+      "loss": 0.16248074769973755,
+      "step": 1460
+    },
+    {
+      "epoch": 0.2665574963609898,
+      "grad_norm": 0.17135761678218842,
+      "learning_rate": 4.303568221953521e-05,
+      "loss": 0.16605921983718872,
+      "step": 1465
+    },
+    {
+      "epoch": 0.26746724890829693,
+      "grad_norm": 0.14322632551193237,
+      "learning_rate": 4.2984631246985897e-05,
+      "loss": 0.1610772728919983,
+      "step": 1470
+    },
+    {
+      "epoch": 0.26837700145560406,
+      "grad_norm": 0.18852312862873077,
+      "learning_rate": 4.2933424365097564e-05,
+      "loss": 0.1686462163925171,
+      "step": 1475
+    },
+    {
+      "epoch": 0.2692867540029112,
+      "grad_norm": 0.1780245155096054,
+      "learning_rate": 4.2882062017784294e-05,
+      "loss": 0.16953932046890258,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2701965065502183,
+      "grad_norm": 0.180568665266037,
+      "learning_rate": 4.2830544650307895e-05,
+      "loss": 0.16442664861679077,
+      "step": 1485
+    },
+    {
+      "epoch": 0.27110625909752545,
+      "grad_norm": 0.16876435279846191,
+      "learning_rate": 4.277887270927407e-05,
+      "loss": 0.17128173112869263,
+      "step": 1490
+    },
+    {
+      "epoch": 0.2720160116448326,
+      "grad_norm": 0.164053276181221,
+      "learning_rate": 4.2727046642628513e-05,
+      "loss": 0.16331382989883422,
+      "step": 1495
+    },
+    {
+      "epoch": 0.27292576419213976,
+      "grad_norm": 0.14577528834342957,
+      "learning_rate": 4.267506689965305e-05,
+      "loss": 0.1638316035270691,
+      "step": 1500
+    },
+    {
+      "epoch": 0.2738355167394469,
+      "grad_norm": 0.1648740917444229,
+      "learning_rate": 4.262293393096171e-05,
+      "loss": 0.15332664251327516,
+      "step": 1505
+    },
+    {
+      "epoch": 0.274745269286754,
+      "grad_norm": 0.16445094347000122,
+      "learning_rate": 4.257064818849685e-05,
+      "loss": 0.1706634521484375,
+      "step": 1510
+    },
+    {
+      "epoch": 0.27565502183406115,
+      "grad_norm": 0.1584935486316681,
+      "learning_rate": 4.251821012552524e-05,
+      "loss": 0.1684114694595337,
+      "step": 1515
+    },
+    {
+      "epoch": 0.2765647743813683,
+      "grad_norm": 0.17215611040592194,
+      "learning_rate": 4.24656201966341e-05,
+      "loss": 0.15594131946563722,
+      "step": 1520
+    },
+    {
+      "epoch": 0.2774745269286754,
+      "grad_norm": 0.15945589542388916,
+      "learning_rate": 4.2412878857727214e-05,
+      "loss": 0.1686659574508667,
+      "step": 1525
+    },
+    {
+      "epoch": 0.27838427947598254,
+      "grad_norm": 0.16103951632976532,
+      "learning_rate": 4.2359986566020906e-05,
+      "loss": 0.17779340744018554,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2792940320232897,
+      "grad_norm": 0.1770307570695877,
+      "learning_rate": 4.230694378004014e-05,
+      "loss": 0.16786882877349854,
+      "step": 1535
+    },
+    {
+      "epoch": 0.2802037845705968,
+      "grad_norm": 0.16225053369998932,
+      "learning_rate": 4.2253750959614504e-05,
+      "loss": 0.16558897495269775,
+      "step": 1540
+    },
+    {
+      "epoch": 0.28111353711790393,
+      "grad_norm": 0.27213969826698303,
+      "learning_rate": 4.220040856587425e-05,
+      "loss": 0.1641119599342346,
+      "step": 1545
+    },
+    {
+      "epoch": 0.28202328966521106,
+      "grad_norm": 0.1773071587085724,
+      "learning_rate": 4.2146917061246284e-05,
+      "loss": 0.16919140815734862,
+      "step": 1550
+    },
+    {
+      "epoch": 0.2829330422125182,
+      "grad_norm": 0.15519705414772034,
+      "learning_rate": 4.209327690945014e-05,
+      "loss": 0.15501506328582765,
+      "step": 1555
+    },
+    {
+      "epoch": 0.2838427947598253,
+      "grad_norm": 0.19921597838401794,
+      "learning_rate": 4.203948857549402e-05,
+      "loss": 0.1690821886062622,
+      "step": 1560
+    },
+    {
+      "epoch": 0.28475254730713245,
+      "grad_norm": 0.15417630970478058,
+      "learning_rate": 4.1985552525670696e-05,
+      "loss": 0.1675640344619751,
+      "step": 1565
+    },
+    {
+      "epoch": 0.2856622998544396,
+      "grad_norm": 0.1739572137594223,
+      "learning_rate": 4.193146922755348e-05,
+      "loss": 0.16738017797470092,
+      "step": 1570
+    },
+    {
+      "epoch": 0.2865720524017467,
+      "grad_norm": 0.1384361982345581,
+      "learning_rate": 4.187723914999221e-05,
+      "loss": 0.16802358627319336,
+      "step": 1575
+    },
+    {
+      "epoch": 0.28748180494905384,
+      "grad_norm": 0.1491454839706421,
+      "learning_rate": 4.182286276310915e-05,
+      "loss": 0.1619583249092102,
+      "step": 1580
+    },
+    {
+      "epoch": 0.288391557496361,
+      "grad_norm": 0.15831919014453888,
+      "learning_rate": 4.176834053829492e-05,
+      "loss": 0.1625199794769287,
+      "step": 1585
+    },
+    {
+      "epoch": 0.2893013100436681,
+      "grad_norm": 0.16265396773815155,
+      "learning_rate": 4.1713672948204416e-05,
+      "loss": 0.16718552112579346,
+      "step": 1590
+    },
+    {
+      "epoch": 0.29021106259097523,
+      "grad_norm": 0.15153461694717407,
+      "learning_rate": 4.1658860466752714e-05,
+      "loss": 0.15979087352752686,
+      "step": 1595
+    },
+    {
+      "epoch": 0.29112081513828236,
+      "grad_norm": 0.1620412915945053,
+      "learning_rate": 4.160390356911096e-05,
+      "loss": 0.16103557348251343,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2920305676855895,
+      "grad_norm": 0.16673807799816132,
+      "learning_rate": 4.154880273170223e-05,
+      "loss": 0.16394708156585694,
+      "step": 1605
+    },
+    {
+      "epoch": 0.2929403202328967,
+      "grad_norm": 0.14834867417812347,
+      "learning_rate": 4.149355843219744e-05,
+      "loss": 0.15916435718536376,
+      "step": 1610
+    },
+    {
+      "epoch": 0.2938500727802038,
+      "grad_norm": 0.16977964341640472,
+      "learning_rate": 4.143817114951119e-05,
+      "loss": 0.16538127660751342,
+      "step": 1615
+    },
+    {
+      "epoch": 0.29475982532751094,
+      "grad_norm": 0.17986875772476196,
+      "learning_rate": 4.138264136379756e-05,
+      "loss": 0.15514618158340454,
+      "step": 1620
+    },
+    {
+      "epoch": 0.29566957787481807,
+      "grad_norm": 0.15794920921325684,
+      "learning_rate": 4.132696955644605e-05,
+      "loss": 0.15992183685302735,
+      "step": 1625
+    },
+    {
+      "epoch": 0.2965793304221252,
+      "grad_norm": 0.19955399632453918,
+      "learning_rate": 4.127115621007731e-05,
+      "loss": 0.16362056732177735,
+      "step": 1630
+    },
+    {
+      "epoch": 0.29748908296943233,
+      "grad_norm": 0.1352023035287857,
+      "learning_rate": 4.121520180853903e-05,
+      "loss": 0.15631601810455323,
+      "step": 1635
+    },
+    {
+      "epoch": 0.29839883551673946,
+      "grad_norm": 0.15340781211853027,
+      "learning_rate": 4.1159106836901674e-05,
+      "loss": 0.1571858048439026,
+      "step": 1640
+    },
+    {
+      "epoch": 0.2993085880640466,
+      "grad_norm": 0.15311770141124725,
+      "learning_rate": 4.110287178145433e-05,
+      "loss": 0.16082344055175782,
+      "step": 1645
+    },
+    {
+      "epoch": 0.3002183406113537,
+      "grad_norm": 0.17811627686023712,
+      "learning_rate": 4.10464971297005e-05,
+      "loss": 0.16117215156555176,
+      "step": 1650
+    },
+    {
+      "epoch": 0.30112809315866085,
+      "grad_norm": 0.21060039103031158,
+      "learning_rate": 4.0989983370353805e-05,
+      "loss": 0.15838587284088135,
+      "step": 1655
+    },
+    {
+      "epoch": 0.302037845705968,
+      "grad_norm": 0.155836820602417,
+      "learning_rate": 4.093333099333383e-05,
+      "loss": 0.16648870706558228,
+      "step": 1660
+    },
+    {
+      "epoch": 0.3029475982532751,
+      "grad_norm": 0.13711698353290558,
+      "learning_rate": 4.0876540489761826e-05,
+      "loss": 0.16899349689483642,
+      "step": 1665
+    },
+    {
+      "epoch": 0.30385735080058224,
+      "grad_norm": 0.15162716805934906,
+      "learning_rate": 4.0819612351956485e-05,
+      "loss": 0.16574090719223022,
+      "step": 1670
+    },
+    {
+      "epoch": 0.30476710334788937,
+      "grad_norm": 0.15016348659992218,
+      "learning_rate": 4.0762547073429615e-05,
+      "loss": 0.1689780354499817,
+      "step": 1675
+    },
+    {
+      "epoch": 0.3056768558951965,
+      "grad_norm": 0.15182986855506897,
+      "learning_rate": 4.070534514888194e-05,
+      "loss": 0.1593686819076538,
+      "step": 1680
+    },
+    {
+      "epoch": 0.3065866084425036,
+      "grad_norm": 0.15648750960826874,
+      "learning_rate": 4.0648007074198765e-05,
+      "loss": 0.16436235904693602,
+      "step": 1685
+    },
+    {
+      "epoch": 0.30749636098981076,
+      "grad_norm": 0.18339484930038452,
+      "learning_rate": 4.0590533346445665e-05,
+      "loss": 0.1678077220916748,
+      "step": 1690
+    },
+    {
+      "epoch": 0.3084061135371179,
+      "grad_norm": 0.16426527500152588,
+      "learning_rate": 4.053292446386422e-05,
+      "loss": 0.1689227342605591,
+      "step": 1695
+    },
+    {
+      "epoch": 0.309315866084425,
+      "grad_norm": 0.16129335761070251,
+      "learning_rate": 4.047518092586766e-05,
+      "loss": 0.16592445373535156,
+      "step": 1700
+    },
+    {
+      "epoch": 0.31022561863173215,
+      "grad_norm": 0.15512363612651825,
+      "learning_rate": 4.041730323303654e-05,
+      "loss": 0.16142364740371704,
+      "step": 1705
+    },
+    {
+      "epoch": 0.3111353711790393,
+      "grad_norm": 0.159842386841774,
+      "learning_rate": 4.0359291887114425e-05,
+      "loss": 0.1702875852584839,
+      "step": 1710
+    },
+    {
+      "epoch": 0.3120451237263464,
+      "grad_norm": 0.19558854401111603,
+      "learning_rate": 4.030114739100352e-05,
+      "loss": 0.15966148376464845,
+      "step": 1715
+    },
+    {
+      "epoch": 0.3129548762736536,
+      "grad_norm": 0.1577496975660324,
+      "learning_rate": 4.024287024876029e-05,
+      "loss": 0.1620358943939209,
+      "step": 1720
+    },
+    {
+      "epoch": 0.3138646288209607,
+      "grad_norm": 0.1629355251789093,
+      "learning_rate": 4.0184460965591144e-05,
+      "loss": 0.16511552333831786,
+      "step": 1725
+    },
+    {
+      "epoch": 0.31477438136826785,
+      "grad_norm": 0.17060767114162445,
+      "learning_rate": 4.0125920047848e-05,
+      "loss": 0.15672838687896729,
+      "step": 1730
+    },
+    {
+      "epoch": 0.315684133915575,
+      "grad_norm": 0.22447620332241058,
+      "learning_rate": 4.006724800302394e-05,
+      "loss": 0.15339784622192382,
+      "step": 1735
+    },
+    {
+      "epoch": 0.3165938864628821,
+      "grad_norm": 0.14572037756443024,
+      "learning_rate": 4.000844533974878e-05,
+      "loss": 0.16566959619522095,
+      "step": 1740
+    },
+    {
+      "epoch": 0.31750363901018924,
+      "grad_norm": 0.15915483236312866,
+      "learning_rate": 3.9949512567784684e-05,
+      "loss": 0.16153957843780517,
+      "step": 1745
+    },
+    {
+      "epoch": 0.3184133915574964,
+      "grad_norm": 0.1668540984392166,
+      "learning_rate": 3.9890450198021704e-05,
+      "loss": 0.1659809947013855,
+      "step": 1750
+    },
+    {
+      "epoch": 0.3193231441048035,
+      "grad_norm": 0.16612035036087036,
+      "learning_rate": 3.983125874247341e-05,
+      "loss": 0.16941241025924683,
+      "step": 1755
+    },
+    {
+      "epoch": 0.32023289665211063,
+      "grad_norm": 0.15163679420948029,
+      "learning_rate": 3.9771938714272407e-05,
+      "loss": 0.16053590774536133,
+      "step": 1760
+    },
+    {
+      "epoch": 0.32114264919941776,
+      "grad_norm": 0.1797824203968048,
+      "learning_rate": 3.97124906276659e-05,
+      "loss": 0.1667110800743103,
+      "step": 1765
+    },
+    {
+      "epoch": 0.3220524017467249,
+      "grad_norm": 0.15076608955860138,
+      "learning_rate": 3.9652914998011237e-05,
+      "loss": 0.1607860803604126,
+      "step": 1770
+    },
+    {
+      "epoch": 0.322962154294032,
+      "grad_norm": 0.16523587703704834,
+      "learning_rate": 3.959321234177144e-05,
+      "loss": 0.16515827178955078,
+      "step": 1775
+    },
+    {
+      "epoch": 0.32387190684133915,
+      "grad_norm": 0.22065149247646332,
+      "learning_rate": 3.9533383176510746e-05,
+      "loss": 0.1618957757949829,
+      "step": 1780
+    },
+    {
+      "epoch": 0.3247816593886463,
+      "grad_norm": 0.16426463425159454,
+      "learning_rate": 3.9473428020890066e-05,
+      "loss": 0.15763382911682128,
+      "step": 1785
+    },
+    {
+      "epoch": 0.3256914119359534,
+      "grad_norm": 0.16474904119968414,
+      "learning_rate": 3.941334739466257e-05,
+      "loss": 0.15135571956634522,
+      "step": 1790
+    },
+    {
+      "epoch": 0.32660116448326054,
+      "grad_norm": 0.16746412217617035,
+      "learning_rate": 3.935314181866909e-05,
+      "loss": 0.15925389528274536,
+      "step": 1795
+    },
+    {
+      "epoch": 0.32751091703056767,
+      "grad_norm": 0.17819371819496155,
+      "learning_rate": 3.929281181483369e-05,
+      "loss": 0.1598669171333313,
+      "step": 1800
+    },
+    {
+      "epoch": 0.3284206695778748,
+      "grad_norm": 0.1816040277481079,
+      "learning_rate": 3.923235790615907e-05,
+      "loss": 0.1652522087097168,
+      "step": 1805
+    },
+    {
+      "epoch": 0.32933042212518193,
+      "grad_norm": 0.14846695959568024,
+      "learning_rate": 3.917178061672211e-05,
+      "loss": 0.16665585041046144,
+      "step": 1810
+    },
+    {
+      "epoch": 0.33024017467248906,
+      "grad_norm": 0.1734926551580429,
+      "learning_rate": 3.911108047166924e-05,
+      "loss": 0.16069791316986085,
+      "step": 1815
+    },
+    {
+      "epoch": 0.3311499272197962,
+      "grad_norm": 0.16154922544956207,
+      "learning_rate": 3.905025799721194e-05,
+      "loss": 0.16114097833633423,
+      "step": 1820
+    },
+    {
+      "epoch": 0.3320596797671033,
+      "grad_norm": 0.1538771390914917,
+      "learning_rate": 3.898931372062217e-05,
+      "loss": 0.1602831244468689,
+      "step": 1825
+    },
+    {
+      "epoch": 0.3329694323144105,
+      "grad_norm": 0.14036566019058228,
+      "learning_rate": 3.892824817022781e-05,
+      "loss": 0.1502395749092102,
+      "step": 1830
+    },
+    {
+      "epoch": 0.33387918486171764,
+      "grad_norm": 0.19212059676647186,
+      "learning_rate": 3.886706187540804e-05,
+      "loss": 0.16265250444412233,
+      "step": 1835
+    },
+    {
+      "epoch": 0.33478893740902477,
+      "grad_norm": 0.17410333454608917,
+      "learning_rate": 3.880575536658881e-05,
+      "loss": 0.15689224004745483,
+      "step": 1840
+    },
+    {
+      "epoch": 0.3356986899563319,
+      "grad_norm": 0.15165294706821442,
+      "learning_rate": 3.874432917523817e-05,
+      "loss": 0.15033140182495117,
+      "step": 1845
+    },
+    {
+      "epoch": 0.336608442503639,
+      "grad_norm": 0.16166730225086212,
+      "learning_rate": 3.8682783833861736e-05,
+      "loss": 0.16896235942840576,
+      "step": 1850
+    },
+    {
+      "epoch": 0.33751819505094616,
+      "grad_norm": 0.16497021913528442,
+      "learning_rate": 3.8621119875998026e-05,
+      "loss": 0.1600774645805359,
+      "step": 1855
+    },
+    {
+      "epoch": 0.3384279475982533,
+      "grad_norm": 0.17264948785305023,
+      "learning_rate": 3.855933783621384e-05,
+      "loss": 0.16947593688964843,
+      "step": 1860
+    },
+    {
+      "epoch": 0.3393377001455604,
+      "grad_norm": 0.16870704293251038,
+      "learning_rate": 3.8497438250099636e-05,
+      "loss": 0.16062095165252685,
+      "step": 1865
+    },
+    {
+      "epoch": 0.34024745269286755,
+      "grad_norm": 0.16644036769866943,
+      "learning_rate": 3.843542165426492e-05,
+      "loss": 0.16015599966049193,
+      "step": 1870
+    },
+    {
+      "epoch": 0.3411572052401747,
+      "grad_norm": 0.1626352220773697,
+      "learning_rate": 3.837328858633349e-05,
+      "loss": 0.17444703578948975,
+      "step": 1875
+    },
+    {
+      "epoch": 0.3420669577874818,
+      "grad_norm": 0.1427375227212906,
+      "learning_rate": 3.83110395849389e-05,
+      "loss": 0.1589805006980896,
+      "step": 1880
+    },
+    {
+      "epoch": 0.34297671033478894,
+      "grad_norm": 0.17840255796909332,
+      "learning_rate": 3.824867518971973e-05,
+      "loss": 0.15953952074050903,
+      "step": 1885
+    },
+    {
+      "epoch": 0.34388646288209607,
+      "grad_norm": 0.16998249292373657,
+      "learning_rate": 3.818619594131489e-05,
+      "loss": 0.16027032136917113,
+      "step": 1890
+    },
+    {
+      "epoch": 0.3447962154294032,
+      "grad_norm": 0.14950257539749146,
+      "learning_rate": 3.812360238135897e-05,
+      "loss": 0.15335670709609986,
+      "step": 1895
+    },
+    {
+      "epoch": 0.3457059679767103,
+      "grad_norm": 0.1678011417388916,
+      "learning_rate": 3.806089505247752e-05,
+      "loss": 0.1560648798942566,
+      "step": 1900
+    },
+    {
+      "epoch": 0.34661572052401746,
+      "grad_norm": 0.17944541573524475,
+      "learning_rate": 3.799807449828238e-05,
+      "loss": 0.16072254180908202,
+      "step": 1905
+    },
+    {
+      "epoch": 0.3475254730713246,
+      "grad_norm": 0.166817307472229,
+      "learning_rate": 3.793514126336691e-05,
+      "loss": 0.1542820692062378,
+      "step": 1910
+    },
+    {
+      "epoch": 0.3484352256186317,
+      "grad_norm": 0.16047626733779907,
+      "learning_rate": 3.787209589330134e-05,
+      "loss": 0.16092092990875245,
+      "step": 1915
+    },
+    {
+      "epoch": 0.34934497816593885,
+      "grad_norm": 0.16478900611400604,
+      "learning_rate": 3.7808938934627965e-05,
+      "loss": 0.16765867471694945,
+      "step": 1920
+    },
+    {
+      "epoch": 0.350254730713246,
+      "grad_norm": 0.15349514782428741,
+      "learning_rate": 3.774567093485648e-05,
+      "loss": 0.15890377759933472,
+      "step": 1925
+    },
+    {
+      "epoch": 0.3511644832605531,
+      "grad_norm": 0.1515921950340271,
+      "learning_rate": 3.768229244245917e-05,
+      "loss": 0.16668319702148438,
+      "step": 1930
+    },
+    {
+      "epoch": 0.35207423580786024,
+      "grad_norm": 0.16310466825962067,
+      "learning_rate": 3.7618804006866195e-05,
+      "loss": 0.15182652473449706,
+      "step": 1935
+    },
+    {
+      "epoch": 0.3529839883551674,
+      "grad_norm": 0.17294517159461975,
+      "learning_rate": 3.755520617846084e-05,
+      "loss": 0.16287628412246705,
+      "step": 1940
+    },
+    {
+      "epoch": 0.35389374090247455,
+      "grad_norm": 0.1482895463705063,
+      "learning_rate": 3.749149950857467e-05,
+      "loss": 0.15321952104568481,
+      "step": 1945
+    },
+    {
+      "epoch": 0.3548034934497817,
+      "grad_norm": 0.2236029952764511,
+      "learning_rate": 3.7427684549482847e-05,
+      "loss": 0.15403482913970948,
+      "step": 1950
+    },
+    {
+      "epoch": 0.3557132459970888,
+      "grad_norm": 0.20185327529907227,
+      "learning_rate": 3.736376185439927e-05,
+      "loss": 0.1633884072303772,
+      "step": 1955
+    },
+    {
+      "epoch": 0.35662299854439594,
+      "grad_norm": 0.13906247913837433,
+      "learning_rate": 3.7299731977471816e-05,
+      "loss": 0.15925350189208984,
+      "step": 1960
+    },
+    {
+      "epoch": 0.35753275109170307,
+      "grad_norm": 0.18665002286434174,
+      "learning_rate": 3.723559547377751e-05,
+      "loss": 0.1612026572227478,
+      "step": 1965
+    },
+    {
+      "epoch": 0.3584425036390102,
+      "grad_norm": 0.16913433372974396,
+      "learning_rate": 3.717135289931774e-05,
+      "loss": 0.15479494333267213,
+      "step": 1970
+    },
+    {
+      "epoch": 0.35935225618631733,
+      "grad_norm": 0.1620066910982132,
+      "learning_rate": 3.7107004811013434e-05,
+      "loss": 0.1604058027267456,
+      "step": 1975
+    },
+    {
+      "epoch": 0.36026200873362446,
+      "grad_norm": 0.16838301718235016,
+      "learning_rate": 3.704255176670021e-05,
+      "loss": 0.15335073471069335,
+      "step": 1980
+    },
+    {
+      "epoch": 0.3611717612809316,
+      "grad_norm": 0.3054695427417755,
+      "learning_rate": 3.6977994325123535e-05,
+      "loss": 0.16558053493499755,
+      "step": 1985
+    },
+    {
+      "epoch": 0.3620815138282387,
+      "grad_norm": 0.1526716649532318,
+      "learning_rate": 3.6913333045933934e-05,
+      "loss": 0.16148923635482787,
+      "step": 1990
+    },
+    {
+      "epoch": 0.36299126637554585,
+      "grad_norm": 0.15328513085842133,
+      "learning_rate": 3.684856848968209e-05,
+      "loss": 0.1553613781929016,
+      "step": 1995
+    },
+    {
+      "epoch": 0.363901018922853,
+      "grad_norm": 0.16129714250564575,
+      "learning_rate": 3.6783701217813995e-05,
+      "loss": 0.16724612712860107,
+      "step": 2000
+    },
+    {
+      "epoch": 0.3648107714701601,
+      "grad_norm": 0.15715539455413818,
+      "learning_rate": 3.6718731792666086e-05,
+      "loss": 0.15867922306060792,
+      "step": 2005
+    },
+    {
+      "epoch": 0.36572052401746724,
+      "grad_norm": 0.15569166839122772,
+      "learning_rate": 3.6653660777460366e-05,
+      "loss": 0.1552058696746826,
+      "step": 2010
+    },
+    {
+      "epoch": 0.36663027656477437,
+      "grad_norm": 0.16223010420799255,
+      "learning_rate": 3.6588488736299535e-05,
+      "loss": 0.1583200454711914,
+      "step": 2015
+    },
+    {
+      "epoch": 0.3675400291120815,
+      "grad_norm": 0.18441995978355408,
+      "learning_rate": 3.652321623416209e-05,
+      "loss": 0.15050662755966188,
+      "step": 2020
+    },
+    {
+      "epoch": 0.36844978165938863,
+      "grad_norm": 0.13792674243450165,
+      "learning_rate": 3.645784383689742e-05,
+      "loss": 0.15458759069442748,
+      "step": 2025
+    },
+    {
+      "epoch": 0.36935953420669576,
+      "grad_norm": 0.14993111789226532,
+      "learning_rate": 3.639237211122091e-05,
+      "loss": 0.15926222801208495,
+      "step": 2030
+    },
+    {
+      "epoch": 0.3702692867540029,
+      "grad_norm": 0.16815930604934692,
+      "learning_rate": 3.632680162470904e-05,
+      "loss": 0.15524441003799438,
+      "step": 2035
+    },
+    {
+      "epoch": 0.37117903930131,
+      "grad_norm": 0.13312821090221405,
+      "learning_rate": 3.626113294579441e-05,
+      "loss": 0.15883516073226928,
+      "step": 2040
+    },
+    {
+      "epoch": 0.37208879184861715,
+      "grad_norm": 0.16838273406028748,
+      "learning_rate": 3.619536664376091e-05,
+      "loss": 0.15829603672027587,
+      "step": 2045
+    },
+    {
+      "epoch": 0.37299854439592434,
+      "grad_norm": 0.14706873893737793,
+      "learning_rate": 3.612950328873869e-05,
+      "loss": 0.15644397735595703,
+      "step": 2050
+    },
+    {
+      "epoch": 0.37390829694323147,
+      "grad_norm": 0.1644199639558792,
+      "learning_rate": 3.606354345169926e-05,
+      "loss": 0.15858219861984252,
+      "step": 2055
+    },
+    {
+      "epoch": 0.3748180494905386,
+      "grad_norm": 0.18077051639556885,
+      "learning_rate": 3.599748770445055e-05,
+      "loss": 0.1641286849975586,
+      "step": 2060
+    },
+    {
+      "epoch": 0.3757278020378457,
+      "grad_norm": 0.16329127550125122,
+      "learning_rate": 3.5931336619631914e-05,
+      "loss": 0.15027186870574952,
+      "step": 2065
+    },
+    {
+      "epoch": 0.37663755458515286,
+      "grad_norm": 0.16346783936023712,
+      "learning_rate": 3.586509077070922e-05,
+      "loss": 0.1558641314506531,
+      "step": 2070
+    },
+    {
+      "epoch": 0.37754730713246,
+      "grad_norm": 0.1727602630853653,
+      "learning_rate": 3.5798750731969834e-05,
+      "loss": 0.15390506982803345,
+      "step": 2075
+    },
+    {
+      "epoch": 0.3784570596797671,
+      "grad_norm": 0.7598192691802979,
+      "learning_rate": 3.5732317078517654e-05,
+      "loss": 0.1533232808113098,
+      "step": 2080
+    },
+    {
+      "epoch": 0.37936681222707425,
+      "grad_norm": 0.1433355212211609,
+      "learning_rate": 3.5665790386268124e-05,
+      "loss": 0.15560413599014283,
+      "step": 2085
+    },
+    {
+      "epoch": 0.3802765647743814,
+      "grad_norm": 0.18439625203609467,
+      "learning_rate": 3.559917123194325e-05,
+      "loss": 0.16695556640625,
+      "step": 2090
+    },
+    {
+      "epoch": 0.3811863173216885,
+      "grad_norm": 0.1693502813577652,
+      "learning_rate": 3.55324601930666e-05,
+      "loss": 0.15957870483398437,
+      "step": 2095
+    },
+    {
+      "epoch": 0.38209606986899564,
+      "grad_norm": 0.17776088416576385,
+      "learning_rate": 3.54656578479583e-05,
+      "loss": 0.1527492880821228,
+      "step": 2100
+    },
+    {
+      "epoch": 0.38300582241630277,
+      "grad_norm": 0.15993724763393402,
+      "learning_rate": 3.539876477572998e-05,
+      "loss": 0.1567505717277527,
+      "step": 2105
+    },
+    {
+      "epoch": 0.3839155749636099,
+      "grad_norm": 0.17067375779151917,
+      "learning_rate": 3.533178155627981e-05,
+      "loss": 0.14660797119140626,
+      "step": 2110
+    },
+    {
+      "epoch": 0.384825327510917,
+      "grad_norm": 0.20239882171154022,
+      "learning_rate": 3.526470877028745e-05,
+      "loss": 0.1596767544746399,
+      "step": 2115
+    },
+    {
+      "epoch": 0.38573508005822416,
+      "grad_norm": 0.1863643079996109,
+      "learning_rate": 3.5197546999209005e-05,
+      "loss": 0.15738571882247926,
+      "step": 2120
+    },
+    {
+      "epoch": 0.3866448326055313,
+      "grad_norm": 0.16994133591651917,
+      "learning_rate": 3.5130296825272014e-05,
+      "loss": 0.16255316734313965,
+      "step": 2125
+    },
+    {
+      "epoch": 0.3875545851528384,
+      "grad_norm": 0.18703415989875793,
+      "learning_rate": 3.5062958831470355e-05,
+      "loss": 0.15206334590911866,
+      "step": 2130
+    },
+    {
+      "epoch": 0.38846433770014555,
+      "grad_norm": 0.15433982014656067,
+      "learning_rate": 3.4995533601559226e-05,
+      "loss": 0.1590178370475769,
+      "step": 2135
+    },
+    {
+      "epoch": 0.3893740902474527,
+      "grad_norm": 0.16498146951198578,
+      "learning_rate": 3.4928021720050104e-05,
+      "loss": 0.14759145975112914,
+      "step": 2140
+    },
+    {
+      "epoch": 0.3902838427947598,
+      "grad_norm": 0.17880478501319885,
+      "learning_rate": 3.486042377220562e-05,
+      "loss": 0.1642458915710449,
+      "step": 2145
+    },
+    {
+      "epoch": 0.39119359534206694,
+      "grad_norm": 0.14700061082839966,
+      "learning_rate": 3.479274034403455e-05,
+      "loss": 0.16105138063430785,
+      "step": 2150
+    },
+    {
+      "epoch": 0.39210334788937407,
+      "grad_norm": 0.1620762050151825,
+      "learning_rate": 3.472497202228664e-05,
+      "loss": 0.15104985237121582,
+      "step": 2155
+    },
+    {
+      "epoch": 0.3930131004366812,
+      "grad_norm": 0.1625058799982071,
+      "learning_rate": 3.4657119394447654e-05,
+      "loss": 0.16145485639572144,
+      "step": 2160
+    },
+    {
+      "epoch": 0.3939228529839884,
+      "grad_norm": 0.1631549596786499,
+      "learning_rate": 3.458918304873417e-05,
+      "loss": 0.16712255477905275,
+      "step": 2165
+    },
+    {
+      "epoch": 0.3948326055312955,
+      "grad_norm": 0.16041551530361176,
+      "learning_rate": 3.452116357408853e-05,
+      "loss": 0.15118330717086792,
+      "step": 2170
+    },
+    {
+      "epoch": 0.39574235807860264,
+      "grad_norm": 0.16692611575126648,
+      "learning_rate": 3.44530615601737e-05,
+      "loss": 0.16982550621032716,
+      "step": 2175
+    },
+    {
+      "epoch": 0.39665211062590977,
+      "grad_norm": 0.16082268953323364,
+      "learning_rate": 3.438487759736821e-05,
+      "loss": 0.1513260006904602,
+      "step": 2180
+    },
+    {
+      "epoch": 0.3975618631732169,
+      "grad_norm": 0.1474589854478836,
+      "learning_rate": 3.4316612276761004e-05,
+      "loss": 0.14968743324279785,
+      "step": 2185
+    },
+    {
+      "epoch": 0.39847161572052403,
+      "grad_norm": 0.14531342685222626,
+      "learning_rate": 3.42482661901463e-05,
+      "loss": 0.1563260555267334,
+      "step": 2190
+    },
+    {
+      "epoch": 0.39938136826783116,
+      "grad_norm": 0.16775506734848022,
+      "learning_rate": 3.41798399300185e-05,
+      "loss": 0.14861010313034057,
+      "step": 2195
+    },
+    {
+      "epoch": 0.4002911208151383,
+      "grad_norm": 0.15065217018127441,
+      "learning_rate": 3.411133408956703e-05,
+      "loss": 0.15559519529342652,
+      "step": 2200
+    },
+    {
+      "epoch": 0.4012008733624454,
+      "grad_norm": 0.16655296087265015,
+      "learning_rate": 3.4042749262671184e-05,
+      "loss": 0.16025567054748535,
+      "step": 2205
+    },
+    {
+      "epoch": 0.40211062590975255,
+      "grad_norm": 0.14773905277252197,
+      "learning_rate": 3.397408604389501e-05,
+      "loss": 0.15074082612991332,
+      "step": 2210
+    },
+    {
+      "epoch": 0.4030203784570597,
+      "grad_norm": 0.16233304142951965,
+      "learning_rate": 3.3905345028482125e-05,
+      "loss": 0.15490520000457764,
+      "step": 2215
+    },
+    {
+      "epoch": 0.4039301310043668,
+      "grad_norm": 0.17520153522491455,
+      "learning_rate": 3.383652681235058e-05,
+      "loss": 0.1517520785331726,
+      "step": 2220
+    },
+    {
+      "epoch": 0.40483988355167394,
+      "grad_norm": 0.14749875664710999,
+      "learning_rate": 3.376763199208766e-05,
+      "loss": 0.15410997867584228,
+      "step": 2225
+    },
+    {
+      "epoch": 0.40574963609898107,
+      "grad_norm": 0.16855919361114502,
+      "learning_rate": 3.369866116494477e-05,
+      "loss": 0.1510261058807373,
+      "step": 2230
+    },
+    {
+      "epoch": 0.4066593886462882,
+      "grad_norm": 0.1594122350215912,
+      "learning_rate": 3.362961492883218e-05,
+      "loss": 0.1493813395500183,
+      "step": 2235
+    },
+    {
+      "epoch": 0.40756914119359533,
+      "grad_norm": 0.13645926117897034,
+      "learning_rate": 3.3560493882313915e-05,
+      "loss": 0.14876762628555298,
+      "step": 2240
+    },
+    {
+      "epoch": 0.40847889374090246,
+      "grad_norm": 0.14304400980472565,
+      "learning_rate": 3.349129862460251e-05,
+      "loss": 0.15567013025283813,
+      "step": 2245
+    },
+    {
+      "epoch": 0.4093886462882096,
+      "grad_norm": 0.17040041089057922,
+      "learning_rate": 3.342202975555386e-05,
+      "loss": 0.1563249945640564,
+      "step": 2250
+    },
+    {
+      "epoch": 0.4102983988355167,
+      "grad_norm": 0.15594671666622162,
+      "learning_rate": 3.3352687875661984e-05,
+      "loss": 0.1546410083770752,
+      "step": 2255
+    },
+    {
+      "epoch": 0.41120815138282385,
+      "grad_norm": 0.1677195280790329,
+      "learning_rate": 3.328327358605384e-05,
+      "loss": 0.15710171461105346,
+      "step": 2260
+    },
+    {
+      "epoch": 0.412117903930131,
+      "grad_norm": 0.1731705516576767,
+      "learning_rate": 3.321378748848412e-05,
+      "loss": 0.16444036960601807,
+      "step": 2265
+    },
+    {
+      "epoch": 0.4130276564774381,
+      "grad_norm": 0.18779033422470093,
+      "learning_rate": 3.3144230185329984e-05,
+      "loss": 0.15659687519073487,
+      "step": 2270
+    },
+    {
+      "epoch": 0.4139374090247453,
+      "grad_norm": 0.1543768346309662,
+      "learning_rate": 3.3074602279585913e-05,
+      "loss": 0.15100739002227784,
+      "step": 2275
+    },
+    {
+      "epoch": 0.4148471615720524,
+      "grad_norm": 0.16672168672084808,
+      "learning_rate": 3.300490437485843e-05,
+      "loss": 0.15535364151000977,
+      "step": 2280
+    },
+    {
+      "epoch": 0.41575691411935956,
+      "grad_norm": 0.16741308569908142,
+      "learning_rate": 3.293513707536089e-05,
+      "loss": 0.15523911714553834,
+      "step": 2285
+    },
+    {
+      "epoch": 0.4166666666666667,
+      "grad_norm": 0.1488303542137146,
+      "learning_rate": 3.286530098590822e-05,
+      "loss": 0.1542000651359558,
+      "step": 2290
+    },
+    {
+      "epoch": 0.4175764192139738,
+      "grad_norm": 0.1637732982635498,
+      "learning_rate": 3.2795396711911694e-05,
+      "loss": 0.15354831218719484,
+      "step": 2295
+    },
+    {
+      "epoch": 0.41848617176128095,
+      "grad_norm": 0.1472022533416748,
+      "learning_rate": 3.272542485937369e-05,
+      "loss": 0.16235145330429077,
+      "step": 2300
+    },
+    {
+      "epoch": 0.4193959243085881,
+      "grad_norm": 0.15908290445804596,
+      "learning_rate": 3.265538603488241e-05,
+      "loss": 0.15642645359039306,
+      "step": 2305
+    },
+    {
+      "epoch": 0.4203056768558952,
+      "grad_norm": 0.1584865301847458,
+      "learning_rate": 3.2585280845606645e-05,
+      "loss": 0.15490249395370484,
+      "step": 2310
+    },
+    {
+      "epoch": 0.42121542940320233,
+      "grad_norm": 0.15893949568271637,
+      "learning_rate": 3.251510989929052e-05,
+      "loss": 0.1598116159439087,
+      "step": 2315
+    },
+    {
+      "epoch": 0.42212518195050946,
+      "grad_norm": 0.18930596113204956,
+      "learning_rate": 3.244487380424817e-05,
+      "loss": 0.1482008934020996,
+      "step": 2320
+    },
+    {
+      "epoch": 0.4230349344978166,
+      "grad_norm": 0.132876455783844,
+      "learning_rate": 3.237457316935856e-05,
+      "loss": 0.15304710865020751,
+      "step": 2325
+    },
+    {
+      "epoch": 0.4239446870451237,
+      "grad_norm": 0.16447032988071442,
+      "learning_rate": 3.2304208604060106e-05,
+      "loss": 0.15298750400543212,
+      "step": 2330
+    },
+    {
+      "epoch": 0.42485443959243085,
+      "grad_norm": 0.17748120427131653,
+      "learning_rate": 3.223378071834546e-05,
+      "loss": 0.1556084156036377,
+      "step": 2335
+    },
+    {
+      "epoch": 0.425764192139738,
+      "grad_norm": 0.16366586089134216,
+      "learning_rate": 3.2163290122756206e-05,
+      "loss": 0.14387927055358887,
+      "step": 2340
+    },
+    {
+      "epoch": 0.4266739446870451,
+      "grad_norm": 0.15398970246315002,
+      "learning_rate": 3.209273742837755e-05,
+      "loss": 0.16091293096542358,
+      "step": 2345
+    },
+    {
+      "epoch": 0.42758369723435224,
+      "grad_norm": 0.164212167263031,
+      "learning_rate": 3.202212324683305e-05,
+      "loss": 0.15523531436920165,
+      "step": 2350
+    },
+    {
+      "epoch": 0.4284934497816594,
+      "grad_norm": 0.16749800741672516,
+      "learning_rate": 3.1951448190279255e-05,
+      "loss": 0.15354975461959838,
+      "step": 2355
+    },
+    {
+      "epoch": 0.4294032023289665,
+      "grad_norm": 0.14137034118175507,
+      "learning_rate": 3.18807128714005e-05,
+      "loss": 0.14981694221496583,
+      "step": 2360
+    },
+    {
+      "epoch": 0.43031295487627363,
+      "grad_norm": 0.14848439395427704,
+      "learning_rate": 3.1809917903403507e-05,
+      "loss": 0.15448769330978393,
+      "step": 2365
+    },
+    {
+      "epoch": 0.43122270742358076,
+      "grad_norm": 0.1747605800628662,
+      "learning_rate": 3.1739063900012095e-05,
+      "loss": 0.15882387161254882,
+      "step": 2370
+    },
+    {
+      "epoch": 0.4321324599708879,
+      "grad_norm": 0.16054467856884003,
+      "learning_rate": 3.166815147546186e-05,
+      "loss": 0.15170297622680665,
+      "step": 2375
+    },
+    {
+      "epoch": 0.433042212518195,
+      "grad_norm": 0.15428027510643005,
+      "learning_rate": 3.1597181244494886e-05,
+      "loss": 0.16202548742294312,
+      "step": 2380
+    },
+    {
+      "epoch": 0.4339519650655022,
+      "grad_norm": 0.16747219860553741,
+      "learning_rate": 3.1526153822354325e-05,
+      "loss": 0.15461477041244506,
+      "step": 2385
+    },
+    {
+      "epoch": 0.43486171761280934,
+      "grad_norm": 0.17415772378444672,
+      "learning_rate": 3.145506982477918e-05,
+      "loss": 0.16173542737960817,
+      "step": 2390
+    },
+    {
+      "epoch": 0.43577147016011647,
+      "grad_norm": 0.1293518990278244,
+      "learning_rate": 3.1383929867998865e-05,
+      "loss": 0.15572521686553956,
+      "step": 2395
+    },
+    {
+      "epoch": 0.4366812227074236,
+      "grad_norm": 0.16909323632717133,
+      "learning_rate": 3.1312734568727935e-05,
+      "loss": 0.15898628234863282,
+      "step": 2400
+    },
+    {
+      "epoch": 0.43759097525473073,
+      "grad_norm": 0.16770294308662415,
+      "learning_rate": 3.124148454416069e-05,
+      "loss": 0.1536281704902649,
+      "step": 2405
+    },
+    {
+      "epoch": 0.43850072780203786,
+      "grad_norm": 0.14078612625598907,
+      "learning_rate": 3.117018041196585e-05,
+      "loss": 0.15274266004562378,
+      "step": 2410
+    },
+    {
+      "epoch": 0.439410480349345,
+      "grad_norm": 0.15457536280155182,
+      "learning_rate": 3.1098822790281226e-05,
+      "loss": 0.15391263961791993,
+      "step": 2415
+    },
+    {
+      "epoch": 0.4403202328966521,
+      "grad_norm": 0.1640717089176178,
+      "learning_rate": 3.102741229770827e-05,
+      "loss": 0.15515168905258178,
+      "step": 2420
+    },
+    {
+      "epoch": 0.44122998544395925,
+      "grad_norm": 0.2601533830165863,
+      "learning_rate": 3.095594955330683e-05,
+      "loss": 0.1587247371673584,
+      "step": 2425
+    },
+    {
+      "epoch": 0.4421397379912664,
+      "grad_norm": 0.1352529525756836,
+      "learning_rate": 3.08844351765897e-05,
+      "loss": 0.1483217477798462,
+      "step": 2430
+    },
+    {
+      "epoch": 0.4430494905385735,
+      "grad_norm": 0.18479721248149872,
+      "learning_rate": 3.081286978751728e-05,
+      "loss": 0.15121787786483765,
+      "step": 2435
+    },
+    {
+      "epoch": 0.44395924308588064,
+      "grad_norm": 0.16954511404037476,
+      "learning_rate": 3.074125400649221e-05,
+      "loss": 0.16073100566864013,
+      "step": 2440
+    },
+    {
+      "epoch": 0.44486899563318777,
+      "grad_norm": 0.15154729783535004,
+      "learning_rate": 3.0669588454353944e-05,
+      "loss": 0.15738017559051515,
+      "step": 2445
+    },
+    {
+      "epoch": 0.4457787481804949,
+      "grad_norm": 0.1540488302707672,
+      "learning_rate": 3.059787375237344e-05,
+      "loss": 0.1515384554862976,
+      "step": 2450
+    },
+    {
+      "epoch": 0.44668850072780203,
+      "grad_norm": 0.1814432442188263,
+      "learning_rate": 3.052611052224774e-05,
+      "loss": 0.15731438398361205,
+      "step": 2455
+    },
+    {
+      "epoch": 0.44759825327510916,
+      "grad_norm": 0.16657036542892456,
+      "learning_rate": 3.0454299386094542e-05,
+      "loss": 0.15741543769836425,
+      "step": 2460
+    },
+    {
+      "epoch": 0.4485080058224163,
+      "grad_norm": 0.2177237570285797,
+      "learning_rate": 3.0382440966446875e-05,
+      "loss": 0.14972515106201173,
+      "step": 2465
+    },
+    {
+      "epoch": 0.4494177583697234,
+      "grad_norm": 0.1669909954071045,
+      "learning_rate": 3.031053588624766e-05,
+      "loss": 0.1506432294845581,
+      "step": 2470
+    },
+    {
+      "epoch": 0.45032751091703055,
+      "grad_norm": 0.1752234250307083,
+      "learning_rate": 3.0238584768844313e-05,
+      "loss": 0.14969609975814818,
+      "step": 2475
+    },
+    {
+      "epoch": 0.4512372634643377,
+      "grad_norm": 0.18267901241779327,
+      "learning_rate": 3.0166588237983363e-05,
+      "loss": 0.15112748146057128,
+      "step": 2480
+    },
+    {
+      "epoch": 0.4521470160116448,
+      "grad_norm": 0.16250105202198029,
+      "learning_rate": 3.0094546917805007e-05,
+      "loss": 0.15864100456237792,
+      "step": 2485
+    },
+    {
+      "epoch": 0.45305676855895194,
+      "grad_norm": 0.14825721085071564,
+      "learning_rate": 3.0022461432837752e-05,
+      "loss": 0.1513954520225525,
+      "step": 2490
+    },
+    {
+      "epoch": 0.4539665211062591,
+      "grad_norm": 0.1626640111207962,
+      "learning_rate": 2.9950332407992943e-05,
+      "loss": 0.1505578875541687,
+      "step": 2495
+    },
+    {
+      "epoch": 0.45487627365356625,
+      "grad_norm": 0.1535351574420929,
+      "learning_rate": 2.987816046855939e-05,
+      "loss": 0.15255829095840454,
+      "step": 2500
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.3799337281583967e+18,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-2500/training_args.bin b/checkpoint-2500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-2500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/checkpoint-2600/README.md b/checkpoint-2600/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-2600/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-2600/adapter_config.json b/checkpoint-2600/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-2600/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-2600/adapter_model.safetensors b/checkpoint-2600/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d0db80ad884feee4cb079c5950b4bdb9ae1ea243
--- /dev/null
+++ b/checkpoint-2600/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f173453c4a8f79d4c764bf9dfe079b46c3d61ff14c11f0f4434bd53560c47a67
+size 169741912
diff --git a/checkpoint-2600/chat_template.jinja b/checkpoint-2600/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-2600/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-2600/optimizer.pt b/checkpoint-2600/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..da55ef4e8f962be5c826a51ba8054c545d28f883
--- /dev/null
+++ b/checkpoint-2600/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f51cc35a25c1ab7f195b458b4d5a61a4c9a4f62dcbc247421ea1828376819ed7
+size 72807355
diff --git a/checkpoint-2600/processor_config.json b/checkpoint-2600/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-2600/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-2600/rng_state.pth b/checkpoint-2600/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-2600/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-2600/scheduler.pt b/checkpoint-2600/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..07e2637406963f0ce5a57ced3c79ae42c7b38062
--- /dev/null
+++ b/checkpoint-2600/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aaab7036ca9f4229b7c51e6e51d9bf8345f474742af924e02c88c4478ec5f987
+size 1465
diff --git a/checkpoint-2600/tokenizer.json b/checkpoint-2600/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-2600/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-2600/tokenizer_config.json b/checkpoint-2600/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-2600/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-2600/trainer_state.json b/checkpoint-2600/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..98e3122b2b9e8c6182f1f7bfa646ab302adeb85b
--- /dev/null
+++ b/checkpoint-2600/trainer_state.json
@@ -0,0 +1,3682 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.47307132459970885,
+  "eval_steps": 100,
+  "global_step": 2600,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    },
+    {
+      "epoch": 0.03729985443959243,
+      "grad_norm": 0.061678580939769745,
+      "learning_rate": 4.999340748636462e-05,
+      "loss": 0.24956226348876953,
+      "step": 205
+    },
+    {
+      "epoch": 0.03820960698689956,
+      "grad_norm": 0.07328873127698898,
+      "learning_rate": 4.999160884067051e-05,
+      "loss": 0.26169676780700685,
+      "step": 210
+    },
+    {
+      "epoch": 0.0391193595342067,
+      "grad_norm": 0.08287990838289261,
+      "learning_rate": 4.9989593541926246e-05,
+      "loss": 0.2574604034423828,
+      "step": 215
+    },
+    {
+      "epoch": 0.04002911208151383,
+      "grad_norm": 0.06787359714508057,
+      "learning_rate": 4.9987361607602525e-05,
+      "loss": 0.25351409912109374,
+      "step": 220
+    },
+    {
+      "epoch": 0.04093886462882096,
+      "grad_norm": 0.06695502996444702,
+      "learning_rate": 4.998491305704805e-05,
+      "loss": 0.24522039890289307,
+      "step": 225
+    },
+    {
+      "epoch": 0.041848617176128096,
+      "grad_norm": 0.08872214704751968,
+      "learning_rate": 4.9982247911489375e-05,
+      "loss": 0.2581867933273315,
+      "step": 230
+    },
+    {
+      "epoch": 0.042758369723435226,
+      "grad_norm": 0.07637131959199905,
+      "learning_rate": 4.9979366194030743e-05,
+      "loss": 0.25569658279418944,
+      "step": 235
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 0.08158119022846222,
+      "learning_rate": 4.997626792965385e-05,
+      "loss": 0.2529409646987915,
+      "step": 240
+    },
+    {
+      "epoch": 0.04457787481804949,
+      "grad_norm": 0.07529161125421524,
+      "learning_rate": 4.997295314521766e-05,
+      "loss": 0.24049024581909179,
+      "step": 245
+    },
+    {
+      "epoch": 0.04548762736535662,
+      "grad_norm": 0.08860139548778534,
+      "learning_rate": 4.996942186945813e-05,
+      "loss": 0.2490522861480713,
+      "step": 250
+    },
+    {
+      "epoch": 0.04639737991266375,
+      "grad_norm": 0.0850321501493454,
+      "learning_rate": 4.9965674132988005e-05,
+      "loss": 0.24180831909179687,
+      "step": 255
+    },
+    {
+      "epoch": 0.04730713245997089,
+      "grad_norm": 0.07556115090847015,
+      "learning_rate": 4.996170996829653e-05,
+      "loss": 0.2509631872177124,
+      "step": 260
+    },
+    {
+      "epoch": 0.04821688500727802,
+      "grad_norm": 0.07971206307411194,
+      "learning_rate": 4.995752940974918e-05,
+      "loss": 0.24398891925811766,
+      "step": 265
+    },
+    {
+      "epoch": 0.04912663755458515,
+      "grad_norm": 0.09149336814880371,
+      "learning_rate": 4.9953132493587344e-05,
+      "loss": 0.2300492286682129,
+      "step": 270
+    },
+    {
+      "epoch": 0.050036390101892286,
+      "grad_norm": 0.08265820890665054,
+      "learning_rate": 4.9948519257928034e-05,
+      "loss": 0.24246792793273925,
+      "step": 275
+    },
+    {
+      "epoch": 0.050946142649199416,
+      "grad_norm": 0.10328587144613266,
+      "learning_rate": 4.9943689742763534e-05,
+      "loss": 0.2367171049118042,
+      "step": 280
+    },
+    {
+      "epoch": 0.05185589519650655,
+      "grad_norm": 0.0836917981505394,
+      "learning_rate": 4.993864398996105e-05,
+      "loss": 0.23215813636779786,
+      "step": 285
+    },
+    {
+      "epoch": 0.05276564774381368,
+      "grad_norm": 0.09475161135196686,
+      "learning_rate": 4.99333820432624e-05,
+      "loss": 0.2350748062133789,
+      "step": 290
+    },
+    {
+      "epoch": 0.05367540029112081,
+      "grad_norm": 0.08040128648281097,
+      "learning_rate": 4.992790394828355e-05,
+      "loss": 0.23253886699676513,
+      "step": 295
+    },
+    {
+      "epoch": 0.05458515283842795,
+      "grad_norm": 0.08852150291204453,
+      "learning_rate": 4.992220975251428e-05,
+      "loss": 0.23856515884399415,
+      "step": 300
+    },
+    {
+      "epoch": 0.05549490538573508,
+      "grad_norm": 0.09565229713916779,
+      "learning_rate": 4.991629950531775e-05,
+      "loss": 0.23311660289764405,
+      "step": 305
+    },
+    {
+      "epoch": 0.05640465793304221,
+      "grad_norm": 0.08158160001039505,
+      "learning_rate": 4.991017325793009e-05,
+      "loss": 0.22467944622039795,
+      "step": 310
+    },
+    {
+      "epoch": 0.05731441048034935,
+      "grad_norm": 0.07746429741382599,
+      "learning_rate": 4.990383106345994e-05,
+      "loss": 0.229844069480896,
+      "step": 315
+    },
+    {
+      "epoch": 0.05822416302765648,
+      "grad_norm": 0.08564355969429016,
+      "learning_rate": 4.989727297688797e-05,
+      "loss": 0.22414517402648926,
+      "step": 320
+    },
+    {
+      "epoch": 0.05913391557496361,
+      "grad_norm": 0.07517435401678085,
+      "learning_rate": 4.9890499055066435e-05,
+      "loss": 0.2236532211303711,
+      "step": 325
+    },
+    {
+      "epoch": 0.060043668122270744,
+      "grad_norm": 0.111734539270401,
+      "learning_rate": 4.988350935671869e-05,
+      "loss": 0.21474847793579102,
+      "step": 330
+    },
+    {
+      "epoch": 0.060953420669577874,
+      "grad_norm": 0.09906989336013794,
+      "learning_rate": 4.987630394243866e-05,
+      "loss": 0.23321933746337892,
+      "step": 335
+    },
+    {
+      "epoch": 0.06186317321688501,
+      "grad_norm": 0.10131457448005676,
+      "learning_rate": 4.98688828746903e-05,
+      "loss": 0.2310662031173706,
+      "step": 340
+    },
+    {
+      "epoch": 0.06277292576419213,
+      "grad_norm": 0.09203507006168365,
+      "learning_rate": 4.986124621780708e-05,
+      "loss": 0.22021169662475587,
+      "step": 345
+    },
+    {
+      "epoch": 0.06368267831149928,
+      "grad_norm": 0.09505912661552429,
+      "learning_rate": 4.9853394037991416e-05,
+      "loss": 0.2197155237197876,
+      "step": 350
+    },
+    {
+      "epoch": 0.06459243085880641,
+      "grad_norm": 0.09038657695055008,
+      "learning_rate": 4.984532640331412e-05,
+      "loss": 0.22066287994384765,
+      "step": 355
+    },
+    {
+      "epoch": 0.06550218340611354,
+      "grad_norm": 0.09707064181566238,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 0.22455451488494874,
+      "step": 360
+    },
+    {
+      "epoch": 0.06641193595342067,
+      "grad_norm": 0.10367228090763092,
+      "learning_rate": 4.98285450509961e-05,
+      "loss": 0.21993820667266845,
+      "step": 365
+    },
+    {
+      "epoch": 0.0673216885007278,
+      "grad_norm": 0.12229471653699875,
+      "learning_rate": 4.9819831478833456e-05,
+      "loss": 0.2168867588043213,
+      "step": 370
+    },
+    {
+      "epoch": 0.06823144104803494,
+      "grad_norm": 0.0964592918753624,
+      "learning_rate": 4.981090274276406e-05,
+      "loss": 0.21579203605651856,
+      "step": 375
+    },
+    {
+      "epoch": 0.06914119359534207,
+      "grad_norm": 0.09400496631860733,
+      "learning_rate": 4.980175892019141e-05,
+      "loss": 0.20972180366516113,
+      "step": 380
+    },
+    {
+      "epoch": 0.0700509461426492,
+      "grad_norm": 0.08158645778894424,
+      "learning_rate": 4.9792400090383594e-05,
+      "loss": 0.22148358821868896,
+      "step": 385
+    },
+    {
+      "epoch": 0.07096069868995633,
+      "grad_norm": 0.10916394740343094,
+      "learning_rate": 4.978282633447261e-05,
+      "loss": 0.2214418649673462,
+      "step": 390
+    },
+    {
+      "epoch": 0.07187045123726346,
+      "grad_norm": 0.11138810962438583,
+      "learning_rate": 4.9773037735453636e-05,
+      "loss": 0.21814754009246826,
+      "step": 395
+    },
+    {
+      "epoch": 0.07278020378457059,
+      "grad_norm": 0.10914396494626999,
+      "learning_rate": 4.9763034378184365e-05,
+      "loss": 0.21310818195343018,
+      "step": 400
+    },
+    {
+      "epoch": 0.07368995633187773,
+      "grad_norm": 0.1043366864323616,
+      "learning_rate": 4.975281634938421e-05,
+      "loss": 0.21266789436340333,
+      "step": 405
+    },
+    {
+      "epoch": 0.07459970887918486,
+      "grad_norm": 0.1036868542432785,
+      "learning_rate": 4.9742383737633594e-05,
+      "loss": 0.21606721878051757,
+      "step": 410
+    },
+    {
+      "epoch": 0.075509461426492,
+      "grad_norm": 0.11640442907810211,
+      "learning_rate": 4.9731736633373144e-05,
+      "loss": 0.21532948017120362,
+      "step": 415
+    },
+    {
+      "epoch": 0.07641921397379912,
+      "grad_norm": 0.11219926178455353,
+      "learning_rate": 4.9720875128902956e-05,
+      "loss": 0.2191627025604248,
+      "step": 420
+    },
+    {
+      "epoch": 0.07732896652110625,
+      "grad_norm": 0.12103637307882309,
+      "learning_rate": 4.970979931838176e-05,
+      "loss": 0.20938868522644044,
+      "step": 425
+    },
+    {
+      "epoch": 0.0782387190684134,
+      "grad_norm": 0.13274189829826355,
+      "learning_rate": 4.96985092978261e-05,
+      "loss": 0.21792960166931152,
+      "step": 430
+    },
+    {
+      "epoch": 0.07914847161572053,
+      "grad_norm": 0.11164513230323792,
+      "learning_rate": 4.968700516510954e-05,
+      "loss": 0.2022618055343628,
+      "step": 435
+    },
+    {
+      "epoch": 0.08005822416302766,
+      "grad_norm": 0.09532847255468369,
+      "learning_rate": 4.967528701996174e-05,
+      "loss": 0.21255812644958497,
+      "step": 440
+    },
+    {
+      "epoch": 0.08096797671033479,
+      "grad_norm": 0.10279258340597153,
+      "learning_rate": 4.96633549639677e-05,
+      "loss": 0.20683050155639648,
+      "step": 445
+    },
+    {
+      "epoch": 0.08187772925764192,
+      "grad_norm": 0.1257462352514267,
+      "learning_rate": 4.965120910056677e-05,
+      "loss": 0.21419920921325683,
+      "step": 450
+    },
+    {
+      "epoch": 0.08278748180494905,
+      "grad_norm": 0.11663137376308441,
+      "learning_rate": 4.963884953505186e-05,
+      "loss": 0.2072287082672119,
+      "step": 455
+    },
+    {
+      "epoch": 0.08369723435225619,
+      "grad_norm": 0.10488224029541016,
+      "learning_rate": 4.96262763745684e-05,
+      "loss": 0.1982678532600403,
+      "step": 460
+    },
+    {
+      "epoch": 0.08460698689956332,
+      "grad_norm": 0.11801692098379135,
+      "learning_rate": 4.961348972811354e-05,
+      "loss": 0.20662031173706055,
+      "step": 465
+    },
+    {
+      "epoch": 0.08551673944687045,
+      "grad_norm": 0.11318827420473099,
+      "learning_rate": 4.96004897065351e-05,
+      "loss": 0.20947303771972656,
+      "step": 470
+    },
+    {
+      "epoch": 0.08642649199417758,
+      "grad_norm": 0.13409486413002014,
+      "learning_rate": 4.95872764225307e-05,
+      "loss": 0.19670876264572143,
+      "step": 475
+    },
+    {
+      "epoch": 0.08733624454148471,
+      "grad_norm": 0.14440792798995972,
+      "learning_rate": 4.957384999064672e-05,
+      "loss": 0.19842848777770997,
+      "step": 480
+    },
+    {
+      "epoch": 0.08824599708879186,
+      "grad_norm": 0.12246996909379959,
+      "learning_rate": 4.956021052727731e-05,
+      "loss": 0.20318071842193602,
+      "step": 485
+    },
+    {
+      "epoch": 0.08915574963609899,
+      "grad_norm": 0.13437233865261078,
+      "learning_rate": 4.954635815066342e-05,
+      "loss": 0.21675212383270265,
+      "step": 490
+    },
+    {
+      "epoch": 0.09006550218340612,
+      "grad_norm": 0.11109672486782074,
+      "learning_rate": 4.9532292980891744e-05,
+      "loss": 0.2100757837295532,
+      "step": 495
+    },
+    {
+      "epoch": 0.09097525473071325,
+      "grad_norm": 0.1388893872499466,
+      "learning_rate": 4.9518015139893675e-05,
+      "loss": 0.20303285121917725,
+      "step": 500
+    },
+    {
+      "epoch": 0.09188500727802038,
+      "grad_norm": 0.13239721953868866,
+      "learning_rate": 4.950352475144427e-05,
+      "loss": 0.2152268409729004,
+      "step": 505
+    },
+    {
+      "epoch": 0.0927947598253275,
+      "grad_norm": 0.12834979593753815,
+      "learning_rate": 4.948882194116119e-05,
+      "loss": 0.20799248218536376,
+      "step": 510
+    },
+    {
+      "epoch": 0.09370451237263465,
+      "grad_norm": 0.11886704713106155,
+      "learning_rate": 4.947390683650354e-05,
+      "loss": 0.20394976139068605,
+      "step": 515
+    },
+    {
+      "epoch": 0.09461426491994178,
+      "grad_norm": 0.11398876458406448,
+      "learning_rate": 4.945877956677083e-05,
+      "loss": 0.2091092586517334,
+      "step": 520
+    },
+    {
+      "epoch": 0.09552401746724891,
+      "grad_norm": 0.1422540694475174,
+      "learning_rate": 4.944344026310186e-05,
+      "loss": 0.19564238786697388,
+      "step": 525
+    },
+    {
+      "epoch": 0.09643377001455604,
+      "grad_norm": 0.11359584331512451,
+      "learning_rate": 4.9427889058473535e-05,
+      "loss": 0.20493624210357667,
+      "step": 530
+    },
+    {
+      "epoch": 0.09734352256186317,
+      "grad_norm": 0.11703553050756454,
+      "learning_rate": 4.941212608769974e-05,
+      "loss": 0.2098615884780884,
+      "step": 535
+    },
+    {
+      "epoch": 0.0982532751091703,
+      "grad_norm": 0.14552047848701477,
+      "learning_rate": 4.939615148743017e-05,
+      "loss": 0.20382182598114013,
+      "step": 540
+    },
+    {
+      "epoch": 0.09916302765647744,
+      "grad_norm": 0.13178016245365143,
+      "learning_rate": 4.937996539614914e-05,
+      "loss": 0.19901862144470214,
+      "step": 545
+    },
+    {
+      "epoch": 0.10007278020378457,
+      "grad_norm": 0.635392427444458,
+      "learning_rate": 4.936356795417439e-05,
+      "loss": 0.20694944858551026,
+      "step": 550
+    },
+    {
+      "epoch": 0.1009825327510917,
+      "grad_norm": 0.15019077062606812,
+      "learning_rate": 4.934695930365586e-05,
+      "loss": 0.19313746690750122,
+      "step": 555
+    },
+    {
+      "epoch": 0.10189228529839883,
+      "grad_norm": 0.12941956520080566,
+      "learning_rate": 4.9330139588574474e-05,
+      "loss": 0.19671722650527954,
+      "step": 560
+    },
+    {
+      "epoch": 0.10280203784570596,
+      "grad_norm": 0.13818831741809845,
+      "learning_rate": 4.931310895474088e-05,
+      "loss": 0.20026786327362062,
+      "step": 565
+    },
+    {
+      "epoch": 0.1037117903930131,
+      "grad_norm": 0.12011194974184036,
+      "learning_rate": 4.929586754979417e-05,
+      "loss": 0.1932437539100647,
+      "step": 570
+    },
+    {
+      "epoch": 0.10462154294032024,
+      "grad_norm": 0.1345364898443222,
+      "learning_rate": 4.9278415523200644e-05,
+      "loss": 0.20245940685272218,
+      "step": 575
+    },
+    {
+      "epoch": 0.10553129548762737,
+      "grad_norm": 0.13281017541885376,
+      "learning_rate": 4.926075302625247e-05,
+      "loss": 0.19864981174468993,
+      "step": 580
+    },
+    {
+      "epoch": 0.1064410480349345,
+      "grad_norm": 0.13465586304664612,
+      "learning_rate": 4.924288021206639e-05,
+      "loss": 0.19573183059692384,
+      "step": 585
+    },
+    {
+      "epoch": 0.10735080058224163,
+      "grad_norm": 0.15225961804389954,
+      "learning_rate": 4.9224797235582396e-05,
+      "loss": 0.19946801662445068,
+      "step": 590
+    },
+    {
+      "epoch": 0.10826055312954876,
+      "grad_norm": 0.12816746532917023,
+      "learning_rate": 4.92065042535624e-05,
+      "loss": 0.19851526021957397,
+      "step": 595
+    },
+    {
+      "epoch": 0.1091703056768559,
+      "grad_norm": 0.13802853226661682,
+      "learning_rate": 4.9188001424588824e-05,
+      "loss": 0.19321763515472412,
+      "step": 600
+    },
+    {
+      "epoch": 0.11008005822416303,
+      "grad_norm": 0.17504797875881195,
+      "learning_rate": 4.9169288909063295e-05,
+      "loss": 0.2032616138458252,
+      "step": 605
+    },
+    {
+      "epoch": 0.11098981077147016,
+      "grad_norm": 0.13544194400310516,
+      "learning_rate": 4.91503668692052e-05,
+      "loss": 0.2011256456375122,
+      "step": 610
+    },
+    {
+      "epoch": 0.11189956331877729,
+      "grad_norm": 1.3976134061813354,
+      "learning_rate": 4.91312354690503e-05,
+      "loss": 0.19916868209838867,
+      "step": 615
+    },
+    {
+      "epoch": 0.11280931586608442,
+      "grad_norm": 0.1465059071779251,
+      "learning_rate": 4.91118948744493e-05,
+      "loss": 0.19487457275390624,
+      "step": 620
+    },
+    {
+      "epoch": 0.11371906841339156,
+      "grad_norm": 0.12103168666362762,
+      "learning_rate": 4.909234525306645e-05,
+      "loss": 0.1907251238822937,
+      "step": 625
+    },
+    {
+      "epoch": 0.1146288209606987,
+      "grad_norm": 0.12660574913024902,
+      "learning_rate": 4.907258677437802e-05,
+      "loss": 0.19327253103256226,
+      "step": 630
+    },
+    {
+      "epoch": 0.11553857350800582,
+      "grad_norm": 0.1347813606262207,
+      "learning_rate": 4.90526196096709e-05,
+      "loss": 0.19637736082077026,
+      "step": 635
+    },
+    {
+      "epoch": 0.11644832605531295,
+      "grad_norm": 0.14953652024269104,
+      "learning_rate": 4.903244393204107e-05,
+      "loss": 0.20325069427490233,
+      "step": 640
+    },
+    {
+      "epoch": 0.11735807860262008,
+      "grad_norm": 0.13936272263526917,
+      "learning_rate": 4.901205991639213e-05,
+      "loss": 0.1930275321006775,
+      "step": 645
+    },
+    {
+      "epoch": 0.11826783114992721,
+      "grad_norm": 0.1448420137166977,
+      "learning_rate": 4.899146773943374e-05,
+      "loss": 0.20026936531066894,
+      "step": 650
+    },
+    {
+      "epoch": 0.11917758369723436,
+      "grad_norm": 0.1312534064054489,
+      "learning_rate": 4.897066757968014e-05,
+      "loss": 0.19062033891677857,
+      "step": 655
+    },
+    {
+      "epoch": 0.12008733624454149,
+      "grad_norm": 0.13644742965698242,
+      "learning_rate": 4.894965961744859e-05,
+      "loss": 0.18719595670700073,
+      "step": 660
+    },
+    {
+      "epoch": 0.12099708879184862,
+      "grad_norm": 0.14276087284088135,
+      "learning_rate": 4.892844403485777e-05,
+      "loss": 0.19784307479858398,
+      "step": 665
+    },
+    {
+      "epoch": 0.12190684133915575,
+      "grad_norm": 0.14735399186611176,
+      "learning_rate": 4.890702101582623e-05,
+      "loss": 0.19163782596588136,
+      "step": 670
+    },
+    {
+      "epoch": 0.12281659388646288,
+      "grad_norm": 0.15742065012454987,
+      "learning_rate": 4.888539074607082e-05,
+      "loss": 0.19312986135482788,
+      "step": 675
+    },
+    {
+      "epoch": 0.12372634643377002,
+      "grad_norm": 0.12917031347751617,
+      "learning_rate": 4.8863553413105025e-05,
+      "loss": 0.20066320896148682,
+      "step": 680
+    },
+    {
+      "epoch": 0.12463609898107715,
+      "grad_norm": 0.1484801322221756,
+      "learning_rate": 4.884150920623737e-05,
+      "loss": 0.20096096992492676,
+      "step": 685
+    },
+    {
+      "epoch": 0.12554585152838427,
+      "grad_norm": 0.1455296128988266,
+      "learning_rate": 4.88192583165698e-05,
+      "loss": 0.20518505573272705,
+      "step": 690
+    },
+    {
+      "epoch": 0.12645560407569142,
+      "grad_norm": 0.14517490565776825,
+      "learning_rate": 4.879680093699598e-05,
+      "loss": 0.18859238624572755,
+      "step": 695
+    },
+    {
+      "epoch": 0.12736535662299855,
+      "grad_norm": 0.18778090178966522,
+      "learning_rate": 4.877413726219964e-05,
+      "loss": 0.197074818611145,
+      "step": 700
+    },
+    {
+      "epoch": 0.12827510917030568,
+      "grad_norm": 0.13497677445411682,
+      "learning_rate": 4.87512674886529e-05,
+      "loss": 0.18713107109069824,
+      "step": 705
+    },
+    {
+      "epoch": 0.12918486171761281,
+      "grad_norm": 0.12657155096530914,
+      "learning_rate": 4.872819181461455e-05,
+      "loss": 0.1858484387397766,
+      "step": 710
+    },
+    {
+      "epoch": 0.13009461426491994,
+      "grad_norm": 0.11458148807287216,
+      "learning_rate": 4.870491044012834e-05,
+      "loss": 0.18732179403305055,
+      "step": 715
+    },
+    {
+      "epoch": 0.13100436681222707,
+      "grad_norm": 0.13000249862670898,
+      "learning_rate": 4.8681423567021244e-05,
+      "loss": 0.1872936010360718,
+      "step": 720
+    },
+    {
+      "epoch": 0.1319141193595342,
+      "grad_norm": 0.14580890536308289,
+      "learning_rate": 4.865773139890172e-05,
+      "loss": 0.19280019998550416,
+      "step": 725
+    },
+    {
+      "epoch": 0.13282387190684133,
+      "grad_norm": 0.1507277935743332,
+      "learning_rate": 4.8633834141157913e-05,
+      "loss": 0.1898929238319397,
+      "step": 730
+    },
+    {
+      "epoch": 0.13373362445414846,
+      "grad_norm": 0.1418737769126892,
+      "learning_rate": 4.860973200095592e-05,
+      "loss": 0.17926375865936278,
+      "step": 735
+    },
+    {
+      "epoch": 0.1346433770014556,
+      "grad_norm": 0.17151866853237152,
+      "learning_rate": 4.858542518723794e-05,
+      "loss": 0.18963592052459716,
+      "step": 740
+    },
+    {
+      "epoch": 0.13555312954876272,
+      "grad_norm": 0.11162743717432022,
+      "learning_rate": 4.8560913910720535e-05,
+      "loss": 0.19466646909713745,
+      "step": 745
+    },
+    {
+      "epoch": 0.13646288209606988,
+      "grad_norm": 0.15628376603126526,
+      "learning_rate": 4.8536198383892725e-05,
+      "loss": 0.19494034051895143,
+      "step": 750
+    },
+    {
+      "epoch": 0.137372634643377,
+      "grad_norm": 0.18209289014339447,
+      "learning_rate": 4.851127882101421e-05,
+      "loss": 0.18747550249099731,
+      "step": 755
+    },
+    {
+      "epoch": 0.13828238719068414,
+      "grad_norm": 0.14559614658355713,
+      "learning_rate": 4.8486155438113454e-05,
+      "loss": 0.1897158980369568,
+      "step": 760
+    },
+    {
+      "epoch": 0.13919213973799127,
+      "grad_norm": 0.3198587894439697,
+      "learning_rate": 4.846082845298586e-05,
+      "loss": 0.18571001291275024,
+      "step": 765
+    },
+    {
+      "epoch": 0.1401018922852984,
+      "grad_norm": 0.1486678421497345,
+      "learning_rate": 4.843529808519189e-05,
+      "loss": 0.19561930894851684,
+      "step": 770
+    },
+    {
+      "epoch": 0.14101164483260553,
+      "grad_norm": 0.15318170189857483,
+      "learning_rate": 4.840956455605509e-05,
+      "loss": 0.187040114402771,
+      "step": 775
+    },
+    {
+      "epoch": 0.14192139737991266,
+      "grad_norm": 0.13754244148731232,
+      "learning_rate": 4.838362808866025e-05,
+      "loss": 0.18345539569854735,
+      "step": 780
+    },
+    {
+      "epoch": 0.1428311499272198,
+      "grad_norm": 0.12943248450756073,
+      "learning_rate": 4.835748890785143e-05,
+      "loss": 0.1921079397201538,
+      "step": 785
+    },
+    {
+      "epoch": 0.14374090247452692,
+      "grad_norm": 0.110458143055439,
+      "learning_rate": 4.833114724023001e-05,
+      "loss": 0.17927205562591553,
+      "step": 790
+    },
+    {
+      "epoch": 0.14465065502183405,
+      "grad_norm": 0.2421770840883255,
+      "learning_rate": 4.830460331415275e-05,
+      "loss": 0.18317567110061644,
+      "step": 795
+    },
+    {
+      "epoch": 0.14556040756914118,
+      "grad_norm": 0.14752762019634247,
+      "learning_rate": 4.8277857359729787e-05,
+      "loss": 0.1843916058540344,
+      "step": 800
+    },
+    {
+      "epoch": 0.14647016011644834,
+      "grad_norm": 0.15043556690216064,
+      "learning_rate": 4.8250909608822644e-05,
+      "loss": 0.18354393243789674,
+      "step": 805
+    },
+    {
+      "epoch": 0.14737991266375547,
+      "grad_norm": 0.1381794661283493,
+      "learning_rate": 4.822376029504223e-05,
+      "loss": 0.1789781332015991,
+      "step": 810
+    },
+    {
+      "epoch": 0.1482896652110626,
+      "grad_norm": 0.18386174738407135,
+      "learning_rate": 4.819640965374681e-05,
+      "loss": 0.19494292736053467,
+      "step": 815
+    },
+    {
+      "epoch": 0.14919941775836973,
+      "grad_norm": 0.13829593360424042,
+      "learning_rate": 4.816885792203996e-05,
+      "loss": 0.18486063480377196,
+      "step": 820
+    },
+    {
+      "epoch": 0.15010917030567686,
+      "grad_norm": 0.15033291280269623,
+      "learning_rate": 4.814110533876852e-05,
+      "loss": 0.18061509132385253,
+      "step": 825
+    },
+    {
+      "epoch": 0.151018922852984,
+      "grad_norm": 0.17150473594665527,
+      "learning_rate": 4.811315214452051e-05,
+      "loss": 0.18464866876602173,
+      "step": 830
+    },
+    {
+      "epoch": 0.15192867540029112,
+      "grad_norm": 0.15317125618457794,
+      "learning_rate": 4.808499858162307e-05,
+      "loss": 0.1837708592414856,
+      "step": 835
+    },
+    {
+      "epoch": 0.15283842794759825,
+      "grad_norm": 0.2671392560005188,
+      "learning_rate": 4.805664489414031e-05,
+      "loss": 0.19338636398315429,
+      "step": 840
+    },
+    {
+      "epoch": 0.15374818049490538,
+      "grad_norm": 0.14047028124332428,
+      "learning_rate": 4.802809132787125e-05,
+      "loss": 0.17069108486175538,
+      "step": 845
+    },
+    {
+      "epoch": 0.1546579330422125,
+      "grad_norm": 0.1520431935787201,
+      "learning_rate": 4.799933813034768e-05,
+      "loss": 0.18607735633850098,
+      "step": 850
+    },
+    {
+      "epoch": 0.15556768558951964,
+      "grad_norm": 0.17239463329315186,
+      "learning_rate": 4.797038555083197e-05,
+      "loss": 0.18069062232971192,
+      "step": 855
+    },
+    {
+      "epoch": 0.1564774381368268,
+      "grad_norm": 0.1377955675125122,
+      "learning_rate": 4.794123384031495e-05,
+      "loss": 0.18870222568511963,
+      "step": 860
+    },
+    {
+      "epoch": 0.15738719068413393,
+      "grad_norm": 0.15901461243629456,
+      "learning_rate": 4.791188325151373e-05,
+      "loss": 0.18128334283828734,
+      "step": 865
+    },
+    {
+      "epoch": 0.15829694323144106,
+      "grad_norm": 0.14634132385253906,
+      "learning_rate": 4.7882334038869495e-05,
+      "loss": 0.1866163969039917,
+      "step": 870
+    },
+    {
+      "epoch": 0.1592066957787482,
+      "grad_norm": 0.15361061692237854,
+      "learning_rate": 4.785258645854529e-05,
+      "loss": 0.17850807905197144,
+      "step": 875
+    },
+    {
+      "epoch": 0.16011644832605532,
+      "grad_norm": 0.13751649856567383,
+      "learning_rate": 4.782264076842385e-05,
+      "loss": 0.17731113433837892,
+      "step": 880
+    },
+    {
+      "epoch": 0.16102620087336245,
+      "grad_norm": 0.17909638583660126,
+      "learning_rate": 4.7792497228105314e-05,
+      "loss": 0.18344542980194092,
+      "step": 885
+    },
+    {
+      "epoch": 0.16193595342066958,
+      "grad_norm": 0.16038304567337036,
+      "learning_rate": 4.776215609890498e-05,
+      "loss": 0.18868647813796996,
+      "step": 890
+    },
+    {
+      "epoch": 0.1628457059679767,
+      "grad_norm": 0.1653951108455658,
+      "learning_rate": 4.773161764385107e-05,
+      "loss": 0.18614152669906617,
+      "step": 895
+    },
+    {
+      "epoch": 0.16375545851528384,
+      "grad_norm": 0.16193026304244995,
+      "learning_rate": 4.770088212768241e-05,
+      "loss": 0.18564575910568237,
+      "step": 900
+    },
+    {
+      "epoch": 0.16466521106259097,
+      "grad_norm": 0.16048531234264374,
+      "learning_rate": 4.7669949816846173e-05,
+      "loss": 0.18330031633377075,
+      "step": 905
+    },
+    {
+      "epoch": 0.1655749636098981,
+      "grad_norm": 0.1440177708864212,
+      "learning_rate": 4.7638820979495534e-05,
+      "loss": 0.17712442874908446,
+      "step": 910
+    },
+    {
+      "epoch": 0.16648471615720525,
+      "grad_norm": 0.19635969400405884,
+      "learning_rate": 4.760749588548738e-05,
+      "loss": 0.18679027557373046,
+      "step": 915
+    },
+    {
+      "epoch": 0.16739446870451238,
+      "grad_norm": 0.15576541423797607,
+      "learning_rate": 4.757597480637995e-05,
+      "loss": 0.19283764362335204,
+      "step": 920
+    },
+    {
+      "epoch": 0.1683042212518195,
+      "grad_norm": 0.1550331562757492,
+      "learning_rate": 4.7544258015430463e-05,
+      "loss": 0.18269542455673218,
+      "step": 925
+    },
+    {
+      "epoch": 0.16921397379912664,
+      "grad_norm": 0.18369626998901367,
+      "learning_rate": 4.75123457875928e-05,
+      "loss": 0.1697891116142273,
+      "step": 930
+    },
+    {
+      "epoch": 0.17012372634643377,
+      "grad_norm": 0.15266314148902893,
+      "learning_rate": 4.7480238399515074e-05,
+      "loss": 0.18523451089859008,
+      "step": 935
+    },
+    {
+      "epoch": 0.1710334788937409,
+      "grad_norm": 0.16709664463996887,
+      "learning_rate": 4.744793612953724e-05,
+      "loss": 0.1803238034248352,
+      "step": 940
+    },
+    {
+      "epoch": 0.17194323144104803,
+      "grad_norm": 0.14929179847240448,
+      "learning_rate": 4.741543925768872e-05,
+      "loss": 0.1861217737197876,
+      "step": 945
+    },
+    {
+      "epoch": 0.17285298398835516,
+      "grad_norm": 0.1362280696630478,
+      "learning_rate": 4.7382748065685915e-05,
+      "loss": 0.17896100282669067,
+      "step": 950
+    },
+    {
+      "epoch": 0.1737627365356623,
+      "grad_norm": 0.15290239453315735,
+      "learning_rate": 4.734986283692982e-05,
+      "loss": 0.18432788848876952,
+      "step": 955
+    },
+    {
+      "epoch": 0.17467248908296942,
+      "grad_norm": 0.1287035197019577,
+      "learning_rate": 4.73167838565035e-05,
+      "loss": 0.18485682010650634,
+      "step": 960
+    },
+    {
+      "epoch": 0.17558224163027655,
+      "grad_norm": 0.17969627678394318,
+      "learning_rate": 4.728351141116971e-05,
+      "loss": 0.17361557483673096,
+      "step": 965
+    },
+    {
+      "epoch": 0.1764919941775837,
+      "grad_norm": 0.13751201331615448,
+      "learning_rate": 4.7250045789368326e-05,
+      "loss": 0.1731679320335388,
+      "step": 970
+    },
+    {
+      "epoch": 0.17740174672489084,
+      "grad_norm": 0.1603265255689621,
+      "learning_rate": 4.721638728121388e-05,
+      "loss": 0.17308170795440675,
+      "step": 975
+    },
+    {
+      "epoch": 0.17831149927219797,
+      "grad_norm": 0.1592789888381958,
+      "learning_rate": 4.718253617849306e-05,
+      "loss": 0.17534757852554322,
+      "step": 980
+    },
+    {
+      "epoch": 0.1792212518195051,
+      "grad_norm": 0.12727224826812744,
+      "learning_rate": 4.714849277466214e-05,
+      "loss": 0.17817609310150145,
+      "step": 985
+    },
+    {
+      "epoch": 0.18013100436681223,
+      "grad_norm": 0.15401554107666016,
+      "learning_rate": 4.711425736484447e-05,
+      "loss": 0.1733405351638794,
+      "step": 990
+    },
+    {
+      "epoch": 0.18104075691411936,
+      "grad_norm": 0.13253968954086304,
+      "learning_rate": 4.7079830245827906e-05,
+      "loss": 0.17846795320510864,
+      "step": 995
+    },
+    {
+      "epoch": 0.1819505094614265,
+      "grad_norm": 0.21846213936805725,
+      "learning_rate": 4.7045211716062245e-05,
+      "loss": 0.18021599054336548,
+      "step": 1000
+    },
+    {
+      "epoch": 0.18286026200873362,
+      "grad_norm": 0.16867990791797638,
+      "learning_rate": 4.7010402075656595e-05,
+      "loss": 0.18232386112213134,
+      "step": 1005
+    },
+    {
+      "epoch": 0.18377001455604075,
+      "grad_norm": 0.17180582880973816,
+      "learning_rate": 4.697540162637686e-05,
+      "loss": 0.1816317319869995,
+      "step": 1010
+    },
+    {
+      "epoch": 0.18467976710334788,
+      "grad_norm": 0.16480213403701782,
+      "learning_rate": 4.694021067164303e-05,
+      "loss": 0.17718446254730225,
+      "step": 1015
+    },
+    {
+      "epoch": 0.185589519650655,
+      "grad_norm": 0.15015918016433716,
+      "learning_rate": 4.6904829516526605e-05,
+      "loss": 0.17412011623382567,
+      "step": 1020
+    },
+    {
+      "epoch": 0.18649927219796217,
+      "grad_norm": 0.14445139467716217,
+      "learning_rate": 4.686925846774795e-05,
+      "loss": 0.1778018832206726,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1874090247452693,
+      "grad_norm": 0.1701960265636444,
+      "learning_rate": 4.683349783367362e-05,
+      "loss": 0.16901081800460815,
+      "step": 1030
+    },
+    {
+      "epoch": 0.18831877729257643,
+      "grad_norm": 0.15894867479801178,
+      "learning_rate": 4.679754792431368e-05,
+      "loss": 0.17055928707122803,
+      "step": 1035
+    },
+    {
+      "epoch": 0.18922852983988356,
+      "grad_norm": 0.1511942446231842,
+      "learning_rate": 4.676140905131903e-05,
+      "loss": 0.17339680194854737,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1901382823871907,
+      "grad_norm": 0.14735209941864014,
+      "learning_rate": 4.672508152797872e-05,
+      "loss": 0.17802717685699462,
+      "step": 1045
+    },
+    {
+      "epoch": 0.19104803493449782,
+      "grad_norm": 0.17367291450500488,
+      "learning_rate": 4.66885656692172e-05,
+      "loss": 0.1732744097709656,
+      "step": 1050
+    },
+    {
+      "epoch": 0.19195778748180495,
+      "grad_norm": 0.147227481007576,
+      "learning_rate": 4.665186179159159e-05,
+      "loss": 0.17040517330169677,
+      "step": 1055
+    },
+    {
+      "epoch": 0.19286754002911208,
+      "grad_norm": 0.1709655076265335,
+      "learning_rate": 4.6614970213289e-05,
+      "loss": 0.17794088125228882,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1937772925764192,
+      "grad_norm": 0.1588088721036911,
+      "learning_rate": 4.657789125412366e-05,
+      "loss": 0.17180380821228028,
+      "step": 1065
+    },
+    {
+      "epoch": 0.19468704512372634,
+      "grad_norm": 0.14827021956443787,
+      "learning_rate": 4.654062523553428e-05,
+      "loss": 0.182997989654541,
+      "step": 1070
+    },
+    {
+      "epoch": 0.19559679767103347,
+      "grad_norm": 0.16230466961860657,
+      "learning_rate": 4.6503172480581126e-05,
+      "loss": 0.17346880435943604,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1965065502183406,
+      "grad_norm": 0.1637624353170395,
+      "learning_rate": 4.646553331394333e-05,
+      "loss": 0.17263576984405518,
+      "step": 1080
+    },
+    {
+      "epoch": 0.19741630276564776,
+      "grad_norm": 0.15977843105793,
+      "learning_rate": 4.642770806191603e-05,
+      "loss": 0.17284308671951293,
+      "step": 1085
+    },
+    {
+      "epoch": 0.19832605531295489,
+      "grad_norm": 0.15394869446754456,
+      "learning_rate": 4.6389697052407534e-05,
+      "loss": 0.17797101736068727,
+      "step": 1090
+    },
+    {
+      "epoch": 0.19923580786026202,
+      "grad_norm": 0.15995225310325623,
+      "learning_rate": 4.6351500614936485e-05,
+      "loss": 0.18137198686599731,
+      "step": 1095
+    },
+    {
+      "epoch": 0.20014556040756915,
+      "grad_norm": 0.1779479682445526,
+      "learning_rate": 4.6313119080629006e-05,
+      "loss": 0.17998344898223878,
+      "step": 1100
+    },
+    {
+      "epoch": 0.20105531295487628,
+      "grad_norm": 0.14362832903862,
+      "learning_rate": 4.627455278221584e-05,
+      "loss": 0.18196423053741456,
+      "step": 1105
+    },
+    {
+      "epoch": 0.2019650655021834,
+      "grad_norm": 0.15951639413833618,
+      "learning_rate": 4.623580205402947e-05,
+      "loss": 0.17423888444900512,
+      "step": 1110
+    },
+    {
+      "epoch": 0.20287481804949054,
+      "grad_norm": 0.17273563146591187,
+      "learning_rate": 4.619686723200115e-05,
+      "loss": 0.17392473220825194,
+      "step": 1115
+    },
+    {
+      "epoch": 0.20378457059679767,
+      "grad_norm": 0.1655360758304596,
+      "learning_rate": 4.615774865365813e-05,
+      "loss": 0.17528389692306517,
+      "step": 1120
+    },
+    {
+      "epoch": 0.2046943231441048,
+      "grad_norm": 0.15920691192150116,
+      "learning_rate": 4.611844665812058e-05,
+      "loss": 0.1806849241256714,
+      "step": 1125
+    },
+    {
+      "epoch": 0.20560407569141192,
+      "grad_norm": 0.16114577651023865,
+      "learning_rate": 4.607896158609875e-05,
+      "loss": 0.17217352390289306,
+      "step": 1130
+    },
+    {
+      "epoch": 0.20651382823871905,
+      "grad_norm": 0.1499422937631607,
+      "learning_rate": 4.603929377988999e-05,
+      "loss": 0.17806737422943114,
+      "step": 1135
+    },
+    {
+      "epoch": 0.2074235807860262,
+      "grad_norm": 0.17605191469192505,
+      "learning_rate": 4.5999443583375765e-05,
+      "loss": 0.17842113971710205,
+      "step": 1140
+    },
+    {
+      "epoch": 0.20833333333333334,
+      "grad_norm": 0.16117210686206818,
+      "learning_rate": 4.595941134201871e-05,
+      "loss": 0.18379683494567872,
+      "step": 1145
+    },
+    {
+      "epoch": 0.20924308588064047,
+      "grad_norm": 0.21199050545692444,
+      "learning_rate": 4.591919740285957e-05,
+      "loss": 0.16286123991012574,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2101528384279476,
+      "grad_norm": 0.15100529789924622,
+      "learning_rate": 4.587880211451427e-05,
+      "loss": 0.17995200157165528,
+      "step": 1155
+    },
+    {
+      "epoch": 0.21106259097525473,
+      "grad_norm": 0.16618172824382782,
+      "learning_rate": 4.583822582717085e-05,
+      "loss": 0.16960303783416747,
+      "step": 1160
+    },
+    {
+      "epoch": 0.21197234352256186,
+      "grad_norm": 0.14743569493293762,
+      "learning_rate": 4.579746889258643e-05,
+      "loss": 0.17762668132781984,
+      "step": 1165
+    },
+    {
+      "epoch": 0.212882096069869,
+      "grad_norm": 0.1697179079055786,
+      "learning_rate": 4.575653166408417e-05,
+      "loss": 0.16656005382537842,
+      "step": 1170
+    },
+    {
+      "epoch": 0.21379184861717612,
+      "grad_norm": 0.14886513352394104,
+      "learning_rate": 4.57154144965502e-05,
+      "loss": 0.17091882228851318,
+      "step": 1175
+    },
+    {
+      "epoch": 0.21470160116448325,
+      "grad_norm": 0.18197473883628845,
+      "learning_rate": 4.5674117746430556e-05,
+      "loss": 0.1770920753479004,
+      "step": 1180
+    },
+    {
+      "epoch": 0.21561135371179038,
+      "grad_norm": 0.17323088645935059,
+      "learning_rate": 4.563264177172807e-05,
+      "loss": 0.1734643578529358,
+      "step": 1185
+    },
+    {
+      "epoch": 0.2165211062590975,
+      "grad_norm": 0.1521984338760376,
+      "learning_rate": 4.559098693199929e-05,
+      "loss": 0.17515116930007935,
+      "step": 1190
+    },
+    {
+      "epoch": 0.21743085880640467,
+      "grad_norm": 0.1842304915189743,
+      "learning_rate": 4.554915358835134e-05,
+      "loss": 0.16798022985458375,
+      "step": 1195
+    },
+    {
+      "epoch": 0.2183406113537118,
+      "grad_norm": 0.14753451943397522,
+      "learning_rate": 4.5507142103438794e-05,
+      "loss": 0.1755476713180542,
+      "step": 1200
+    },
+    {
+      "epoch": 0.21925036390101893,
+      "grad_norm": 0.17096194624900818,
+      "learning_rate": 4.546495284146057e-05,
+      "loss": 0.1792473554611206,
+      "step": 1205
+    },
+    {
+      "epoch": 0.22016011644832606,
+      "grad_norm": 0.1579233556985855,
+      "learning_rate": 4.542258616815669e-05,
+      "loss": 0.17230144739151002,
+      "step": 1210
+    },
+    {
+      "epoch": 0.2210698689956332,
+      "grad_norm": 0.177297905087471,
+      "learning_rate": 4.5380042450805216e-05,
+      "loss": 0.1807127833366394,
+      "step": 1215
+    },
+    {
+      "epoch": 0.22197962154294032,
+      "grad_norm": 0.14331696927547455,
+      "learning_rate": 4.533732205821897e-05,
+      "loss": 0.17201389074325563,
+      "step": 1220
+    },
+    {
+      "epoch": 0.22288937409024745,
+      "grad_norm": 0.14473360776901245,
+      "learning_rate": 4.529442536074239e-05,
+      "loss": 0.17036900520324708,
+      "step": 1225
+    },
+    {
+      "epoch": 0.22379912663755458,
+      "grad_norm": 0.1820901483297348,
+      "learning_rate": 4.5251352730248314e-05,
+      "loss": 0.17704882621765136,
+      "step": 1230
+    },
+    {
+      "epoch": 0.2247088791848617,
+      "grad_norm": 0.1948976367712021,
+      "learning_rate": 4.5208104540134746e-05,
+      "loss": 0.1706973433494568,
+      "step": 1235
+    },
+    {
+      "epoch": 0.22561863173216884,
+      "grad_norm": 0.16660070419311523,
+      "learning_rate": 4.51646811653216e-05,
+      "loss": 0.17636821269989014,
+      "step": 1240
+    },
+    {
+      "epoch": 0.22652838427947597,
+      "grad_norm": 0.1699984073638916,
+      "learning_rate": 4.512108298224751e-05,
+      "loss": 0.16986632347106934,
+      "step": 1245
+    },
+    {
+      "epoch": 0.22743813682678313,
+      "grad_norm": 0.17601042985916138,
+      "learning_rate": 4.50773103688665e-05,
+      "loss": 0.17507898807525635,
+      "step": 1250
+    },
+    {
+      "epoch": 0.22834788937409026,
+      "grad_norm": 0.17557238042354584,
+      "learning_rate": 4.503336370464476e-05,
+      "loss": 0.17702863216400147,
+      "step": 1255
+    },
+    {
+      "epoch": 0.2292576419213974,
+      "grad_norm": 0.1800651252269745,
+      "learning_rate": 4.498924337055729e-05,
+      "loss": 0.16419180631637573,
+      "step": 1260
+    },
+    {
+      "epoch": 0.23016739446870452,
+      "grad_norm": 0.2022479772567749,
+      "learning_rate": 4.494494974908468e-05,
+      "loss": 0.17482060194015503,
+      "step": 1265
+    },
+    {
+      "epoch": 0.23107714701601165,
+      "grad_norm": 0.14180205762386322,
+      "learning_rate": 4.490048322420973e-05,
+      "loss": 0.1723136067390442,
+      "step": 1270
+    },
+    {
+      "epoch": 0.23198689956331878,
+      "grad_norm": 0.18607310950756073,
+      "learning_rate": 4.485584418141419e-05,
+      "loss": 0.17096419334411622,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2328966521106259,
+      "grad_norm": 0.15958310663700104,
+      "learning_rate": 4.481103300767529e-05,
+      "loss": 0.1656244158744812,
+      "step": 1280
+    },
+    {
+      "epoch": 0.23380640465793304,
+      "grad_norm": 0.17552383244037628,
+      "learning_rate": 4.476605009146255e-05,
+      "loss": 0.17677626609802247,
+      "step": 1285
+    },
+    {
+      "epoch": 0.23471615720524017,
+      "grad_norm": 0.15299823880195618,
+      "learning_rate": 4.472089582273429e-05,
+      "loss": 0.1778991103172302,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2356259097525473,
+      "grad_norm": 0.14613987505435944,
+      "learning_rate": 4.46755705929343e-05,
+      "loss": 0.17071452140808105,
+      "step": 1295
+    },
+    {
+      "epoch": 0.23653566229985443,
+      "grad_norm": 0.17781122028827667,
+      "learning_rate": 4.463007479498843e-05,
+      "loss": 0.16955430507659913,
+      "step": 1300
+    },
+    {
+      "epoch": 0.23744541484716158,
+      "grad_norm": 0.16326487064361572,
+      "learning_rate": 4.458440882330119e-05,
+      "loss": 0.1777693510055542,
+      "step": 1305
+    },
+    {
+      "epoch": 0.23835516739446871,
+      "grad_norm": 0.17701926827430725,
+      "learning_rate": 4.4538573073752365e-05,
+      "loss": 0.16323351860046387,
+      "step": 1310
+    },
+    {
+      "epoch": 0.23926491994177584,
+      "grad_norm": 0.13104717433452606,
+      "learning_rate": 4.449256794369349e-05,
+      "loss": 0.17653456926345826,
+      "step": 1315
+    },
+    {
+      "epoch": 0.24017467248908297,
+      "grad_norm": 0.1796836256980896,
+      "learning_rate": 4.444639383194452e-05,
+      "loss": 0.17189600467681884,
+      "step": 1320
+    },
+    {
+      "epoch": 0.2410844250363901,
+      "grad_norm": 0.14919696748256683,
+      "learning_rate": 4.440005113879029e-05,
+      "loss": 0.17003334760665895,
+      "step": 1325
+    },
+    {
+      "epoch": 0.24199417758369723,
+      "grad_norm": 0.1728784441947937,
+      "learning_rate": 4.4353540265977064e-05,
+      "loss": 0.17397408485412597,
+      "step": 1330
+    },
+    {
+      "epoch": 0.24290393013100436,
+      "grad_norm": 0.14591015875339508,
+      "learning_rate": 4.43068616167091e-05,
+      "loss": 0.16498478651046752,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2438136826783115,
+      "grad_norm": 0.18417201936244965,
+      "learning_rate": 4.4260015595645055e-05,
+      "loss": 0.16841750144958495,
+      "step": 1340
+    },
+    {
+      "epoch": 0.24472343522561862,
+      "grad_norm": 0.16264279186725616,
+      "learning_rate": 4.4213002608894605e-05,
+      "loss": 0.16907373666763306,
+      "step": 1345
+    },
+    {
+      "epoch": 0.24563318777292575,
+      "grad_norm": 0.15248481929302216,
+      "learning_rate": 4.416582306401481e-05,
+      "loss": 0.15931472778320313,
+      "step": 1350
+    },
+    {
+      "epoch": 0.24654294032023288,
+      "grad_norm": 0.1488373875617981,
+      "learning_rate": 4.4118477370006636e-05,
+      "loss": 0.1701716423034668,
+      "step": 1355
+    },
+    {
+      "epoch": 0.24745269286754004,
+      "grad_norm": 0.14679782092571259,
+      "learning_rate": 4.407096593731142e-05,
+      "loss": 0.157412326335907,
+      "step": 1360
+    },
+    {
+      "epoch": 0.24836244541484717,
+      "grad_norm": 0.17139530181884766,
+      "learning_rate": 4.402328917780728e-05,
+      "loss": 0.17303754091262818,
+      "step": 1365
+    },
+    {
+      "epoch": 0.2492721979621543,
+      "grad_norm": 0.1534871757030487,
+      "learning_rate": 4.397544750480554e-05,
+      "loss": 0.1786255121231079,
+      "step": 1370
+    },
+    {
+      "epoch": 0.2501819505094614,
+      "grad_norm": 0.1876252293586731,
+      "learning_rate": 4.39274413330472e-05,
+      "loss": 0.16442898511886597,
+      "step": 1375
+    },
+    {
+      "epoch": 0.25109170305676853,
+      "grad_norm": 0.16165752708911896,
+      "learning_rate": 4.387927107869928e-05,
+      "loss": 0.1780426025390625,
+      "step": 1380
+    },
+    {
+      "epoch": 0.25200145560407566,
+      "grad_norm": 0.17242255806922913,
+      "learning_rate": 4.383093715935124e-05,
+      "loss": 0.15959256887435913,
+      "step": 1385
+    },
+    {
+      "epoch": 0.25291120815138285,
+      "grad_norm": 0.1627114862203598,
+      "learning_rate": 4.378243999401137e-05,
+      "loss": 0.17606115341186523,
+      "step": 1390
+    },
+    {
+      "epoch": 0.25382096069869,
+      "grad_norm": 0.15911224484443665,
+      "learning_rate": 4.373378000310312e-05,
+      "loss": 0.16798585653305054,
+      "step": 1395
+    },
+    {
+      "epoch": 0.2547307132459971,
+      "grad_norm": 0.15542249381542206,
+      "learning_rate": 4.3684957608461505e-05,
+      "loss": 0.1695417881011963,
+      "step": 1400
+    },
+    {
+      "epoch": 0.25564046579330424,
+      "grad_norm": 0.1475304812192917,
+      "learning_rate": 4.363597323332941e-05,
+      "loss": 0.16340878009796142,
+      "step": 1405
+    },
+    {
+      "epoch": 0.25655021834061137,
+      "grad_norm": 0.16943927109241486,
+      "learning_rate": 4.358682730235395e-05,
+      "loss": 0.17240238189697266,
+      "step": 1410
+    },
+    {
+      "epoch": 0.2574599708879185,
+      "grad_norm": 0.1816391944885254,
+      "learning_rate": 4.3537520241582744e-05,
+      "loss": 0.16558437347412108,
+      "step": 1415
+    },
+    {
+      "epoch": 0.25836972343522563,
+      "grad_norm": 0.23851341009140015,
+      "learning_rate": 4.348805247846027e-05,
+      "loss": 0.16796000003814698,
+      "step": 1420
+    },
+    {
+      "epoch": 0.25927947598253276,
+      "grad_norm": 0.15415243804454803,
+      "learning_rate": 4.343842444182414e-05,
+      "loss": 0.1746017098426819,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2601892285298399,
+      "grad_norm": 0.15651032328605652,
+      "learning_rate": 4.338863656190139e-05,
+      "loss": 0.1649057984352112,
+      "step": 1430
+    },
+    {
+      "epoch": 0.261098981077147,
+      "grad_norm": 0.16601966321468353,
+      "learning_rate": 4.333868927030471e-05,
+      "loss": 0.15888988971710205,
+      "step": 1435
+    },
+    {
+      "epoch": 0.26200873362445415,
+      "grad_norm": 0.1549467295408249,
+      "learning_rate": 4.328858300002876e-05,
+      "loss": 0.16357985734939576,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2629184861717613,
+      "grad_norm": 0.16332370042800903,
+      "learning_rate": 4.32383181854464e-05,
+      "loss": 0.16749982833862304,
+      "step": 1445
+    },
+    {
+      "epoch": 0.2638282387190684,
+      "grad_norm": 0.14827077090740204,
+      "learning_rate": 4.3187895262304894e-05,
+      "loss": 0.16886214017868043,
+      "step": 1450
+    },
+    {
+      "epoch": 0.26473799126637554,
+      "grad_norm": 0.1557198166847229,
+      "learning_rate": 4.313731466772216e-05,
+      "loss": 0.17512214183807373,
+      "step": 1455
+    },
+    {
+      "epoch": 0.26564774381368267,
+      "grad_norm": 0.17263570427894592,
+      "learning_rate": 4.308657684018299e-05,
+      "loss": 0.16248074769973755,
+      "step": 1460
+    },
+    {
+      "epoch": 0.2665574963609898,
+      "grad_norm": 0.17135761678218842,
+      "learning_rate": 4.303568221953521e-05,
+      "loss": 0.16605921983718872,
+      "step": 1465
+    },
+    {
+      "epoch": 0.26746724890829693,
+      "grad_norm": 0.14322632551193237,
+      "learning_rate": 4.2984631246985897e-05,
+      "loss": 0.1610772728919983,
+      "step": 1470
+    },
+    {
+      "epoch": 0.26837700145560406,
+      "grad_norm": 0.18852312862873077,
+      "learning_rate": 4.2933424365097564e-05,
+      "loss": 0.1686462163925171,
+      "step": 1475
+    },
+    {
+      "epoch": 0.2692867540029112,
+      "grad_norm": 0.1780245155096054,
+      "learning_rate": 4.2882062017784294e-05,
+      "loss": 0.16953932046890258,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2701965065502183,
+      "grad_norm": 0.180568665266037,
+      "learning_rate": 4.2830544650307895e-05,
+      "loss": 0.16442664861679077,
+      "step": 1485
+    },
+    {
+      "epoch": 0.27110625909752545,
+      "grad_norm": 0.16876435279846191,
+      "learning_rate": 4.277887270927407e-05,
+      "loss": 0.17128173112869263,
+      "step": 1490
+    },
+    {
+      "epoch": 0.2720160116448326,
+      "grad_norm": 0.164053276181221,
+      "learning_rate": 4.2727046642628513e-05,
+      "loss": 0.16331382989883422,
+      "step": 1495
+    },
+    {
+      "epoch": 0.27292576419213976,
+      "grad_norm": 0.14577528834342957,
+      "learning_rate": 4.267506689965305e-05,
+      "loss": 0.1638316035270691,
+      "step": 1500
+    },
+    {
+      "epoch": 0.2738355167394469,
+      "grad_norm": 0.1648740917444229,
+      "learning_rate": 4.262293393096171e-05,
+      "loss": 0.15332664251327516,
+      "step": 1505
+    },
+    {
+      "epoch": 0.274745269286754,
+      "grad_norm": 0.16445094347000122,
+      "learning_rate": 4.257064818849685e-05,
+      "loss": 0.1706634521484375,
+      "step": 1510
+    },
+    {
+      "epoch": 0.27565502183406115,
+      "grad_norm": 0.1584935486316681,
+      "learning_rate": 4.251821012552524e-05,
+      "loss": 0.1684114694595337,
+      "step": 1515
+    },
+    {
+      "epoch": 0.2765647743813683,
+      "grad_norm": 0.17215611040592194,
+      "learning_rate": 4.24656201966341e-05,
+      "loss": 0.15594131946563722,
+      "step": 1520
+    },
+    {
+      "epoch": 0.2774745269286754,
+      "grad_norm": 0.15945589542388916,
+      "learning_rate": 4.2412878857727214e-05,
+      "loss": 0.1686659574508667,
+      "step": 1525
+    },
+    {
+      "epoch": 0.27838427947598254,
+      "grad_norm": 0.16103951632976532,
+      "learning_rate": 4.2359986566020906e-05,
+      "loss": 0.17779340744018554,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2792940320232897,
+      "grad_norm": 0.1770307570695877,
+      "learning_rate": 4.230694378004014e-05,
+      "loss": 0.16786882877349854,
+      "step": 1535
+    },
+    {
+      "epoch": 0.2802037845705968,
+      "grad_norm": 0.16225053369998932,
+      "learning_rate": 4.2253750959614504e-05,
+      "loss": 0.16558897495269775,
+      "step": 1540
+    },
+    {
+      "epoch": 0.28111353711790393,
+      "grad_norm": 0.27213969826698303,
+      "learning_rate": 4.220040856587425e-05,
+      "loss": 0.1641119599342346,
+      "step": 1545
+    },
+    {
+      "epoch": 0.28202328966521106,
+      "grad_norm": 0.1773071587085724,
+      "learning_rate": 4.2146917061246284e-05,
+      "loss": 0.16919140815734862,
+      "step": 1550
+    },
+    {
+      "epoch": 0.2829330422125182,
+      "grad_norm": 0.15519705414772034,
+      "learning_rate": 4.209327690945014e-05,
+      "loss": 0.15501506328582765,
+      "step": 1555
+    },
+    {
+      "epoch": 0.2838427947598253,
+      "grad_norm": 0.19921597838401794,
+      "learning_rate": 4.203948857549402e-05,
+      "loss": 0.1690821886062622,
+      "step": 1560
+    },
+    {
+      "epoch": 0.28475254730713245,
+      "grad_norm": 0.15417630970478058,
+      "learning_rate": 4.1985552525670696e-05,
+      "loss": 0.1675640344619751,
+      "step": 1565
+    },
+    {
+      "epoch": 0.2856622998544396,
+      "grad_norm": 0.1739572137594223,
+      "learning_rate": 4.193146922755348e-05,
+      "loss": 0.16738017797470092,
+      "step": 1570
+    },
+    {
+      "epoch": 0.2865720524017467,
+      "grad_norm": 0.1384361982345581,
+      "learning_rate": 4.187723914999221e-05,
+      "loss": 0.16802358627319336,
+      "step": 1575
+    },
+    {
+      "epoch": 0.28748180494905384,
+      "grad_norm": 0.1491454839706421,
+      "learning_rate": 4.182286276310915e-05,
+      "loss": 0.1619583249092102,
+      "step": 1580
+    },
+    {
+      "epoch": 0.288391557496361,
+      "grad_norm": 0.15831919014453888,
+      "learning_rate": 4.176834053829492e-05,
+      "loss": 0.1625199794769287,
+      "step": 1585
+    },
+    {
+      "epoch": 0.2893013100436681,
+      "grad_norm": 0.16265396773815155,
+      "learning_rate": 4.1713672948204416e-05,
+      "loss": 0.16718552112579346,
+      "step": 1590
+    },
+    {
+      "epoch": 0.29021106259097523,
+      "grad_norm": 0.15153461694717407,
+      "learning_rate": 4.1658860466752714e-05,
+      "loss": 0.15979087352752686,
+      "step": 1595
+    },
+    {
+      "epoch": 0.29112081513828236,
+      "grad_norm": 0.1620412915945053,
+      "learning_rate": 4.160390356911096e-05,
+      "loss": 0.16103557348251343,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2920305676855895,
+      "grad_norm": 0.16673807799816132,
+      "learning_rate": 4.154880273170223e-05,
+      "loss": 0.16394708156585694,
+      "step": 1605
+    },
+    {
+      "epoch": 0.2929403202328967,
+      "grad_norm": 0.14834867417812347,
+      "learning_rate": 4.149355843219744e-05,
+      "loss": 0.15916435718536376,
+      "step": 1610
+    },
+    {
+      "epoch": 0.2938500727802038,
+      "grad_norm": 0.16977964341640472,
+      "learning_rate": 4.143817114951119e-05,
+      "loss": 0.16538127660751342,
+      "step": 1615
+    },
+    {
+      "epoch": 0.29475982532751094,
+      "grad_norm": 0.17986875772476196,
+      "learning_rate": 4.138264136379756e-05,
+      "loss": 0.15514618158340454,
+      "step": 1620
+    },
+    {
+      "epoch": 0.29566957787481807,
+      "grad_norm": 0.15794920921325684,
+      "learning_rate": 4.132696955644605e-05,
+      "loss": 0.15992183685302735,
+      "step": 1625
+    },
+    {
+      "epoch": 0.2965793304221252,
+      "grad_norm": 0.19955399632453918,
+      "learning_rate": 4.127115621007731e-05,
+      "loss": 0.16362056732177735,
+      "step": 1630
+    },
+    {
+      "epoch": 0.29748908296943233,
+      "grad_norm": 0.1352023035287857,
+      "learning_rate": 4.121520180853903e-05,
+      "loss": 0.15631601810455323,
+      "step": 1635
+    },
+    {
+      "epoch": 0.29839883551673946,
+      "grad_norm": 0.15340781211853027,
+      "learning_rate": 4.1159106836901674e-05,
+      "loss": 0.1571858048439026,
+      "step": 1640
+    },
+    {
+      "epoch": 0.2993085880640466,
+      "grad_norm": 0.15311770141124725,
+      "learning_rate": 4.110287178145433e-05,
+      "loss": 0.16082344055175782,
+      "step": 1645
+    },
+    {
+      "epoch": 0.3002183406113537,
+      "grad_norm": 0.17811627686023712,
+      "learning_rate": 4.10464971297005e-05,
+      "loss": 0.16117215156555176,
+      "step": 1650
+    },
+    {
+      "epoch": 0.30112809315866085,
+      "grad_norm": 0.21060039103031158,
+      "learning_rate": 4.0989983370353805e-05,
+      "loss": 0.15838587284088135,
+      "step": 1655
+    },
+    {
+      "epoch": 0.302037845705968,
+      "grad_norm": 0.155836820602417,
+      "learning_rate": 4.093333099333383e-05,
+      "loss": 0.16648870706558228,
+      "step": 1660
+    },
+    {
+      "epoch": 0.3029475982532751,
+      "grad_norm": 0.13711698353290558,
+      "learning_rate": 4.0876540489761826e-05,
+      "loss": 0.16899349689483642,
+      "step": 1665
+    },
+    {
+      "epoch": 0.30385735080058224,
+      "grad_norm": 0.15162716805934906,
+      "learning_rate": 4.0819612351956485e-05,
+      "loss": 0.16574090719223022,
+      "step": 1670
+    },
+    {
+      "epoch": 0.30476710334788937,
+      "grad_norm": 0.15016348659992218,
+      "learning_rate": 4.0762547073429615e-05,
+      "loss": 0.1689780354499817,
+      "step": 1675
+    },
+    {
+      "epoch": 0.3056768558951965,
+      "grad_norm": 0.15182986855506897,
+      "learning_rate": 4.070534514888194e-05,
+      "loss": 0.1593686819076538,
+      "step": 1680
+    },
+    {
+      "epoch": 0.3065866084425036,
+      "grad_norm": 0.15648750960826874,
+      "learning_rate": 4.0648007074198765e-05,
+      "loss": 0.16436235904693602,
+      "step": 1685
+    },
+    {
+      "epoch": 0.30749636098981076,
+      "grad_norm": 0.18339484930038452,
+      "learning_rate": 4.0590533346445665e-05,
+      "loss": 0.1678077220916748,
+      "step": 1690
+    },
+    {
+      "epoch": 0.3084061135371179,
+      "grad_norm": 0.16426527500152588,
+      "learning_rate": 4.053292446386422e-05,
+      "loss": 0.1689227342605591,
+      "step": 1695
+    },
+    {
+      "epoch": 0.309315866084425,
+      "grad_norm": 0.16129335761070251,
+      "learning_rate": 4.047518092586766e-05,
+      "loss": 0.16592445373535156,
+      "step": 1700
+    },
+    {
+      "epoch": 0.31022561863173215,
+      "grad_norm": 0.15512363612651825,
+      "learning_rate": 4.041730323303654e-05,
+      "loss": 0.16142364740371704,
+      "step": 1705
+    },
+    {
+      "epoch": 0.3111353711790393,
+      "grad_norm": 0.159842386841774,
+      "learning_rate": 4.0359291887114425e-05,
+      "loss": 0.1702875852584839,
+      "step": 1710
+    },
+    {
+      "epoch": 0.3120451237263464,
+      "grad_norm": 0.19558854401111603,
+      "learning_rate": 4.030114739100352e-05,
+      "loss": 0.15966148376464845,
+      "step": 1715
+    },
+    {
+      "epoch": 0.3129548762736536,
+      "grad_norm": 0.1577496975660324,
+      "learning_rate": 4.024287024876029e-05,
+      "loss": 0.1620358943939209,
+      "step": 1720
+    },
+    {
+      "epoch": 0.3138646288209607,
+      "grad_norm": 0.1629355251789093,
+      "learning_rate": 4.0184460965591144e-05,
+      "loss": 0.16511552333831786,
+      "step": 1725
+    },
+    {
+      "epoch": 0.31477438136826785,
+      "grad_norm": 0.17060767114162445,
+      "learning_rate": 4.0125920047848e-05,
+      "loss": 0.15672838687896729,
+      "step": 1730
+    },
+    {
+      "epoch": 0.315684133915575,
+      "grad_norm": 0.22447620332241058,
+      "learning_rate": 4.006724800302394e-05,
+      "loss": 0.15339784622192382,
+      "step": 1735
+    },
+    {
+      "epoch": 0.3165938864628821,
+      "grad_norm": 0.14572037756443024,
+      "learning_rate": 4.000844533974878e-05,
+      "loss": 0.16566959619522095,
+      "step": 1740
+    },
+    {
+      "epoch": 0.31750363901018924,
+      "grad_norm": 0.15915483236312866,
+      "learning_rate": 3.9949512567784684e-05,
+      "loss": 0.16153957843780517,
+      "step": 1745
+    },
+    {
+      "epoch": 0.3184133915574964,
+      "grad_norm": 0.1668540984392166,
+      "learning_rate": 3.9890450198021704e-05,
+      "loss": 0.1659809947013855,
+      "step": 1750
+    },
+    {
+      "epoch": 0.3193231441048035,
+      "grad_norm": 0.16612035036087036,
+      "learning_rate": 3.983125874247341e-05,
+      "loss": 0.16941241025924683,
+      "step": 1755
+    },
+    {
+      "epoch": 0.32023289665211063,
+      "grad_norm": 0.15163679420948029,
+      "learning_rate": 3.9771938714272407e-05,
+      "loss": 0.16053590774536133,
+      "step": 1760
+    },
+    {
+      "epoch": 0.32114264919941776,
+      "grad_norm": 0.1797824203968048,
+      "learning_rate": 3.97124906276659e-05,
+      "loss": 0.1667110800743103,
+      "step": 1765
+    },
+    {
+      "epoch": 0.3220524017467249,
+      "grad_norm": 0.15076608955860138,
+      "learning_rate": 3.9652914998011237e-05,
+      "loss": 0.1607860803604126,
+      "step": 1770
+    },
+    {
+      "epoch": 0.322962154294032,
+      "grad_norm": 0.16523587703704834,
+      "learning_rate": 3.959321234177144e-05,
+      "loss": 0.16515827178955078,
+      "step": 1775
+    },
+    {
+      "epoch": 0.32387190684133915,
+      "grad_norm": 0.22065149247646332,
+      "learning_rate": 3.9533383176510746e-05,
+      "loss": 0.1618957757949829,
+      "step": 1780
+    },
+    {
+      "epoch": 0.3247816593886463,
+      "grad_norm": 0.16426463425159454,
+      "learning_rate": 3.9473428020890066e-05,
+      "loss": 0.15763382911682128,
+      "step": 1785
+    },
+    {
+      "epoch": 0.3256914119359534,
+      "grad_norm": 0.16474904119968414,
+      "learning_rate": 3.941334739466257e-05,
+      "loss": 0.15135571956634522,
+      "step": 1790
+    },
+    {
+      "epoch": 0.32660116448326054,
+      "grad_norm": 0.16746412217617035,
+      "learning_rate": 3.935314181866909e-05,
+      "loss": 0.15925389528274536,
+      "step": 1795
+    },
+    {
+      "epoch": 0.32751091703056767,
+      "grad_norm": 0.17819371819496155,
+      "learning_rate": 3.929281181483369e-05,
+      "loss": 0.1598669171333313,
+      "step": 1800
+    },
+    {
+      "epoch": 0.3284206695778748,
+      "grad_norm": 0.1816040277481079,
+      "learning_rate": 3.923235790615907e-05,
+      "loss": 0.1652522087097168,
+      "step": 1805
+    },
+    {
+      "epoch": 0.32933042212518193,
+      "grad_norm": 0.14846695959568024,
+      "learning_rate": 3.917178061672211e-05,
+      "loss": 0.16665585041046144,
+      "step": 1810
+    },
+    {
+      "epoch": 0.33024017467248906,
+      "grad_norm": 0.1734926551580429,
+      "learning_rate": 3.911108047166924e-05,
+      "loss": 0.16069791316986085,
+      "step": 1815
+    },
+    {
+      "epoch": 0.3311499272197962,
+      "grad_norm": 0.16154922544956207,
+      "learning_rate": 3.905025799721194e-05,
+      "loss": 0.16114097833633423,
+      "step": 1820
+    },
+    {
+      "epoch": 0.3320596797671033,
+      "grad_norm": 0.1538771390914917,
+      "learning_rate": 3.898931372062217e-05,
+      "loss": 0.1602831244468689,
+      "step": 1825
+    },
+    {
+      "epoch": 0.3329694323144105,
+      "grad_norm": 0.14036566019058228,
+      "learning_rate": 3.892824817022781e-05,
+      "loss": 0.1502395749092102,
+      "step": 1830
+    },
+    {
+      "epoch": 0.33387918486171764,
+      "grad_norm": 0.19212059676647186,
+      "learning_rate": 3.886706187540804e-05,
+      "loss": 0.16265250444412233,
+      "step": 1835
+    },
+    {
+      "epoch": 0.33478893740902477,
+      "grad_norm": 0.17410333454608917,
+      "learning_rate": 3.880575536658881e-05,
+      "loss": 0.15689224004745483,
+      "step": 1840
+    },
+    {
+      "epoch": 0.3356986899563319,
+      "grad_norm": 0.15165294706821442,
+      "learning_rate": 3.874432917523817e-05,
+      "loss": 0.15033140182495117,
+      "step": 1845
+    },
+    {
+      "epoch": 0.336608442503639,
+      "grad_norm": 0.16166730225086212,
+      "learning_rate": 3.8682783833861736e-05,
+      "loss": 0.16896235942840576,
+      "step": 1850
+    },
+    {
+      "epoch": 0.33751819505094616,
+      "grad_norm": 0.16497021913528442,
+      "learning_rate": 3.8621119875998026e-05,
+      "loss": 0.1600774645805359,
+      "step": 1855
+    },
+    {
+      "epoch": 0.3384279475982533,
+      "grad_norm": 0.17264948785305023,
+      "learning_rate": 3.855933783621384e-05,
+      "loss": 0.16947593688964843,
+      "step": 1860
+    },
+    {
+      "epoch": 0.3393377001455604,
+      "grad_norm": 0.16870704293251038,
+      "learning_rate": 3.8497438250099636e-05,
+      "loss": 0.16062095165252685,
+      "step": 1865
+    },
+    {
+      "epoch": 0.34024745269286755,
+      "grad_norm": 0.16644036769866943,
+      "learning_rate": 3.843542165426492e-05,
+      "loss": 0.16015599966049193,
+      "step": 1870
+    },
+    {
+      "epoch": 0.3411572052401747,
+      "grad_norm": 0.1626352220773697,
+      "learning_rate": 3.837328858633349e-05,
+      "loss": 0.17444703578948975,
+      "step": 1875
+    },
+    {
+      "epoch": 0.3420669577874818,
+      "grad_norm": 0.1427375227212906,
+      "learning_rate": 3.83110395849389e-05,
+      "loss": 0.1589805006980896,
+      "step": 1880
+    },
+    {
+      "epoch": 0.34297671033478894,
+      "grad_norm": 0.17840255796909332,
+      "learning_rate": 3.824867518971973e-05,
+      "loss": 0.15953952074050903,
+      "step": 1885
+    },
+    {
+      "epoch": 0.34388646288209607,
+      "grad_norm": 0.16998249292373657,
+      "learning_rate": 3.818619594131489e-05,
+      "loss": 0.16027032136917113,
+      "step": 1890
+    },
+    {
+      "epoch": 0.3447962154294032,
+      "grad_norm": 0.14950257539749146,
+      "learning_rate": 3.812360238135897e-05,
+      "loss": 0.15335670709609986,
+      "step": 1895
+    },
+    {
+      "epoch": 0.3457059679767103,
+      "grad_norm": 0.1678011417388916,
+      "learning_rate": 3.806089505247752e-05,
+      "loss": 0.1560648798942566,
+      "step": 1900
+    },
+    {
+      "epoch": 0.34661572052401746,
+      "grad_norm": 0.17944541573524475,
+      "learning_rate": 3.799807449828238e-05,
+      "loss": 0.16072254180908202,
+      "step": 1905
+    },
+    {
+      "epoch": 0.3475254730713246,
+      "grad_norm": 0.166817307472229,
+      "learning_rate": 3.793514126336691e-05,
+      "loss": 0.1542820692062378,
+      "step": 1910
+    },
+    {
+      "epoch": 0.3484352256186317,
+      "grad_norm": 0.16047626733779907,
+      "learning_rate": 3.787209589330134e-05,
+      "loss": 0.16092092990875245,
+      "step": 1915
+    },
+    {
+      "epoch": 0.34934497816593885,
+      "grad_norm": 0.16478900611400604,
+      "learning_rate": 3.7808938934627965e-05,
+      "loss": 0.16765867471694945,
+      "step": 1920
+    },
+    {
+      "epoch": 0.350254730713246,
+      "grad_norm": 0.15349514782428741,
+      "learning_rate": 3.774567093485648e-05,
+      "loss": 0.15890377759933472,
+      "step": 1925
+    },
+    {
+      "epoch": 0.3511644832605531,
+      "grad_norm": 0.1515921950340271,
+      "learning_rate": 3.768229244245917e-05,
+      "loss": 0.16668319702148438,
+      "step": 1930
+    },
+    {
+      "epoch": 0.35207423580786024,
+      "grad_norm": 0.16310466825962067,
+      "learning_rate": 3.7618804006866195e-05,
+      "loss": 0.15182652473449706,
+      "step": 1935
+    },
+    {
+      "epoch": 0.3529839883551674,
+      "grad_norm": 0.17294517159461975,
+      "learning_rate": 3.755520617846084e-05,
+      "loss": 0.16287628412246705,
+      "step": 1940
+    },
+    {
+      "epoch": 0.35389374090247455,
+      "grad_norm": 0.1482895463705063,
+      "learning_rate": 3.749149950857467e-05,
+      "loss": 0.15321952104568481,
+      "step": 1945
+    },
+    {
+      "epoch": 0.3548034934497817,
+      "grad_norm": 0.2236029952764511,
+      "learning_rate": 3.7427684549482847e-05,
+      "loss": 0.15403482913970948,
+      "step": 1950
+    },
+    {
+      "epoch": 0.3557132459970888,
+      "grad_norm": 0.20185327529907227,
+      "learning_rate": 3.736376185439927e-05,
+      "loss": 0.1633884072303772,
+      "step": 1955
+    },
+    {
+      "epoch": 0.35662299854439594,
+      "grad_norm": 0.13906247913837433,
+      "learning_rate": 3.7299731977471816e-05,
+      "loss": 0.15925350189208984,
+      "step": 1960
+    },
+    {
+      "epoch": 0.35753275109170307,
+      "grad_norm": 0.18665002286434174,
+      "learning_rate": 3.723559547377751e-05,
+      "loss": 0.1612026572227478,
+      "step": 1965
+    },
+    {
+      "epoch": 0.3584425036390102,
+      "grad_norm": 0.16913433372974396,
+      "learning_rate": 3.717135289931774e-05,
+      "loss": 0.15479494333267213,
+      "step": 1970
+    },
+    {
+      "epoch": 0.35935225618631733,
+      "grad_norm": 0.1620066910982132,
+      "learning_rate": 3.7107004811013434e-05,
+      "loss": 0.1604058027267456,
+      "step": 1975
+    },
+    {
+      "epoch": 0.36026200873362446,
+      "grad_norm": 0.16838301718235016,
+      "learning_rate": 3.704255176670021e-05,
+      "loss": 0.15335073471069335,
+      "step": 1980
+    },
+    {
+      "epoch": 0.3611717612809316,
+      "grad_norm": 0.3054695427417755,
+      "learning_rate": 3.6977994325123535e-05,
+      "loss": 0.16558053493499755,
+      "step": 1985
+    },
+    {
+      "epoch": 0.3620815138282387,
+      "grad_norm": 0.1526716649532318,
+      "learning_rate": 3.6913333045933934e-05,
+      "loss": 0.16148923635482787,
+      "step": 1990
+    },
+    {
+      "epoch": 0.36299126637554585,
+      "grad_norm": 0.15328513085842133,
+      "learning_rate": 3.684856848968209e-05,
+      "loss": 0.1553613781929016,
+      "step": 1995
+    },
+    {
+      "epoch": 0.363901018922853,
+      "grad_norm": 0.16129714250564575,
+      "learning_rate": 3.6783701217813995e-05,
+      "loss": 0.16724612712860107,
+      "step": 2000
+    },
+    {
+      "epoch": 0.3648107714701601,
+      "grad_norm": 0.15715539455413818,
+      "learning_rate": 3.6718731792666086e-05,
+      "loss": 0.15867922306060792,
+      "step": 2005
+    },
+    {
+      "epoch": 0.36572052401746724,
+      "grad_norm": 0.15569166839122772,
+      "learning_rate": 3.6653660777460366e-05,
+      "loss": 0.1552058696746826,
+      "step": 2010
+    },
+    {
+      "epoch": 0.36663027656477437,
+      "grad_norm": 0.16223010420799255,
+      "learning_rate": 3.6588488736299535e-05,
+      "loss": 0.1583200454711914,
+      "step": 2015
+    },
+    {
+      "epoch": 0.3675400291120815,
+      "grad_norm": 0.18441995978355408,
+      "learning_rate": 3.652321623416209e-05,
+      "loss": 0.15050662755966188,
+      "step": 2020
+    },
+    {
+      "epoch": 0.36844978165938863,
+      "grad_norm": 0.13792674243450165,
+      "learning_rate": 3.645784383689742e-05,
+      "loss": 0.15458759069442748,
+      "step": 2025
+    },
+    {
+      "epoch": 0.36935953420669576,
+      "grad_norm": 0.14993111789226532,
+      "learning_rate": 3.639237211122091e-05,
+      "loss": 0.15926222801208495,
+      "step": 2030
+    },
+    {
+      "epoch": 0.3702692867540029,
+      "grad_norm": 0.16815930604934692,
+      "learning_rate": 3.632680162470904e-05,
+      "loss": 0.15524441003799438,
+      "step": 2035
+    },
+    {
+      "epoch": 0.37117903930131,
+      "grad_norm": 0.13312821090221405,
+      "learning_rate": 3.626113294579441e-05,
+      "loss": 0.15883516073226928,
+      "step": 2040
+    },
+    {
+      "epoch": 0.37208879184861715,
+      "grad_norm": 0.16838273406028748,
+      "learning_rate": 3.619536664376091e-05,
+      "loss": 0.15829603672027587,
+      "step": 2045
+    },
+    {
+      "epoch": 0.37299854439592434,
+      "grad_norm": 0.14706873893737793,
+      "learning_rate": 3.612950328873869e-05,
+      "loss": 0.15644397735595703,
+      "step": 2050
+    },
+    {
+      "epoch": 0.37390829694323147,
+      "grad_norm": 0.1644199639558792,
+      "learning_rate": 3.606354345169926e-05,
+      "loss": 0.15858219861984252,
+      "step": 2055
+    },
+    {
+      "epoch": 0.3748180494905386,
+      "grad_norm": 0.18077051639556885,
+      "learning_rate": 3.599748770445055e-05,
+      "loss": 0.1641286849975586,
+      "step": 2060
+    },
+    {
+      "epoch": 0.3757278020378457,
+      "grad_norm": 0.16329127550125122,
+      "learning_rate": 3.5931336619631914e-05,
+      "loss": 0.15027186870574952,
+      "step": 2065
+    },
+    {
+      "epoch": 0.37663755458515286,
+      "grad_norm": 0.16346783936023712,
+      "learning_rate": 3.586509077070922e-05,
+      "loss": 0.1558641314506531,
+      "step": 2070
+    },
+    {
+      "epoch": 0.37754730713246,
+      "grad_norm": 0.1727602630853653,
+      "learning_rate": 3.5798750731969834e-05,
+      "loss": 0.15390506982803345,
+      "step": 2075
+    },
+    {
+      "epoch": 0.3784570596797671,
+      "grad_norm": 0.7598192691802979,
+      "learning_rate": 3.5732317078517654e-05,
+      "loss": 0.1533232808113098,
+      "step": 2080
+    },
+    {
+      "epoch": 0.37936681222707425,
+      "grad_norm": 0.1433355212211609,
+      "learning_rate": 3.5665790386268124e-05,
+      "loss": 0.15560413599014283,
+      "step": 2085
+    },
+    {
+      "epoch": 0.3802765647743814,
+      "grad_norm": 0.18439625203609467,
+      "learning_rate": 3.559917123194325e-05,
+      "loss": 0.16695556640625,
+      "step": 2090
+    },
+    {
+      "epoch": 0.3811863173216885,
+      "grad_norm": 0.1693502813577652,
+      "learning_rate": 3.55324601930666e-05,
+      "loss": 0.15957870483398437,
+      "step": 2095
+    },
+    {
+      "epoch": 0.38209606986899564,
+      "grad_norm": 0.17776088416576385,
+      "learning_rate": 3.54656578479583e-05,
+      "loss": 0.1527492880821228,
+      "step": 2100
+    },
+    {
+      "epoch": 0.38300582241630277,
+      "grad_norm": 0.15993724763393402,
+      "learning_rate": 3.539876477572998e-05,
+      "loss": 0.1567505717277527,
+      "step": 2105
+    },
+    {
+      "epoch": 0.3839155749636099,
+      "grad_norm": 0.17067375779151917,
+      "learning_rate": 3.533178155627981e-05,
+      "loss": 0.14660797119140626,
+      "step": 2110
+    },
+    {
+      "epoch": 0.384825327510917,
+      "grad_norm": 0.20239882171154022,
+      "learning_rate": 3.526470877028745e-05,
+      "loss": 0.1596767544746399,
+      "step": 2115
+    },
+    {
+      "epoch": 0.38573508005822416,
+      "grad_norm": 0.1863643079996109,
+      "learning_rate": 3.5197546999209005e-05,
+      "loss": 0.15738571882247926,
+      "step": 2120
+    },
+    {
+      "epoch": 0.3866448326055313,
+      "grad_norm": 0.16994133591651917,
+      "learning_rate": 3.5130296825272014e-05,
+      "loss": 0.16255316734313965,
+      "step": 2125
+    },
+    {
+      "epoch": 0.3875545851528384,
+      "grad_norm": 0.18703415989875793,
+      "learning_rate": 3.5062958831470355e-05,
+      "loss": 0.15206334590911866,
+      "step": 2130
+    },
+    {
+      "epoch": 0.38846433770014555,
+      "grad_norm": 0.15433982014656067,
+      "learning_rate": 3.4995533601559226e-05,
+      "loss": 0.1590178370475769,
+      "step": 2135
+    },
+    {
+      "epoch": 0.3893740902474527,
+      "grad_norm": 0.16498146951198578,
+      "learning_rate": 3.4928021720050104e-05,
+      "loss": 0.14759145975112914,
+      "step": 2140
+    },
+    {
+      "epoch": 0.3902838427947598,
+      "grad_norm": 0.17880478501319885,
+      "learning_rate": 3.486042377220562e-05,
+      "loss": 0.1642458915710449,
+      "step": 2145
+    },
+    {
+      "epoch": 0.39119359534206694,
+      "grad_norm": 0.14700061082839966,
+      "learning_rate": 3.479274034403455e-05,
+      "loss": 0.16105138063430785,
+      "step": 2150
+    },
+    {
+      "epoch": 0.39210334788937407,
+      "grad_norm": 0.1620762050151825,
+      "learning_rate": 3.472497202228664e-05,
+      "loss": 0.15104985237121582,
+      "step": 2155
+    },
+    {
+      "epoch": 0.3930131004366812,
+      "grad_norm": 0.1625058799982071,
+      "learning_rate": 3.4657119394447654e-05,
+      "loss": 0.16145485639572144,
+      "step": 2160
+    },
+    {
+      "epoch": 0.3939228529839884,
+      "grad_norm": 0.1631549596786499,
+      "learning_rate": 3.458918304873417e-05,
+      "loss": 0.16712255477905275,
+      "step": 2165
+    },
+    {
+      "epoch": 0.3948326055312955,
+      "grad_norm": 0.16041551530361176,
+      "learning_rate": 3.452116357408853e-05,
+      "loss": 0.15118330717086792,
+      "step": 2170
+    },
+    {
+      "epoch": 0.39574235807860264,
+      "grad_norm": 0.16692611575126648,
+      "learning_rate": 3.44530615601737e-05,
+      "loss": 0.16982550621032716,
+      "step": 2175
+    },
+    {
+      "epoch": 0.39665211062590977,
+      "grad_norm": 0.16082268953323364,
+      "learning_rate": 3.438487759736821e-05,
+      "loss": 0.1513260006904602,
+      "step": 2180
+    },
+    {
+      "epoch": 0.3975618631732169,
+      "grad_norm": 0.1474589854478836,
+      "learning_rate": 3.4316612276761004e-05,
+      "loss": 0.14968743324279785,
+      "step": 2185
+    },
+    {
+      "epoch": 0.39847161572052403,
+      "grad_norm": 0.14531342685222626,
+      "learning_rate": 3.42482661901463e-05,
+      "loss": 0.1563260555267334,
+      "step": 2190
+    },
+    {
+      "epoch": 0.39938136826783116,
+      "grad_norm": 0.16775506734848022,
+      "learning_rate": 3.41798399300185e-05,
+      "loss": 0.14861010313034057,
+      "step": 2195
+    },
+    {
+      "epoch": 0.4002911208151383,
+      "grad_norm": 0.15065217018127441,
+      "learning_rate": 3.411133408956703e-05,
+      "loss": 0.15559519529342652,
+      "step": 2200
+    },
+    {
+      "epoch": 0.4012008733624454,
+      "grad_norm": 0.16655296087265015,
+      "learning_rate": 3.4042749262671184e-05,
+      "loss": 0.16025567054748535,
+      "step": 2205
+    },
+    {
+      "epoch": 0.40211062590975255,
+      "grad_norm": 0.14773905277252197,
+      "learning_rate": 3.397408604389501e-05,
+      "loss": 0.15074082612991332,
+      "step": 2210
+    },
+    {
+      "epoch": 0.4030203784570597,
+      "grad_norm": 0.16233304142951965,
+      "learning_rate": 3.3905345028482125e-05,
+      "loss": 0.15490520000457764,
+      "step": 2215
+    },
+    {
+      "epoch": 0.4039301310043668,
+      "grad_norm": 0.17520153522491455,
+      "learning_rate": 3.383652681235058e-05,
+      "loss": 0.1517520785331726,
+      "step": 2220
+    },
+    {
+      "epoch": 0.40483988355167394,
+      "grad_norm": 0.14749875664710999,
+      "learning_rate": 3.376763199208766e-05,
+      "loss": 0.15410997867584228,
+      "step": 2225
+    },
+    {
+      "epoch": 0.40574963609898107,
+      "grad_norm": 0.16855919361114502,
+      "learning_rate": 3.369866116494477e-05,
+      "loss": 0.1510261058807373,
+      "step": 2230
+    },
+    {
+      "epoch": 0.4066593886462882,
+      "grad_norm": 0.1594122350215912,
+      "learning_rate": 3.362961492883218e-05,
+      "loss": 0.1493813395500183,
+      "step": 2235
+    },
+    {
+      "epoch": 0.40756914119359533,
+      "grad_norm": 0.13645926117897034,
+      "learning_rate": 3.3560493882313915e-05,
+      "loss": 0.14876762628555298,
+      "step": 2240
+    },
+    {
+      "epoch": 0.40847889374090246,
+      "grad_norm": 0.14304400980472565,
+      "learning_rate": 3.349129862460251e-05,
+      "loss": 0.15567013025283813,
+      "step": 2245
+    },
+    {
+      "epoch": 0.4093886462882096,
+      "grad_norm": 0.17040041089057922,
+      "learning_rate": 3.342202975555386e-05,
+      "loss": 0.1563249945640564,
+      "step": 2250
+    },
+    {
+      "epoch": 0.4102983988355167,
+      "grad_norm": 0.15594671666622162,
+      "learning_rate": 3.3352687875661984e-05,
+      "loss": 0.1546410083770752,
+      "step": 2255
+    },
+    {
+      "epoch": 0.41120815138282385,
+      "grad_norm": 0.1677195280790329,
+      "learning_rate": 3.328327358605384e-05,
+      "loss": 0.15710171461105346,
+      "step": 2260
+    },
+    {
+      "epoch": 0.412117903930131,
+      "grad_norm": 0.1731705516576767,
+      "learning_rate": 3.321378748848412e-05,
+      "loss": 0.16444036960601807,
+      "step": 2265
+    },
+    {
+      "epoch": 0.4130276564774381,
+      "grad_norm": 0.18779033422470093,
+      "learning_rate": 3.3144230185329984e-05,
+      "loss": 0.15659687519073487,
+      "step": 2270
+    },
+    {
+      "epoch": 0.4139374090247453,
+      "grad_norm": 0.1543768346309662,
+      "learning_rate": 3.3074602279585913e-05,
+      "loss": 0.15100739002227784,
+      "step": 2275
+    },
+    {
+      "epoch": 0.4148471615720524,
+      "grad_norm": 0.16672168672084808,
+      "learning_rate": 3.300490437485843e-05,
+      "loss": 0.15535364151000977,
+      "step": 2280
+    },
+    {
+      "epoch": 0.41575691411935956,
+      "grad_norm": 0.16741308569908142,
+      "learning_rate": 3.293513707536089e-05,
+      "loss": 0.15523911714553834,
+      "step": 2285
+    },
+    {
+      "epoch": 0.4166666666666667,
+      "grad_norm": 0.1488303542137146,
+      "learning_rate": 3.286530098590822e-05,
+      "loss": 0.1542000651359558,
+      "step": 2290
+    },
+    {
+      "epoch": 0.4175764192139738,
+      "grad_norm": 0.1637732982635498,
+      "learning_rate": 3.2795396711911694e-05,
+      "loss": 0.15354831218719484,
+      "step": 2295
+    },
+    {
+      "epoch": 0.41848617176128095,
+      "grad_norm": 0.1472022533416748,
+      "learning_rate": 3.272542485937369e-05,
+      "loss": 0.16235145330429077,
+      "step": 2300
+    },
+    {
+      "epoch": 0.4193959243085881,
+      "grad_norm": 0.15908290445804596,
+      "learning_rate": 3.265538603488241e-05,
+      "loss": 0.15642645359039306,
+      "step": 2305
+    },
+    {
+      "epoch": 0.4203056768558952,
+      "grad_norm": 0.1584865301847458,
+      "learning_rate": 3.2585280845606645e-05,
+      "loss": 0.15490249395370484,
+      "step": 2310
+    },
+    {
+      "epoch": 0.42121542940320233,
+      "grad_norm": 0.15893949568271637,
+      "learning_rate": 3.251510989929052e-05,
+      "loss": 0.1598116159439087,
+      "step": 2315
+    },
+    {
+      "epoch": 0.42212518195050946,
+      "grad_norm": 0.18930596113204956,
+      "learning_rate": 3.244487380424817e-05,
+      "loss": 0.1482008934020996,
+      "step": 2320
+    },
+    {
+      "epoch": 0.4230349344978166,
+      "grad_norm": 0.132876455783844,
+      "learning_rate": 3.237457316935856e-05,
+      "loss": 0.15304710865020751,
+      "step": 2325
+    },
+    {
+      "epoch": 0.4239446870451237,
+      "grad_norm": 0.16447032988071442,
+      "learning_rate": 3.2304208604060106e-05,
+      "loss": 0.15298750400543212,
+      "step": 2330
+    },
+    {
+      "epoch": 0.42485443959243085,
+      "grad_norm": 0.17748120427131653,
+      "learning_rate": 3.223378071834546e-05,
+      "loss": 0.1556084156036377,
+      "step": 2335
+    },
+    {
+      "epoch": 0.425764192139738,
+      "grad_norm": 0.16366586089134216,
+      "learning_rate": 3.2163290122756206e-05,
+      "loss": 0.14387927055358887,
+      "step": 2340
+    },
+    {
+      "epoch": 0.4266739446870451,
+      "grad_norm": 0.15398970246315002,
+      "learning_rate": 3.209273742837755e-05,
+      "loss": 0.16091293096542358,
+      "step": 2345
+    },
+    {
+      "epoch": 0.42758369723435224,
+      "grad_norm": 0.164212167263031,
+      "learning_rate": 3.202212324683305e-05,
+      "loss": 0.15523531436920165,
+      "step": 2350
+    },
+    {
+      "epoch": 0.4284934497816594,
+      "grad_norm": 0.16749800741672516,
+      "learning_rate": 3.1951448190279255e-05,
+      "loss": 0.15354975461959838,
+      "step": 2355
+    },
+    {
+      "epoch": 0.4294032023289665,
+      "grad_norm": 0.14137034118175507,
+      "learning_rate": 3.18807128714005e-05,
+      "loss": 0.14981694221496583,
+      "step": 2360
+    },
+    {
+      "epoch": 0.43031295487627363,
+      "grad_norm": 0.14848439395427704,
+      "learning_rate": 3.1809917903403507e-05,
+      "loss": 0.15448769330978393,
+      "step": 2365
+    },
+    {
+      "epoch": 0.43122270742358076,
+      "grad_norm": 0.1747605800628662,
+      "learning_rate": 3.1739063900012095e-05,
+      "loss": 0.15882387161254882,
+      "step": 2370
+    },
+    {
+      "epoch": 0.4321324599708879,
+      "grad_norm": 0.16054467856884003,
+      "learning_rate": 3.166815147546186e-05,
+      "loss": 0.15170297622680665,
+      "step": 2375
+    },
+    {
+      "epoch": 0.433042212518195,
+      "grad_norm": 0.15428027510643005,
+      "learning_rate": 3.1597181244494886e-05,
+      "loss": 0.16202548742294312,
+      "step": 2380
+    },
+    {
+      "epoch": 0.4339519650655022,
+      "grad_norm": 0.16747219860553741,
+      "learning_rate": 3.1526153822354325e-05,
+      "loss": 0.15461477041244506,
+      "step": 2385
+    },
+    {
+      "epoch": 0.43486171761280934,
+      "grad_norm": 0.17415772378444672,
+      "learning_rate": 3.145506982477918e-05,
+      "loss": 0.16173542737960817,
+      "step": 2390
+    },
+    {
+      "epoch": 0.43577147016011647,
+      "grad_norm": 0.1293518990278244,
+      "learning_rate": 3.1383929867998865e-05,
+      "loss": 0.15572521686553956,
+      "step": 2395
+    },
+    {
+      "epoch": 0.4366812227074236,
+      "grad_norm": 0.16909323632717133,
+      "learning_rate": 3.1312734568727935e-05,
+      "loss": 0.15898628234863282,
+      "step": 2400
+    },
+    {
+      "epoch": 0.43759097525473073,
+      "grad_norm": 0.16770294308662415,
+      "learning_rate": 3.124148454416069e-05,
+      "loss": 0.1536281704902649,
+      "step": 2405
+    },
+    {
+      "epoch": 0.43850072780203786,
+      "grad_norm": 0.14078612625598907,
+      "learning_rate": 3.117018041196585e-05,
+      "loss": 0.15274266004562378,
+      "step": 2410
+    },
+    {
+      "epoch": 0.439410480349345,
+      "grad_norm": 0.15457536280155182,
+      "learning_rate": 3.1098822790281226e-05,
+      "loss": 0.15391263961791993,
+      "step": 2415
+    },
+    {
+      "epoch": 0.4403202328966521,
+      "grad_norm": 0.1640717089176178,
+      "learning_rate": 3.102741229770827e-05,
+      "loss": 0.15515168905258178,
+      "step": 2420
+    },
+    {
+      "epoch": 0.44122998544395925,
+      "grad_norm": 0.2601533830165863,
+      "learning_rate": 3.095594955330683e-05,
+      "loss": 0.1587247371673584,
+      "step": 2425
+    },
+    {
+      "epoch": 0.4421397379912664,
+      "grad_norm": 0.1352529525756836,
+      "learning_rate": 3.08844351765897e-05,
+      "loss": 0.1483217477798462,
+      "step": 2430
+    },
+    {
+      "epoch": 0.4430494905385735,
+      "grad_norm": 0.18479721248149872,
+      "learning_rate": 3.081286978751728e-05,
+      "loss": 0.15121787786483765,
+      "step": 2435
+    },
+    {
+      "epoch": 0.44395924308588064,
+      "grad_norm": 0.16954511404037476,
+      "learning_rate": 3.074125400649221e-05,
+      "loss": 0.16073100566864013,
+      "step": 2440
+    },
+    {
+      "epoch": 0.44486899563318777,
+      "grad_norm": 0.15154729783535004,
+      "learning_rate": 3.0669588454353944e-05,
+      "loss": 0.15738017559051515,
+      "step": 2445
+    },
+    {
+      "epoch": 0.4457787481804949,
+      "grad_norm": 0.1540488302707672,
+      "learning_rate": 3.059787375237344e-05,
+      "loss": 0.1515384554862976,
+      "step": 2450
+    },
+    {
+      "epoch": 0.44668850072780203,
+      "grad_norm": 0.1814432442188263,
+      "learning_rate": 3.052611052224774e-05,
+      "loss": 0.15731438398361205,
+      "step": 2455
+    },
+    {
+      "epoch": 0.44759825327510916,
+      "grad_norm": 0.16657036542892456,
+      "learning_rate": 3.0454299386094542e-05,
+      "loss": 0.15741543769836425,
+      "step": 2460
+    },
+    {
+      "epoch": 0.4485080058224163,
+      "grad_norm": 0.2177237570285797,
+      "learning_rate": 3.0382440966446875e-05,
+      "loss": 0.14972515106201173,
+      "step": 2465
+    },
+    {
+      "epoch": 0.4494177583697234,
+      "grad_norm": 0.1669909954071045,
+      "learning_rate": 3.031053588624766e-05,
+      "loss": 0.1506432294845581,
+      "step": 2470
+    },
+    {
+      "epoch": 0.45032751091703055,
+      "grad_norm": 0.1752234250307083,
+      "learning_rate": 3.0238584768844313e-05,
+      "loss": 0.14969609975814818,
+      "step": 2475
+    },
+    {
+      "epoch": 0.4512372634643377,
+      "grad_norm": 0.18267901241779327,
+      "learning_rate": 3.0166588237983363e-05,
+      "loss": 0.15112748146057128,
+      "step": 2480
+    },
+    {
+      "epoch": 0.4521470160116448,
+      "grad_norm": 0.16250105202198029,
+      "learning_rate": 3.0094546917805007e-05,
+      "loss": 0.15864100456237792,
+      "step": 2485
+    },
+    {
+      "epoch": 0.45305676855895194,
+      "grad_norm": 0.14825721085071564,
+      "learning_rate": 3.0022461432837752e-05,
+      "loss": 0.1513954520225525,
+      "step": 2490
+    },
+    {
+      "epoch": 0.4539665211062591,
+      "grad_norm": 0.1626640111207962,
+      "learning_rate": 2.9950332407992943e-05,
+      "loss": 0.1505578875541687,
+      "step": 2495
+    },
+    {
+      "epoch": 0.45487627365356625,
+      "grad_norm": 0.1535351574420929,
+      "learning_rate": 2.987816046855939e-05,
+      "loss": 0.15255829095840454,
+      "step": 2500
+    },
+    {
+      "epoch": 0.4557860262008734,
+      "grad_norm": 0.17552775144577026,
+      "learning_rate": 2.9805946240197928e-05,
+      "loss": 0.1516443133354187,
+      "step": 2505
+    },
+    {
+      "epoch": 0.4566957787481805,
+      "grad_norm": 0.16020981967449188,
+      "learning_rate": 2.9733690348935994e-05,
+      "loss": 0.14519743919372557,
+      "step": 2510
+    },
+    {
+      "epoch": 0.45760553129548764,
+      "grad_norm": 0.17800211906433105,
+      "learning_rate": 2.9661393421162204e-05,
+      "loss": 0.15679080486297609,
+      "step": 2515
+    },
+    {
+      "epoch": 0.4585152838427948,
+      "grad_norm": 0.16016991436481476,
+      "learning_rate": 2.9589056083620902e-05,
+      "loss": 0.14768127202987671,
+      "step": 2520
+    },
+    {
+      "epoch": 0.4594250363901019,
+      "grad_norm": 0.16272081434726715,
+      "learning_rate": 2.951667896340679e-05,
+      "loss": 0.1513301968574524,
+      "step": 2525
+    },
+    {
+      "epoch": 0.46033478893740903,
+      "grad_norm": 0.1726413071155548,
+      "learning_rate": 2.9444262687959402e-05,
+      "loss": 0.14819332361221313,
+      "step": 2530
+    },
+    {
+      "epoch": 0.46124454148471616,
+      "grad_norm": 0.1670403778553009,
+      "learning_rate": 2.9371807885057735e-05,
+      "loss": 0.15245940685272216,
+      "step": 2535
+    },
+    {
+      "epoch": 0.4621542940320233,
+      "grad_norm": 0.1650049239397049,
+      "learning_rate": 2.9299315182814772e-05,
+      "loss": 0.15187418460845947,
+      "step": 2540
+    },
+    {
+      "epoch": 0.4630640465793304,
+      "grad_norm": 0.16327734291553497,
+      "learning_rate": 2.9226785209672047e-05,
+      "loss": 0.15579828023910522,
+      "step": 2545
+    },
+    {
+      "epoch": 0.46397379912663755,
+      "grad_norm": 0.3367880582809448,
+      "learning_rate": 2.91542185943942e-05,
+      "loss": 0.15617697238922118,
+      "step": 2550
+    },
+    {
+      "epoch": 0.4648835516739447,
+      "grad_norm": 0.1731594055891037,
+      "learning_rate": 2.908161596606353e-05,
+      "loss": 0.1559603691101074,
+      "step": 2555
+    },
+    {
+      "epoch": 0.4657933042212518,
+      "grad_norm": 0.1477293074131012,
+      "learning_rate": 2.9008977954074517e-05,
+      "loss": 0.15567959547042848,
+      "step": 2560
+    },
+    {
+      "epoch": 0.46670305676855894,
+      "grad_norm": 0.16227173805236816,
+      "learning_rate": 2.8936305188128392e-05,
+      "loss": 0.1522113561630249,
+      "step": 2565
+    },
+    {
+      "epoch": 0.4676128093158661,
+      "grad_norm": 0.2031075656414032,
+      "learning_rate": 2.8863598298227674e-05,
+      "loss": 0.15054640769958497,
+      "step": 2570
+    },
+    {
+      "epoch": 0.4685225618631732,
+      "grad_norm": 0.18351472914218903,
+      "learning_rate": 2.8790857914670698e-05,
+      "loss": 0.15837019681930542,
+      "step": 2575
+    },
+    {
+      "epoch": 0.46943231441048033,
+      "grad_norm": 0.15914765000343323,
+      "learning_rate": 2.871808466804616e-05,
+      "loss": 0.1550259470939636,
+      "step": 2580
+    },
+    {
+      "epoch": 0.47034206695778746,
+      "grad_norm": 0.17366717755794525,
+      "learning_rate": 2.8645279189227636e-05,
+      "loss": 0.15702390670776367,
+      "step": 2585
+    },
+    {
+      "epoch": 0.4712518195050946,
+      "grad_norm": 0.13677838444709778,
+      "learning_rate": 2.8572442109368134e-05,
+      "loss": 0.15485031604766847,
+      "step": 2590
+    },
+    {
+      "epoch": 0.4721615720524017,
+      "grad_norm": 0.1477748304605484,
+      "learning_rate": 2.8499574059894617e-05,
+      "loss": 0.14577245712280273,
+      "step": 2595
+    },
+    {
+      "epoch": 0.47307132459970885,
+      "grad_norm": 0.1582217663526535,
+      "learning_rate": 2.842667567250252e-05,
+      "loss": 0.15586793422698975,
+      "step": 2600
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.434562636887095e+18,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-2600/training_args.bin b/checkpoint-2600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-2600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/checkpoint-2700/README.md b/checkpoint-2700/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-2700/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-2700/adapter_config.json b/checkpoint-2700/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-2700/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-2700/adapter_model.safetensors b/checkpoint-2700/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..203992c22dcd71e4ca8c03add6274511415a9c58
--- /dev/null
+++ b/checkpoint-2700/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5228b3e1f8e018ba523a5a58ecc10a22b9a43c4013b47bcbff970a2bb3659f41
+size 169741912
diff --git a/checkpoint-2700/chat_template.jinja b/checkpoint-2700/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-2700/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-2700/optimizer.pt b/checkpoint-2700/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..685f98980bb4ccd9a276b3f51ce543a8f682776d
--- /dev/null
+++ b/checkpoint-2700/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b4bd0f7cf5bad80e0365b13b8cffad014af469876163af9316ef3ed54d554656
+size 72807355
diff --git a/checkpoint-2700/processor_config.json b/checkpoint-2700/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-2700/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-2700/rng_state.pth b/checkpoint-2700/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-2700/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-2700/scheduler.pt b/checkpoint-2700/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1368bc7697b897a75c8ff7e6470c07f1fb933def
--- /dev/null
+++ b/checkpoint-2700/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c7f25b53574ff484cfa756a6e440f8376a502b769885540ab8e7db003faf2b7a
+size 1465
diff --git a/checkpoint-2700/tokenizer.json b/checkpoint-2700/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-2700/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-2700/tokenizer_config.json b/checkpoint-2700/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-2700/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-2700/trainer_state.json b/checkpoint-2700/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..6e2457148503f2ee163a1aec96a13cf71b4e4d3e
--- /dev/null
+++ b/checkpoint-2700/trainer_state.json
@@ -0,0 +1,3822 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.4912663755458515,
+  "eval_steps": 100,
+  "global_step": 2700,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    },
+    {
+      "epoch": 0.03729985443959243,
+      "grad_norm": 0.061678580939769745,
+      "learning_rate": 4.999340748636462e-05,
+      "loss": 0.24956226348876953,
+      "step": 205
+    },
+    {
+      "epoch": 0.03820960698689956,
+      "grad_norm": 0.07328873127698898,
+      "learning_rate": 4.999160884067051e-05,
+      "loss": 0.26169676780700685,
+      "step": 210
+    },
+    {
+      "epoch": 0.0391193595342067,
+      "grad_norm": 0.08287990838289261,
+      "learning_rate": 4.9989593541926246e-05,
+      "loss": 0.2574604034423828,
+      "step": 215
+    },
+    {
+      "epoch": 0.04002911208151383,
+      "grad_norm": 0.06787359714508057,
+      "learning_rate": 4.9987361607602525e-05,
+      "loss": 0.25351409912109374,
+      "step": 220
+    },
+    {
+      "epoch": 0.04093886462882096,
+      "grad_norm": 0.06695502996444702,
+      "learning_rate": 4.998491305704805e-05,
+      "loss": 0.24522039890289307,
+      "step": 225
+    },
+    {
+      "epoch": 0.041848617176128096,
+      "grad_norm": 0.08872214704751968,
+      "learning_rate": 4.9982247911489375e-05,
+      "loss": 0.2581867933273315,
+      "step": 230
+    },
+    {
+      "epoch": 0.042758369723435226,
+      "grad_norm": 0.07637131959199905,
+      "learning_rate": 4.9979366194030743e-05,
+      "loss": 0.25569658279418944,
+      "step": 235
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 0.08158119022846222,
+      "learning_rate": 4.997626792965385e-05,
+      "loss": 0.2529409646987915,
+      "step": 240
+    },
+    {
+      "epoch": 0.04457787481804949,
+      "grad_norm": 0.07529161125421524,
+      "learning_rate": 4.997295314521766e-05,
+      "loss": 0.24049024581909179,
+      "step": 245
+    },
+    {
+      "epoch": 0.04548762736535662,
+      "grad_norm": 0.08860139548778534,
+      "learning_rate": 4.996942186945813e-05,
+      "loss": 0.2490522861480713,
+      "step": 250
+    },
+    {
+      "epoch": 0.04639737991266375,
+      "grad_norm": 0.0850321501493454,
+      "learning_rate": 4.9965674132988005e-05,
+      "loss": 0.24180831909179687,
+      "step": 255
+    },
+    {
+      "epoch": 0.04730713245997089,
+      "grad_norm": 0.07556115090847015,
+      "learning_rate": 4.996170996829653e-05,
+      "loss": 0.2509631872177124,
+      "step": 260
+    },
+    {
+      "epoch": 0.04821688500727802,
+      "grad_norm": 0.07971206307411194,
+      "learning_rate": 4.995752940974918e-05,
+      "loss": 0.24398891925811766,
+      "step": 265
+    },
+    {
+      "epoch": 0.04912663755458515,
+      "grad_norm": 0.09149336814880371,
+      "learning_rate": 4.9953132493587344e-05,
+      "loss": 0.2300492286682129,
+      "step": 270
+    },
+    {
+      "epoch": 0.050036390101892286,
+      "grad_norm": 0.08265820890665054,
+      "learning_rate": 4.9948519257928034e-05,
+      "loss": 0.24246792793273925,
+      "step": 275
+    },
+    {
+      "epoch": 0.050946142649199416,
+      "grad_norm": 0.10328587144613266,
+      "learning_rate": 4.9943689742763534e-05,
+      "loss": 0.2367171049118042,
+      "step": 280
+    },
+    {
+      "epoch": 0.05185589519650655,
+      "grad_norm": 0.0836917981505394,
+      "learning_rate": 4.993864398996105e-05,
+      "loss": 0.23215813636779786,
+      "step": 285
+    },
+    {
+      "epoch": 0.05276564774381368,
+      "grad_norm": 0.09475161135196686,
+      "learning_rate": 4.99333820432624e-05,
+      "loss": 0.2350748062133789,
+      "step": 290
+    },
+    {
+      "epoch": 0.05367540029112081,
+      "grad_norm": 0.08040128648281097,
+      "learning_rate": 4.992790394828355e-05,
+      "loss": 0.23253886699676513,
+      "step": 295
+    },
+    {
+      "epoch": 0.05458515283842795,
+      "grad_norm": 0.08852150291204453,
+      "learning_rate": 4.992220975251428e-05,
+      "loss": 0.23856515884399415,
+      "step": 300
+    },
+    {
+      "epoch": 0.05549490538573508,
+      "grad_norm": 0.09565229713916779,
+      "learning_rate": 4.991629950531775e-05,
+      "loss": 0.23311660289764405,
+      "step": 305
+    },
+    {
+      "epoch": 0.05640465793304221,
+      "grad_norm": 0.08158160001039505,
+      "learning_rate": 4.991017325793009e-05,
+      "loss": 0.22467944622039795,
+      "step": 310
+    },
+    {
+      "epoch": 0.05731441048034935,
+      "grad_norm": 0.07746429741382599,
+      "learning_rate": 4.990383106345994e-05,
+      "loss": 0.229844069480896,
+      "step": 315
+    },
+    {
+      "epoch": 0.05822416302765648,
+      "grad_norm": 0.08564355969429016,
+      "learning_rate": 4.989727297688797e-05,
+      "loss": 0.22414517402648926,
+      "step": 320
+    },
+    {
+      "epoch": 0.05913391557496361,
+      "grad_norm": 0.07517435401678085,
+      "learning_rate": 4.9890499055066435e-05,
+      "loss": 0.2236532211303711,
+      "step": 325
+    },
+    {
+      "epoch": 0.060043668122270744,
+      "grad_norm": 0.111734539270401,
+      "learning_rate": 4.988350935671869e-05,
+      "loss": 0.21474847793579102,
+      "step": 330
+    },
+    {
+      "epoch": 0.060953420669577874,
+      "grad_norm": 0.09906989336013794,
+      "learning_rate": 4.987630394243866e-05,
+      "loss": 0.23321933746337892,
+      "step": 335
+    },
+    {
+      "epoch": 0.06186317321688501,
+      "grad_norm": 0.10131457448005676,
+      "learning_rate": 4.98688828746903e-05,
+      "loss": 0.2310662031173706,
+      "step": 340
+    },
+    {
+      "epoch": 0.06277292576419213,
+      "grad_norm": 0.09203507006168365,
+      "learning_rate": 4.986124621780708e-05,
+      "loss": 0.22021169662475587,
+      "step": 345
+    },
+    {
+      "epoch": 0.06368267831149928,
+      "grad_norm": 0.09505912661552429,
+      "learning_rate": 4.9853394037991416e-05,
+      "loss": 0.2197155237197876,
+      "step": 350
+    },
+    {
+      "epoch": 0.06459243085880641,
+      "grad_norm": 0.09038657695055008,
+      "learning_rate": 4.984532640331412e-05,
+      "loss": 0.22066287994384765,
+      "step": 355
+    },
+    {
+      "epoch": 0.06550218340611354,
+      "grad_norm": 0.09707064181566238,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 0.22455451488494874,
+      "step": 360
+    },
+    {
+      "epoch": 0.06641193595342067,
+      "grad_norm": 0.10367228090763092,
+      "learning_rate": 4.98285450509961e-05,
+      "loss": 0.21993820667266845,
+      "step": 365
+    },
+    {
+      "epoch": 0.0673216885007278,
+      "grad_norm": 0.12229471653699875,
+      "learning_rate": 4.9819831478833456e-05,
+      "loss": 0.2168867588043213,
+      "step": 370
+    },
+    {
+      "epoch": 0.06823144104803494,
+      "grad_norm": 0.0964592918753624,
+      "learning_rate": 4.981090274276406e-05,
+      "loss": 0.21579203605651856,
+      "step": 375
+    },
+    {
+      "epoch": 0.06914119359534207,
+      "grad_norm": 0.09400496631860733,
+      "learning_rate": 4.980175892019141e-05,
+      "loss": 0.20972180366516113,
+      "step": 380
+    },
+    {
+      "epoch": 0.0700509461426492,
+      "grad_norm": 0.08158645778894424,
+      "learning_rate": 4.9792400090383594e-05,
+      "loss": 0.22148358821868896,
+      "step": 385
+    },
+    {
+      "epoch": 0.07096069868995633,
+      "grad_norm": 0.10916394740343094,
+      "learning_rate": 4.978282633447261e-05,
+      "loss": 0.2214418649673462,
+      "step": 390
+    },
+    {
+      "epoch": 0.07187045123726346,
+      "grad_norm": 0.11138810962438583,
+      "learning_rate": 4.9773037735453636e-05,
+      "loss": 0.21814754009246826,
+      "step": 395
+    },
+    {
+      "epoch": 0.07278020378457059,
+      "grad_norm": 0.10914396494626999,
+      "learning_rate": 4.9763034378184365e-05,
+      "loss": 0.21310818195343018,
+      "step": 400
+    },
+    {
+      "epoch": 0.07368995633187773,
+      "grad_norm": 0.1043366864323616,
+      "learning_rate": 4.975281634938421e-05,
+      "loss": 0.21266789436340333,
+      "step": 405
+    },
+    {
+      "epoch": 0.07459970887918486,
+      "grad_norm": 0.1036868542432785,
+      "learning_rate": 4.9742383737633594e-05,
+      "loss": 0.21606721878051757,
+      "step": 410
+    },
+    {
+      "epoch": 0.075509461426492,
+      "grad_norm": 0.11640442907810211,
+      "learning_rate": 4.9731736633373144e-05,
+      "loss": 0.21532948017120362,
+      "step": 415
+    },
+    {
+      "epoch": 0.07641921397379912,
+      "grad_norm": 0.11219926178455353,
+      "learning_rate": 4.9720875128902956e-05,
+      "loss": 0.2191627025604248,
+      "step": 420
+    },
+    {
+      "epoch": 0.07732896652110625,
+      "grad_norm": 0.12103637307882309,
+      "learning_rate": 4.970979931838176e-05,
+      "loss": 0.20938868522644044,
+      "step": 425
+    },
+    {
+      "epoch": 0.0782387190684134,
+      "grad_norm": 0.13274189829826355,
+      "learning_rate": 4.96985092978261e-05,
+      "loss": 0.21792960166931152,
+      "step": 430
+    },
+    {
+      "epoch": 0.07914847161572053,
+      "grad_norm": 0.11164513230323792,
+      "learning_rate": 4.968700516510954e-05,
+      "loss": 0.2022618055343628,
+      "step": 435
+    },
+    {
+      "epoch": 0.08005822416302766,
+      "grad_norm": 0.09532847255468369,
+      "learning_rate": 4.967528701996174e-05,
+      "loss": 0.21255812644958497,
+      "step": 440
+    },
+    {
+      "epoch": 0.08096797671033479,
+      "grad_norm": 0.10279258340597153,
+      "learning_rate": 4.96633549639677e-05,
+      "loss": 0.20683050155639648,
+      "step": 445
+    },
+    {
+      "epoch": 0.08187772925764192,
+      "grad_norm": 0.1257462352514267,
+      "learning_rate": 4.965120910056677e-05,
+      "loss": 0.21419920921325683,
+      "step": 450
+    },
+    {
+      "epoch": 0.08278748180494905,
+      "grad_norm": 0.11663137376308441,
+      "learning_rate": 4.963884953505186e-05,
+      "loss": 0.2072287082672119,
+      "step": 455
+    },
+    {
+      "epoch": 0.08369723435225619,
+      "grad_norm": 0.10488224029541016,
+      "learning_rate": 4.96262763745684e-05,
+      "loss": 0.1982678532600403,
+      "step": 460
+    },
+    {
+      "epoch": 0.08460698689956332,
+      "grad_norm": 0.11801692098379135,
+      "learning_rate": 4.961348972811354e-05,
+      "loss": 0.20662031173706055,
+      "step": 465
+    },
+    {
+      "epoch": 0.08551673944687045,
+      "grad_norm": 0.11318827420473099,
+      "learning_rate": 4.96004897065351e-05,
+      "loss": 0.20947303771972656,
+      "step": 470
+    },
+    {
+      "epoch": 0.08642649199417758,
+      "grad_norm": 0.13409486413002014,
+      "learning_rate": 4.95872764225307e-05,
+      "loss": 0.19670876264572143,
+      "step": 475
+    },
+    {
+      "epoch": 0.08733624454148471,
+      "grad_norm": 0.14440792798995972,
+      "learning_rate": 4.957384999064672e-05,
+      "loss": 0.19842848777770997,
+      "step": 480
+    },
+    {
+      "epoch": 0.08824599708879186,
+      "grad_norm": 0.12246996909379959,
+      "learning_rate": 4.956021052727731e-05,
+      "loss": 0.20318071842193602,
+      "step": 485
+    },
+    {
+      "epoch": 0.08915574963609899,
+      "grad_norm": 0.13437233865261078,
+      "learning_rate": 4.954635815066342e-05,
+      "loss": 0.21675212383270265,
+      "step": 490
+    },
+    {
+      "epoch": 0.09006550218340612,
+      "grad_norm": 0.11109672486782074,
+      "learning_rate": 4.9532292980891744e-05,
+      "loss": 0.2100757837295532,
+      "step": 495
+    },
+    {
+      "epoch": 0.09097525473071325,
+      "grad_norm": 0.1388893872499466,
+      "learning_rate": 4.9518015139893675e-05,
+      "loss": 0.20303285121917725,
+      "step": 500
+    },
+    {
+      "epoch": 0.09188500727802038,
+      "grad_norm": 0.13239721953868866,
+      "learning_rate": 4.950352475144427e-05,
+      "loss": 0.2152268409729004,
+      "step": 505
+    },
+    {
+      "epoch": 0.0927947598253275,
+      "grad_norm": 0.12834979593753815,
+      "learning_rate": 4.948882194116119e-05,
+      "loss": 0.20799248218536376,
+      "step": 510
+    },
+    {
+      "epoch": 0.09370451237263465,
+      "grad_norm": 0.11886704713106155,
+      "learning_rate": 4.947390683650354e-05,
+      "loss": 0.20394976139068605,
+      "step": 515
+    },
+    {
+      "epoch": 0.09461426491994178,
+      "grad_norm": 0.11398876458406448,
+      "learning_rate": 4.945877956677083e-05,
+      "loss": 0.2091092586517334,
+      "step": 520
+    },
+    {
+      "epoch": 0.09552401746724891,
+      "grad_norm": 0.1422540694475174,
+      "learning_rate": 4.944344026310186e-05,
+      "loss": 0.19564238786697388,
+      "step": 525
+    },
+    {
+      "epoch": 0.09643377001455604,
+      "grad_norm": 0.11359584331512451,
+      "learning_rate": 4.9427889058473535e-05,
+      "loss": 0.20493624210357667,
+      "step": 530
+    },
+    {
+      "epoch": 0.09734352256186317,
+      "grad_norm": 0.11703553050756454,
+      "learning_rate": 4.941212608769974e-05,
+      "loss": 0.2098615884780884,
+      "step": 535
+    },
+    {
+      "epoch": 0.0982532751091703,
+      "grad_norm": 0.14552047848701477,
+      "learning_rate": 4.939615148743017e-05,
+      "loss": 0.20382182598114013,
+      "step": 540
+    },
+    {
+      "epoch": 0.09916302765647744,
+      "grad_norm": 0.13178016245365143,
+      "learning_rate": 4.937996539614914e-05,
+      "loss": 0.19901862144470214,
+      "step": 545
+    },
+    {
+      "epoch": 0.10007278020378457,
+      "grad_norm": 0.635392427444458,
+      "learning_rate": 4.936356795417439e-05,
+      "loss": 0.20694944858551026,
+      "step": 550
+    },
+    {
+      "epoch": 0.1009825327510917,
+      "grad_norm": 0.15019077062606812,
+      "learning_rate": 4.934695930365586e-05,
+      "loss": 0.19313746690750122,
+      "step": 555
+    },
+    {
+      "epoch": 0.10189228529839883,
+      "grad_norm": 0.12941956520080566,
+      "learning_rate": 4.9330139588574474e-05,
+      "loss": 0.19671722650527954,
+      "step": 560
+    },
+    {
+      "epoch": 0.10280203784570596,
+      "grad_norm": 0.13818831741809845,
+      "learning_rate": 4.931310895474088e-05,
+      "loss": 0.20026786327362062,
+      "step": 565
+    },
+    {
+      "epoch": 0.1037117903930131,
+      "grad_norm": 0.12011194974184036,
+      "learning_rate": 4.929586754979417e-05,
+      "loss": 0.1932437539100647,
+      "step": 570
+    },
+    {
+      "epoch": 0.10462154294032024,
+      "grad_norm": 0.1345364898443222,
+      "learning_rate": 4.9278415523200644e-05,
+      "loss": 0.20245940685272218,
+      "step": 575
+    },
+    {
+      "epoch": 0.10553129548762737,
+      "grad_norm": 0.13281017541885376,
+      "learning_rate": 4.926075302625247e-05,
+      "loss": 0.19864981174468993,
+      "step": 580
+    },
+    {
+      "epoch": 0.1064410480349345,
+      "grad_norm": 0.13465586304664612,
+      "learning_rate": 4.924288021206639e-05,
+      "loss": 0.19573183059692384,
+      "step": 585
+    },
+    {
+      "epoch": 0.10735080058224163,
+      "grad_norm": 0.15225961804389954,
+      "learning_rate": 4.9224797235582396e-05,
+      "loss": 0.19946801662445068,
+      "step": 590
+    },
+    {
+      "epoch": 0.10826055312954876,
+      "grad_norm": 0.12816746532917023,
+      "learning_rate": 4.92065042535624e-05,
+      "loss": 0.19851526021957397,
+      "step": 595
+    },
+    {
+      "epoch": 0.1091703056768559,
+      "grad_norm": 0.13802853226661682,
+      "learning_rate": 4.9188001424588824e-05,
+      "loss": 0.19321763515472412,
+      "step": 600
+    },
+    {
+      "epoch": 0.11008005822416303,
+      "grad_norm": 0.17504797875881195,
+      "learning_rate": 4.9169288909063295e-05,
+      "loss": 0.2032616138458252,
+      "step": 605
+    },
+    {
+      "epoch": 0.11098981077147016,
+      "grad_norm": 0.13544194400310516,
+      "learning_rate": 4.91503668692052e-05,
+      "loss": 0.2011256456375122,
+      "step": 610
+    },
+    {
+      "epoch": 0.11189956331877729,
+      "grad_norm": 1.3976134061813354,
+      "learning_rate": 4.91312354690503e-05,
+      "loss": 0.19916868209838867,
+      "step": 615
+    },
+    {
+      "epoch": 0.11280931586608442,
+      "grad_norm": 0.1465059071779251,
+      "learning_rate": 4.91118948744493e-05,
+      "loss": 0.19487457275390624,
+      "step": 620
+    },
+    {
+      "epoch": 0.11371906841339156,
+      "grad_norm": 0.12103168666362762,
+      "learning_rate": 4.909234525306645e-05,
+      "loss": 0.1907251238822937,
+      "step": 625
+    },
+    {
+      "epoch": 0.1146288209606987,
+      "grad_norm": 0.12660574913024902,
+      "learning_rate": 4.907258677437802e-05,
+      "loss": 0.19327253103256226,
+      "step": 630
+    },
+    {
+      "epoch": 0.11553857350800582,
+      "grad_norm": 0.1347813606262207,
+      "learning_rate": 4.90526196096709e-05,
+      "loss": 0.19637736082077026,
+      "step": 635
+    },
+    {
+      "epoch": 0.11644832605531295,
+      "grad_norm": 0.14953652024269104,
+      "learning_rate": 4.903244393204107e-05,
+      "loss": 0.20325069427490233,
+      "step": 640
+    },
+    {
+      "epoch": 0.11735807860262008,
+      "grad_norm": 0.13936272263526917,
+      "learning_rate": 4.901205991639213e-05,
+      "loss": 0.1930275321006775,
+      "step": 645
+    },
+    {
+      "epoch": 0.11826783114992721,
+      "grad_norm": 0.1448420137166977,
+      "learning_rate": 4.899146773943374e-05,
+      "loss": 0.20026936531066894,
+      "step": 650
+    },
+    {
+      "epoch": 0.11917758369723436,
+      "grad_norm": 0.1312534064054489,
+      "learning_rate": 4.897066757968014e-05,
+      "loss": 0.19062033891677857,
+      "step": 655
+    },
+    {
+      "epoch": 0.12008733624454149,
+      "grad_norm": 0.13644742965698242,
+      "learning_rate": 4.894965961744859e-05,
+      "loss": 0.18719595670700073,
+      "step": 660
+    },
+    {
+      "epoch": 0.12099708879184862,
+      "grad_norm": 0.14276087284088135,
+      "learning_rate": 4.892844403485777e-05,
+      "loss": 0.19784307479858398,
+      "step": 665
+    },
+    {
+      "epoch": 0.12190684133915575,
+      "grad_norm": 0.14735399186611176,
+      "learning_rate": 4.890702101582623e-05,
+      "loss": 0.19163782596588136,
+      "step": 670
+    },
+    {
+      "epoch": 0.12281659388646288,
+      "grad_norm": 0.15742065012454987,
+      "learning_rate": 4.888539074607082e-05,
+      "loss": 0.19312986135482788,
+      "step": 675
+    },
+    {
+      "epoch": 0.12372634643377002,
+      "grad_norm": 0.12917031347751617,
+      "learning_rate": 4.8863553413105025e-05,
+      "loss": 0.20066320896148682,
+      "step": 680
+    },
+    {
+      "epoch": 0.12463609898107715,
+      "grad_norm": 0.1484801322221756,
+      "learning_rate": 4.884150920623737e-05,
+      "loss": 0.20096096992492676,
+      "step": 685
+    },
+    {
+      "epoch": 0.12554585152838427,
+      "grad_norm": 0.1455296128988266,
+      "learning_rate": 4.88192583165698e-05,
+      "loss": 0.20518505573272705,
+      "step": 690
+    },
+    {
+      "epoch": 0.12645560407569142,
+      "grad_norm": 0.14517490565776825,
+      "learning_rate": 4.879680093699598e-05,
+      "loss": 0.18859238624572755,
+      "step": 695
+    },
+    {
+      "epoch": 0.12736535662299855,
+      "grad_norm": 0.18778090178966522,
+      "learning_rate": 4.877413726219964e-05,
+      "loss": 0.197074818611145,
+      "step": 700
+    },
+    {
+      "epoch": 0.12827510917030568,
+      "grad_norm": 0.13497677445411682,
+      "learning_rate": 4.87512674886529e-05,
+      "loss": 0.18713107109069824,
+      "step": 705
+    },
+    {
+      "epoch": 0.12918486171761281,
+      "grad_norm": 0.12657155096530914,
+      "learning_rate": 4.872819181461455e-05,
+      "loss": 0.1858484387397766,
+      "step": 710
+    },
+    {
+      "epoch": 0.13009461426491994,
+      "grad_norm": 0.11458148807287216,
+      "learning_rate": 4.870491044012834e-05,
+      "loss": 0.18732179403305055,
+      "step": 715
+    },
+    {
+      "epoch": 0.13100436681222707,
+      "grad_norm": 0.13000249862670898,
+      "learning_rate": 4.8681423567021244e-05,
+      "loss": 0.1872936010360718,
+      "step": 720
+    },
+    {
+      "epoch": 0.1319141193595342,
+      "grad_norm": 0.14580890536308289,
+      "learning_rate": 4.865773139890172e-05,
+      "loss": 0.19280019998550416,
+      "step": 725
+    },
+    {
+      "epoch": 0.13282387190684133,
+      "grad_norm": 0.1507277935743332,
+      "learning_rate": 4.8633834141157913e-05,
+      "loss": 0.1898929238319397,
+      "step": 730
+    },
+    {
+      "epoch": 0.13373362445414846,
+      "grad_norm": 0.1418737769126892,
+      "learning_rate": 4.860973200095592e-05,
+      "loss": 0.17926375865936278,
+      "step": 735
+    },
+    {
+      "epoch": 0.1346433770014556,
+      "grad_norm": 0.17151866853237152,
+      "learning_rate": 4.858542518723794e-05,
+      "loss": 0.18963592052459716,
+      "step": 740
+    },
+    {
+      "epoch": 0.13555312954876272,
+      "grad_norm": 0.11162743717432022,
+      "learning_rate": 4.8560913910720535e-05,
+      "loss": 0.19466646909713745,
+      "step": 745
+    },
+    {
+      "epoch": 0.13646288209606988,
+      "grad_norm": 0.15628376603126526,
+      "learning_rate": 4.8536198383892725e-05,
+      "loss": 0.19494034051895143,
+      "step": 750
+    },
+    {
+      "epoch": 0.137372634643377,
+      "grad_norm": 0.18209289014339447,
+      "learning_rate": 4.851127882101421e-05,
+      "loss": 0.18747550249099731,
+      "step": 755
+    },
+    {
+      "epoch": 0.13828238719068414,
+      "grad_norm": 0.14559614658355713,
+      "learning_rate": 4.8486155438113454e-05,
+      "loss": 0.1897158980369568,
+      "step": 760
+    },
+    {
+      "epoch": 0.13919213973799127,
+      "grad_norm": 0.3198587894439697,
+      "learning_rate": 4.846082845298586e-05,
+      "loss": 0.18571001291275024,
+      "step": 765
+    },
+    {
+      "epoch": 0.1401018922852984,
+      "grad_norm": 0.1486678421497345,
+      "learning_rate": 4.843529808519189e-05,
+      "loss": 0.19561930894851684,
+      "step": 770
+    },
+    {
+      "epoch": 0.14101164483260553,
+      "grad_norm": 0.15318170189857483,
+      "learning_rate": 4.840956455605509e-05,
+      "loss": 0.187040114402771,
+      "step": 775
+    },
+    {
+      "epoch": 0.14192139737991266,
+      "grad_norm": 0.13754244148731232,
+      "learning_rate": 4.838362808866025e-05,
+      "loss": 0.18345539569854735,
+      "step": 780
+    },
+    {
+      "epoch": 0.1428311499272198,
+      "grad_norm": 0.12943248450756073,
+      "learning_rate": 4.835748890785143e-05,
+      "loss": 0.1921079397201538,
+      "step": 785
+    },
+    {
+      "epoch": 0.14374090247452692,
+      "grad_norm": 0.110458143055439,
+      "learning_rate": 4.833114724023001e-05,
+      "loss": 0.17927205562591553,
+      "step": 790
+    },
+    {
+      "epoch": 0.14465065502183405,
+      "grad_norm": 0.2421770840883255,
+      "learning_rate": 4.830460331415275e-05,
+      "loss": 0.18317567110061644,
+      "step": 795
+    },
+    {
+      "epoch": 0.14556040756914118,
+      "grad_norm": 0.14752762019634247,
+      "learning_rate": 4.8277857359729787e-05,
+      "loss": 0.1843916058540344,
+      "step": 800
+    },
+    {
+      "epoch": 0.14647016011644834,
+      "grad_norm": 0.15043556690216064,
+      "learning_rate": 4.8250909608822644e-05,
+      "loss": 0.18354393243789674,
+      "step": 805
+    },
+    {
+      "epoch": 0.14737991266375547,
+      "grad_norm": 0.1381794661283493,
+      "learning_rate": 4.822376029504223e-05,
+      "loss": 0.1789781332015991,
+      "step": 810
+    },
+    {
+      "epoch": 0.1482896652110626,
+      "grad_norm": 0.18386174738407135,
+      "learning_rate": 4.819640965374681e-05,
+      "loss": 0.19494292736053467,
+      "step": 815
+    },
+    {
+      "epoch": 0.14919941775836973,
+      "grad_norm": 0.13829593360424042,
+      "learning_rate": 4.816885792203996e-05,
+      "loss": 0.18486063480377196,
+      "step": 820
+    },
+    {
+      "epoch": 0.15010917030567686,
+      "grad_norm": 0.15033291280269623,
+      "learning_rate": 4.814110533876852e-05,
+      "loss": 0.18061509132385253,
+      "step": 825
+    },
+    {
+      "epoch": 0.151018922852984,
+      "grad_norm": 0.17150473594665527,
+      "learning_rate": 4.811315214452051e-05,
+      "loss": 0.18464866876602173,
+      "step": 830
+    },
+    {
+      "epoch": 0.15192867540029112,
+      "grad_norm": 0.15317125618457794,
+      "learning_rate": 4.808499858162307e-05,
+      "loss": 0.1837708592414856,
+      "step": 835
+    },
+    {
+      "epoch": 0.15283842794759825,
+      "grad_norm": 0.2671392560005188,
+      "learning_rate": 4.805664489414031e-05,
+      "loss": 0.19338636398315429,
+      "step": 840
+    },
+    {
+      "epoch": 0.15374818049490538,
+      "grad_norm": 0.14047028124332428,
+      "learning_rate": 4.802809132787125e-05,
+      "loss": 0.17069108486175538,
+      "step": 845
+    },
+    {
+      "epoch": 0.1546579330422125,
+      "grad_norm": 0.1520431935787201,
+      "learning_rate": 4.799933813034768e-05,
+      "loss": 0.18607735633850098,
+      "step": 850
+    },
+    {
+      "epoch": 0.15556768558951964,
+      "grad_norm": 0.17239463329315186,
+      "learning_rate": 4.797038555083197e-05,
+      "loss": 0.18069062232971192,
+      "step": 855
+    },
+    {
+      "epoch": 0.1564774381368268,
+      "grad_norm": 0.1377955675125122,
+      "learning_rate": 4.794123384031495e-05,
+      "loss": 0.18870222568511963,
+      "step": 860
+    },
+    {
+      "epoch": 0.15738719068413393,
+      "grad_norm": 0.15901461243629456,
+      "learning_rate": 4.791188325151373e-05,
+      "loss": 0.18128334283828734,
+      "step": 865
+    },
+    {
+      "epoch": 0.15829694323144106,
+      "grad_norm": 0.14634132385253906,
+      "learning_rate": 4.7882334038869495e-05,
+      "loss": 0.1866163969039917,
+      "step": 870
+    },
+    {
+      "epoch": 0.1592066957787482,
+      "grad_norm": 0.15361061692237854,
+      "learning_rate": 4.785258645854529e-05,
+      "loss": 0.17850807905197144,
+      "step": 875
+    },
+    {
+      "epoch": 0.16011644832605532,
+      "grad_norm": 0.13751649856567383,
+      "learning_rate": 4.782264076842385e-05,
+      "loss": 0.17731113433837892,
+      "step": 880
+    },
+    {
+      "epoch": 0.16102620087336245,
+      "grad_norm": 0.17909638583660126,
+      "learning_rate": 4.7792497228105314e-05,
+      "loss": 0.18344542980194092,
+      "step": 885
+    },
+    {
+      "epoch": 0.16193595342066958,
+      "grad_norm": 0.16038304567337036,
+      "learning_rate": 4.776215609890498e-05,
+      "loss": 0.18868647813796996,
+      "step": 890
+    },
+    {
+      "epoch": 0.1628457059679767,
+      "grad_norm": 0.1653951108455658,
+      "learning_rate": 4.773161764385107e-05,
+      "loss": 0.18614152669906617,
+      "step": 895
+    },
+    {
+      "epoch": 0.16375545851528384,
+      "grad_norm": 0.16193026304244995,
+      "learning_rate": 4.770088212768241e-05,
+      "loss": 0.18564575910568237,
+      "step": 900
+    },
+    {
+      "epoch": 0.16466521106259097,
+      "grad_norm": 0.16048531234264374,
+      "learning_rate": 4.7669949816846173e-05,
+      "loss": 0.18330031633377075,
+      "step": 905
+    },
+    {
+      "epoch": 0.1655749636098981,
+      "grad_norm": 0.1440177708864212,
+      "learning_rate": 4.7638820979495534e-05,
+      "loss": 0.17712442874908446,
+      "step": 910
+    },
+    {
+      "epoch": 0.16648471615720525,
+      "grad_norm": 0.19635969400405884,
+      "learning_rate": 4.760749588548738e-05,
+      "loss": 0.18679027557373046,
+      "step": 915
+    },
+    {
+      "epoch": 0.16739446870451238,
+      "grad_norm": 0.15576541423797607,
+      "learning_rate": 4.757597480637995e-05,
+      "loss": 0.19283764362335204,
+      "step": 920
+    },
+    {
+      "epoch": 0.1683042212518195,
+      "grad_norm": 0.1550331562757492,
+      "learning_rate": 4.7544258015430463e-05,
+      "loss": 0.18269542455673218,
+      "step": 925
+    },
+    {
+      "epoch": 0.16921397379912664,
+      "grad_norm": 0.18369626998901367,
+      "learning_rate": 4.75123457875928e-05,
+      "loss": 0.1697891116142273,
+      "step": 930
+    },
+    {
+      "epoch": 0.17012372634643377,
+      "grad_norm": 0.15266314148902893,
+      "learning_rate": 4.7480238399515074e-05,
+      "loss": 0.18523451089859008,
+      "step": 935
+    },
+    {
+      "epoch": 0.1710334788937409,
+      "grad_norm": 0.16709664463996887,
+      "learning_rate": 4.744793612953724e-05,
+      "loss": 0.1803238034248352,
+      "step": 940
+    },
+    {
+      "epoch": 0.17194323144104803,
+      "grad_norm": 0.14929179847240448,
+      "learning_rate": 4.741543925768872e-05,
+      "loss": 0.1861217737197876,
+      "step": 945
+    },
+    {
+      "epoch": 0.17285298398835516,
+      "grad_norm": 0.1362280696630478,
+      "learning_rate": 4.7382748065685915e-05,
+      "loss": 0.17896100282669067,
+      "step": 950
+    },
+    {
+      "epoch": 0.1737627365356623,
+      "grad_norm": 0.15290239453315735,
+      "learning_rate": 4.734986283692982e-05,
+      "loss": 0.18432788848876952,
+      "step": 955
+    },
+    {
+      "epoch": 0.17467248908296942,
+      "grad_norm": 0.1287035197019577,
+      "learning_rate": 4.73167838565035e-05,
+      "loss": 0.18485682010650634,
+      "step": 960
+    },
+    {
+      "epoch": 0.17558224163027655,
+      "grad_norm": 0.17969627678394318,
+      "learning_rate": 4.728351141116971e-05,
+      "loss": 0.17361557483673096,
+      "step": 965
+    },
+    {
+      "epoch": 0.1764919941775837,
+      "grad_norm": 0.13751201331615448,
+      "learning_rate": 4.7250045789368326e-05,
+      "loss": 0.1731679320335388,
+      "step": 970
+    },
+    {
+      "epoch": 0.17740174672489084,
+      "grad_norm": 0.1603265255689621,
+      "learning_rate": 4.721638728121388e-05,
+      "loss": 0.17308170795440675,
+      "step": 975
+    },
+    {
+      "epoch": 0.17831149927219797,
+      "grad_norm": 0.1592789888381958,
+      "learning_rate": 4.718253617849306e-05,
+      "loss": 0.17534757852554322,
+      "step": 980
+    },
+    {
+      "epoch": 0.1792212518195051,
+      "grad_norm": 0.12727224826812744,
+      "learning_rate": 4.714849277466214e-05,
+      "loss": 0.17817609310150145,
+      "step": 985
+    },
+    {
+      "epoch": 0.18013100436681223,
+      "grad_norm": 0.15401554107666016,
+      "learning_rate": 4.711425736484447e-05,
+      "loss": 0.1733405351638794,
+      "step": 990
+    },
+    {
+      "epoch": 0.18104075691411936,
+      "grad_norm": 0.13253968954086304,
+      "learning_rate": 4.7079830245827906e-05,
+      "loss": 0.17846795320510864,
+      "step": 995
+    },
+    {
+      "epoch": 0.1819505094614265,
+      "grad_norm": 0.21846213936805725,
+      "learning_rate": 4.7045211716062245e-05,
+      "loss": 0.18021599054336548,
+      "step": 1000
+    },
+    {
+      "epoch": 0.18286026200873362,
+      "grad_norm": 0.16867990791797638,
+      "learning_rate": 4.7010402075656595e-05,
+      "loss": 0.18232386112213134,
+      "step": 1005
+    },
+    {
+      "epoch": 0.18377001455604075,
+      "grad_norm": 0.17180582880973816,
+      "learning_rate": 4.697540162637686e-05,
+      "loss": 0.1816317319869995,
+      "step": 1010
+    },
+    {
+      "epoch": 0.18467976710334788,
+      "grad_norm": 0.16480213403701782,
+      "learning_rate": 4.694021067164303e-05,
+      "loss": 0.17718446254730225,
+      "step": 1015
+    },
+    {
+      "epoch": 0.185589519650655,
+      "grad_norm": 0.15015918016433716,
+      "learning_rate": 4.6904829516526605e-05,
+      "loss": 0.17412011623382567,
+      "step": 1020
+    },
+    {
+      "epoch": 0.18649927219796217,
+      "grad_norm": 0.14445139467716217,
+      "learning_rate": 4.686925846774795e-05,
+      "loss": 0.1778018832206726,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1874090247452693,
+      "grad_norm": 0.1701960265636444,
+      "learning_rate": 4.683349783367362e-05,
+      "loss": 0.16901081800460815,
+      "step": 1030
+    },
+    {
+      "epoch": 0.18831877729257643,
+      "grad_norm": 0.15894867479801178,
+      "learning_rate": 4.679754792431368e-05,
+      "loss": 0.17055928707122803,
+      "step": 1035
+    },
+    {
+      "epoch": 0.18922852983988356,
+      "grad_norm": 0.1511942446231842,
+      "learning_rate": 4.676140905131903e-05,
+      "loss": 0.17339680194854737,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1901382823871907,
+      "grad_norm": 0.14735209941864014,
+      "learning_rate": 4.672508152797872e-05,
+      "loss": 0.17802717685699462,
+      "step": 1045
+    },
+    {
+      "epoch": 0.19104803493449782,
+      "grad_norm": 0.17367291450500488,
+      "learning_rate": 4.66885656692172e-05,
+      "loss": 0.1732744097709656,
+      "step": 1050
+    },
+    {
+      "epoch": 0.19195778748180495,
+      "grad_norm": 0.147227481007576,
+      "learning_rate": 4.665186179159159e-05,
+      "loss": 0.17040517330169677,
+      "step": 1055
+    },
+    {
+      "epoch": 0.19286754002911208,
+      "grad_norm": 0.1709655076265335,
+      "learning_rate": 4.6614970213289e-05,
+      "loss": 0.17794088125228882,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1937772925764192,
+      "grad_norm": 0.1588088721036911,
+      "learning_rate": 4.657789125412366e-05,
+      "loss": 0.17180380821228028,
+      "step": 1065
+    },
+    {
+      "epoch": 0.19468704512372634,
+      "grad_norm": 0.14827021956443787,
+      "learning_rate": 4.654062523553428e-05,
+      "loss": 0.182997989654541,
+      "step": 1070
+    },
+    {
+      "epoch": 0.19559679767103347,
+      "grad_norm": 0.16230466961860657,
+      "learning_rate": 4.6503172480581126e-05,
+      "loss": 0.17346880435943604,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1965065502183406,
+      "grad_norm": 0.1637624353170395,
+      "learning_rate": 4.646553331394333e-05,
+      "loss": 0.17263576984405518,
+      "step": 1080
+    },
+    {
+      "epoch": 0.19741630276564776,
+      "grad_norm": 0.15977843105793,
+      "learning_rate": 4.642770806191603e-05,
+      "loss": 0.17284308671951293,
+      "step": 1085
+    },
+    {
+      "epoch": 0.19832605531295489,
+      "grad_norm": 0.15394869446754456,
+      "learning_rate": 4.6389697052407534e-05,
+      "loss": 0.17797101736068727,
+      "step": 1090
+    },
+    {
+      "epoch": 0.19923580786026202,
+      "grad_norm": 0.15995225310325623,
+      "learning_rate": 4.6351500614936485e-05,
+      "loss": 0.18137198686599731,
+      "step": 1095
+    },
+    {
+      "epoch": 0.20014556040756915,
+      "grad_norm": 0.1779479682445526,
+      "learning_rate": 4.6313119080629006e-05,
+      "loss": 0.17998344898223878,
+      "step": 1100
+    },
+    {
+      "epoch": 0.20105531295487628,
+      "grad_norm": 0.14362832903862,
+      "learning_rate": 4.627455278221584e-05,
+      "loss": 0.18196423053741456,
+      "step": 1105
+    },
+    {
+      "epoch": 0.2019650655021834,
+      "grad_norm": 0.15951639413833618,
+      "learning_rate": 4.623580205402947e-05,
+      "loss": 0.17423888444900512,
+      "step": 1110
+    },
+    {
+      "epoch": 0.20287481804949054,
+      "grad_norm": 0.17273563146591187,
+      "learning_rate": 4.619686723200115e-05,
+      "loss": 0.17392473220825194,
+      "step": 1115
+    },
+    {
+      "epoch": 0.20378457059679767,
+      "grad_norm": 0.1655360758304596,
+      "learning_rate": 4.615774865365813e-05,
+      "loss": 0.17528389692306517,
+      "step": 1120
+    },
+    {
+      "epoch": 0.2046943231441048,
+      "grad_norm": 0.15920691192150116,
+      "learning_rate": 4.611844665812058e-05,
+      "loss": 0.1806849241256714,
+      "step": 1125
+    },
+    {
+      "epoch": 0.20560407569141192,
+      "grad_norm": 0.16114577651023865,
+      "learning_rate": 4.607896158609875e-05,
+      "loss": 0.17217352390289306,
+      "step": 1130
+    },
+    {
+      "epoch": 0.20651382823871905,
+      "grad_norm": 0.1499422937631607,
+      "learning_rate": 4.603929377988999e-05,
+      "loss": 0.17806737422943114,
+      "step": 1135
+    },
+    {
+      "epoch": 0.2074235807860262,
+      "grad_norm": 0.17605191469192505,
+      "learning_rate": 4.5999443583375765e-05,
+      "loss": 0.17842113971710205,
+      "step": 1140
+    },
+    {
+      "epoch": 0.20833333333333334,
+      "grad_norm": 0.16117210686206818,
+      "learning_rate": 4.595941134201871e-05,
+      "loss": 0.18379683494567872,
+      "step": 1145
+    },
+    {
+      "epoch": 0.20924308588064047,
+      "grad_norm": 0.21199050545692444,
+      "learning_rate": 4.591919740285957e-05,
+      "loss": 0.16286123991012574,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2101528384279476,
+      "grad_norm": 0.15100529789924622,
+      "learning_rate": 4.587880211451427e-05,
+      "loss": 0.17995200157165528,
+      "step": 1155
+    },
+    {
+      "epoch": 0.21106259097525473,
+      "grad_norm": 0.16618172824382782,
+      "learning_rate": 4.583822582717085e-05,
+      "loss": 0.16960303783416747,
+      "step": 1160
+    },
+    {
+      "epoch": 0.21197234352256186,
+      "grad_norm": 0.14743569493293762,
+      "learning_rate": 4.579746889258643e-05,
+      "loss": 0.17762668132781984,
+      "step": 1165
+    },
+    {
+      "epoch": 0.212882096069869,
+      "grad_norm": 0.1697179079055786,
+      "learning_rate": 4.575653166408417e-05,
+      "loss": 0.16656005382537842,
+      "step": 1170
+    },
+    {
+      "epoch": 0.21379184861717612,
+      "grad_norm": 0.14886513352394104,
+      "learning_rate": 4.57154144965502e-05,
+      "loss": 0.17091882228851318,
+      "step": 1175
+    },
+    {
+      "epoch": 0.21470160116448325,
+      "grad_norm": 0.18197473883628845,
+      "learning_rate": 4.5674117746430556e-05,
+      "loss": 0.1770920753479004,
+      "step": 1180
+    },
+    {
+      "epoch": 0.21561135371179038,
+      "grad_norm": 0.17323088645935059,
+      "learning_rate": 4.563264177172807e-05,
+      "loss": 0.1734643578529358,
+      "step": 1185
+    },
+    {
+      "epoch": 0.2165211062590975,
+      "grad_norm": 0.1521984338760376,
+      "learning_rate": 4.559098693199929e-05,
+      "loss": 0.17515116930007935,
+      "step": 1190
+    },
+    {
+      "epoch": 0.21743085880640467,
+      "grad_norm": 0.1842304915189743,
+      "learning_rate": 4.554915358835134e-05,
+      "loss": 0.16798022985458375,
+      "step": 1195
+    },
+    {
+      "epoch": 0.2183406113537118,
+      "grad_norm": 0.14753451943397522,
+      "learning_rate": 4.5507142103438794e-05,
+      "loss": 0.1755476713180542,
+      "step": 1200
+    },
+    {
+      "epoch": 0.21925036390101893,
+      "grad_norm": 0.17096194624900818,
+      "learning_rate": 4.546495284146057e-05,
+      "loss": 0.1792473554611206,
+      "step": 1205
+    },
+    {
+      "epoch": 0.22016011644832606,
+      "grad_norm": 0.1579233556985855,
+      "learning_rate": 4.542258616815669e-05,
+      "loss": 0.17230144739151002,
+      "step": 1210
+    },
+    {
+      "epoch": 0.2210698689956332,
+      "grad_norm": 0.177297905087471,
+      "learning_rate": 4.5380042450805216e-05,
+      "loss": 0.1807127833366394,
+      "step": 1215
+    },
+    {
+      "epoch": 0.22197962154294032,
+      "grad_norm": 0.14331696927547455,
+      "learning_rate": 4.533732205821897e-05,
+      "loss": 0.17201389074325563,
+      "step": 1220
+    },
+    {
+      "epoch": 0.22288937409024745,
+      "grad_norm": 0.14473360776901245,
+      "learning_rate": 4.529442536074239e-05,
+      "loss": 0.17036900520324708,
+      "step": 1225
+    },
+    {
+      "epoch": 0.22379912663755458,
+      "grad_norm": 0.1820901483297348,
+      "learning_rate": 4.5251352730248314e-05,
+      "loss": 0.17704882621765136,
+      "step": 1230
+    },
+    {
+      "epoch": 0.2247088791848617,
+      "grad_norm": 0.1948976367712021,
+      "learning_rate": 4.5208104540134746e-05,
+      "loss": 0.1706973433494568,
+      "step": 1235
+    },
+    {
+      "epoch": 0.22561863173216884,
+      "grad_norm": 0.16660070419311523,
+      "learning_rate": 4.51646811653216e-05,
+      "loss": 0.17636821269989014,
+      "step": 1240
+    },
+    {
+      "epoch": 0.22652838427947597,
+      "grad_norm": 0.1699984073638916,
+      "learning_rate": 4.512108298224751e-05,
+      "loss": 0.16986632347106934,
+      "step": 1245
+    },
+    {
+      "epoch": 0.22743813682678313,
+      "grad_norm": 0.17601042985916138,
+      "learning_rate": 4.50773103688665e-05,
+      "loss": 0.17507898807525635,
+      "step": 1250
+    },
+    {
+      "epoch": 0.22834788937409026,
+      "grad_norm": 0.17557238042354584,
+      "learning_rate": 4.503336370464476e-05,
+      "loss": 0.17702863216400147,
+      "step": 1255
+    },
+    {
+      "epoch": 0.2292576419213974,
+      "grad_norm": 0.1800651252269745,
+      "learning_rate": 4.498924337055729e-05,
+      "loss": 0.16419180631637573,
+      "step": 1260
+    },
+    {
+      "epoch": 0.23016739446870452,
+      "grad_norm": 0.2022479772567749,
+      "learning_rate": 4.494494974908468e-05,
+      "loss": 0.17482060194015503,
+      "step": 1265
+    },
+    {
+      "epoch": 0.23107714701601165,
+      "grad_norm": 0.14180205762386322,
+      "learning_rate": 4.490048322420973e-05,
+      "loss": 0.1723136067390442,
+      "step": 1270
+    },
+    {
+      "epoch": 0.23198689956331878,
+      "grad_norm": 0.18607310950756073,
+      "learning_rate": 4.485584418141419e-05,
+      "loss": 0.17096419334411622,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2328966521106259,
+      "grad_norm": 0.15958310663700104,
+      "learning_rate": 4.481103300767529e-05,
+      "loss": 0.1656244158744812,
+      "step": 1280
+    },
+    {
+      "epoch": 0.23380640465793304,
+      "grad_norm": 0.17552383244037628,
+      "learning_rate": 4.476605009146255e-05,
+      "loss": 0.17677626609802247,
+      "step": 1285
+    },
+    {
+      "epoch": 0.23471615720524017,
+      "grad_norm": 0.15299823880195618,
+      "learning_rate": 4.472089582273429e-05,
+      "loss": 0.1778991103172302,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2356259097525473,
+      "grad_norm": 0.14613987505435944,
+      "learning_rate": 4.46755705929343e-05,
+      "loss": 0.17071452140808105,
+      "step": 1295
+    },
+    {
+      "epoch": 0.23653566229985443,
+      "grad_norm": 0.17781122028827667,
+      "learning_rate": 4.463007479498843e-05,
+      "loss": 0.16955430507659913,
+      "step": 1300
+    },
+    {
+      "epoch": 0.23744541484716158,
+      "grad_norm": 0.16326487064361572,
+      "learning_rate": 4.458440882330119e-05,
+      "loss": 0.1777693510055542,
+      "step": 1305
+    },
+    {
+      "epoch": 0.23835516739446871,
+      "grad_norm": 0.17701926827430725,
+      "learning_rate": 4.4538573073752365e-05,
+      "loss": 0.16323351860046387,
+      "step": 1310
+    },
+    {
+      "epoch": 0.23926491994177584,
+      "grad_norm": 0.13104717433452606,
+      "learning_rate": 4.449256794369349e-05,
+      "loss": 0.17653456926345826,
+      "step": 1315
+    },
+    {
+      "epoch": 0.24017467248908297,
+      "grad_norm": 0.1796836256980896,
+      "learning_rate": 4.444639383194452e-05,
+      "loss": 0.17189600467681884,
+      "step": 1320
+    },
+    {
+      "epoch": 0.2410844250363901,
+      "grad_norm": 0.14919696748256683,
+      "learning_rate": 4.440005113879029e-05,
+      "loss": 0.17003334760665895,
+      "step": 1325
+    },
+    {
+      "epoch": 0.24199417758369723,
+      "grad_norm": 0.1728784441947937,
+      "learning_rate": 4.4353540265977064e-05,
+      "loss": 0.17397408485412597,
+      "step": 1330
+    },
+    {
+      "epoch": 0.24290393013100436,
+      "grad_norm": 0.14591015875339508,
+      "learning_rate": 4.43068616167091e-05,
+      "loss": 0.16498478651046752,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2438136826783115,
+      "grad_norm": 0.18417201936244965,
+      "learning_rate": 4.4260015595645055e-05,
+      "loss": 0.16841750144958495,
+      "step": 1340
+    },
+    {
+      "epoch": 0.24472343522561862,
+      "grad_norm": 0.16264279186725616,
+      "learning_rate": 4.4213002608894605e-05,
+      "loss": 0.16907373666763306,
+      "step": 1345
+    },
+    {
+      "epoch": 0.24563318777292575,
+      "grad_norm": 0.15248481929302216,
+      "learning_rate": 4.416582306401481e-05,
+      "loss": 0.15931472778320313,
+      "step": 1350
+    },
+    {
+      "epoch": 0.24654294032023288,
+      "grad_norm": 0.1488373875617981,
+      "learning_rate": 4.4118477370006636e-05,
+      "loss": 0.1701716423034668,
+      "step": 1355
+    },
+    {
+      "epoch": 0.24745269286754004,
+      "grad_norm": 0.14679782092571259,
+      "learning_rate": 4.407096593731142e-05,
+      "loss": 0.157412326335907,
+      "step": 1360
+    },
+    {
+      "epoch": 0.24836244541484717,
+      "grad_norm": 0.17139530181884766,
+      "learning_rate": 4.402328917780728e-05,
+      "loss": 0.17303754091262818,
+      "step": 1365
+    },
+    {
+      "epoch": 0.2492721979621543,
+      "grad_norm": 0.1534871757030487,
+      "learning_rate": 4.397544750480554e-05,
+      "loss": 0.1786255121231079,
+      "step": 1370
+    },
+    {
+      "epoch": 0.2501819505094614,
+      "grad_norm": 0.1876252293586731,
+      "learning_rate": 4.39274413330472e-05,
+      "loss": 0.16442898511886597,
+      "step": 1375
+    },
+    {
+      "epoch": 0.25109170305676853,
+      "grad_norm": 0.16165752708911896,
+      "learning_rate": 4.387927107869928e-05,
+      "loss": 0.1780426025390625,
+      "step": 1380
+    },
+    {
+      "epoch": 0.25200145560407566,
+      "grad_norm": 0.17242255806922913,
+      "learning_rate": 4.383093715935124e-05,
+      "loss": 0.15959256887435913,
+      "step": 1385
+    },
+    {
+      "epoch": 0.25291120815138285,
+      "grad_norm": 0.1627114862203598,
+      "learning_rate": 4.378243999401137e-05,
+      "loss": 0.17606115341186523,
+      "step": 1390
+    },
+    {
+      "epoch": 0.25382096069869,
+      "grad_norm": 0.15911224484443665,
+      "learning_rate": 4.373378000310312e-05,
+      "loss": 0.16798585653305054,
+      "step": 1395
+    },
+    {
+      "epoch": 0.2547307132459971,
+      "grad_norm": 0.15542249381542206,
+      "learning_rate": 4.3684957608461505e-05,
+      "loss": 0.1695417881011963,
+      "step": 1400
+    },
+    {
+      "epoch": 0.25564046579330424,
+      "grad_norm": 0.1475304812192917,
+      "learning_rate": 4.363597323332941e-05,
+      "loss": 0.16340878009796142,
+      "step": 1405
+    },
+    {
+      "epoch": 0.25655021834061137,
+      "grad_norm": 0.16943927109241486,
+      "learning_rate": 4.358682730235395e-05,
+      "loss": 0.17240238189697266,
+      "step": 1410
+    },
+    {
+      "epoch": 0.2574599708879185,
+      "grad_norm": 0.1816391944885254,
+      "learning_rate": 4.3537520241582744e-05,
+      "loss": 0.16558437347412108,
+      "step": 1415
+    },
+    {
+      "epoch": 0.25836972343522563,
+      "grad_norm": 0.23851341009140015,
+      "learning_rate": 4.348805247846027e-05,
+      "loss": 0.16796000003814698,
+      "step": 1420
+    },
+    {
+      "epoch": 0.25927947598253276,
+      "grad_norm": 0.15415243804454803,
+      "learning_rate": 4.343842444182414e-05,
+      "loss": 0.1746017098426819,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2601892285298399,
+      "grad_norm": 0.15651032328605652,
+      "learning_rate": 4.338863656190139e-05,
+      "loss": 0.1649057984352112,
+      "step": 1430
+    },
+    {
+      "epoch": 0.261098981077147,
+      "grad_norm": 0.16601966321468353,
+      "learning_rate": 4.333868927030471e-05,
+      "loss": 0.15888988971710205,
+      "step": 1435
+    },
+    {
+      "epoch": 0.26200873362445415,
+      "grad_norm": 0.1549467295408249,
+      "learning_rate": 4.328858300002876e-05,
+      "loss": 0.16357985734939576,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2629184861717613,
+      "grad_norm": 0.16332370042800903,
+      "learning_rate": 4.32383181854464e-05,
+      "loss": 0.16749982833862304,
+      "step": 1445
+    },
+    {
+      "epoch": 0.2638282387190684,
+      "grad_norm": 0.14827077090740204,
+      "learning_rate": 4.3187895262304894e-05,
+      "loss": 0.16886214017868043,
+      "step": 1450
+    },
+    {
+      "epoch": 0.26473799126637554,
+      "grad_norm": 0.1557198166847229,
+      "learning_rate": 4.313731466772216e-05,
+      "loss": 0.17512214183807373,
+      "step": 1455
+    },
+    {
+      "epoch": 0.26564774381368267,
+      "grad_norm": 0.17263570427894592,
+      "learning_rate": 4.308657684018299e-05,
+      "loss": 0.16248074769973755,
+      "step": 1460
+    },
+    {
+      "epoch": 0.2665574963609898,
+      "grad_norm": 0.17135761678218842,
+      "learning_rate": 4.303568221953521e-05,
+      "loss": 0.16605921983718872,
+      "step": 1465
+    },
+    {
+      "epoch": 0.26746724890829693,
+      "grad_norm": 0.14322632551193237,
+      "learning_rate": 4.2984631246985897e-05,
+      "loss": 0.1610772728919983,
+      "step": 1470
+    },
+    {
+      "epoch": 0.26837700145560406,
+      "grad_norm": 0.18852312862873077,
+      "learning_rate": 4.2933424365097564e-05,
+      "loss": 0.1686462163925171,
+      "step": 1475
+    },
+    {
+      "epoch": 0.2692867540029112,
+      "grad_norm": 0.1780245155096054,
+      "learning_rate": 4.2882062017784294e-05,
+      "loss": 0.16953932046890258,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2701965065502183,
+      "grad_norm": 0.180568665266037,
+      "learning_rate": 4.2830544650307895e-05,
+      "loss": 0.16442664861679077,
+      "step": 1485
+    },
+    {
+      "epoch": 0.27110625909752545,
+      "grad_norm": 0.16876435279846191,
+      "learning_rate": 4.277887270927407e-05,
+      "loss": 0.17128173112869263,
+      "step": 1490
+    },
+    {
+      "epoch": 0.2720160116448326,
+      "grad_norm": 0.164053276181221,
+      "learning_rate": 4.2727046642628513e-05,
+      "loss": 0.16331382989883422,
+      "step": 1495
+    },
+    {
+      "epoch": 0.27292576419213976,
+      "grad_norm": 0.14577528834342957,
+      "learning_rate": 4.267506689965305e-05,
+      "loss": 0.1638316035270691,
+      "step": 1500
+    },
+    {
+      "epoch": 0.2738355167394469,
+      "grad_norm": 0.1648740917444229,
+      "learning_rate": 4.262293393096171e-05,
+      "loss": 0.15332664251327516,
+      "step": 1505
+    },
+    {
+      "epoch": 0.274745269286754,
+      "grad_norm": 0.16445094347000122,
+      "learning_rate": 4.257064818849685e-05,
+      "loss": 0.1706634521484375,
+      "step": 1510
+    },
+    {
+      "epoch": 0.27565502183406115,
+      "grad_norm": 0.1584935486316681,
+      "learning_rate": 4.251821012552524e-05,
+      "loss": 0.1684114694595337,
+      "step": 1515
+    },
+    {
+      "epoch": 0.2765647743813683,
+      "grad_norm": 0.17215611040592194,
+      "learning_rate": 4.24656201966341e-05,
+      "loss": 0.15594131946563722,
+      "step": 1520
+    },
+    {
+      "epoch": 0.2774745269286754,
+      "grad_norm": 0.15945589542388916,
+      "learning_rate": 4.2412878857727214e-05,
+      "loss": 0.1686659574508667,
+      "step": 1525
+    },
+    {
+      "epoch": 0.27838427947598254,
+      "grad_norm": 0.16103951632976532,
+      "learning_rate": 4.2359986566020906e-05,
+      "loss": 0.17779340744018554,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2792940320232897,
+      "grad_norm": 0.1770307570695877,
+      "learning_rate": 4.230694378004014e-05,
+      "loss": 0.16786882877349854,
+      "step": 1535
+    },
+    {
+      "epoch": 0.2802037845705968,
+      "grad_norm": 0.16225053369998932,
+      "learning_rate": 4.2253750959614504e-05,
+      "loss": 0.16558897495269775,
+      "step": 1540
+    },
+    {
+      "epoch": 0.28111353711790393,
+      "grad_norm": 0.27213969826698303,
+      "learning_rate": 4.220040856587425e-05,
+      "loss": 0.1641119599342346,
+      "step": 1545
+    },
+    {
+      "epoch": 0.28202328966521106,
+      "grad_norm": 0.1773071587085724,
+      "learning_rate": 4.2146917061246284e-05,
+      "loss": 0.16919140815734862,
+      "step": 1550
+    },
+    {
+      "epoch": 0.2829330422125182,
+      "grad_norm": 0.15519705414772034,
+      "learning_rate": 4.209327690945014e-05,
+      "loss": 0.15501506328582765,
+      "step": 1555
+    },
+    {
+      "epoch": 0.2838427947598253,
+      "grad_norm": 0.19921597838401794,
+      "learning_rate": 4.203948857549402e-05,
+      "loss": 0.1690821886062622,
+      "step": 1560
+    },
+    {
+      "epoch": 0.28475254730713245,
+      "grad_norm": 0.15417630970478058,
+      "learning_rate": 4.1985552525670696e-05,
+      "loss": 0.1675640344619751,
+      "step": 1565
+    },
+    {
+      "epoch": 0.2856622998544396,
+      "grad_norm": 0.1739572137594223,
+      "learning_rate": 4.193146922755348e-05,
+      "loss": 0.16738017797470092,
+      "step": 1570
+    },
+    {
+      "epoch": 0.2865720524017467,
+      "grad_norm": 0.1384361982345581,
+      "learning_rate": 4.187723914999221e-05,
+      "loss": 0.16802358627319336,
+      "step": 1575
+    },
+    {
+      "epoch": 0.28748180494905384,
+      "grad_norm": 0.1491454839706421,
+      "learning_rate": 4.182286276310915e-05,
+      "loss": 0.1619583249092102,
+      "step": 1580
+    },
+    {
+      "epoch": 0.288391557496361,
+      "grad_norm": 0.15831919014453888,
+      "learning_rate": 4.176834053829492e-05,
+      "loss": 0.1625199794769287,
+      "step": 1585
+    },
+    {
+      "epoch": 0.2893013100436681,
+      "grad_norm": 0.16265396773815155,
+      "learning_rate": 4.1713672948204416e-05,
+      "loss": 0.16718552112579346,
+      "step": 1590
+    },
+    {
+      "epoch": 0.29021106259097523,
+      "grad_norm": 0.15153461694717407,
+      "learning_rate": 4.1658860466752714e-05,
+      "loss": 0.15979087352752686,
+      "step": 1595
+    },
+    {
+      "epoch": 0.29112081513828236,
+      "grad_norm": 0.1620412915945053,
+      "learning_rate": 4.160390356911096e-05,
+      "loss": 0.16103557348251343,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2920305676855895,
+      "grad_norm": 0.16673807799816132,
+      "learning_rate": 4.154880273170223e-05,
+      "loss": 0.16394708156585694,
+      "step": 1605
+    },
+    {
+      "epoch": 0.2929403202328967,
+      "grad_norm": 0.14834867417812347,
+      "learning_rate": 4.149355843219744e-05,
+      "loss": 0.15916435718536376,
+      "step": 1610
+    },
+    {
+      "epoch": 0.2938500727802038,
+      "grad_norm": 0.16977964341640472,
+      "learning_rate": 4.143817114951119e-05,
+      "loss": 0.16538127660751342,
+      "step": 1615
+    },
+    {
+      "epoch": 0.29475982532751094,
+      "grad_norm": 0.17986875772476196,
+      "learning_rate": 4.138264136379756e-05,
+      "loss": 0.15514618158340454,
+      "step": 1620
+    },
+    {
+      "epoch": 0.29566957787481807,
+      "grad_norm": 0.15794920921325684,
+      "learning_rate": 4.132696955644605e-05,
+      "loss": 0.15992183685302735,
+      "step": 1625
+    },
+    {
+      "epoch": 0.2965793304221252,
+      "grad_norm": 0.19955399632453918,
+      "learning_rate": 4.127115621007731e-05,
+      "loss": 0.16362056732177735,
+      "step": 1630
+    },
+    {
+      "epoch": 0.29748908296943233,
+      "grad_norm": 0.1352023035287857,
+      "learning_rate": 4.121520180853903e-05,
+      "loss": 0.15631601810455323,
+      "step": 1635
+    },
+    {
+      "epoch": 0.29839883551673946,
+      "grad_norm": 0.15340781211853027,
+      "learning_rate": 4.1159106836901674e-05,
+      "loss": 0.1571858048439026,
+      "step": 1640
+    },
+    {
+      "epoch": 0.2993085880640466,
+      "grad_norm": 0.15311770141124725,
+      "learning_rate": 4.110287178145433e-05,
+      "loss": 0.16082344055175782,
+      "step": 1645
+    },
+    {
+      "epoch": 0.3002183406113537,
+      "grad_norm": 0.17811627686023712,
+      "learning_rate": 4.10464971297005e-05,
+      "loss": 0.16117215156555176,
+      "step": 1650
+    },
+    {
+      "epoch": 0.30112809315866085,
+      "grad_norm": 0.21060039103031158,
+      "learning_rate": 4.0989983370353805e-05,
+      "loss": 0.15838587284088135,
+      "step": 1655
+    },
+    {
+      "epoch": 0.302037845705968,
+      "grad_norm": 0.155836820602417,
+      "learning_rate": 4.093333099333383e-05,
+      "loss": 0.16648870706558228,
+      "step": 1660
+    },
+    {
+      "epoch": 0.3029475982532751,
+      "grad_norm": 0.13711698353290558,
+      "learning_rate": 4.0876540489761826e-05,
+      "loss": 0.16899349689483642,
+      "step": 1665
+    },
+    {
+      "epoch": 0.30385735080058224,
+      "grad_norm": 0.15162716805934906,
+      "learning_rate": 4.0819612351956485e-05,
+      "loss": 0.16574090719223022,
+      "step": 1670
+    },
+    {
+      "epoch": 0.30476710334788937,
+      "grad_norm": 0.15016348659992218,
+      "learning_rate": 4.0762547073429615e-05,
+      "loss": 0.1689780354499817,
+      "step": 1675
+    },
+    {
+      "epoch": 0.3056768558951965,
+      "grad_norm": 0.15182986855506897,
+      "learning_rate": 4.070534514888194e-05,
+      "loss": 0.1593686819076538,
+      "step": 1680
+    },
+    {
+      "epoch": 0.3065866084425036,
+      "grad_norm": 0.15648750960826874,
+      "learning_rate": 4.0648007074198765e-05,
+      "loss": 0.16436235904693602,
+      "step": 1685
+    },
+    {
+      "epoch": 0.30749636098981076,
+      "grad_norm": 0.18339484930038452,
+      "learning_rate": 4.0590533346445665e-05,
+      "loss": 0.1678077220916748,
+      "step": 1690
+    },
+    {
+      "epoch": 0.3084061135371179,
+      "grad_norm": 0.16426527500152588,
+      "learning_rate": 4.053292446386422e-05,
+      "loss": 0.1689227342605591,
+      "step": 1695
+    },
+    {
+      "epoch": 0.309315866084425,
+      "grad_norm": 0.16129335761070251,
+      "learning_rate": 4.047518092586766e-05,
+      "loss": 0.16592445373535156,
+      "step": 1700
+    },
+    {
+      "epoch": 0.31022561863173215,
+      "grad_norm": 0.15512363612651825,
+      "learning_rate": 4.041730323303654e-05,
+      "loss": 0.16142364740371704,
+      "step": 1705
+    },
+    {
+      "epoch": 0.3111353711790393,
+      "grad_norm": 0.159842386841774,
+      "learning_rate": 4.0359291887114425e-05,
+      "loss": 0.1702875852584839,
+      "step": 1710
+    },
+    {
+      "epoch": 0.3120451237263464,
+      "grad_norm": 0.19558854401111603,
+      "learning_rate": 4.030114739100352e-05,
+      "loss": 0.15966148376464845,
+      "step": 1715
+    },
+    {
+      "epoch": 0.3129548762736536,
+      "grad_norm": 0.1577496975660324,
+      "learning_rate": 4.024287024876029e-05,
+      "loss": 0.1620358943939209,
+      "step": 1720
+    },
+    {
+      "epoch": 0.3138646288209607,
+      "grad_norm": 0.1629355251789093,
+      "learning_rate": 4.0184460965591144e-05,
+      "loss": 0.16511552333831786,
+      "step": 1725
+    },
+    {
+      "epoch": 0.31477438136826785,
+      "grad_norm": 0.17060767114162445,
+      "learning_rate": 4.0125920047848e-05,
+      "loss": 0.15672838687896729,
+      "step": 1730
+    },
+    {
+      "epoch": 0.315684133915575,
+      "grad_norm": 0.22447620332241058,
+      "learning_rate": 4.006724800302394e-05,
+      "loss": 0.15339784622192382,
+      "step": 1735
+    },
+    {
+      "epoch": 0.3165938864628821,
+      "grad_norm": 0.14572037756443024,
+      "learning_rate": 4.000844533974878e-05,
+      "loss": 0.16566959619522095,
+      "step": 1740
+    },
+    {
+      "epoch": 0.31750363901018924,
+      "grad_norm": 0.15915483236312866,
+      "learning_rate": 3.9949512567784684e-05,
+      "loss": 0.16153957843780517,
+      "step": 1745
+    },
+    {
+      "epoch": 0.3184133915574964,
+      "grad_norm": 0.1668540984392166,
+      "learning_rate": 3.9890450198021704e-05,
+      "loss": 0.1659809947013855,
+      "step": 1750
+    },
+    {
+      "epoch": 0.3193231441048035,
+      "grad_norm": 0.16612035036087036,
+      "learning_rate": 3.983125874247341e-05,
+      "loss": 0.16941241025924683,
+      "step": 1755
+    },
+    {
+      "epoch": 0.32023289665211063,
+      "grad_norm": 0.15163679420948029,
+      "learning_rate": 3.9771938714272407e-05,
+      "loss": 0.16053590774536133,
+      "step": 1760
+    },
+    {
+      "epoch": 0.32114264919941776,
+      "grad_norm": 0.1797824203968048,
+      "learning_rate": 3.97124906276659e-05,
+      "loss": 0.1667110800743103,
+      "step": 1765
+    },
+    {
+      "epoch": 0.3220524017467249,
+      "grad_norm": 0.15076608955860138,
+      "learning_rate": 3.9652914998011237e-05,
+      "loss": 0.1607860803604126,
+      "step": 1770
+    },
+    {
+      "epoch": 0.322962154294032,
+      "grad_norm": 0.16523587703704834,
+      "learning_rate": 3.959321234177144e-05,
+      "loss": 0.16515827178955078,
+      "step": 1775
+    },
+    {
+      "epoch": 0.32387190684133915,
+      "grad_norm": 0.22065149247646332,
+      "learning_rate": 3.9533383176510746e-05,
+      "loss": 0.1618957757949829,
+      "step": 1780
+    },
+    {
+      "epoch": 0.3247816593886463,
+      "grad_norm": 0.16426463425159454,
+      "learning_rate": 3.9473428020890066e-05,
+      "loss": 0.15763382911682128,
+      "step": 1785
+    },
+    {
+      "epoch": 0.3256914119359534,
+      "grad_norm": 0.16474904119968414,
+      "learning_rate": 3.941334739466257e-05,
+      "loss": 0.15135571956634522,
+      "step": 1790
+    },
+    {
+      "epoch": 0.32660116448326054,
+      "grad_norm": 0.16746412217617035,
+      "learning_rate": 3.935314181866909e-05,
+      "loss": 0.15925389528274536,
+      "step": 1795
+    },
+    {
+      "epoch": 0.32751091703056767,
+      "grad_norm": 0.17819371819496155,
+      "learning_rate": 3.929281181483369e-05,
+      "loss": 0.1598669171333313,
+      "step": 1800
+    },
+    {
+      "epoch": 0.3284206695778748,
+      "grad_norm": 0.1816040277481079,
+      "learning_rate": 3.923235790615907e-05,
+      "loss": 0.1652522087097168,
+      "step": 1805
+    },
+    {
+      "epoch": 0.32933042212518193,
+      "grad_norm": 0.14846695959568024,
+      "learning_rate": 3.917178061672211e-05,
+      "loss": 0.16665585041046144,
+      "step": 1810
+    },
+    {
+      "epoch": 0.33024017467248906,
+      "grad_norm": 0.1734926551580429,
+      "learning_rate": 3.911108047166924e-05,
+      "loss": 0.16069791316986085,
+      "step": 1815
+    },
+    {
+      "epoch": 0.3311499272197962,
+      "grad_norm": 0.16154922544956207,
+      "learning_rate": 3.905025799721194e-05,
+      "loss": 0.16114097833633423,
+      "step": 1820
+    },
+    {
+      "epoch": 0.3320596797671033,
+      "grad_norm": 0.1538771390914917,
+      "learning_rate": 3.898931372062217e-05,
+      "loss": 0.1602831244468689,
+      "step": 1825
+    },
+    {
+      "epoch": 0.3329694323144105,
+      "grad_norm": 0.14036566019058228,
+      "learning_rate": 3.892824817022781e-05,
+      "loss": 0.1502395749092102,
+      "step": 1830
+    },
+    {
+      "epoch": 0.33387918486171764,
+      "grad_norm": 0.19212059676647186,
+      "learning_rate": 3.886706187540804e-05,
+      "loss": 0.16265250444412233,
+      "step": 1835
+    },
+    {
+      "epoch": 0.33478893740902477,
+      "grad_norm": 0.17410333454608917,
+      "learning_rate": 3.880575536658881e-05,
+      "loss": 0.15689224004745483,
+      "step": 1840
+    },
+    {
+      "epoch": 0.3356986899563319,
+      "grad_norm": 0.15165294706821442,
+      "learning_rate": 3.874432917523817e-05,
+      "loss": 0.15033140182495117,
+      "step": 1845
+    },
+    {
+      "epoch": 0.336608442503639,
+      "grad_norm": 0.16166730225086212,
+      "learning_rate": 3.8682783833861736e-05,
+      "loss": 0.16896235942840576,
+      "step": 1850
+    },
+    {
+      "epoch": 0.33751819505094616,
+      "grad_norm": 0.16497021913528442,
+      "learning_rate": 3.8621119875998026e-05,
+      "loss": 0.1600774645805359,
+      "step": 1855
+    },
+    {
+      "epoch": 0.3384279475982533,
+      "grad_norm": 0.17264948785305023,
+      "learning_rate": 3.855933783621384e-05,
+      "loss": 0.16947593688964843,
+      "step": 1860
+    },
+    {
+      "epoch": 0.3393377001455604,
+      "grad_norm": 0.16870704293251038,
+      "learning_rate": 3.8497438250099636e-05,
+      "loss": 0.16062095165252685,
+      "step": 1865
+    },
+    {
+      "epoch": 0.34024745269286755,
+      "grad_norm": 0.16644036769866943,
+      "learning_rate": 3.843542165426492e-05,
+      "loss": 0.16015599966049193,
+      "step": 1870
+    },
+    {
+      "epoch": 0.3411572052401747,
+      "grad_norm": 0.1626352220773697,
+      "learning_rate": 3.837328858633349e-05,
+      "loss": 0.17444703578948975,
+      "step": 1875
+    },
+    {
+      "epoch": 0.3420669577874818,
+      "grad_norm": 0.1427375227212906,
+      "learning_rate": 3.83110395849389e-05,
+      "loss": 0.1589805006980896,
+      "step": 1880
+    },
+    {
+      "epoch": 0.34297671033478894,
+      "grad_norm": 0.17840255796909332,
+      "learning_rate": 3.824867518971973e-05,
+      "loss": 0.15953952074050903,
+      "step": 1885
+    },
+    {
+      "epoch": 0.34388646288209607,
+      "grad_norm": 0.16998249292373657,
+      "learning_rate": 3.818619594131489e-05,
+      "loss": 0.16027032136917113,
+      "step": 1890
+    },
+    {
+      "epoch": 0.3447962154294032,
+      "grad_norm": 0.14950257539749146,
+      "learning_rate": 3.812360238135897e-05,
+      "loss": 0.15335670709609986,
+      "step": 1895
+    },
+    {
+      "epoch": 0.3457059679767103,
+      "grad_norm": 0.1678011417388916,
+      "learning_rate": 3.806089505247752e-05,
+      "loss": 0.1560648798942566,
+      "step": 1900
+    },
+    {
+      "epoch": 0.34661572052401746,
+      "grad_norm": 0.17944541573524475,
+      "learning_rate": 3.799807449828238e-05,
+      "loss": 0.16072254180908202,
+      "step": 1905
+    },
+    {
+      "epoch": 0.3475254730713246,
+      "grad_norm": 0.166817307472229,
+      "learning_rate": 3.793514126336691e-05,
+      "loss": 0.1542820692062378,
+      "step": 1910
+    },
+    {
+      "epoch": 0.3484352256186317,
+      "grad_norm": 0.16047626733779907,
+      "learning_rate": 3.787209589330134e-05,
+      "loss": 0.16092092990875245,
+      "step": 1915
+    },
+    {
+      "epoch": 0.34934497816593885,
+      "grad_norm": 0.16478900611400604,
+      "learning_rate": 3.7808938934627965e-05,
+      "loss": 0.16765867471694945,
+      "step": 1920
+    },
+    {
+      "epoch": 0.350254730713246,
+      "grad_norm": 0.15349514782428741,
+      "learning_rate": 3.774567093485648e-05,
+      "loss": 0.15890377759933472,
+      "step": 1925
+    },
+    {
+      "epoch": 0.3511644832605531,
+      "grad_norm": 0.1515921950340271,
+      "learning_rate": 3.768229244245917e-05,
+      "loss": 0.16668319702148438,
+      "step": 1930
+    },
+    {
+      "epoch": 0.35207423580786024,
+      "grad_norm": 0.16310466825962067,
+      "learning_rate": 3.7618804006866195e-05,
+      "loss": 0.15182652473449706,
+      "step": 1935
+    },
+    {
+      "epoch": 0.3529839883551674,
+      "grad_norm": 0.17294517159461975,
+      "learning_rate": 3.755520617846084e-05,
+      "loss": 0.16287628412246705,
+      "step": 1940
+    },
+    {
+      "epoch": 0.35389374090247455,
+      "grad_norm": 0.1482895463705063,
+      "learning_rate": 3.749149950857467e-05,
+      "loss": 0.15321952104568481,
+      "step": 1945
+    },
+    {
+      "epoch": 0.3548034934497817,
+      "grad_norm": 0.2236029952764511,
+      "learning_rate": 3.7427684549482847e-05,
+      "loss": 0.15403482913970948,
+      "step": 1950
+    },
+    {
+      "epoch": 0.3557132459970888,
+      "grad_norm": 0.20185327529907227,
+      "learning_rate": 3.736376185439927e-05,
+      "loss": 0.1633884072303772,
+      "step": 1955
+    },
+    {
+      "epoch": 0.35662299854439594,
+      "grad_norm": 0.13906247913837433,
+      "learning_rate": 3.7299731977471816e-05,
+      "loss": 0.15925350189208984,
+      "step": 1960
+    },
+    {
+      "epoch": 0.35753275109170307,
+      "grad_norm": 0.18665002286434174,
+      "learning_rate": 3.723559547377751e-05,
+      "loss": 0.1612026572227478,
+      "step": 1965
+    },
+    {
+      "epoch": 0.3584425036390102,
+      "grad_norm": 0.16913433372974396,
+      "learning_rate": 3.717135289931774e-05,
+      "loss": 0.15479494333267213,
+      "step": 1970
+    },
+    {
+      "epoch": 0.35935225618631733,
+      "grad_norm": 0.1620066910982132,
+      "learning_rate": 3.7107004811013434e-05,
+      "loss": 0.1604058027267456,
+      "step": 1975
+    },
+    {
+      "epoch": 0.36026200873362446,
+      "grad_norm": 0.16838301718235016,
+      "learning_rate": 3.704255176670021e-05,
+      "loss": 0.15335073471069335,
+      "step": 1980
+    },
+    {
+      "epoch": 0.3611717612809316,
+      "grad_norm": 0.3054695427417755,
+      "learning_rate": 3.6977994325123535e-05,
+      "loss": 0.16558053493499755,
+      "step": 1985
+    },
+    {
+      "epoch": 0.3620815138282387,
+      "grad_norm": 0.1526716649532318,
+      "learning_rate": 3.6913333045933934e-05,
+      "loss": 0.16148923635482787,
+      "step": 1990
+    },
+    {
+      "epoch": 0.36299126637554585,
+      "grad_norm": 0.15328513085842133,
+      "learning_rate": 3.684856848968209e-05,
+      "loss": 0.1553613781929016,
+      "step": 1995
+    },
+    {
+      "epoch": 0.363901018922853,
+      "grad_norm": 0.16129714250564575,
+      "learning_rate": 3.6783701217813995e-05,
+      "loss": 0.16724612712860107,
+      "step": 2000
+    },
+    {
+      "epoch": 0.3648107714701601,
+      "grad_norm": 0.15715539455413818,
+      "learning_rate": 3.6718731792666086e-05,
+      "loss": 0.15867922306060792,
+      "step": 2005
+    },
+    {
+      "epoch": 0.36572052401746724,
+      "grad_norm": 0.15569166839122772,
+      "learning_rate": 3.6653660777460366e-05,
+      "loss": 0.1552058696746826,
+      "step": 2010
+    },
+    {
+      "epoch": 0.36663027656477437,
+      "grad_norm": 0.16223010420799255,
+      "learning_rate": 3.6588488736299535e-05,
+      "loss": 0.1583200454711914,
+      "step": 2015
+    },
+    {
+      "epoch": 0.3675400291120815,
+      "grad_norm": 0.18441995978355408,
+      "learning_rate": 3.652321623416209e-05,
+      "loss": 0.15050662755966188,
+      "step": 2020
+    },
+    {
+      "epoch": 0.36844978165938863,
+      "grad_norm": 0.13792674243450165,
+      "learning_rate": 3.645784383689742e-05,
+      "loss": 0.15458759069442748,
+      "step": 2025
+    },
+    {
+      "epoch": 0.36935953420669576,
+      "grad_norm": 0.14993111789226532,
+      "learning_rate": 3.639237211122091e-05,
+      "loss": 0.15926222801208495,
+      "step": 2030
+    },
+    {
+      "epoch": 0.3702692867540029,
+      "grad_norm": 0.16815930604934692,
+      "learning_rate": 3.632680162470904e-05,
+      "loss": 0.15524441003799438,
+      "step": 2035
+    },
+    {
+      "epoch": 0.37117903930131,
+      "grad_norm": 0.13312821090221405,
+      "learning_rate": 3.626113294579441e-05,
+      "loss": 0.15883516073226928,
+      "step": 2040
+    },
+    {
+      "epoch": 0.37208879184861715,
+      "grad_norm": 0.16838273406028748,
+      "learning_rate": 3.619536664376091e-05,
+      "loss": 0.15829603672027587,
+      "step": 2045
+    },
+    {
+      "epoch": 0.37299854439592434,
+      "grad_norm": 0.14706873893737793,
+      "learning_rate": 3.612950328873869e-05,
+      "loss": 0.15644397735595703,
+      "step": 2050
+    },
+    {
+      "epoch": 0.37390829694323147,
+      "grad_norm": 0.1644199639558792,
+      "learning_rate": 3.606354345169926e-05,
+      "loss": 0.15858219861984252,
+      "step": 2055
+    },
+    {
+      "epoch": 0.3748180494905386,
+      "grad_norm": 0.18077051639556885,
+      "learning_rate": 3.599748770445055e-05,
+      "loss": 0.1641286849975586,
+      "step": 2060
+    },
+    {
+      "epoch": 0.3757278020378457,
+      "grad_norm": 0.16329127550125122,
+      "learning_rate": 3.5931336619631914e-05,
+      "loss": 0.15027186870574952,
+      "step": 2065
+    },
+    {
+      "epoch": 0.37663755458515286,
+      "grad_norm": 0.16346783936023712,
+      "learning_rate": 3.586509077070922e-05,
+      "loss": 0.1558641314506531,
+      "step": 2070
+    },
+    {
+      "epoch": 0.37754730713246,
+      "grad_norm": 0.1727602630853653,
+      "learning_rate": 3.5798750731969834e-05,
+      "loss": 0.15390506982803345,
+      "step": 2075
+    },
+    {
+      "epoch": 0.3784570596797671,
+      "grad_norm": 0.7598192691802979,
+      "learning_rate": 3.5732317078517654e-05,
+      "loss": 0.1533232808113098,
+      "step": 2080
+    },
+    {
+      "epoch": 0.37936681222707425,
+      "grad_norm": 0.1433355212211609,
+      "learning_rate": 3.5665790386268124e-05,
+      "loss": 0.15560413599014283,
+      "step": 2085
+    },
+    {
+      "epoch": 0.3802765647743814,
+      "grad_norm": 0.18439625203609467,
+      "learning_rate": 3.559917123194325e-05,
+      "loss": 0.16695556640625,
+      "step": 2090
+    },
+    {
+      "epoch": 0.3811863173216885,
+      "grad_norm": 0.1693502813577652,
+      "learning_rate": 3.55324601930666e-05,
+      "loss": 0.15957870483398437,
+      "step": 2095
+    },
+    {
+      "epoch": 0.38209606986899564,
+      "grad_norm": 0.17776088416576385,
+      "learning_rate": 3.54656578479583e-05,
+      "loss": 0.1527492880821228,
+      "step": 2100
+    },
+    {
+      "epoch": 0.38300582241630277,
+      "grad_norm": 0.15993724763393402,
+      "learning_rate": 3.539876477572998e-05,
+      "loss": 0.1567505717277527,
+      "step": 2105
+    },
+    {
+      "epoch": 0.3839155749636099,
+      "grad_norm": 0.17067375779151917,
+      "learning_rate": 3.533178155627981e-05,
+      "loss": 0.14660797119140626,
+      "step": 2110
+    },
+    {
+      "epoch": 0.384825327510917,
+      "grad_norm": 0.20239882171154022,
+      "learning_rate": 3.526470877028745e-05,
+      "loss": 0.1596767544746399,
+      "step": 2115
+    },
+    {
+      "epoch": 0.38573508005822416,
+      "grad_norm": 0.1863643079996109,
+      "learning_rate": 3.5197546999209005e-05,
+      "loss": 0.15738571882247926,
+      "step": 2120
+    },
+    {
+      "epoch": 0.3866448326055313,
+      "grad_norm": 0.16994133591651917,
+      "learning_rate": 3.5130296825272014e-05,
+      "loss": 0.16255316734313965,
+      "step": 2125
+    },
+    {
+      "epoch": 0.3875545851528384,
+      "grad_norm": 0.18703415989875793,
+      "learning_rate": 3.5062958831470355e-05,
+      "loss": 0.15206334590911866,
+      "step": 2130
+    },
+    {
+      "epoch": 0.38846433770014555,
+      "grad_norm": 0.15433982014656067,
+      "learning_rate": 3.4995533601559226e-05,
+      "loss": 0.1590178370475769,
+      "step": 2135
+    },
+    {
+      "epoch": 0.3893740902474527,
+      "grad_norm": 0.16498146951198578,
+      "learning_rate": 3.4928021720050104e-05,
+      "loss": 0.14759145975112914,
+      "step": 2140
+    },
+    {
+      "epoch": 0.3902838427947598,
+      "grad_norm": 0.17880478501319885,
+      "learning_rate": 3.486042377220562e-05,
+      "loss": 0.1642458915710449,
+      "step": 2145
+    },
+    {
+      "epoch": 0.39119359534206694,
+      "grad_norm": 0.14700061082839966,
+      "learning_rate": 3.479274034403455e-05,
+      "loss": 0.16105138063430785,
+      "step": 2150
+    },
+    {
+      "epoch": 0.39210334788937407,
+      "grad_norm": 0.1620762050151825,
+      "learning_rate": 3.472497202228664e-05,
+      "loss": 0.15104985237121582,
+      "step": 2155
+    },
+    {
+      "epoch": 0.3930131004366812,
+      "grad_norm": 0.1625058799982071,
+      "learning_rate": 3.4657119394447654e-05,
+      "loss": 0.16145485639572144,
+      "step": 2160
+    },
+    {
+      "epoch": 0.3939228529839884,
+      "grad_norm": 0.1631549596786499,
+      "learning_rate": 3.458918304873417e-05,
+      "loss": 0.16712255477905275,
+      "step": 2165
+    },
+    {
+      "epoch": 0.3948326055312955,
+      "grad_norm": 0.16041551530361176,
+      "learning_rate": 3.452116357408853e-05,
+      "loss": 0.15118330717086792,
+      "step": 2170
+    },
+    {
+      "epoch": 0.39574235807860264,
+      "grad_norm": 0.16692611575126648,
+      "learning_rate": 3.44530615601737e-05,
+      "loss": 0.16982550621032716,
+      "step": 2175
+    },
+    {
+      "epoch": 0.39665211062590977,
+      "grad_norm": 0.16082268953323364,
+      "learning_rate": 3.438487759736821e-05,
+      "loss": 0.1513260006904602,
+      "step": 2180
+    },
+    {
+      "epoch": 0.3975618631732169,
+      "grad_norm": 0.1474589854478836,
+      "learning_rate": 3.4316612276761004e-05,
+      "loss": 0.14968743324279785,
+      "step": 2185
+    },
+    {
+      "epoch": 0.39847161572052403,
+      "grad_norm": 0.14531342685222626,
+      "learning_rate": 3.42482661901463e-05,
+      "loss": 0.1563260555267334,
+      "step": 2190
+    },
+    {
+      "epoch": 0.39938136826783116,
+      "grad_norm": 0.16775506734848022,
+      "learning_rate": 3.41798399300185e-05,
+      "loss": 0.14861010313034057,
+      "step": 2195
+    },
+    {
+      "epoch": 0.4002911208151383,
+      "grad_norm": 0.15065217018127441,
+      "learning_rate": 3.411133408956703e-05,
+      "loss": 0.15559519529342652,
+      "step": 2200
+    },
+    {
+      "epoch": 0.4012008733624454,
+      "grad_norm": 0.16655296087265015,
+      "learning_rate": 3.4042749262671184e-05,
+      "loss": 0.16025567054748535,
+      "step": 2205
+    },
+    {
+      "epoch": 0.40211062590975255,
+      "grad_norm": 0.14773905277252197,
+      "learning_rate": 3.397408604389501e-05,
+      "loss": 0.15074082612991332,
+      "step": 2210
+    },
+    {
+      "epoch": 0.4030203784570597,
+      "grad_norm": 0.16233304142951965,
+      "learning_rate": 3.3905345028482125e-05,
+      "loss": 0.15490520000457764,
+      "step": 2215
+    },
+    {
+      "epoch": 0.4039301310043668,
+      "grad_norm": 0.17520153522491455,
+      "learning_rate": 3.383652681235058e-05,
+      "loss": 0.1517520785331726,
+      "step": 2220
+    },
+    {
+      "epoch": 0.40483988355167394,
+      "grad_norm": 0.14749875664710999,
+      "learning_rate": 3.376763199208766e-05,
+      "loss": 0.15410997867584228,
+      "step": 2225
+    },
+    {
+      "epoch": 0.40574963609898107,
+      "grad_norm": 0.16855919361114502,
+      "learning_rate": 3.369866116494477e-05,
+      "loss": 0.1510261058807373,
+      "step": 2230
+    },
+    {
+      "epoch": 0.4066593886462882,
+      "grad_norm": 0.1594122350215912,
+      "learning_rate": 3.362961492883218e-05,
+      "loss": 0.1493813395500183,
+      "step": 2235
+    },
+    {
+      "epoch": 0.40756914119359533,
+      "grad_norm": 0.13645926117897034,
+      "learning_rate": 3.3560493882313915e-05,
+      "loss": 0.14876762628555298,
+      "step": 2240
+    },
+    {
+      "epoch": 0.40847889374090246,
+      "grad_norm": 0.14304400980472565,
+      "learning_rate": 3.349129862460251e-05,
+      "loss": 0.15567013025283813,
+      "step": 2245
+    },
+    {
+      "epoch": 0.4093886462882096,
+      "grad_norm": 0.17040041089057922,
+      "learning_rate": 3.342202975555386e-05,
+      "loss": 0.1563249945640564,
+      "step": 2250
+    },
+    {
+      "epoch": 0.4102983988355167,
+      "grad_norm": 0.15594671666622162,
+      "learning_rate": 3.3352687875661984e-05,
+      "loss": 0.1546410083770752,
+      "step": 2255
+    },
+    {
+      "epoch": 0.41120815138282385,
+      "grad_norm": 0.1677195280790329,
+      "learning_rate": 3.328327358605384e-05,
+      "loss": 0.15710171461105346,
+      "step": 2260
+    },
+    {
+      "epoch": 0.412117903930131,
+      "grad_norm": 0.1731705516576767,
+      "learning_rate": 3.321378748848412e-05,
+      "loss": 0.16444036960601807,
+      "step": 2265
+    },
+    {
+      "epoch": 0.4130276564774381,
+      "grad_norm": 0.18779033422470093,
+      "learning_rate": 3.3144230185329984e-05,
+      "loss": 0.15659687519073487,
+      "step": 2270
+    },
+    {
+      "epoch": 0.4139374090247453,
+      "grad_norm": 0.1543768346309662,
+      "learning_rate": 3.3074602279585913e-05,
+      "loss": 0.15100739002227784,
+      "step": 2275
+    },
+    {
+      "epoch": 0.4148471615720524,
+      "grad_norm": 0.16672168672084808,
+      "learning_rate": 3.300490437485843e-05,
+      "loss": 0.15535364151000977,
+      "step": 2280
+    },
+    {
+      "epoch": 0.41575691411935956,
+      "grad_norm": 0.16741308569908142,
+      "learning_rate": 3.293513707536089e-05,
+      "loss": 0.15523911714553834,
+      "step": 2285
+    },
+    {
+      "epoch": 0.4166666666666667,
+      "grad_norm": 0.1488303542137146,
+      "learning_rate": 3.286530098590822e-05,
+      "loss": 0.1542000651359558,
+      "step": 2290
+    },
+    {
+      "epoch": 0.4175764192139738,
+      "grad_norm": 0.1637732982635498,
+      "learning_rate": 3.2795396711911694e-05,
+      "loss": 0.15354831218719484,
+      "step": 2295
+    },
+    {
+      "epoch": 0.41848617176128095,
+      "grad_norm": 0.1472022533416748,
+      "learning_rate": 3.272542485937369e-05,
+      "loss": 0.16235145330429077,
+      "step": 2300
+    },
+    {
+      "epoch": 0.4193959243085881,
+      "grad_norm": 0.15908290445804596,
+      "learning_rate": 3.265538603488241e-05,
+      "loss": 0.15642645359039306,
+      "step": 2305
+    },
+    {
+      "epoch": 0.4203056768558952,
+      "grad_norm": 0.1584865301847458,
+      "learning_rate": 3.2585280845606645e-05,
+      "loss": 0.15490249395370484,
+      "step": 2310
+    },
+    {
+      "epoch": 0.42121542940320233,
+      "grad_norm": 0.15893949568271637,
+      "learning_rate": 3.251510989929052e-05,
+      "loss": 0.1598116159439087,
+      "step": 2315
+    },
+    {
+      "epoch": 0.42212518195050946,
+      "grad_norm": 0.18930596113204956,
+      "learning_rate": 3.244487380424817e-05,
+      "loss": 0.1482008934020996,
+      "step": 2320
+    },
+    {
+      "epoch": 0.4230349344978166,
+      "grad_norm": 0.132876455783844,
+      "learning_rate": 3.237457316935856e-05,
+      "loss": 0.15304710865020751,
+      "step": 2325
+    },
+    {
+      "epoch": 0.4239446870451237,
+      "grad_norm": 0.16447032988071442,
+      "learning_rate": 3.2304208604060106e-05,
+      "loss": 0.15298750400543212,
+      "step": 2330
+    },
+    {
+      "epoch": 0.42485443959243085,
+      "grad_norm": 0.17748120427131653,
+      "learning_rate": 3.223378071834546e-05,
+      "loss": 0.1556084156036377,
+      "step": 2335
+    },
+    {
+      "epoch": 0.425764192139738,
+      "grad_norm": 0.16366586089134216,
+      "learning_rate": 3.2163290122756206e-05,
+      "loss": 0.14387927055358887,
+      "step": 2340
+    },
+    {
+      "epoch": 0.4266739446870451,
+      "grad_norm": 0.15398970246315002,
+      "learning_rate": 3.209273742837755e-05,
+      "loss": 0.16091293096542358,
+      "step": 2345
+    },
+    {
+      "epoch": 0.42758369723435224,
+      "grad_norm": 0.164212167263031,
+      "learning_rate": 3.202212324683305e-05,
+      "loss": 0.15523531436920165,
+      "step": 2350
+    },
+    {
+      "epoch": 0.4284934497816594,
+      "grad_norm": 0.16749800741672516,
+      "learning_rate": 3.1951448190279255e-05,
+      "loss": 0.15354975461959838,
+      "step": 2355
+    },
+    {
+      "epoch": 0.4294032023289665,
+      "grad_norm": 0.14137034118175507,
+      "learning_rate": 3.18807128714005e-05,
+      "loss": 0.14981694221496583,
+      "step": 2360
+    },
+    {
+      "epoch": 0.43031295487627363,
+      "grad_norm": 0.14848439395427704,
+      "learning_rate": 3.1809917903403507e-05,
+      "loss": 0.15448769330978393,
+      "step": 2365
+    },
+    {
+      "epoch": 0.43122270742358076,
+      "grad_norm": 0.1747605800628662,
+      "learning_rate": 3.1739063900012095e-05,
+      "loss": 0.15882387161254882,
+      "step": 2370
+    },
+    {
+      "epoch": 0.4321324599708879,
+      "grad_norm": 0.16054467856884003,
+      "learning_rate": 3.166815147546186e-05,
+      "loss": 0.15170297622680665,
+      "step": 2375
+    },
+    {
+      "epoch": 0.433042212518195,
+      "grad_norm": 0.15428027510643005,
+      "learning_rate": 3.1597181244494886e-05,
+      "loss": 0.16202548742294312,
+      "step": 2380
+    },
+    {
+      "epoch": 0.4339519650655022,
+      "grad_norm": 0.16747219860553741,
+      "learning_rate": 3.1526153822354325e-05,
+      "loss": 0.15461477041244506,
+      "step": 2385
+    },
+    {
+      "epoch": 0.43486171761280934,
+      "grad_norm": 0.17415772378444672,
+      "learning_rate": 3.145506982477918e-05,
+      "loss": 0.16173542737960817,
+      "step": 2390
+    },
+    {
+      "epoch": 0.43577147016011647,
+      "grad_norm": 0.1293518990278244,
+      "learning_rate": 3.1383929867998865e-05,
+      "loss": 0.15572521686553956,
+      "step": 2395
+    },
+    {
+      "epoch": 0.4366812227074236,
+      "grad_norm": 0.16909323632717133,
+      "learning_rate": 3.1312734568727935e-05,
+      "loss": 0.15898628234863282,
+      "step": 2400
+    },
+    {
+      "epoch": 0.43759097525473073,
+      "grad_norm": 0.16770294308662415,
+      "learning_rate": 3.124148454416069e-05,
+      "loss": 0.1536281704902649,
+      "step": 2405
+    },
+    {
+      "epoch": 0.43850072780203786,
+      "grad_norm": 0.14078612625598907,
+      "learning_rate": 3.117018041196585e-05,
+      "loss": 0.15274266004562378,
+      "step": 2410
+    },
+    {
+      "epoch": 0.439410480349345,
+      "grad_norm": 0.15457536280155182,
+      "learning_rate": 3.1098822790281226e-05,
+      "loss": 0.15391263961791993,
+      "step": 2415
+    },
+    {
+      "epoch": 0.4403202328966521,
+      "grad_norm": 0.1640717089176178,
+      "learning_rate": 3.102741229770827e-05,
+      "loss": 0.15515168905258178,
+      "step": 2420
+    },
+    {
+      "epoch": 0.44122998544395925,
+      "grad_norm": 0.2601533830165863,
+      "learning_rate": 3.095594955330683e-05,
+      "loss": 0.1587247371673584,
+      "step": 2425
+    },
+    {
+      "epoch": 0.4421397379912664,
+      "grad_norm": 0.1352529525756836,
+      "learning_rate": 3.08844351765897e-05,
+      "loss": 0.1483217477798462,
+      "step": 2430
+    },
+    {
+      "epoch": 0.4430494905385735,
+      "grad_norm": 0.18479721248149872,
+      "learning_rate": 3.081286978751728e-05,
+      "loss": 0.15121787786483765,
+      "step": 2435
+    },
+    {
+      "epoch": 0.44395924308588064,
+      "grad_norm": 0.16954511404037476,
+      "learning_rate": 3.074125400649221e-05,
+      "loss": 0.16073100566864013,
+      "step": 2440
+    },
+    {
+      "epoch": 0.44486899563318777,
+      "grad_norm": 0.15154729783535004,
+      "learning_rate": 3.0669588454353944e-05,
+      "loss": 0.15738017559051515,
+      "step": 2445
+    },
+    {
+      "epoch": 0.4457787481804949,
+      "grad_norm": 0.1540488302707672,
+      "learning_rate": 3.059787375237344e-05,
+      "loss": 0.1515384554862976,
+      "step": 2450
+    },
+    {
+      "epoch": 0.44668850072780203,
+      "grad_norm": 0.1814432442188263,
+      "learning_rate": 3.052611052224774e-05,
+      "loss": 0.15731438398361205,
+      "step": 2455
+    },
+    {
+      "epoch": 0.44759825327510916,
+      "grad_norm": 0.16657036542892456,
+      "learning_rate": 3.0454299386094542e-05,
+      "loss": 0.15741543769836425,
+      "step": 2460
+    },
+    {
+      "epoch": 0.4485080058224163,
+      "grad_norm": 0.2177237570285797,
+      "learning_rate": 3.0382440966446875e-05,
+      "loss": 0.14972515106201173,
+      "step": 2465
+    },
+    {
+      "epoch": 0.4494177583697234,
+      "grad_norm": 0.1669909954071045,
+      "learning_rate": 3.031053588624766e-05,
+      "loss": 0.1506432294845581,
+      "step": 2470
+    },
+    {
+      "epoch": 0.45032751091703055,
+      "grad_norm": 0.1752234250307083,
+      "learning_rate": 3.0238584768844313e-05,
+      "loss": 0.14969609975814818,
+      "step": 2475
+    },
+    {
+      "epoch": 0.4512372634643377,
+      "grad_norm": 0.18267901241779327,
+      "learning_rate": 3.0166588237983363e-05,
+      "loss": 0.15112748146057128,
+      "step": 2480
+    },
+    {
+      "epoch": 0.4521470160116448,
+      "grad_norm": 0.16250105202198029,
+      "learning_rate": 3.0094546917805007e-05,
+      "loss": 0.15864100456237792,
+      "step": 2485
+    },
+    {
+      "epoch": 0.45305676855895194,
+      "grad_norm": 0.14825721085071564,
+      "learning_rate": 3.0022461432837752e-05,
+      "loss": 0.1513954520225525,
+      "step": 2490
+    },
+    {
+      "epoch": 0.4539665211062591,
+      "grad_norm": 0.1626640111207962,
+      "learning_rate": 2.9950332407992943e-05,
+      "loss": 0.1505578875541687,
+      "step": 2495
+    },
+    {
+      "epoch": 0.45487627365356625,
+      "grad_norm": 0.1535351574420929,
+      "learning_rate": 2.987816046855939e-05,
+      "loss": 0.15255829095840454,
+      "step": 2500
+    },
+    {
+      "epoch": 0.4557860262008734,
+      "grad_norm": 0.17552775144577026,
+      "learning_rate": 2.9805946240197928e-05,
+      "loss": 0.1516443133354187,
+      "step": 2505
+    },
+    {
+      "epoch": 0.4566957787481805,
+      "grad_norm": 0.16020981967449188,
+      "learning_rate": 2.9733690348935994e-05,
+      "loss": 0.14519743919372557,
+      "step": 2510
+    },
+    {
+      "epoch": 0.45760553129548764,
+      "grad_norm": 0.17800211906433105,
+      "learning_rate": 2.9661393421162204e-05,
+      "loss": 0.15679080486297609,
+      "step": 2515
+    },
+    {
+      "epoch": 0.4585152838427948,
+      "grad_norm": 0.16016991436481476,
+      "learning_rate": 2.9589056083620902e-05,
+      "loss": 0.14768127202987671,
+      "step": 2520
+    },
+    {
+      "epoch": 0.4594250363901019,
+      "grad_norm": 0.16272081434726715,
+      "learning_rate": 2.951667896340679e-05,
+      "loss": 0.1513301968574524,
+      "step": 2525
+    },
+    {
+      "epoch": 0.46033478893740903,
+      "grad_norm": 0.1726413071155548,
+      "learning_rate": 2.9444262687959402e-05,
+      "loss": 0.14819332361221313,
+      "step": 2530
+    },
+    {
+      "epoch": 0.46124454148471616,
+      "grad_norm": 0.1670403778553009,
+      "learning_rate": 2.9371807885057735e-05,
+      "loss": 0.15245940685272216,
+      "step": 2535
+    },
+    {
+      "epoch": 0.4621542940320233,
+      "grad_norm": 0.1650049239397049,
+      "learning_rate": 2.9299315182814772e-05,
+      "loss": 0.15187418460845947,
+      "step": 2540
+    },
+    {
+      "epoch": 0.4630640465793304,
+      "grad_norm": 0.16327734291553497,
+      "learning_rate": 2.9226785209672047e-05,
+      "loss": 0.15579828023910522,
+      "step": 2545
+    },
+    {
+      "epoch": 0.46397379912663755,
+      "grad_norm": 0.3367880582809448,
+      "learning_rate": 2.91542185943942e-05,
+      "loss": 0.15617697238922118,
+      "step": 2550
+    },
+    {
+      "epoch": 0.4648835516739447,
+      "grad_norm": 0.1731594055891037,
+      "learning_rate": 2.908161596606353e-05,
+      "loss": 0.1559603691101074,
+      "step": 2555
+    },
+    {
+      "epoch": 0.4657933042212518,
+      "grad_norm": 0.1477293074131012,
+      "learning_rate": 2.9008977954074517e-05,
+      "loss": 0.15567959547042848,
+      "step": 2560
+    },
+    {
+      "epoch": 0.46670305676855894,
+      "grad_norm": 0.16227173805236816,
+      "learning_rate": 2.8936305188128392e-05,
+      "loss": 0.1522113561630249,
+      "step": 2565
+    },
+    {
+      "epoch": 0.4676128093158661,
+      "grad_norm": 0.2031075656414032,
+      "learning_rate": 2.8863598298227674e-05,
+      "loss": 0.15054640769958497,
+      "step": 2570
+    },
+    {
+      "epoch": 0.4685225618631732,
+      "grad_norm": 0.18351472914218903,
+      "learning_rate": 2.8790857914670698e-05,
+      "loss": 0.15837019681930542,
+      "step": 2575
+    },
+    {
+      "epoch": 0.46943231441048033,
+      "grad_norm": 0.15914765000343323,
+      "learning_rate": 2.871808466804616e-05,
+      "loss": 0.1550259470939636,
+      "step": 2580
+    },
+    {
+      "epoch": 0.47034206695778746,
+      "grad_norm": 0.17366717755794525,
+      "learning_rate": 2.8645279189227636e-05,
+      "loss": 0.15702390670776367,
+      "step": 2585
+    },
+    {
+      "epoch": 0.4712518195050946,
+      "grad_norm": 0.13677838444709778,
+      "learning_rate": 2.8572442109368134e-05,
+      "loss": 0.15485031604766847,
+      "step": 2590
+    },
+    {
+      "epoch": 0.4721615720524017,
+      "grad_norm": 0.1477748304605484,
+      "learning_rate": 2.8499574059894617e-05,
+      "loss": 0.14577245712280273,
+      "step": 2595
+    },
+    {
+      "epoch": 0.47307132459970885,
+      "grad_norm": 0.1582217663526535,
+      "learning_rate": 2.842667567250252e-05,
+      "loss": 0.15586793422698975,
+      "step": 2600
+    },
+    {
+      "epoch": 0.47398107714701604,
+      "grad_norm": 0.19658738374710083,
+      "learning_rate": 2.8353747579150268e-05,
+      "loss": 0.15060495138168334,
+      "step": 2605
+    },
+    {
+      "epoch": 0.47489082969432317,
+      "grad_norm": 0.176767036318779,
+      "learning_rate": 2.828079041205382e-05,
+      "loss": 0.15116705894470214,
+      "step": 2610
+    },
+    {
+      "epoch": 0.4758005822416303,
+      "grad_norm": 0.16972507536411285,
+      "learning_rate": 2.820780480368117e-05,
+      "loss": 0.1541937470436096,
+      "step": 2615
+    },
+    {
+      "epoch": 0.47671033478893743,
+      "grad_norm": 0.1548585742712021,
+      "learning_rate": 2.8134791386746884e-05,
+      "loss": 0.14334756135940552,
+      "step": 2620
+    },
+    {
+      "epoch": 0.47762008733624456,
+      "grad_norm": 0.15411986410617828,
+      "learning_rate": 2.806175079420658e-05,
+      "loss": 0.14642289876937867,
+      "step": 2625
+    },
+    {
+      "epoch": 0.4785298398835517,
+      "grad_norm": 0.16609491407871246,
+      "learning_rate": 2.7988683659251474e-05,
+      "loss": 0.15083469152450563,
+      "step": 2630
+    },
+    {
+      "epoch": 0.4794395924308588,
+      "grad_norm": 0.16592684388160706,
+      "learning_rate": 2.791559061530289e-05,
+      "loss": 0.14218480587005616,
+      "step": 2635
+    },
+    {
+      "epoch": 0.48034934497816595,
+      "grad_norm": 0.1764935404062271,
+      "learning_rate": 2.7842472296006722e-05,
+      "loss": 0.15004343986511232,
+      "step": 2640
+    },
+    {
+      "epoch": 0.4812590975254731,
+      "grad_norm": 0.20094354450702667,
+      "learning_rate": 2.7769329335228022e-05,
+      "loss": 0.14975016117095946,
+      "step": 2645
+    },
+    {
+      "epoch": 0.4821688500727802,
+      "grad_norm": 0.1869269460439682,
+      "learning_rate": 2.769616236704542e-05,
+      "loss": 0.155981707572937,
+      "step": 2650
+    },
+    {
+      "epoch": 0.48307860262008734,
+      "grad_norm": 0.16671574115753174,
+      "learning_rate": 2.762297202574571e-05,
+      "loss": 0.14633859395980836,
+      "step": 2655
+    },
+    {
+      "epoch": 0.48398835516739447,
+      "grad_norm": 0.14999663829803467,
+      "learning_rate": 2.754975894581826e-05,
+      "loss": 0.15692603588104248,
+      "step": 2660
+    },
+    {
+      "epoch": 0.4848981077147016,
+      "grad_norm": 0.16893649101257324,
+      "learning_rate": 2.7476523761949592e-05,
+      "loss": 0.14530394077301026,
+      "step": 2665
+    },
+    {
+      "epoch": 0.48580786026200873,
+      "grad_norm": 0.16039884090423584,
+      "learning_rate": 2.740326710901784e-05,
+      "loss": 0.15013915300369263,
+      "step": 2670
+    },
+    {
+      "epoch": 0.48671761280931586,
+      "grad_norm": 0.16672006249427795,
+      "learning_rate": 2.732998962208725e-05,
+      "loss": 0.15667349100112915,
+      "step": 2675
+    },
+    {
+      "epoch": 0.487627365356623,
+      "grad_norm": 0.2160867303609848,
+      "learning_rate": 2.7256691936402684e-05,
+      "loss": 0.14335414171218872,
+      "step": 2680
+    },
+    {
+      "epoch": 0.4885371179039301,
+      "grad_norm": 0.349030077457428,
+      "learning_rate": 2.71833746873841e-05,
+      "loss": 0.1437530279159546,
+      "step": 2685
+    },
+    {
+      "epoch": 0.48944687045123725,
+      "grad_norm": 0.18380966782569885,
+      "learning_rate": 2.7110038510621073e-05,
+      "loss": 0.1476014256477356,
+      "step": 2690
+    },
+    {
+      "epoch": 0.4903566229985444,
+      "grad_norm": 0.1523742377758026,
+      "learning_rate": 2.703668404186722e-05,
+      "loss": 0.14578526020050048,
+      "step": 2695
+    },
+    {
+      "epoch": 0.4912663755458515,
+      "grad_norm": 0.16092729568481445,
+      "learning_rate": 2.696331191703479e-05,
+      "loss": 0.15335593223571778,
+      "step": 2700
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.4891834667741673e+18,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-2700/training_args.bin b/checkpoint-2700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-2700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/checkpoint-2800/README.md b/checkpoint-2800/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-2800/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-2800/adapter_config.json b/checkpoint-2800/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-2800/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-2800/adapter_model.safetensors b/checkpoint-2800/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..1b96fadfcb5707bab87751ed99b96dd227e3a365
--- /dev/null
+++ b/checkpoint-2800/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:975a596b0f9f20e20d0a121966ce95f9f0cc47ac7a3071454c651134ed2521c0
+size 169741912
diff --git a/checkpoint-2800/chat_template.jinja b/checkpoint-2800/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-2800/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-2800/optimizer.pt b/checkpoint-2800/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4f05633833a462e0fdae1e939af2dc0bd2cfc786
--- /dev/null
+++ b/checkpoint-2800/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b5fafd57d1b93c1a7117bf468c29689eeab4e80ec6d969115ed4cb60c57ca13b
+size 72807355
diff --git a/checkpoint-2800/processor_config.json b/checkpoint-2800/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-2800/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-2800/rng_state.pth b/checkpoint-2800/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-2800/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-2800/scheduler.pt b/checkpoint-2800/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..96a4e6609d42076027566db9feeed9130d0f2790
--- /dev/null
+++ b/checkpoint-2800/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9bb4aacf6d545f189a555097c83dcbdc89c9069ccc831393243fe44af2f96596
+size 1465
diff --git a/checkpoint-2800/tokenizer.json b/checkpoint-2800/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-2800/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-2800/tokenizer_config.json b/checkpoint-2800/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-2800/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-2800/trainer_state.json b/checkpoint-2800/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..d20edad2fc0535646df507ffbcc9642f65dd5fe4
--- /dev/null
+++ b/checkpoint-2800/trainer_state.json
@@ -0,0 +1,3962 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.5094614264919942,
+  "eval_steps": 100,
+  "global_step": 2800,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    },
+    {
+      "epoch": 0.03729985443959243,
+      "grad_norm": 0.061678580939769745,
+      "learning_rate": 4.999340748636462e-05,
+      "loss": 0.24956226348876953,
+      "step": 205
+    },
+    {
+      "epoch": 0.03820960698689956,
+      "grad_norm": 0.07328873127698898,
+      "learning_rate": 4.999160884067051e-05,
+      "loss": 0.26169676780700685,
+      "step": 210
+    },
+    {
+      "epoch": 0.0391193595342067,
+      "grad_norm": 0.08287990838289261,
+      "learning_rate": 4.9989593541926246e-05,
+      "loss": 0.2574604034423828,
+      "step": 215
+    },
+    {
+      "epoch": 0.04002911208151383,
+      "grad_norm": 0.06787359714508057,
+      "learning_rate": 4.9987361607602525e-05,
+      "loss": 0.25351409912109374,
+      "step": 220
+    },
+    {
+      "epoch": 0.04093886462882096,
+      "grad_norm": 0.06695502996444702,
+      "learning_rate": 4.998491305704805e-05,
+      "loss": 0.24522039890289307,
+      "step": 225
+    },
+    {
+      "epoch": 0.041848617176128096,
+      "grad_norm": 0.08872214704751968,
+      "learning_rate": 4.9982247911489375e-05,
+      "loss": 0.2581867933273315,
+      "step": 230
+    },
+    {
+      "epoch": 0.042758369723435226,
+      "grad_norm": 0.07637131959199905,
+      "learning_rate": 4.9979366194030743e-05,
+      "loss": 0.25569658279418944,
+      "step": 235
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 0.08158119022846222,
+      "learning_rate": 4.997626792965385e-05,
+      "loss": 0.2529409646987915,
+      "step": 240
+    },
+    {
+      "epoch": 0.04457787481804949,
+      "grad_norm": 0.07529161125421524,
+      "learning_rate": 4.997295314521766e-05,
+      "loss": 0.24049024581909179,
+      "step": 245
+    },
+    {
+      "epoch": 0.04548762736535662,
+      "grad_norm": 0.08860139548778534,
+      "learning_rate": 4.996942186945813e-05,
+      "loss": 0.2490522861480713,
+      "step": 250
+    },
+    {
+      "epoch": 0.04639737991266375,
+      "grad_norm": 0.0850321501493454,
+      "learning_rate": 4.9965674132988005e-05,
+      "loss": 0.24180831909179687,
+      "step": 255
+    },
+    {
+      "epoch": 0.04730713245997089,
+      "grad_norm": 0.07556115090847015,
+      "learning_rate": 4.996170996829653e-05,
+      "loss": 0.2509631872177124,
+      "step": 260
+    },
+    {
+      "epoch": 0.04821688500727802,
+      "grad_norm": 0.07971206307411194,
+      "learning_rate": 4.995752940974918e-05,
+      "loss": 0.24398891925811766,
+      "step": 265
+    },
+    {
+      "epoch": 0.04912663755458515,
+      "grad_norm": 0.09149336814880371,
+      "learning_rate": 4.9953132493587344e-05,
+      "loss": 0.2300492286682129,
+      "step": 270
+    },
+    {
+      "epoch": 0.050036390101892286,
+      "grad_norm": 0.08265820890665054,
+      "learning_rate": 4.9948519257928034e-05,
+      "loss": 0.24246792793273925,
+      "step": 275
+    },
+    {
+      "epoch": 0.050946142649199416,
+      "grad_norm": 0.10328587144613266,
+      "learning_rate": 4.9943689742763534e-05,
+      "loss": 0.2367171049118042,
+      "step": 280
+    },
+    {
+      "epoch": 0.05185589519650655,
+      "grad_norm": 0.0836917981505394,
+      "learning_rate": 4.993864398996105e-05,
+      "loss": 0.23215813636779786,
+      "step": 285
+    },
+    {
+      "epoch": 0.05276564774381368,
+      "grad_norm": 0.09475161135196686,
+      "learning_rate": 4.99333820432624e-05,
+      "loss": 0.2350748062133789,
+      "step": 290
+    },
+    {
+      "epoch": 0.05367540029112081,
+      "grad_norm": 0.08040128648281097,
+      "learning_rate": 4.992790394828355e-05,
+      "loss": 0.23253886699676513,
+      "step": 295
+    },
+    {
+      "epoch": 0.05458515283842795,
+      "grad_norm": 0.08852150291204453,
+      "learning_rate": 4.992220975251428e-05,
+      "loss": 0.23856515884399415,
+      "step": 300
+    },
+    {
+      "epoch": 0.05549490538573508,
+      "grad_norm": 0.09565229713916779,
+      "learning_rate": 4.991629950531775e-05,
+      "loss": 0.23311660289764405,
+      "step": 305
+    },
+    {
+      "epoch": 0.05640465793304221,
+      "grad_norm": 0.08158160001039505,
+      "learning_rate": 4.991017325793009e-05,
+      "loss": 0.22467944622039795,
+      "step": 310
+    },
+    {
+      "epoch": 0.05731441048034935,
+      "grad_norm": 0.07746429741382599,
+      "learning_rate": 4.990383106345994e-05,
+      "loss": 0.229844069480896,
+      "step": 315
+    },
+    {
+      "epoch": 0.05822416302765648,
+      "grad_norm": 0.08564355969429016,
+      "learning_rate": 4.989727297688797e-05,
+      "loss": 0.22414517402648926,
+      "step": 320
+    },
+    {
+      "epoch": 0.05913391557496361,
+      "grad_norm": 0.07517435401678085,
+      "learning_rate": 4.9890499055066435e-05,
+      "loss": 0.2236532211303711,
+      "step": 325
+    },
+    {
+      "epoch": 0.060043668122270744,
+      "grad_norm": 0.111734539270401,
+      "learning_rate": 4.988350935671869e-05,
+      "loss": 0.21474847793579102,
+      "step": 330
+    },
+    {
+      "epoch": 0.060953420669577874,
+      "grad_norm": 0.09906989336013794,
+      "learning_rate": 4.987630394243866e-05,
+      "loss": 0.23321933746337892,
+      "step": 335
+    },
+    {
+      "epoch": 0.06186317321688501,
+      "grad_norm": 0.10131457448005676,
+      "learning_rate": 4.98688828746903e-05,
+      "loss": 0.2310662031173706,
+      "step": 340
+    },
+    {
+      "epoch": 0.06277292576419213,
+      "grad_norm": 0.09203507006168365,
+      "learning_rate": 4.986124621780708e-05,
+      "loss": 0.22021169662475587,
+      "step": 345
+    },
+    {
+      "epoch": 0.06368267831149928,
+      "grad_norm": 0.09505912661552429,
+      "learning_rate": 4.9853394037991416e-05,
+      "loss": 0.2197155237197876,
+      "step": 350
+    },
+    {
+      "epoch": 0.06459243085880641,
+      "grad_norm": 0.09038657695055008,
+      "learning_rate": 4.984532640331412e-05,
+      "loss": 0.22066287994384765,
+      "step": 355
+    },
+    {
+      "epoch": 0.06550218340611354,
+      "grad_norm": 0.09707064181566238,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 0.22455451488494874,
+      "step": 360
+    },
+    {
+      "epoch": 0.06641193595342067,
+      "grad_norm": 0.10367228090763092,
+      "learning_rate": 4.98285450509961e-05,
+      "loss": 0.21993820667266845,
+      "step": 365
+    },
+    {
+      "epoch": 0.0673216885007278,
+      "grad_norm": 0.12229471653699875,
+      "learning_rate": 4.9819831478833456e-05,
+      "loss": 0.2168867588043213,
+      "step": 370
+    },
+    {
+      "epoch": 0.06823144104803494,
+      "grad_norm": 0.0964592918753624,
+      "learning_rate": 4.981090274276406e-05,
+      "loss": 0.21579203605651856,
+      "step": 375
+    },
+    {
+      "epoch": 0.06914119359534207,
+      "grad_norm": 0.09400496631860733,
+      "learning_rate": 4.980175892019141e-05,
+      "loss": 0.20972180366516113,
+      "step": 380
+    },
+    {
+      "epoch": 0.0700509461426492,
+      "grad_norm": 0.08158645778894424,
+      "learning_rate": 4.9792400090383594e-05,
+      "loss": 0.22148358821868896,
+      "step": 385
+    },
+    {
+      "epoch": 0.07096069868995633,
+      "grad_norm": 0.10916394740343094,
+      "learning_rate": 4.978282633447261e-05,
+      "loss": 0.2214418649673462,
+      "step": 390
+    },
+    {
+      "epoch": 0.07187045123726346,
+      "grad_norm": 0.11138810962438583,
+      "learning_rate": 4.9773037735453636e-05,
+      "loss": 0.21814754009246826,
+      "step": 395
+    },
+    {
+      "epoch": 0.07278020378457059,
+      "grad_norm": 0.10914396494626999,
+      "learning_rate": 4.9763034378184365e-05,
+      "loss": 0.21310818195343018,
+      "step": 400
+    },
+    {
+      "epoch": 0.07368995633187773,
+      "grad_norm": 0.1043366864323616,
+      "learning_rate": 4.975281634938421e-05,
+      "loss": 0.21266789436340333,
+      "step": 405
+    },
+    {
+      "epoch": 0.07459970887918486,
+      "grad_norm": 0.1036868542432785,
+      "learning_rate": 4.9742383737633594e-05,
+      "loss": 0.21606721878051757,
+      "step": 410
+    },
+    {
+      "epoch": 0.075509461426492,
+      "grad_norm": 0.11640442907810211,
+      "learning_rate": 4.9731736633373144e-05,
+      "loss": 0.21532948017120362,
+      "step": 415
+    },
+    {
+      "epoch": 0.07641921397379912,
+      "grad_norm": 0.11219926178455353,
+      "learning_rate": 4.9720875128902956e-05,
+      "loss": 0.2191627025604248,
+      "step": 420
+    },
+    {
+      "epoch": 0.07732896652110625,
+      "grad_norm": 0.12103637307882309,
+      "learning_rate": 4.970979931838176e-05,
+      "loss": 0.20938868522644044,
+      "step": 425
+    },
+    {
+      "epoch": 0.0782387190684134,
+      "grad_norm": 0.13274189829826355,
+      "learning_rate": 4.96985092978261e-05,
+      "loss": 0.21792960166931152,
+      "step": 430
+    },
+    {
+      "epoch": 0.07914847161572053,
+      "grad_norm": 0.11164513230323792,
+      "learning_rate": 4.968700516510954e-05,
+      "loss": 0.2022618055343628,
+      "step": 435
+    },
+    {
+      "epoch": 0.08005822416302766,
+      "grad_norm": 0.09532847255468369,
+      "learning_rate": 4.967528701996174e-05,
+      "loss": 0.21255812644958497,
+      "step": 440
+    },
+    {
+      "epoch": 0.08096797671033479,
+      "grad_norm": 0.10279258340597153,
+      "learning_rate": 4.96633549639677e-05,
+      "loss": 0.20683050155639648,
+      "step": 445
+    },
+    {
+      "epoch": 0.08187772925764192,
+      "grad_norm": 0.1257462352514267,
+      "learning_rate": 4.965120910056677e-05,
+      "loss": 0.21419920921325683,
+      "step": 450
+    },
+    {
+      "epoch": 0.08278748180494905,
+      "grad_norm": 0.11663137376308441,
+      "learning_rate": 4.963884953505186e-05,
+      "loss": 0.2072287082672119,
+      "step": 455
+    },
+    {
+      "epoch": 0.08369723435225619,
+      "grad_norm": 0.10488224029541016,
+      "learning_rate": 4.96262763745684e-05,
+      "loss": 0.1982678532600403,
+      "step": 460
+    },
+    {
+      "epoch": 0.08460698689956332,
+      "grad_norm": 0.11801692098379135,
+      "learning_rate": 4.961348972811354e-05,
+      "loss": 0.20662031173706055,
+      "step": 465
+    },
+    {
+      "epoch": 0.08551673944687045,
+      "grad_norm": 0.11318827420473099,
+      "learning_rate": 4.96004897065351e-05,
+      "loss": 0.20947303771972656,
+      "step": 470
+    },
+    {
+      "epoch": 0.08642649199417758,
+      "grad_norm": 0.13409486413002014,
+      "learning_rate": 4.95872764225307e-05,
+      "loss": 0.19670876264572143,
+      "step": 475
+    },
+    {
+      "epoch": 0.08733624454148471,
+      "grad_norm": 0.14440792798995972,
+      "learning_rate": 4.957384999064672e-05,
+      "loss": 0.19842848777770997,
+      "step": 480
+    },
+    {
+      "epoch": 0.08824599708879186,
+      "grad_norm": 0.12246996909379959,
+      "learning_rate": 4.956021052727731e-05,
+      "loss": 0.20318071842193602,
+      "step": 485
+    },
+    {
+      "epoch": 0.08915574963609899,
+      "grad_norm": 0.13437233865261078,
+      "learning_rate": 4.954635815066342e-05,
+      "loss": 0.21675212383270265,
+      "step": 490
+    },
+    {
+      "epoch": 0.09006550218340612,
+      "grad_norm": 0.11109672486782074,
+      "learning_rate": 4.9532292980891744e-05,
+      "loss": 0.2100757837295532,
+      "step": 495
+    },
+    {
+      "epoch": 0.09097525473071325,
+      "grad_norm": 0.1388893872499466,
+      "learning_rate": 4.9518015139893675e-05,
+      "loss": 0.20303285121917725,
+      "step": 500
+    },
+    {
+      "epoch": 0.09188500727802038,
+      "grad_norm": 0.13239721953868866,
+      "learning_rate": 4.950352475144427e-05,
+      "loss": 0.2152268409729004,
+      "step": 505
+    },
+    {
+      "epoch": 0.0927947598253275,
+      "grad_norm": 0.12834979593753815,
+      "learning_rate": 4.948882194116119e-05,
+      "loss": 0.20799248218536376,
+      "step": 510
+    },
+    {
+      "epoch": 0.09370451237263465,
+      "grad_norm": 0.11886704713106155,
+      "learning_rate": 4.947390683650354e-05,
+      "loss": 0.20394976139068605,
+      "step": 515
+    },
+    {
+      "epoch": 0.09461426491994178,
+      "grad_norm": 0.11398876458406448,
+      "learning_rate": 4.945877956677083e-05,
+      "loss": 0.2091092586517334,
+      "step": 520
+    },
+    {
+      "epoch": 0.09552401746724891,
+      "grad_norm": 0.1422540694475174,
+      "learning_rate": 4.944344026310186e-05,
+      "loss": 0.19564238786697388,
+      "step": 525
+    },
+    {
+      "epoch": 0.09643377001455604,
+      "grad_norm": 0.11359584331512451,
+      "learning_rate": 4.9427889058473535e-05,
+      "loss": 0.20493624210357667,
+      "step": 530
+    },
+    {
+      "epoch": 0.09734352256186317,
+      "grad_norm": 0.11703553050756454,
+      "learning_rate": 4.941212608769974e-05,
+      "loss": 0.2098615884780884,
+      "step": 535
+    },
+    {
+      "epoch": 0.0982532751091703,
+      "grad_norm": 0.14552047848701477,
+      "learning_rate": 4.939615148743017e-05,
+      "loss": 0.20382182598114013,
+      "step": 540
+    },
+    {
+      "epoch": 0.09916302765647744,
+      "grad_norm": 0.13178016245365143,
+      "learning_rate": 4.937996539614914e-05,
+      "loss": 0.19901862144470214,
+      "step": 545
+    },
+    {
+      "epoch": 0.10007278020378457,
+      "grad_norm": 0.635392427444458,
+      "learning_rate": 4.936356795417439e-05,
+      "loss": 0.20694944858551026,
+      "step": 550
+    },
+    {
+      "epoch": 0.1009825327510917,
+      "grad_norm": 0.15019077062606812,
+      "learning_rate": 4.934695930365586e-05,
+      "loss": 0.19313746690750122,
+      "step": 555
+    },
+    {
+      "epoch": 0.10189228529839883,
+      "grad_norm": 0.12941956520080566,
+      "learning_rate": 4.9330139588574474e-05,
+      "loss": 0.19671722650527954,
+      "step": 560
+    },
+    {
+      "epoch": 0.10280203784570596,
+      "grad_norm": 0.13818831741809845,
+      "learning_rate": 4.931310895474088e-05,
+      "loss": 0.20026786327362062,
+      "step": 565
+    },
+    {
+      "epoch": 0.1037117903930131,
+      "grad_norm": 0.12011194974184036,
+      "learning_rate": 4.929586754979417e-05,
+      "loss": 0.1932437539100647,
+      "step": 570
+    },
+    {
+      "epoch": 0.10462154294032024,
+      "grad_norm": 0.1345364898443222,
+      "learning_rate": 4.9278415523200644e-05,
+      "loss": 0.20245940685272218,
+      "step": 575
+    },
+    {
+      "epoch": 0.10553129548762737,
+      "grad_norm": 0.13281017541885376,
+      "learning_rate": 4.926075302625247e-05,
+      "loss": 0.19864981174468993,
+      "step": 580
+    },
+    {
+      "epoch": 0.1064410480349345,
+      "grad_norm": 0.13465586304664612,
+      "learning_rate": 4.924288021206639e-05,
+      "loss": 0.19573183059692384,
+      "step": 585
+    },
+    {
+      "epoch": 0.10735080058224163,
+      "grad_norm": 0.15225961804389954,
+      "learning_rate": 4.9224797235582396e-05,
+      "loss": 0.19946801662445068,
+      "step": 590
+    },
+    {
+      "epoch": 0.10826055312954876,
+      "grad_norm": 0.12816746532917023,
+      "learning_rate": 4.92065042535624e-05,
+      "loss": 0.19851526021957397,
+      "step": 595
+    },
+    {
+      "epoch": 0.1091703056768559,
+      "grad_norm": 0.13802853226661682,
+      "learning_rate": 4.9188001424588824e-05,
+      "loss": 0.19321763515472412,
+      "step": 600
+    },
+    {
+      "epoch": 0.11008005822416303,
+      "grad_norm": 0.17504797875881195,
+      "learning_rate": 4.9169288909063295e-05,
+      "loss": 0.2032616138458252,
+      "step": 605
+    },
+    {
+      "epoch": 0.11098981077147016,
+      "grad_norm": 0.13544194400310516,
+      "learning_rate": 4.91503668692052e-05,
+      "loss": 0.2011256456375122,
+      "step": 610
+    },
+    {
+      "epoch": 0.11189956331877729,
+      "grad_norm": 1.3976134061813354,
+      "learning_rate": 4.91312354690503e-05,
+      "loss": 0.19916868209838867,
+      "step": 615
+    },
+    {
+      "epoch": 0.11280931586608442,
+      "grad_norm": 0.1465059071779251,
+      "learning_rate": 4.91118948744493e-05,
+      "loss": 0.19487457275390624,
+      "step": 620
+    },
+    {
+      "epoch": 0.11371906841339156,
+      "grad_norm": 0.12103168666362762,
+      "learning_rate": 4.909234525306645e-05,
+      "loss": 0.1907251238822937,
+      "step": 625
+    },
+    {
+      "epoch": 0.1146288209606987,
+      "grad_norm": 0.12660574913024902,
+      "learning_rate": 4.907258677437802e-05,
+      "loss": 0.19327253103256226,
+      "step": 630
+    },
+    {
+      "epoch": 0.11553857350800582,
+      "grad_norm": 0.1347813606262207,
+      "learning_rate": 4.90526196096709e-05,
+      "loss": 0.19637736082077026,
+      "step": 635
+    },
+    {
+      "epoch": 0.11644832605531295,
+      "grad_norm": 0.14953652024269104,
+      "learning_rate": 4.903244393204107e-05,
+      "loss": 0.20325069427490233,
+      "step": 640
+    },
+    {
+      "epoch": 0.11735807860262008,
+      "grad_norm": 0.13936272263526917,
+      "learning_rate": 4.901205991639213e-05,
+      "loss": 0.1930275321006775,
+      "step": 645
+    },
+    {
+      "epoch": 0.11826783114992721,
+      "grad_norm": 0.1448420137166977,
+      "learning_rate": 4.899146773943374e-05,
+      "loss": 0.20026936531066894,
+      "step": 650
+    },
+    {
+      "epoch": 0.11917758369723436,
+      "grad_norm": 0.1312534064054489,
+      "learning_rate": 4.897066757968014e-05,
+      "loss": 0.19062033891677857,
+      "step": 655
+    },
+    {
+      "epoch": 0.12008733624454149,
+      "grad_norm": 0.13644742965698242,
+      "learning_rate": 4.894965961744859e-05,
+      "loss": 0.18719595670700073,
+      "step": 660
+    },
+    {
+      "epoch": 0.12099708879184862,
+      "grad_norm": 0.14276087284088135,
+      "learning_rate": 4.892844403485777e-05,
+      "loss": 0.19784307479858398,
+      "step": 665
+    },
+    {
+      "epoch": 0.12190684133915575,
+      "grad_norm": 0.14735399186611176,
+      "learning_rate": 4.890702101582623e-05,
+      "loss": 0.19163782596588136,
+      "step": 670
+    },
+    {
+      "epoch": 0.12281659388646288,
+      "grad_norm": 0.15742065012454987,
+      "learning_rate": 4.888539074607082e-05,
+      "loss": 0.19312986135482788,
+      "step": 675
+    },
+    {
+      "epoch": 0.12372634643377002,
+      "grad_norm": 0.12917031347751617,
+      "learning_rate": 4.8863553413105025e-05,
+      "loss": 0.20066320896148682,
+      "step": 680
+    },
+    {
+      "epoch": 0.12463609898107715,
+      "grad_norm": 0.1484801322221756,
+      "learning_rate": 4.884150920623737e-05,
+      "loss": 0.20096096992492676,
+      "step": 685
+    },
+    {
+      "epoch": 0.12554585152838427,
+      "grad_norm": 0.1455296128988266,
+      "learning_rate": 4.88192583165698e-05,
+      "loss": 0.20518505573272705,
+      "step": 690
+    },
+    {
+      "epoch": 0.12645560407569142,
+      "grad_norm": 0.14517490565776825,
+      "learning_rate": 4.879680093699598e-05,
+      "loss": 0.18859238624572755,
+      "step": 695
+    },
+    {
+      "epoch": 0.12736535662299855,
+      "grad_norm": 0.18778090178966522,
+      "learning_rate": 4.877413726219964e-05,
+      "loss": 0.197074818611145,
+      "step": 700
+    },
+    {
+      "epoch": 0.12827510917030568,
+      "grad_norm": 0.13497677445411682,
+      "learning_rate": 4.87512674886529e-05,
+      "loss": 0.18713107109069824,
+      "step": 705
+    },
+    {
+      "epoch": 0.12918486171761281,
+      "grad_norm": 0.12657155096530914,
+      "learning_rate": 4.872819181461455e-05,
+      "loss": 0.1858484387397766,
+      "step": 710
+    },
+    {
+      "epoch": 0.13009461426491994,
+      "grad_norm": 0.11458148807287216,
+      "learning_rate": 4.870491044012834e-05,
+      "loss": 0.18732179403305055,
+      "step": 715
+    },
+    {
+      "epoch": 0.13100436681222707,
+      "grad_norm": 0.13000249862670898,
+      "learning_rate": 4.8681423567021244e-05,
+      "loss": 0.1872936010360718,
+      "step": 720
+    },
+    {
+      "epoch": 0.1319141193595342,
+      "grad_norm": 0.14580890536308289,
+      "learning_rate": 4.865773139890172e-05,
+      "loss": 0.19280019998550416,
+      "step": 725
+    },
+    {
+      "epoch": 0.13282387190684133,
+      "grad_norm": 0.1507277935743332,
+      "learning_rate": 4.8633834141157913e-05,
+      "loss": 0.1898929238319397,
+      "step": 730
+    },
+    {
+      "epoch": 0.13373362445414846,
+      "grad_norm": 0.1418737769126892,
+      "learning_rate": 4.860973200095592e-05,
+      "loss": 0.17926375865936278,
+      "step": 735
+    },
+    {
+      "epoch": 0.1346433770014556,
+      "grad_norm": 0.17151866853237152,
+      "learning_rate": 4.858542518723794e-05,
+      "loss": 0.18963592052459716,
+      "step": 740
+    },
+    {
+      "epoch": 0.13555312954876272,
+      "grad_norm": 0.11162743717432022,
+      "learning_rate": 4.8560913910720535e-05,
+      "loss": 0.19466646909713745,
+      "step": 745
+    },
+    {
+      "epoch": 0.13646288209606988,
+      "grad_norm": 0.15628376603126526,
+      "learning_rate": 4.8536198383892725e-05,
+      "loss": 0.19494034051895143,
+      "step": 750
+    },
+    {
+      "epoch": 0.137372634643377,
+      "grad_norm": 0.18209289014339447,
+      "learning_rate": 4.851127882101421e-05,
+      "loss": 0.18747550249099731,
+      "step": 755
+    },
+    {
+      "epoch": 0.13828238719068414,
+      "grad_norm": 0.14559614658355713,
+      "learning_rate": 4.8486155438113454e-05,
+      "loss": 0.1897158980369568,
+      "step": 760
+    },
+    {
+      "epoch": 0.13919213973799127,
+      "grad_norm": 0.3198587894439697,
+      "learning_rate": 4.846082845298586e-05,
+      "loss": 0.18571001291275024,
+      "step": 765
+    },
+    {
+      "epoch": 0.1401018922852984,
+      "grad_norm": 0.1486678421497345,
+      "learning_rate": 4.843529808519189e-05,
+      "loss": 0.19561930894851684,
+      "step": 770
+    },
+    {
+      "epoch": 0.14101164483260553,
+      "grad_norm": 0.15318170189857483,
+      "learning_rate": 4.840956455605509e-05,
+      "loss": 0.187040114402771,
+      "step": 775
+    },
+    {
+      "epoch": 0.14192139737991266,
+      "grad_norm": 0.13754244148731232,
+      "learning_rate": 4.838362808866025e-05,
+      "loss": 0.18345539569854735,
+      "step": 780
+    },
+    {
+      "epoch": 0.1428311499272198,
+      "grad_norm": 0.12943248450756073,
+      "learning_rate": 4.835748890785143e-05,
+      "loss": 0.1921079397201538,
+      "step": 785
+    },
+    {
+      "epoch": 0.14374090247452692,
+      "grad_norm": 0.110458143055439,
+      "learning_rate": 4.833114724023001e-05,
+      "loss": 0.17927205562591553,
+      "step": 790
+    },
+    {
+      "epoch": 0.14465065502183405,
+      "grad_norm": 0.2421770840883255,
+      "learning_rate": 4.830460331415275e-05,
+      "loss": 0.18317567110061644,
+      "step": 795
+    },
+    {
+      "epoch": 0.14556040756914118,
+      "grad_norm": 0.14752762019634247,
+      "learning_rate": 4.8277857359729787e-05,
+      "loss": 0.1843916058540344,
+      "step": 800
+    },
+    {
+      "epoch": 0.14647016011644834,
+      "grad_norm": 0.15043556690216064,
+      "learning_rate": 4.8250909608822644e-05,
+      "loss": 0.18354393243789674,
+      "step": 805
+    },
+    {
+      "epoch": 0.14737991266375547,
+      "grad_norm": 0.1381794661283493,
+      "learning_rate": 4.822376029504223e-05,
+      "loss": 0.1789781332015991,
+      "step": 810
+    },
+    {
+      "epoch": 0.1482896652110626,
+      "grad_norm": 0.18386174738407135,
+      "learning_rate": 4.819640965374681e-05,
+      "loss": 0.19494292736053467,
+      "step": 815
+    },
+    {
+      "epoch": 0.14919941775836973,
+      "grad_norm": 0.13829593360424042,
+      "learning_rate": 4.816885792203996e-05,
+      "loss": 0.18486063480377196,
+      "step": 820
+    },
+    {
+      "epoch": 0.15010917030567686,
+      "grad_norm": 0.15033291280269623,
+      "learning_rate": 4.814110533876852e-05,
+      "loss": 0.18061509132385253,
+      "step": 825
+    },
+    {
+      "epoch": 0.151018922852984,
+      "grad_norm": 0.17150473594665527,
+      "learning_rate": 4.811315214452051e-05,
+      "loss": 0.18464866876602173,
+      "step": 830
+    },
+    {
+      "epoch": 0.15192867540029112,
+      "grad_norm": 0.15317125618457794,
+      "learning_rate": 4.808499858162307e-05,
+      "loss": 0.1837708592414856,
+      "step": 835
+    },
+    {
+      "epoch": 0.15283842794759825,
+      "grad_norm": 0.2671392560005188,
+      "learning_rate": 4.805664489414031e-05,
+      "loss": 0.19338636398315429,
+      "step": 840
+    },
+    {
+      "epoch": 0.15374818049490538,
+      "grad_norm": 0.14047028124332428,
+      "learning_rate": 4.802809132787125e-05,
+      "loss": 0.17069108486175538,
+      "step": 845
+    },
+    {
+      "epoch": 0.1546579330422125,
+      "grad_norm": 0.1520431935787201,
+      "learning_rate": 4.799933813034768e-05,
+      "loss": 0.18607735633850098,
+      "step": 850
+    },
+    {
+      "epoch": 0.15556768558951964,
+      "grad_norm": 0.17239463329315186,
+      "learning_rate": 4.797038555083197e-05,
+      "loss": 0.18069062232971192,
+      "step": 855
+    },
+    {
+      "epoch": 0.1564774381368268,
+      "grad_norm": 0.1377955675125122,
+      "learning_rate": 4.794123384031495e-05,
+      "loss": 0.18870222568511963,
+      "step": 860
+    },
+    {
+      "epoch": 0.15738719068413393,
+      "grad_norm": 0.15901461243629456,
+      "learning_rate": 4.791188325151373e-05,
+      "loss": 0.18128334283828734,
+      "step": 865
+    },
+    {
+      "epoch": 0.15829694323144106,
+      "grad_norm": 0.14634132385253906,
+      "learning_rate": 4.7882334038869495e-05,
+      "loss": 0.1866163969039917,
+      "step": 870
+    },
+    {
+      "epoch": 0.1592066957787482,
+      "grad_norm": 0.15361061692237854,
+      "learning_rate": 4.785258645854529e-05,
+      "loss": 0.17850807905197144,
+      "step": 875
+    },
+    {
+      "epoch": 0.16011644832605532,
+      "grad_norm": 0.13751649856567383,
+      "learning_rate": 4.782264076842385e-05,
+      "loss": 0.17731113433837892,
+      "step": 880
+    },
+    {
+      "epoch": 0.16102620087336245,
+      "grad_norm": 0.17909638583660126,
+      "learning_rate": 4.7792497228105314e-05,
+      "loss": 0.18344542980194092,
+      "step": 885
+    },
+    {
+      "epoch": 0.16193595342066958,
+      "grad_norm": 0.16038304567337036,
+      "learning_rate": 4.776215609890498e-05,
+      "loss": 0.18868647813796996,
+      "step": 890
+    },
+    {
+      "epoch": 0.1628457059679767,
+      "grad_norm": 0.1653951108455658,
+      "learning_rate": 4.773161764385107e-05,
+      "loss": 0.18614152669906617,
+      "step": 895
+    },
+    {
+      "epoch": 0.16375545851528384,
+      "grad_norm": 0.16193026304244995,
+      "learning_rate": 4.770088212768241e-05,
+      "loss": 0.18564575910568237,
+      "step": 900
+    },
+    {
+      "epoch": 0.16466521106259097,
+      "grad_norm": 0.16048531234264374,
+      "learning_rate": 4.7669949816846173e-05,
+      "loss": 0.18330031633377075,
+      "step": 905
+    },
+    {
+      "epoch": 0.1655749636098981,
+      "grad_norm": 0.1440177708864212,
+      "learning_rate": 4.7638820979495534e-05,
+      "loss": 0.17712442874908446,
+      "step": 910
+    },
+    {
+      "epoch": 0.16648471615720525,
+      "grad_norm": 0.19635969400405884,
+      "learning_rate": 4.760749588548738e-05,
+      "loss": 0.18679027557373046,
+      "step": 915
+    },
+    {
+      "epoch": 0.16739446870451238,
+      "grad_norm": 0.15576541423797607,
+      "learning_rate": 4.757597480637995e-05,
+      "loss": 0.19283764362335204,
+      "step": 920
+    },
+    {
+      "epoch": 0.1683042212518195,
+      "grad_norm": 0.1550331562757492,
+      "learning_rate": 4.7544258015430463e-05,
+      "loss": 0.18269542455673218,
+      "step": 925
+    },
+    {
+      "epoch": 0.16921397379912664,
+      "grad_norm": 0.18369626998901367,
+      "learning_rate": 4.75123457875928e-05,
+      "loss": 0.1697891116142273,
+      "step": 930
+    },
+    {
+      "epoch": 0.17012372634643377,
+      "grad_norm": 0.15266314148902893,
+      "learning_rate": 4.7480238399515074e-05,
+      "loss": 0.18523451089859008,
+      "step": 935
+    },
+    {
+      "epoch": 0.1710334788937409,
+      "grad_norm": 0.16709664463996887,
+      "learning_rate": 4.744793612953724e-05,
+      "loss": 0.1803238034248352,
+      "step": 940
+    },
+    {
+      "epoch": 0.17194323144104803,
+      "grad_norm": 0.14929179847240448,
+      "learning_rate": 4.741543925768872e-05,
+      "loss": 0.1861217737197876,
+      "step": 945
+    },
+    {
+      "epoch": 0.17285298398835516,
+      "grad_norm": 0.1362280696630478,
+      "learning_rate": 4.7382748065685915e-05,
+      "loss": 0.17896100282669067,
+      "step": 950
+    },
+    {
+      "epoch": 0.1737627365356623,
+      "grad_norm": 0.15290239453315735,
+      "learning_rate": 4.734986283692982e-05,
+      "loss": 0.18432788848876952,
+      "step": 955
+    },
+    {
+      "epoch": 0.17467248908296942,
+      "grad_norm": 0.1287035197019577,
+      "learning_rate": 4.73167838565035e-05,
+      "loss": 0.18485682010650634,
+      "step": 960
+    },
+    {
+      "epoch": 0.17558224163027655,
+      "grad_norm": 0.17969627678394318,
+      "learning_rate": 4.728351141116971e-05,
+      "loss": 0.17361557483673096,
+      "step": 965
+    },
+    {
+      "epoch": 0.1764919941775837,
+      "grad_norm": 0.13751201331615448,
+      "learning_rate": 4.7250045789368326e-05,
+      "loss": 0.1731679320335388,
+      "step": 970
+    },
+    {
+      "epoch": 0.17740174672489084,
+      "grad_norm": 0.1603265255689621,
+      "learning_rate": 4.721638728121388e-05,
+      "loss": 0.17308170795440675,
+      "step": 975
+    },
+    {
+      "epoch": 0.17831149927219797,
+      "grad_norm": 0.1592789888381958,
+      "learning_rate": 4.718253617849306e-05,
+      "loss": 0.17534757852554322,
+      "step": 980
+    },
+    {
+      "epoch": 0.1792212518195051,
+      "grad_norm": 0.12727224826812744,
+      "learning_rate": 4.714849277466214e-05,
+      "loss": 0.17817609310150145,
+      "step": 985
+    },
+    {
+      "epoch": 0.18013100436681223,
+      "grad_norm": 0.15401554107666016,
+      "learning_rate": 4.711425736484447e-05,
+      "loss": 0.1733405351638794,
+      "step": 990
+    },
+    {
+      "epoch": 0.18104075691411936,
+      "grad_norm": 0.13253968954086304,
+      "learning_rate": 4.7079830245827906e-05,
+      "loss": 0.17846795320510864,
+      "step": 995
+    },
+    {
+      "epoch": 0.1819505094614265,
+      "grad_norm": 0.21846213936805725,
+      "learning_rate": 4.7045211716062245e-05,
+      "loss": 0.18021599054336548,
+      "step": 1000
+    },
+    {
+      "epoch": 0.18286026200873362,
+      "grad_norm": 0.16867990791797638,
+      "learning_rate": 4.7010402075656595e-05,
+      "loss": 0.18232386112213134,
+      "step": 1005
+    },
+    {
+      "epoch": 0.18377001455604075,
+      "grad_norm": 0.17180582880973816,
+      "learning_rate": 4.697540162637686e-05,
+      "loss": 0.1816317319869995,
+      "step": 1010
+    },
+    {
+      "epoch": 0.18467976710334788,
+      "grad_norm": 0.16480213403701782,
+      "learning_rate": 4.694021067164303e-05,
+      "loss": 0.17718446254730225,
+      "step": 1015
+    },
+    {
+      "epoch": 0.185589519650655,
+      "grad_norm": 0.15015918016433716,
+      "learning_rate": 4.6904829516526605e-05,
+      "loss": 0.17412011623382567,
+      "step": 1020
+    },
+    {
+      "epoch": 0.18649927219796217,
+      "grad_norm": 0.14445139467716217,
+      "learning_rate": 4.686925846774795e-05,
+      "loss": 0.1778018832206726,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1874090247452693,
+      "grad_norm": 0.1701960265636444,
+      "learning_rate": 4.683349783367362e-05,
+      "loss": 0.16901081800460815,
+      "step": 1030
+    },
+    {
+      "epoch": 0.18831877729257643,
+      "grad_norm": 0.15894867479801178,
+      "learning_rate": 4.679754792431368e-05,
+      "loss": 0.17055928707122803,
+      "step": 1035
+    },
+    {
+      "epoch": 0.18922852983988356,
+      "grad_norm": 0.1511942446231842,
+      "learning_rate": 4.676140905131903e-05,
+      "loss": 0.17339680194854737,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1901382823871907,
+      "grad_norm": 0.14735209941864014,
+      "learning_rate": 4.672508152797872e-05,
+      "loss": 0.17802717685699462,
+      "step": 1045
+    },
+    {
+      "epoch": 0.19104803493449782,
+      "grad_norm": 0.17367291450500488,
+      "learning_rate": 4.66885656692172e-05,
+      "loss": 0.1732744097709656,
+      "step": 1050
+    },
+    {
+      "epoch": 0.19195778748180495,
+      "grad_norm": 0.147227481007576,
+      "learning_rate": 4.665186179159159e-05,
+      "loss": 0.17040517330169677,
+      "step": 1055
+    },
+    {
+      "epoch": 0.19286754002911208,
+      "grad_norm": 0.1709655076265335,
+      "learning_rate": 4.6614970213289e-05,
+      "loss": 0.17794088125228882,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1937772925764192,
+      "grad_norm": 0.1588088721036911,
+      "learning_rate": 4.657789125412366e-05,
+      "loss": 0.17180380821228028,
+      "step": 1065
+    },
+    {
+      "epoch": 0.19468704512372634,
+      "grad_norm": 0.14827021956443787,
+      "learning_rate": 4.654062523553428e-05,
+      "loss": 0.182997989654541,
+      "step": 1070
+    },
+    {
+      "epoch": 0.19559679767103347,
+      "grad_norm": 0.16230466961860657,
+      "learning_rate": 4.6503172480581126e-05,
+      "loss": 0.17346880435943604,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1965065502183406,
+      "grad_norm": 0.1637624353170395,
+      "learning_rate": 4.646553331394333e-05,
+      "loss": 0.17263576984405518,
+      "step": 1080
+    },
+    {
+      "epoch": 0.19741630276564776,
+      "grad_norm": 0.15977843105793,
+      "learning_rate": 4.642770806191603e-05,
+      "loss": 0.17284308671951293,
+      "step": 1085
+    },
+    {
+      "epoch": 0.19832605531295489,
+      "grad_norm": 0.15394869446754456,
+      "learning_rate": 4.6389697052407534e-05,
+      "loss": 0.17797101736068727,
+      "step": 1090
+    },
+    {
+      "epoch": 0.19923580786026202,
+      "grad_norm": 0.15995225310325623,
+      "learning_rate": 4.6351500614936485e-05,
+      "loss": 0.18137198686599731,
+      "step": 1095
+    },
+    {
+      "epoch": 0.20014556040756915,
+      "grad_norm": 0.1779479682445526,
+      "learning_rate": 4.6313119080629006e-05,
+      "loss": 0.17998344898223878,
+      "step": 1100
+    },
+    {
+      "epoch": 0.20105531295487628,
+      "grad_norm": 0.14362832903862,
+      "learning_rate": 4.627455278221584e-05,
+      "loss": 0.18196423053741456,
+      "step": 1105
+    },
+    {
+      "epoch": 0.2019650655021834,
+      "grad_norm": 0.15951639413833618,
+      "learning_rate": 4.623580205402947e-05,
+      "loss": 0.17423888444900512,
+      "step": 1110
+    },
+    {
+      "epoch": 0.20287481804949054,
+      "grad_norm": 0.17273563146591187,
+      "learning_rate": 4.619686723200115e-05,
+      "loss": 0.17392473220825194,
+      "step": 1115
+    },
+    {
+      "epoch": 0.20378457059679767,
+      "grad_norm": 0.1655360758304596,
+      "learning_rate": 4.615774865365813e-05,
+      "loss": 0.17528389692306517,
+      "step": 1120
+    },
+    {
+      "epoch": 0.2046943231441048,
+      "grad_norm": 0.15920691192150116,
+      "learning_rate": 4.611844665812058e-05,
+      "loss": 0.1806849241256714,
+      "step": 1125
+    },
+    {
+      "epoch": 0.20560407569141192,
+      "grad_norm": 0.16114577651023865,
+      "learning_rate": 4.607896158609875e-05,
+      "loss": 0.17217352390289306,
+      "step": 1130
+    },
+    {
+      "epoch": 0.20651382823871905,
+      "grad_norm": 0.1499422937631607,
+      "learning_rate": 4.603929377988999e-05,
+      "loss": 0.17806737422943114,
+      "step": 1135
+    },
+    {
+      "epoch": 0.2074235807860262,
+      "grad_norm": 0.17605191469192505,
+      "learning_rate": 4.5999443583375765e-05,
+      "loss": 0.17842113971710205,
+      "step": 1140
+    },
+    {
+      "epoch": 0.20833333333333334,
+      "grad_norm": 0.16117210686206818,
+      "learning_rate": 4.595941134201871e-05,
+      "loss": 0.18379683494567872,
+      "step": 1145
+    },
+    {
+      "epoch": 0.20924308588064047,
+      "grad_norm": 0.21199050545692444,
+      "learning_rate": 4.591919740285957e-05,
+      "loss": 0.16286123991012574,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2101528384279476,
+      "grad_norm": 0.15100529789924622,
+      "learning_rate": 4.587880211451427e-05,
+      "loss": 0.17995200157165528,
+      "step": 1155
+    },
+    {
+      "epoch": 0.21106259097525473,
+      "grad_norm": 0.16618172824382782,
+      "learning_rate": 4.583822582717085e-05,
+      "loss": 0.16960303783416747,
+      "step": 1160
+    },
+    {
+      "epoch": 0.21197234352256186,
+      "grad_norm": 0.14743569493293762,
+      "learning_rate": 4.579746889258643e-05,
+      "loss": 0.17762668132781984,
+      "step": 1165
+    },
+    {
+      "epoch": 0.212882096069869,
+      "grad_norm": 0.1697179079055786,
+      "learning_rate": 4.575653166408417e-05,
+      "loss": 0.16656005382537842,
+      "step": 1170
+    },
+    {
+      "epoch": 0.21379184861717612,
+      "grad_norm": 0.14886513352394104,
+      "learning_rate": 4.57154144965502e-05,
+      "loss": 0.17091882228851318,
+      "step": 1175
+    },
+    {
+      "epoch": 0.21470160116448325,
+      "grad_norm": 0.18197473883628845,
+      "learning_rate": 4.5674117746430556e-05,
+      "loss": 0.1770920753479004,
+      "step": 1180
+    },
+    {
+      "epoch": 0.21561135371179038,
+      "grad_norm": 0.17323088645935059,
+      "learning_rate": 4.563264177172807e-05,
+      "loss": 0.1734643578529358,
+      "step": 1185
+    },
+    {
+      "epoch": 0.2165211062590975,
+      "grad_norm": 0.1521984338760376,
+      "learning_rate": 4.559098693199929e-05,
+      "loss": 0.17515116930007935,
+      "step": 1190
+    },
+    {
+      "epoch": 0.21743085880640467,
+      "grad_norm": 0.1842304915189743,
+      "learning_rate": 4.554915358835134e-05,
+      "loss": 0.16798022985458375,
+      "step": 1195
+    },
+    {
+      "epoch": 0.2183406113537118,
+      "grad_norm": 0.14753451943397522,
+      "learning_rate": 4.5507142103438794e-05,
+      "loss": 0.1755476713180542,
+      "step": 1200
+    },
+    {
+      "epoch": 0.21925036390101893,
+      "grad_norm": 0.17096194624900818,
+      "learning_rate": 4.546495284146057e-05,
+      "loss": 0.1792473554611206,
+      "step": 1205
+    },
+    {
+      "epoch": 0.22016011644832606,
+      "grad_norm": 0.1579233556985855,
+      "learning_rate": 4.542258616815669e-05,
+      "loss": 0.17230144739151002,
+      "step": 1210
+    },
+    {
+      "epoch": 0.2210698689956332,
+      "grad_norm": 0.177297905087471,
+      "learning_rate": 4.5380042450805216e-05,
+      "loss": 0.1807127833366394,
+      "step": 1215
+    },
+    {
+      "epoch": 0.22197962154294032,
+      "grad_norm": 0.14331696927547455,
+      "learning_rate": 4.533732205821897e-05,
+      "loss": 0.17201389074325563,
+      "step": 1220
+    },
+    {
+      "epoch": 0.22288937409024745,
+      "grad_norm": 0.14473360776901245,
+      "learning_rate": 4.529442536074239e-05,
+      "loss": 0.17036900520324708,
+      "step": 1225
+    },
+    {
+      "epoch": 0.22379912663755458,
+      "grad_norm": 0.1820901483297348,
+      "learning_rate": 4.5251352730248314e-05,
+      "loss": 0.17704882621765136,
+      "step": 1230
+    },
+    {
+      "epoch": 0.2247088791848617,
+      "grad_norm": 0.1948976367712021,
+      "learning_rate": 4.5208104540134746e-05,
+      "loss": 0.1706973433494568,
+      "step": 1235
+    },
+    {
+      "epoch": 0.22561863173216884,
+      "grad_norm": 0.16660070419311523,
+      "learning_rate": 4.51646811653216e-05,
+      "loss": 0.17636821269989014,
+      "step": 1240
+    },
+    {
+      "epoch": 0.22652838427947597,
+      "grad_norm": 0.1699984073638916,
+      "learning_rate": 4.512108298224751e-05,
+      "loss": 0.16986632347106934,
+      "step": 1245
+    },
+    {
+      "epoch": 0.22743813682678313,
+      "grad_norm": 0.17601042985916138,
+      "learning_rate": 4.50773103688665e-05,
+      "loss": 0.17507898807525635,
+      "step": 1250
+    },
+    {
+      "epoch": 0.22834788937409026,
+      "grad_norm": 0.17557238042354584,
+      "learning_rate": 4.503336370464476e-05,
+      "loss": 0.17702863216400147,
+      "step": 1255
+    },
+    {
+      "epoch": 0.2292576419213974,
+      "grad_norm": 0.1800651252269745,
+      "learning_rate": 4.498924337055729e-05,
+      "loss": 0.16419180631637573,
+      "step": 1260
+    },
+    {
+      "epoch": 0.23016739446870452,
+      "grad_norm": 0.2022479772567749,
+      "learning_rate": 4.494494974908468e-05,
+      "loss": 0.17482060194015503,
+      "step": 1265
+    },
+    {
+      "epoch": 0.23107714701601165,
+      "grad_norm": 0.14180205762386322,
+      "learning_rate": 4.490048322420973e-05,
+      "loss": 0.1723136067390442,
+      "step": 1270
+    },
+    {
+      "epoch": 0.23198689956331878,
+      "grad_norm": 0.18607310950756073,
+      "learning_rate": 4.485584418141419e-05,
+      "loss": 0.17096419334411622,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2328966521106259,
+      "grad_norm": 0.15958310663700104,
+      "learning_rate": 4.481103300767529e-05,
+      "loss": 0.1656244158744812,
+      "step": 1280
+    },
+    {
+      "epoch": 0.23380640465793304,
+      "grad_norm": 0.17552383244037628,
+      "learning_rate": 4.476605009146255e-05,
+      "loss": 0.17677626609802247,
+      "step": 1285
+    },
+    {
+      "epoch": 0.23471615720524017,
+      "grad_norm": 0.15299823880195618,
+      "learning_rate": 4.472089582273429e-05,
+      "loss": 0.1778991103172302,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2356259097525473,
+      "grad_norm": 0.14613987505435944,
+      "learning_rate": 4.46755705929343e-05,
+      "loss": 0.17071452140808105,
+      "step": 1295
+    },
+    {
+      "epoch": 0.23653566229985443,
+      "grad_norm": 0.17781122028827667,
+      "learning_rate": 4.463007479498843e-05,
+      "loss": 0.16955430507659913,
+      "step": 1300
+    },
+    {
+      "epoch": 0.23744541484716158,
+      "grad_norm": 0.16326487064361572,
+      "learning_rate": 4.458440882330119e-05,
+      "loss": 0.1777693510055542,
+      "step": 1305
+    },
+    {
+      "epoch": 0.23835516739446871,
+      "grad_norm": 0.17701926827430725,
+      "learning_rate": 4.4538573073752365e-05,
+      "loss": 0.16323351860046387,
+      "step": 1310
+    },
+    {
+      "epoch": 0.23926491994177584,
+      "grad_norm": 0.13104717433452606,
+      "learning_rate": 4.449256794369349e-05,
+      "loss": 0.17653456926345826,
+      "step": 1315
+    },
+    {
+      "epoch": 0.24017467248908297,
+      "grad_norm": 0.1796836256980896,
+      "learning_rate": 4.444639383194452e-05,
+      "loss": 0.17189600467681884,
+      "step": 1320
+    },
+    {
+      "epoch": 0.2410844250363901,
+      "grad_norm": 0.14919696748256683,
+      "learning_rate": 4.440005113879029e-05,
+      "loss": 0.17003334760665895,
+      "step": 1325
+    },
+    {
+      "epoch": 0.24199417758369723,
+      "grad_norm": 0.1728784441947937,
+      "learning_rate": 4.4353540265977064e-05,
+      "loss": 0.17397408485412597,
+      "step": 1330
+    },
+    {
+      "epoch": 0.24290393013100436,
+      "grad_norm": 0.14591015875339508,
+      "learning_rate": 4.43068616167091e-05,
+      "loss": 0.16498478651046752,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2438136826783115,
+      "grad_norm": 0.18417201936244965,
+      "learning_rate": 4.4260015595645055e-05,
+      "loss": 0.16841750144958495,
+      "step": 1340
+    },
+    {
+      "epoch": 0.24472343522561862,
+      "grad_norm": 0.16264279186725616,
+      "learning_rate": 4.4213002608894605e-05,
+      "loss": 0.16907373666763306,
+      "step": 1345
+    },
+    {
+      "epoch": 0.24563318777292575,
+      "grad_norm": 0.15248481929302216,
+      "learning_rate": 4.416582306401481e-05,
+      "loss": 0.15931472778320313,
+      "step": 1350
+    },
+    {
+      "epoch": 0.24654294032023288,
+      "grad_norm": 0.1488373875617981,
+      "learning_rate": 4.4118477370006636e-05,
+      "loss": 0.1701716423034668,
+      "step": 1355
+    },
+    {
+      "epoch": 0.24745269286754004,
+      "grad_norm": 0.14679782092571259,
+      "learning_rate": 4.407096593731142e-05,
+      "loss": 0.157412326335907,
+      "step": 1360
+    },
+    {
+      "epoch": 0.24836244541484717,
+      "grad_norm": 0.17139530181884766,
+      "learning_rate": 4.402328917780728e-05,
+      "loss": 0.17303754091262818,
+      "step": 1365
+    },
+    {
+      "epoch": 0.2492721979621543,
+      "grad_norm": 0.1534871757030487,
+      "learning_rate": 4.397544750480554e-05,
+      "loss": 0.1786255121231079,
+      "step": 1370
+    },
+    {
+      "epoch": 0.2501819505094614,
+      "grad_norm": 0.1876252293586731,
+      "learning_rate": 4.39274413330472e-05,
+      "loss": 0.16442898511886597,
+      "step": 1375
+    },
+    {
+      "epoch": 0.25109170305676853,
+      "grad_norm": 0.16165752708911896,
+      "learning_rate": 4.387927107869928e-05,
+      "loss": 0.1780426025390625,
+      "step": 1380
+    },
+    {
+      "epoch": 0.25200145560407566,
+      "grad_norm": 0.17242255806922913,
+      "learning_rate": 4.383093715935124e-05,
+      "loss": 0.15959256887435913,
+      "step": 1385
+    },
+    {
+      "epoch": 0.25291120815138285,
+      "grad_norm": 0.1627114862203598,
+      "learning_rate": 4.378243999401137e-05,
+      "loss": 0.17606115341186523,
+      "step": 1390
+    },
+    {
+      "epoch": 0.25382096069869,
+      "grad_norm": 0.15911224484443665,
+      "learning_rate": 4.373378000310312e-05,
+      "loss": 0.16798585653305054,
+      "step": 1395
+    },
+    {
+      "epoch": 0.2547307132459971,
+      "grad_norm": 0.15542249381542206,
+      "learning_rate": 4.3684957608461505e-05,
+      "loss": 0.1695417881011963,
+      "step": 1400
+    },
+    {
+      "epoch": 0.25564046579330424,
+      "grad_norm": 0.1475304812192917,
+      "learning_rate": 4.363597323332941e-05,
+      "loss": 0.16340878009796142,
+      "step": 1405
+    },
+    {
+      "epoch": 0.25655021834061137,
+      "grad_norm": 0.16943927109241486,
+      "learning_rate": 4.358682730235395e-05,
+      "loss": 0.17240238189697266,
+      "step": 1410
+    },
+    {
+      "epoch": 0.2574599708879185,
+      "grad_norm": 0.1816391944885254,
+      "learning_rate": 4.3537520241582744e-05,
+      "loss": 0.16558437347412108,
+      "step": 1415
+    },
+    {
+      "epoch": 0.25836972343522563,
+      "grad_norm": 0.23851341009140015,
+      "learning_rate": 4.348805247846027e-05,
+      "loss": 0.16796000003814698,
+      "step": 1420
+    },
+    {
+      "epoch": 0.25927947598253276,
+      "grad_norm": 0.15415243804454803,
+      "learning_rate": 4.343842444182414e-05,
+      "loss": 0.1746017098426819,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2601892285298399,
+      "grad_norm": 0.15651032328605652,
+      "learning_rate": 4.338863656190139e-05,
+      "loss": 0.1649057984352112,
+      "step": 1430
+    },
+    {
+      "epoch": 0.261098981077147,
+      "grad_norm": 0.16601966321468353,
+      "learning_rate": 4.333868927030471e-05,
+      "loss": 0.15888988971710205,
+      "step": 1435
+    },
+    {
+      "epoch": 0.26200873362445415,
+      "grad_norm": 0.1549467295408249,
+      "learning_rate": 4.328858300002876e-05,
+      "loss": 0.16357985734939576,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2629184861717613,
+      "grad_norm": 0.16332370042800903,
+      "learning_rate": 4.32383181854464e-05,
+      "loss": 0.16749982833862304,
+      "step": 1445
+    },
+    {
+      "epoch": 0.2638282387190684,
+      "grad_norm": 0.14827077090740204,
+      "learning_rate": 4.3187895262304894e-05,
+      "loss": 0.16886214017868043,
+      "step": 1450
+    },
+    {
+      "epoch": 0.26473799126637554,
+      "grad_norm": 0.1557198166847229,
+      "learning_rate": 4.313731466772216e-05,
+      "loss": 0.17512214183807373,
+      "step": 1455
+    },
+    {
+      "epoch": 0.26564774381368267,
+      "grad_norm": 0.17263570427894592,
+      "learning_rate": 4.308657684018299e-05,
+      "loss": 0.16248074769973755,
+      "step": 1460
+    },
+    {
+      "epoch": 0.2665574963609898,
+      "grad_norm": 0.17135761678218842,
+      "learning_rate": 4.303568221953521e-05,
+      "loss": 0.16605921983718872,
+      "step": 1465
+    },
+    {
+      "epoch": 0.26746724890829693,
+      "grad_norm": 0.14322632551193237,
+      "learning_rate": 4.2984631246985897e-05,
+      "loss": 0.1610772728919983,
+      "step": 1470
+    },
+    {
+      "epoch": 0.26837700145560406,
+      "grad_norm": 0.18852312862873077,
+      "learning_rate": 4.2933424365097564e-05,
+      "loss": 0.1686462163925171,
+      "step": 1475
+    },
+    {
+      "epoch": 0.2692867540029112,
+      "grad_norm": 0.1780245155096054,
+      "learning_rate": 4.2882062017784294e-05,
+      "loss": 0.16953932046890258,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2701965065502183,
+      "grad_norm": 0.180568665266037,
+      "learning_rate": 4.2830544650307895e-05,
+      "loss": 0.16442664861679077,
+      "step": 1485
+    },
+    {
+      "epoch": 0.27110625909752545,
+      "grad_norm": 0.16876435279846191,
+      "learning_rate": 4.277887270927407e-05,
+      "loss": 0.17128173112869263,
+      "step": 1490
+    },
+    {
+      "epoch": 0.2720160116448326,
+      "grad_norm": 0.164053276181221,
+      "learning_rate": 4.2727046642628513e-05,
+      "loss": 0.16331382989883422,
+      "step": 1495
+    },
+    {
+      "epoch": 0.27292576419213976,
+      "grad_norm": 0.14577528834342957,
+      "learning_rate": 4.267506689965305e-05,
+      "loss": 0.1638316035270691,
+      "step": 1500
+    },
+    {
+      "epoch": 0.2738355167394469,
+      "grad_norm": 0.1648740917444229,
+      "learning_rate": 4.262293393096171e-05,
+      "loss": 0.15332664251327516,
+      "step": 1505
+    },
+    {
+      "epoch": 0.274745269286754,
+      "grad_norm": 0.16445094347000122,
+      "learning_rate": 4.257064818849685e-05,
+      "loss": 0.1706634521484375,
+      "step": 1510
+    },
+    {
+      "epoch": 0.27565502183406115,
+      "grad_norm": 0.1584935486316681,
+      "learning_rate": 4.251821012552524e-05,
+      "loss": 0.1684114694595337,
+      "step": 1515
+    },
+    {
+      "epoch": 0.2765647743813683,
+      "grad_norm": 0.17215611040592194,
+      "learning_rate": 4.24656201966341e-05,
+      "loss": 0.15594131946563722,
+      "step": 1520
+    },
+    {
+      "epoch": 0.2774745269286754,
+      "grad_norm": 0.15945589542388916,
+      "learning_rate": 4.2412878857727214e-05,
+      "loss": 0.1686659574508667,
+      "step": 1525
+    },
+    {
+      "epoch": 0.27838427947598254,
+      "grad_norm": 0.16103951632976532,
+      "learning_rate": 4.2359986566020906e-05,
+      "loss": 0.17779340744018554,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2792940320232897,
+      "grad_norm": 0.1770307570695877,
+      "learning_rate": 4.230694378004014e-05,
+      "loss": 0.16786882877349854,
+      "step": 1535
+    },
+    {
+      "epoch": 0.2802037845705968,
+      "grad_norm": 0.16225053369998932,
+      "learning_rate": 4.2253750959614504e-05,
+      "loss": 0.16558897495269775,
+      "step": 1540
+    },
+    {
+      "epoch": 0.28111353711790393,
+      "grad_norm": 0.27213969826698303,
+      "learning_rate": 4.220040856587425e-05,
+      "loss": 0.1641119599342346,
+      "step": 1545
+    },
+    {
+      "epoch": 0.28202328966521106,
+      "grad_norm": 0.1773071587085724,
+      "learning_rate": 4.2146917061246284e-05,
+      "loss": 0.16919140815734862,
+      "step": 1550
+    },
+    {
+      "epoch": 0.2829330422125182,
+      "grad_norm": 0.15519705414772034,
+      "learning_rate": 4.209327690945014e-05,
+      "loss": 0.15501506328582765,
+      "step": 1555
+    },
+    {
+      "epoch": 0.2838427947598253,
+      "grad_norm": 0.19921597838401794,
+      "learning_rate": 4.203948857549402e-05,
+      "loss": 0.1690821886062622,
+      "step": 1560
+    },
+    {
+      "epoch": 0.28475254730713245,
+      "grad_norm": 0.15417630970478058,
+      "learning_rate": 4.1985552525670696e-05,
+      "loss": 0.1675640344619751,
+      "step": 1565
+    },
+    {
+      "epoch": 0.2856622998544396,
+      "grad_norm": 0.1739572137594223,
+      "learning_rate": 4.193146922755348e-05,
+      "loss": 0.16738017797470092,
+      "step": 1570
+    },
+    {
+      "epoch": 0.2865720524017467,
+      "grad_norm": 0.1384361982345581,
+      "learning_rate": 4.187723914999221e-05,
+      "loss": 0.16802358627319336,
+      "step": 1575
+    },
+    {
+      "epoch": 0.28748180494905384,
+      "grad_norm": 0.1491454839706421,
+      "learning_rate": 4.182286276310915e-05,
+      "loss": 0.1619583249092102,
+      "step": 1580
+    },
+    {
+      "epoch": 0.288391557496361,
+      "grad_norm": 0.15831919014453888,
+      "learning_rate": 4.176834053829492e-05,
+      "loss": 0.1625199794769287,
+      "step": 1585
+    },
+    {
+      "epoch": 0.2893013100436681,
+      "grad_norm": 0.16265396773815155,
+      "learning_rate": 4.1713672948204416e-05,
+      "loss": 0.16718552112579346,
+      "step": 1590
+    },
+    {
+      "epoch": 0.29021106259097523,
+      "grad_norm": 0.15153461694717407,
+      "learning_rate": 4.1658860466752714e-05,
+      "loss": 0.15979087352752686,
+      "step": 1595
+    },
+    {
+      "epoch": 0.29112081513828236,
+      "grad_norm": 0.1620412915945053,
+      "learning_rate": 4.160390356911096e-05,
+      "loss": 0.16103557348251343,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2920305676855895,
+      "grad_norm": 0.16673807799816132,
+      "learning_rate": 4.154880273170223e-05,
+      "loss": 0.16394708156585694,
+      "step": 1605
+    },
+    {
+      "epoch": 0.2929403202328967,
+      "grad_norm": 0.14834867417812347,
+      "learning_rate": 4.149355843219744e-05,
+      "loss": 0.15916435718536376,
+      "step": 1610
+    },
+    {
+      "epoch": 0.2938500727802038,
+      "grad_norm": 0.16977964341640472,
+      "learning_rate": 4.143817114951119e-05,
+      "loss": 0.16538127660751342,
+      "step": 1615
+    },
+    {
+      "epoch": 0.29475982532751094,
+      "grad_norm": 0.17986875772476196,
+      "learning_rate": 4.138264136379756e-05,
+      "loss": 0.15514618158340454,
+      "step": 1620
+    },
+    {
+      "epoch": 0.29566957787481807,
+      "grad_norm": 0.15794920921325684,
+      "learning_rate": 4.132696955644605e-05,
+      "loss": 0.15992183685302735,
+      "step": 1625
+    },
+    {
+      "epoch": 0.2965793304221252,
+      "grad_norm": 0.19955399632453918,
+      "learning_rate": 4.127115621007731e-05,
+      "loss": 0.16362056732177735,
+      "step": 1630
+    },
+    {
+      "epoch": 0.29748908296943233,
+      "grad_norm": 0.1352023035287857,
+      "learning_rate": 4.121520180853903e-05,
+      "loss": 0.15631601810455323,
+      "step": 1635
+    },
+    {
+      "epoch": 0.29839883551673946,
+      "grad_norm": 0.15340781211853027,
+      "learning_rate": 4.1159106836901674e-05,
+      "loss": 0.1571858048439026,
+      "step": 1640
+    },
+    {
+      "epoch": 0.2993085880640466,
+      "grad_norm": 0.15311770141124725,
+      "learning_rate": 4.110287178145433e-05,
+      "loss": 0.16082344055175782,
+      "step": 1645
+    },
+    {
+      "epoch": 0.3002183406113537,
+      "grad_norm": 0.17811627686023712,
+      "learning_rate": 4.10464971297005e-05,
+      "loss": 0.16117215156555176,
+      "step": 1650
+    },
+    {
+      "epoch": 0.30112809315866085,
+      "grad_norm": 0.21060039103031158,
+      "learning_rate": 4.0989983370353805e-05,
+      "loss": 0.15838587284088135,
+      "step": 1655
+    },
+    {
+      "epoch": 0.302037845705968,
+      "grad_norm": 0.155836820602417,
+      "learning_rate": 4.093333099333383e-05,
+      "loss": 0.16648870706558228,
+      "step": 1660
+    },
+    {
+      "epoch": 0.3029475982532751,
+      "grad_norm": 0.13711698353290558,
+      "learning_rate": 4.0876540489761826e-05,
+      "loss": 0.16899349689483642,
+      "step": 1665
+    },
+    {
+      "epoch": 0.30385735080058224,
+      "grad_norm": 0.15162716805934906,
+      "learning_rate": 4.0819612351956485e-05,
+      "loss": 0.16574090719223022,
+      "step": 1670
+    },
+    {
+      "epoch": 0.30476710334788937,
+      "grad_norm": 0.15016348659992218,
+      "learning_rate": 4.0762547073429615e-05,
+      "loss": 0.1689780354499817,
+      "step": 1675
+    },
+    {
+      "epoch": 0.3056768558951965,
+      "grad_norm": 0.15182986855506897,
+      "learning_rate": 4.070534514888194e-05,
+      "loss": 0.1593686819076538,
+      "step": 1680
+    },
+    {
+      "epoch": 0.3065866084425036,
+      "grad_norm": 0.15648750960826874,
+      "learning_rate": 4.0648007074198765e-05,
+      "loss": 0.16436235904693602,
+      "step": 1685
+    },
+    {
+      "epoch": 0.30749636098981076,
+      "grad_norm": 0.18339484930038452,
+      "learning_rate": 4.0590533346445665e-05,
+      "loss": 0.1678077220916748,
+      "step": 1690
+    },
+    {
+      "epoch": 0.3084061135371179,
+      "grad_norm": 0.16426527500152588,
+      "learning_rate": 4.053292446386422e-05,
+      "loss": 0.1689227342605591,
+      "step": 1695
+    },
+    {
+      "epoch": 0.309315866084425,
+      "grad_norm": 0.16129335761070251,
+      "learning_rate": 4.047518092586766e-05,
+      "loss": 0.16592445373535156,
+      "step": 1700
+    },
+    {
+      "epoch": 0.31022561863173215,
+      "grad_norm": 0.15512363612651825,
+      "learning_rate": 4.041730323303654e-05,
+      "loss": 0.16142364740371704,
+      "step": 1705
+    },
+    {
+      "epoch": 0.3111353711790393,
+      "grad_norm": 0.159842386841774,
+      "learning_rate": 4.0359291887114425e-05,
+      "loss": 0.1702875852584839,
+      "step": 1710
+    },
+    {
+      "epoch": 0.3120451237263464,
+      "grad_norm": 0.19558854401111603,
+      "learning_rate": 4.030114739100352e-05,
+      "loss": 0.15966148376464845,
+      "step": 1715
+    },
+    {
+      "epoch": 0.3129548762736536,
+      "grad_norm": 0.1577496975660324,
+      "learning_rate": 4.024287024876029e-05,
+      "loss": 0.1620358943939209,
+      "step": 1720
+    },
+    {
+      "epoch": 0.3138646288209607,
+      "grad_norm": 0.1629355251789093,
+      "learning_rate": 4.0184460965591144e-05,
+      "loss": 0.16511552333831786,
+      "step": 1725
+    },
+    {
+      "epoch": 0.31477438136826785,
+      "grad_norm": 0.17060767114162445,
+      "learning_rate": 4.0125920047848e-05,
+      "loss": 0.15672838687896729,
+      "step": 1730
+    },
+    {
+      "epoch": 0.315684133915575,
+      "grad_norm": 0.22447620332241058,
+      "learning_rate": 4.006724800302394e-05,
+      "loss": 0.15339784622192382,
+      "step": 1735
+    },
+    {
+      "epoch": 0.3165938864628821,
+      "grad_norm": 0.14572037756443024,
+      "learning_rate": 4.000844533974878e-05,
+      "loss": 0.16566959619522095,
+      "step": 1740
+    },
+    {
+      "epoch": 0.31750363901018924,
+      "grad_norm": 0.15915483236312866,
+      "learning_rate": 3.9949512567784684e-05,
+      "loss": 0.16153957843780517,
+      "step": 1745
+    },
+    {
+      "epoch": 0.3184133915574964,
+      "grad_norm": 0.1668540984392166,
+      "learning_rate": 3.9890450198021704e-05,
+      "loss": 0.1659809947013855,
+      "step": 1750
+    },
+    {
+      "epoch": 0.3193231441048035,
+      "grad_norm": 0.16612035036087036,
+      "learning_rate": 3.983125874247341e-05,
+      "loss": 0.16941241025924683,
+      "step": 1755
+    },
+    {
+      "epoch": 0.32023289665211063,
+      "grad_norm": 0.15163679420948029,
+      "learning_rate": 3.9771938714272407e-05,
+      "loss": 0.16053590774536133,
+      "step": 1760
+    },
+    {
+      "epoch": 0.32114264919941776,
+      "grad_norm": 0.1797824203968048,
+      "learning_rate": 3.97124906276659e-05,
+      "loss": 0.1667110800743103,
+      "step": 1765
+    },
+    {
+      "epoch": 0.3220524017467249,
+      "grad_norm": 0.15076608955860138,
+      "learning_rate": 3.9652914998011237e-05,
+      "loss": 0.1607860803604126,
+      "step": 1770
+    },
+    {
+      "epoch": 0.322962154294032,
+      "grad_norm": 0.16523587703704834,
+      "learning_rate": 3.959321234177144e-05,
+      "loss": 0.16515827178955078,
+      "step": 1775
+    },
+    {
+      "epoch": 0.32387190684133915,
+      "grad_norm": 0.22065149247646332,
+      "learning_rate": 3.9533383176510746e-05,
+      "loss": 0.1618957757949829,
+      "step": 1780
+    },
+    {
+      "epoch": 0.3247816593886463,
+      "grad_norm": 0.16426463425159454,
+      "learning_rate": 3.9473428020890066e-05,
+      "loss": 0.15763382911682128,
+      "step": 1785
+    },
+    {
+      "epoch": 0.3256914119359534,
+      "grad_norm": 0.16474904119968414,
+      "learning_rate": 3.941334739466257e-05,
+      "loss": 0.15135571956634522,
+      "step": 1790
+    },
+    {
+      "epoch": 0.32660116448326054,
+      "grad_norm": 0.16746412217617035,
+      "learning_rate": 3.935314181866909e-05,
+      "loss": 0.15925389528274536,
+      "step": 1795
+    },
+    {
+      "epoch": 0.32751091703056767,
+      "grad_norm": 0.17819371819496155,
+      "learning_rate": 3.929281181483369e-05,
+      "loss": 0.1598669171333313,
+      "step": 1800
+    },
+    {
+      "epoch": 0.3284206695778748,
+      "grad_norm": 0.1816040277481079,
+      "learning_rate": 3.923235790615907e-05,
+      "loss": 0.1652522087097168,
+      "step": 1805
+    },
+    {
+      "epoch": 0.32933042212518193,
+      "grad_norm": 0.14846695959568024,
+      "learning_rate": 3.917178061672211e-05,
+      "loss": 0.16665585041046144,
+      "step": 1810
+    },
+    {
+      "epoch": 0.33024017467248906,
+      "grad_norm": 0.1734926551580429,
+      "learning_rate": 3.911108047166924e-05,
+      "loss": 0.16069791316986085,
+      "step": 1815
+    },
+    {
+      "epoch": 0.3311499272197962,
+      "grad_norm": 0.16154922544956207,
+      "learning_rate": 3.905025799721194e-05,
+      "loss": 0.16114097833633423,
+      "step": 1820
+    },
+    {
+      "epoch": 0.3320596797671033,
+      "grad_norm": 0.1538771390914917,
+      "learning_rate": 3.898931372062217e-05,
+      "loss": 0.1602831244468689,
+      "step": 1825
+    },
+    {
+      "epoch": 0.3329694323144105,
+      "grad_norm": 0.14036566019058228,
+      "learning_rate": 3.892824817022781e-05,
+      "loss": 0.1502395749092102,
+      "step": 1830
+    },
+    {
+      "epoch": 0.33387918486171764,
+      "grad_norm": 0.19212059676647186,
+      "learning_rate": 3.886706187540804e-05,
+      "loss": 0.16265250444412233,
+      "step": 1835
+    },
+    {
+      "epoch": 0.33478893740902477,
+      "grad_norm": 0.17410333454608917,
+      "learning_rate": 3.880575536658881e-05,
+      "loss": 0.15689224004745483,
+      "step": 1840
+    },
+    {
+      "epoch": 0.3356986899563319,
+      "grad_norm": 0.15165294706821442,
+      "learning_rate": 3.874432917523817e-05,
+      "loss": 0.15033140182495117,
+      "step": 1845
+    },
+    {
+      "epoch": 0.336608442503639,
+      "grad_norm": 0.16166730225086212,
+      "learning_rate": 3.8682783833861736e-05,
+      "loss": 0.16896235942840576,
+      "step": 1850
+    },
+    {
+      "epoch": 0.33751819505094616,
+      "grad_norm": 0.16497021913528442,
+      "learning_rate": 3.8621119875998026e-05,
+      "loss": 0.1600774645805359,
+      "step": 1855
+    },
+    {
+      "epoch": 0.3384279475982533,
+      "grad_norm": 0.17264948785305023,
+      "learning_rate": 3.855933783621384e-05,
+      "loss": 0.16947593688964843,
+      "step": 1860
+    },
+    {
+      "epoch": 0.3393377001455604,
+      "grad_norm": 0.16870704293251038,
+      "learning_rate": 3.8497438250099636e-05,
+      "loss": 0.16062095165252685,
+      "step": 1865
+    },
+    {
+      "epoch": 0.34024745269286755,
+      "grad_norm": 0.16644036769866943,
+      "learning_rate": 3.843542165426492e-05,
+      "loss": 0.16015599966049193,
+      "step": 1870
+    },
+    {
+      "epoch": 0.3411572052401747,
+      "grad_norm": 0.1626352220773697,
+      "learning_rate": 3.837328858633349e-05,
+      "loss": 0.17444703578948975,
+      "step": 1875
+    },
+    {
+      "epoch": 0.3420669577874818,
+      "grad_norm": 0.1427375227212906,
+      "learning_rate": 3.83110395849389e-05,
+      "loss": 0.1589805006980896,
+      "step": 1880
+    },
+    {
+      "epoch": 0.34297671033478894,
+      "grad_norm": 0.17840255796909332,
+      "learning_rate": 3.824867518971973e-05,
+      "loss": 0.15953952074050903,
+      "step": 1885
+    },
+    {
+      "epoch": 0.34388646288209607,
+      "grad_norm": 0.16998249292373657,
+      "learning_rate": 3.818619594131489e-05,
+      "loss": 0.16027032136917113,
+      "step": 1890
+    },
+    {
+      "epoch": 0.3447962154294032,
+      "grad_norm": 0.14950257539749146,
+      "learning_rate": 3.812360238135897e-05,
+      "loss": 0.15335670709609986,
+      "step": 1895
+    },
+    {
+      "epoch": 0.3457059679767103,
+      "grad_norm": 0.1678011417388916,
+      "learning_rate": 3.806089505247752e-05,
+      "loss": 0.1560648798942566,
+      "step": 1900
+    },
+    {
+      "epoch": 0.34661572052401746,
+      "grad_norm": 0.17944541573524475,
+      "learning_rate": 3.799807449828238e-05,
+      "loss": 0.16072254180908202,
+      "step": 1905
+    },
+    {
+      "epoch": 0.3475254730713246,
+      "grad_norm": 0.166817307472229,
+      "learning_rate": 3.793514126336691e-05,
+      "loss": 0.1542820692062378,
+      "step": 1910
+    },
+    {
+      "epoch": 0.3484352256186317,
+      "grad_norm": 0.16047626733779907,
+      "learning_rate": 3.787209589330134e-05,
+      "loss": 0.16092092990875245,
+      "step": 1915
+    },
+    {
+      "epoch": 0.34934497816593885,
+      "grad_norm": 0.16478900611400604,
+      "learning_rate": 3.7808938934627965e-05,
+      "loss": 0.16765867471694945,
+      "step": 1920
+    },
+    {
+      "epoch": 0.350254730713246,
+      "grad_norm": 0.15349514782428741,
+      "learning_rate": 3.774567093485648e-05,
+      "loss": 0.15890377759933472,
+      "step": 1925
+    },
+    {
+      "epoch": 0.3511644832605531,
+      "grad_norm": 0.1515921950340271,
+      "learning_rate": 3.768229244245917e-05,
+      "loss": 0.16668319702148438,
+      "step": 1930
+    },
+    {
+      "epoch": 0.35207423580786024,
+      "grad_norm": 0.16310466825962067,
+      "learning_rate": 3.7618804006866195e-05,
+      "loss": 0.15182652473449706,
+      "step": 1935
+    },
+    {
+      "epoch": 0.3529839883551674,
+      "grad_norm": 0.17294517159461975,
+      "learning_rate": 3.755520617846084e-05,
+      "loss": 0.16287628412246705,
+      "step": 1940
+    },
+    {
+      "epoch": 0.35389374090247455,
+      "grad_norm": 0.1482895463705063,
+      "learning_rate": 3.749149950857467e-05,
+      "loss": 0.15321952104568481,
+      "step": 1945
+    },
+    {
+      "epoch": 0.3548034934497817,
+      "grad_norm": 0.2236029952764511,
+      "learning_rate": 3.7427684549482847e-05,
+      "loss": 0.15403482913970948,
+      "step": 1950
+    },
+    {
+      "epoch": 0.3557132459970888,
+      "grad_norm": 0.20185327529907227,
+      "learning_rate": 3.736376185439927e-05,
+      "loss": 0.1633884072303772,
+      "step": 1955
+    },
+    {
+      "epoch": 0.35662299854439594,
+      "grad_norm": 0.13906247913837433,
+      "learning_rate": 3.7299731977471816e-05,
+      "loss": 0.15925350189208984,
+      "step": 1960
+    },
+    {
+      "epoch": 0.35753275109170307,
+      "grad_norm": 0.18665002286434174,
+      "learning_rate": 3.723559547377751e-05,
+      "loss": 0.1612026572227478,
+      "step": 1965
+    },
+    {
+      "epoch": 0.3584425036390102,
+      "grad_norm": 0.16913433372974396,
+      "learning_rate": 3.717135289931774e-05,
+      "loss": 0.15479494333267213,
+      "step": 1970
+    },
+    {
+      "epoch": 0.35935225618631733,
+      "grad_norm": 0.1620066910982132,
+      "learning_rate": 3.7107004811013434e-05,
+      "loss": 0.1604058027267456,
+      "step": 1975
+    },
+    {
+      "epoch": 0.36026200873362446,
+      "grad_norm": 0.16838301718235016,
+      "learning_rate": 3.704255176670021e-05,
+      "loss": 0.15335073471069335,
+      "step": 1980
+    },
+    {
+      "epoch": 0.3611717612809316,
+      "grad_norm": 0.3054695427417755,
+      "learning_rate": 3.6977994325123535e-05,
+      "loss": 0.16558053493499755,
+      "step": 1985
+    },
+    {
+      "epoch": 0.3620815138282387,
+      "grad_norm": 0.1526716649532318,
+      "learning_rate": 3.6913333045933934e-05,
+      "loss": 0.16148923635482787,
+      "step": 1990
+    },
+    {
+      "epoch": 0.36299126637554585,
+      "grad_norm": 0.15328513085842133,
+      "learning_rate": 3.684856848968209e-05,
+      "loss": 0.1553613781929016,
+      "step": 1995
+    },
+    {
+      "epoch": 0.363901018922853,
+      "grad_norm": 0.16129714250564575,
+      "learning_rate": 3.6783701217813995e-05,
+      "loss": 0.16724612712860107,
+      "step": 2000
+    },
+    {
+      "epoch": 0.3648107714701601,
+      "grad_norm": 0.15715539455413818,
+      "learning_rate": 3.6718731792666086e-05,
+      "loss": 0.15867922306060792,
+      "step": 2005
+    },
+    {
+      "epoch": 0.36572052401746724,
+      "grad_norm": 0.15569166839122772,
+      "learning_rate": 3.6653660777460366e-05,
+      "loss": 0.1552058696746826,
+      "step": 2010
+    },
+    {
+      "epoch": 0.36663027656477437,
+      "grad_norm": 0.16223010420799255,
+      "learning_rate": 3.6588488736299535e-05,
+      "loss": 0.1583200454711914,
+      "step": 2015
+    },
+    {
+      "epoch": 0.3675400291120815,
+      "grad_norm": 0.18441995978355408,
+      "learning_rate": 3.652321623416209e-05,
+      "loss": 0.15050662755966188,
+      "step": 2020
+    },
+    {
+      "epoch": 0.36844978165938863,
+      "grad_norm": 0.13792674243450165,
+      "learning_rate": 3.645784383689742e-05,
+      "loss": 0.15458759069442748,
+      "step": 2025
+    },
+    {
+      "epoch": 0.36935953420669576,
+      "grad_norm": 0.14993111789226532,
+      "learning_rate": 3.639237211122091e-05,
+      "loss": 0.15926222801208495,
+      "step": 2030
+    },
+    {
+      "epoch": 0.3702692867540029,
+      "grad_norm": 0.16815930604934692,
+      "learning_rate": 3.632680162470904e-05,
+      "loss": 0.15524441003799438,
+      "step": 2035
+    },
+    {
+      "epoch": 0.37117903930131,
+      "grad_norm": 0.13312821090221405,
+      "learning_rate": 3.626113294579441e-05,
+      "loss": 0.15883516073226928,
+      "step": 2040
+    },
+    {
+      "epoch": 0.37208879184861715,
+      "grad_norm": 0.16838273406028748,
+      "learning_rate": 3.619536664376091e-05,
+      "loss": 0.15829603672027587,
+      "step": 2045
+    },
+    {
+      "epoch": 0.37299854439592434,
+      "grad_norm": 0.14706873893737793,
+      "learning_rate": 3.612950328873869e-05,
+      "loss": 0.15644397735595703,
+      "step": 2050
+    },
+    {
+      "epoch": 0.37390829694323147,
+      "grad_norm": 0.1644199639558792,
+      "learning_rate": 3.606354345169926e-05,
+      "loss": 0.15858219861984252,
+      "step": 2055
+    },
+    {
+      "epoch": 0.3748180494905386,
+      "grad_norm": 0.18077051639556885,
+      "learning_rate": 3.599748770445055e-05,
+      "loss": 0.1641286849975586,
+      "step": 2060
+    },
+    {
+      "epoch": 0.3757278020378457,
+      "grad_norm": 0.16329127550125122,
+      "learning_rate": 3.5931336619631914e-05,
+      "loss": 0.15027186870574952,
+      "step": 2065
+    },
+    {
+      "epoch": 0.37663755458515286,
+      "grad_norm": 0.16346783936023712,
+      "learning_rate": 3.586509077070922e-05,
+      "loss": 0.1558641314506531,
+      "step": 2070
+    },
+    {
+      "epoch": 0.37754730713246,
+      "grad_norm": 0.1727602630853653,
+      "learning_rate": 3.5798750731969834e-05,
+      "loss": 0.15390506982803345,
+      "step": 2075
+    },
+    {
+      "epoch": 0.3784570596797671,
+      "grad_norm": 0.7598192691802979,
+      "learning_rate": 3.5732317078517654e-05,
+      "loss": 0.1533232808113098,
+      "step": 2080
+    },
+    {
+      "epoch": 0.37936681222707425,
+      "grad_norm": 0.1433355212211609,
+      "learning_rate": 3.5665790386268124e-05,
+      "loss": 0.15560413599014283,
+      "step": 2085
+    },
+    {
+      "epoch": 0.3802765647743814,
+      "grad_norm": 0.18439625203609467,
+      "learning_rate": 3.559917123194325e-05,
+      "loss": 0.16695556640625,
+      "step": 2090
+    },
+    {
+      "epoch": 0.3811863173216885,
+      "grad_norm": 0.1693502813577652,
+      "learning_rate": 3.55324601930666e-05,
+      "loss": 0.15957870483398437,
+      "step": 2095
+    },
+    {
+      "epoch": 0.38209606986899564,
+      "grad_norm": 0.17776088416576385,
+      "learning_rate": 3.54656578479583e-05,
+      "loss": 0.1527492880821228,
+      "step": 2100
+    },
+    {
+      "epoch": 0.38300582241630277,
+      "grad_norm": 0.15993724763393402,
+      "learning_rate": 3.539876477572998e-05,
+      "loss": 0.1567505717277527,
+      "step": 2105
+    },
+    {
+      "epoch": 0.3839155749636099,
+      "grad_norm": 0.17067375779151917,
+      "learning_rate": 3.533178155627981e-05,
+      "loss": 0.14660797119140626,
+      "step": 2110
+    },
+    {
+      "epoch": 0.384825327510917,
+      "grad_norm": 0.20239882171154022,
+      "learning_rate": 3.526470877028745e-05,
+      "loss": 0.1596767544746399,
+      "step": 2115
+    },
+    {
+      "epoch": 0.38573508005822416,
+      "grad_norm": 0.1863643079996109,
+      "learning_rate": 3.5197546999209005e-05,
+      "loss": 0.15738571882247926,
+      "step": 2120
+    },
+    {
+      "epoch": 0.3866448326055313,
+      "grad_norm": 0.16994133591651917,
+      "learning_rate": 3.5130296825272014e-05,
+      "loss": 0.16255316734313965,
+      "step": 2125
+    },
+    {
+      "epoch": 0.3875545851528384,
+      "grad_norm": 0.18703415989875793,
+      "learning_rate": 3.5062958831470355e-05,
+      "loss": 0.15206334590911866,
+      "step": 2130
+    },
+    {
+      "epoch": 0.38846433770014555,
+      "grad_norm": 0.15433982014656067,
+      "learning_rate": 3.4995533601559226e-05,
+      "loss": 0.1590178370475769,
+      "step": 2135
+    },
+    {
+      "epoch": 0.3893740902474527,
+      "grad_norm": 0.16498146951198578,
+      "learning_rate": 3.4928021720050104e-05,
+      "loss": 0.14759145975112914,
+      "step": 2140
+    },
+    {
+      "epoch": 0.3902838427947598,
+      "grad_norm": 0.17880478501319885,
+      "learning_rate": 3.486042377220562e-05,
+      "loss": 0.1642458915710449,
+      "step": 2145
+    },
+    {
+      "epoch": 0.39119359534206694,
+      "grad_norm": 0.14700061082839966,
+      "learning_rate": 3.479274034403455e-05,
+      "loss": 0.16105138063430785,
+      "step": 2150
+    },
+    {
+      "epoch": 0.39210334788937407,
+      "grad_norm": 0.1620762050151825,
+      "learning_rate": 3.472497202228664e-05,
+      "loss": 0.15104985237121582,
+      "step": 2155
+    },
+    {
+      "epoch": 0.3930131004366812,
+      "grad_norm": 0.1625058799982071,
+      "learning_rate": 3.4657119394447654e-05,
+      "loss": 0.16145485639572144,
+      "step": 2160
+    },
+    {
+      "epoch": 0.3939228529839884,
+      "grad_norm": 0.1631549596786499,
+      "learning_rate": 3.458918304873417e-05,
+      "loss": 0.16712255477905275,
+      "step": 2165
+    },
+    {
+      "epoch": 0.3948326055312955,
+      "grad_norm": 0.16041551530361176,
+      "learning_rate": 3.452116357408853e-05,
+      "loss": 0.15118330717086792,
+      "step": 2170
+    },
+    {
+      "epoch": 0.39574235807860264,
+      "grad_norm": 0.16692611575126648,
+      "learning_rate": 3.44530615601737e-05,
+      "loss": 0.16982550621032716,
+      "step": 2175
+    },
+    {
+      "epoch": 0.39665211062590977,
+      "grad_norm": 0.16082268953323364,
+      "learning_rate": 3.438487759736821e-05,
+      "loss": 0.1513260006904602,
+      "step": 2180
+    },
+    {
+      "epoch": 0.3975618631732169,
+      "grad_norm": 0.1474589854478836,
+      "learning_rate": 3.4316612276761004e-05,
+      "loss": 0.14968743324279785,
+      "step": 2185
+    },
+    {
+      "epoch": 0.39847161572052403,
+      "grad_norm": 0.14531342685222626,
+      "learning_rate": 3.42482661901463e-05,
+      "loss": 0.1563260555267334,
+      "step": 2190
+    },
+    {
+      "epoch": 0.39938136826783116,
+      "grad_norm": 0.16775506734848022,
+      "learning_rate": 3.41798399300185e-05,
+      "loss": 0.14861010313034057,
+      "step": 2195
+    },
+    {
+      "epoch": 0.4002911208151383,
+      "grad_norm": 0.15065217018127441,
+      "learning_rate": 3.411133408956703e-05,
+      "loss": 0.15559519529342652,
+      "step": 2200
+    },
+    {
+      "epoch": 0.4012008733624454,
+      "grad_norm": 0.16655296087265015,
+      "learning_rate": 3.4042749262671184e-05,
+      "loss": 0.16025567054748535,
+      "step": 2205
+    },
+    {
+      "epoch": 0.40211062590975255,
+      "grad_norm": 0.14773905277252197,
+      "learning_rate": 3.397408604389501e-05,
+      "loss": 0.15074082612991332,
+      "step": 2210
+    },
+    {
+      "epoch": 0.4030203784570597,
+      "grad_norm": 0.16233304142951965,
+      "learning_rate": 3.3905345028482125e-05,
+      "loss": 0.15490520000457764,
+      "step": 2215
+    },
+    {
+      "epoch": 0.4039301310043668,
+      "grad_norm": 0.17520153522491455,
+      "learning_rate": 3.383652681235058e-05,
+      "loss": 0.1517520785331726,
+      "step": 2220
+    },
+    {
+      "epoch": 0.40483988355167394,
+      "grad_norm": 0.14749875664710999,
+      "learning_rate": 3.376763199208766e-05,
+      "loss": 0.15410997867584228,
+      "step": 2225
+    },
+    {
+      "epoch": 0.40574963609898107,
+      "grad_norm": 0.16855919361114502,
+      "learning_rate": 3.369866116494477e-05,
+      "loss": 0.1510261058807373,
+      "step": 2230
+    },
+    {
+      "epoch": 0.4066593886462882,
+      "grad_norm": 0.1594122350215912,
+      "learning_rate": 3.362961492883218e-05,
+      "loss": 0.1493813395500183,
+      "step": 2235
+    },
+    {
+      "epoch": 0.40756914119359533,
+      "grad_norm": 0.13645926117897034,
+      "learning_rate": 3.3560493882313915e-05,
+      "loss": 0.14876762628555298,
+      "step": 2240
+    },
+    {
+      "epoch": 0.40847889374090246,
+      "grad_norm": 0.14304400980472565,
+      "learning_rate": 3.349129862460251e-05,
+      "loss": 0.15567013025283813,
+      "step": 2245
+    },
+    {
+      "epoch": 0.4093886462882096,
+      "grad_norm": 0.17040041089057922,
+      "learning_rate": 3.342202975555386e-05,
+      "loss": 0.1563249945640564,
+      "step": 2250
+    },
+    {
+      "epoch": 0.4102983988355167,
+      "grad_norm": 0.15594671666622162,
+      "learning_rate": 3.3352687875661984e-05,
+      "loss": 0.1546410083770752,
+      "step": 2255
+    },
+    {
+      "epoch": 0.41120815138282385,
+      "grad_norm": 0.1677195280790329,
+      "learning_rate": 3.328327358605384e-05,
+      "loss": 0.15710171461105346,
+      "step": 2260
+    },
+    {
+      "epoch": 0.412117903930131,
+      "grad_norm": 0.1731705516576767,
+      "learning_rate": 3.321378748848412e-05,
+      "loss": 0.16444036960601807,
+      "step": 2265
+    },
+    {
+      "epoch": 0.4130276564774381,
+      "grad_norm": 0.18779033422470093,
+      "learning_rate": 3.3144230185329984e-05,
+      "loss": 0.15659687519073487,
+      "step": 2270
+    },
+    {
+      "epoch": 0.4139374090247453,
+      "grad_norm": 0.1543768346309662,
+      "learning_rate": 3.3074602279585913e-05,
+      "loss": 0.15100739002227784,
+      "step": 2275
+    },
+    {
+      "epoch": 0.4148471615720524,
+      "grad_norm": 0.16672168672084808,
+      "learning_rate": 3.300490437485843e-05,
+      "loss": 0.15535364151000977,
+      "step": 2280
+    },
+    {
+      "epoch": 0.41575691411935956,
+      "grad_norm": 0.16741308569908142,
+      "learning_rate": 3.293513707536089e-05,
+      "loss": 0.15523911714553834,
+      "step": 2285
+    },
+    {
+      "epoch": 0.4166666666666667,
+      "grad_norm": 0.1488303542137146,
+      "learning_rate": 3.286530098590822e-05,
+      "loss": 0.1542000651359558,
+      "step": 2290
+    },
+    {
+      "epoch": 0.4175764192139738,
+      "grad_norm": 0.1637732982635498,
+      "learning_rate": 3.2795396711911694e-05,
+      "loss": 0.15354831218719484,
+      "step": 2295
+    },
+    {
+      "epoch": 0.41848617176128095,
+      "grad_norm": 0.1472022533416748,
+      "learning_rate": 3.272542485937369e-05,
+      "loss": 0.16235145330429077,
+      "step": 2300
+    },
+    {
+      "epoch": 0.4193959243085881,
+      "grad_norm": 0.15908290445804596,
+      "learning_rate": 3.265538603488241e-05,
+      "loss": 0.15642645359039306,
+      "step": 2305
+    },
+    {
+      "epoch": 0.4203056768558952,
+      "grad_norm": 0.1584865301847458,
+      "learning_rate": 3.2585280845606645e-05,
+      "loss": 0.15490249395370484,
+      "step": 2310
+    },
+    {
+      "epoch": 0.42121542940320233,
+      "grad_norm": 0.15893949568271637,
+      "learning_rate": 3.251510989929052e-05,
+      "loss": 0.1598116159439087,
+      "step": 2315
+    },
+    {
+      "epoch": 0.42212518195050946,
+      "grad_norm": 0.18930596113204956,
+      "learning_rate": 3.244487380424817e-05,
+      "loss": 0.1482008934020996,
+      "step": 2320
+    },
+    {
+      "epoch": 0.4230349344978166,
+      "grad_norm": 0.132876455783844,
+      "learning_rate": 3.237457316935856e-05,
+      "loss": 0.15304710865020751,
+      "step": 2325
+    },
+    {
+      "epoch": 0.4239446870451237,
+      "grad_norm": 0.16447032988071442,
+      "learning_rate": 3.2304208604060106e-05,
+      "loss": 0.15298750400543212,
+      "step": 2330
+    },
+    {
+      "epoch": 0.42485443959243085,
+      "grad_norm": 0.17748120427131653,
+      "learning_rate": 3.223378071834546e-05,
+      "loss": 0.1556084156036377,
+      "step": 2335
+    },
+    {
+      "epoch": 0.425764192139738,
+      "grad_norm": 0.16366586089134216,
+      "learning_rate": 3.2163290122756206e-05,
+      "loss": 0.14387927055358887,
+      "step": 2340
+    },
+    {
+      "epoch": 0.4266739446870451,
+      "grad_norm": 0.15398970246315002,
+      "learning_rate": 3.209273742837755e-05,
+      "loss": 0.16091293096542358,
+      "step": 2345
+    },
+    {
+      "epoch": 0.42758369723435224,
+      "grad_norm": 0.164212167263031,
+      "learning_rate": 3.202212324683305e-05,
+      "loss": 0.15523531436920165,
+      "step": 2350
+    },
+    {
+      "epoch": 0.4284934497816594,
+      "grad_norm": 0.16749800741672516,
+      "learning_rate": 3.1951448190279255e-05,
+      "loss": 0.15354975461959838,
+      "step": 2355
+    },
+    {
+      "epoch": 0.4294032023289665,
+      "grad_norm": 0.14137034118175507,
+      "learning_rate": 3.18807128714005e-05,
+      "loss": 0.14981694221496583,
+      "step": 2360
+    },
+    {
+      "epoch": 0.43031295487627363,
+      "grad_norm": 0.14848439395427704,
+      "learning_rate": 3.1809917903403507e-05,
+      "loss": 0.15448769330978393,
+      "step": 2365
+    },
+    {
+      "epoch": 0.43122270742358076,
+      "grad_norm": 0.1747605800628662,
+      "learning_rate": 3.1739063900012095e-05,
+      "loss": 0.15882387161254882,
+      "step": 2370
+    },
+    {
+      "epoch": 0.4321324599708879,
+      "grad_norm": 0.16054467856884003,
+      "learning_rate": 3.166815147546186e-05,
+      "loss": 0.15170297622680665,
+      "step": 2375
+    },
+    {
+      "epoch": 0.433042212518195,
+      "grad_norm": 0.15428027510643005,
+      "learning_rate": 3.1597181244494886e-05,
+      "loss": 0.16202548742294312,
+      "step": 2380
+    },
+    {
+      "epoch": 0.4339519650655022,
+      "grad_norm": 0.16747219860553741,
+      "learning_rate": 3.1526153822354325e-05,
+      "loss": 0.15461477041244506,
+      "step": 2385
+    },
+    {
+      "epoch": 0.43486171761280934,
+      "grad_norm": 0.17415772378444672,
+      "learning_rate": 3.145506982477918e-05,
+      "loss": 0.16173542737960817,
+      "step": 2390
+    },
+    {
+      "epoch": 0.43577147016011647,
+      "grad_norm": 0.1293518990278244,
+      "learning_rate": 3.1383929867998865e-05,
+      "loss": 0.15572521686553956,
+      "step": 2395
+    },
+    {
+      "epoch": 0.4366812227074236,
+      "grad_norm": 0.16909323632717133,
+      "learning_rate": 3.1312734568727935e-05,
+      "loss": 0.15898628234863282,
+      "step": 2400
+    },
+    {
+      "epoch": 0.43759097525473073,
+      "grad_norm": 0.16770294308662415,
+      "learning_rate": 3.124148454416069e-05,
+      "loss": 0.1536281704902649,
+      "step": 2405
+    },
+    {
+      "epoch": 0.43850072780203786,
+      "grad_norm": 0.14078612625598907,
+      "learning_rate": 3.117018041196585e-05,
+      "loss": 0.15274266004562378,
+      "step": 2410
+    },
+    {
+      "epoch": 0.439410480349345,
+      "grad_norm": 0.15457536280155182,
+      "learning_rate": 3.1098822790281226e-05,
+      "loss": 0.15391263961791993,
+      "step": 2415
+    },
+    {
+      "epoch": 0.4403202328966521,
+      "grad_norm": 0.1640717089176178,
+      "learning_rate": 3.102741229770827e-05,
+      "loss": 0.15515168905258178,
+      "step": 2420
+    },
+    {
+      "epoch": 0.44122998544395925,
+      "grad_norm": 0.2601533830165863,
+      "learning_rate": 3.095594955330683e-05,
+      "loss": 0.1587247371673584,
+      "step": 2425
+    },
+    {
+      "epoch": 0.4421397379912664,
+      "grad_norm": 0.1352529525756836,
+      "learning_rate": 3.08844351765897e-05,
+      "loss": 0.1483217477798462,
+      "step": 2430
+    },
+    {
+      "epoch": 0.4430494905385735,
+      "grad_norm": 0.18479721248149872,
+      "learning_rate": 3.081286978751728e-05,
+      "loss": 0.15121787786483765,
+      "step": 2435
+    },
+    {
+      "epoch": 0.44395924308588064,
+      "grad_norm": 0.16954511404037476,
+      "learning_rate": 3.074125400649221e-05,
+      "loss": 0.16073100566864013,
+      "step": 2440
+    },
+    {
+      "epoch": 0.44486899563318777,
+      "grad_norm": 0.15154729783535004,
+      "learning_rate": 3.0669588454353944e-05,
+      "loss": 0.15738017559051515,
+      "step": 2445
+    },
+    {
+      "epoch": 0.4457787481804949,
+      "grad_norm": 0.1540488302707672,
+      "learning_rate": 3.059787375237344e-05,
+      "loss": 0.1515384554862976,
+      "step": 2450
+    },
+    {
+      "epoch": 0.44668850072780203,
+      "grad_norm": 0.1814432442188263,
+      "learning_rate": 3.052611052224774e-05,
+      "loss": 0.15731438398361205,
+      "step": 2455
+    },
+    {
+      "epoch": 0.44759825327510916,
+      "grad_norm": 0.16657036542892456,
+      "learning_rate": 3.0454299386094542e-05,
+      "loss": 0.15741543769836425,
+      "step": 2460
+    },
+    {
+      "epoch": 0.4485080058224163,
+      "grad_norm": 0.2177237570285797,
+      "learning_rate": 3.0382440966446875e-05,
+      "loss": 0.14972515106201173,
+      "step": 2465
+    },
+    {
+      "epoch": 0.4494177583697234,
+      "grad_norm": 0.1669909954071045,
+      "learning_rate": 3.031053588624766e-05,
+      "loss": 0.1506432294845581,
+      "step": 2470
+    },
+    {
+      "epoch": 0.45032751091703055,
+      "grad_norm": 0.1752234250307083,
+      "learning_rate": 3.0238584768844313e-05,
+      "loss": 0.14969609975814818,
+      "step": 2475
+    },
+    {
+      "epoch": 0.4512372634643377,
+      "grad_norm": 0.18267901241779327,
+      "learning_rate": 3.0166588237983363e-05,
+      "loss": 0.15112748146057128,
+      "step": 2480
+    },
+    {
+      "epoch": 0.4521470160116448,
+      "grad_norm": 0.16250105202198029,
+      "learning_rate": 3.0094546917805007e-05,
+      "loss": 0.15864100456237792,
+      "step": 2485
+    },
+    {
+      "epoch": 0.45305676855895194,
+      "grad_norm": 0.14825721085071564,
+      "learning_rate": 3.0022461432837752e-05,
+      "loss": 0.1513954520225525,
+      "step": 2490
+    },
+    {
+      "epoch": 0.4539665211062591,
+      "grad_norm": 0.1626640111207962,
+      "learning_rate": 2.9950332407992943e-05,
+      "loss": 0.1505578875541687,
+      "step": 2495
+    },
+    {
+      "epoch": 0.45487627365356625,
+      "grad_norm": 0.1535351574420929,
+      "learning_rate": 2.987816046855939e-05,
+      "loss": 0.15255829095840454,
+      "step": 2500
+    },
+    {
+      "epoch": 0.4557860262008734,
+      "grad_norm": 0.17552775144577026,
+      "learning_rate": 2.9805946240197928e-05,
+      "loss": 0.1516443133354187,
+      "step": 2505
+    },
+    {
+      "epoch": 0.4566957787481805,
+      "grad_norm": 0.16020981967449188,
+      "learning_rate": 2.9733690348935994e-05,
+      "loss": 0.14519743919372557,
+      "step": 2510
+    },
+    {
+      "epoch": 0.45760553129548764,
+      "grad_norm": 0.17800211906433105,
+      "learning_rate": 2.9661393421162204e-05,
+      "loss": 0.15679080486297609,
+      "step": 2515
+    },
+    {
+      "epoch": 0.4585152838427948,
+      "grad_norm": 0.16016991436481476,
+      "learning_rate": 2.9589056083620902e-05,
+      "loss": 0.14768127202987671,
+      "step": 2520
+    },
+    {
+      "epoch": 0.4594250363901019,
+      "grad_norm": 0.16272081434726715,
+      "learning_rate": 2.951667896340679e-05,
+      "loss": 0.1513301968574524,
+      "step": 2525
+    },
+    {
+      "epoch": 0.46033478893740903,
+      "grad_norm": 0.1726413071155548,
+      "learning_rate": 2.9444262687959402e-05,
+      "loss": 0.14819332361221313,
+      "step": 2530
+    },
+    {
+      "epoch": 0.46124454148471616,
+      "grad_norm": 0.1670403778553009,
+      "learning_rate": 2.9371807885057735e-05,
+      "loss": 0.15245940685272216,
+      "step": 2535
+    },
+    {
+      "epoch": 0.4621542940320233,
+      "grad_norm": 0.1650049239397049,
+      "learning_rate": 2.9299315182814772e-05,
+      "loss": 0.15187418460845947,
+      "step": 2540
+    },
+    {
+      "epoch": 0.4630640465793304,
+      "grad_norm": 0.16327734291553497,
+      "learning_rate": 2.9226785209672047e-05,
+      "loss": 0.15579828023910522,
+      "step": 2545
+    },
+    {
+      "epoch": 0.46397379912663755,
+      "grad_norm": 0.3367880582809448,
+      "learning_rate": 2.91542185943942e-05,
+      "loss": 0.15617697238922118,
+      "step": 2550
+    },
+    {
+      "epoch": 0.4648835516739447,
+      "grad_norm": 0.1731594055891037,
+      "learning_rate": 2.908161596606353e-05,
+      "loss": 0.1559603691101074,
+      "step": 2555
+    },
+    {
+      "epoch": 0.4657933042212518,
+      "grad_norm": 0.1477293074131012,
+      "learning_rate": 2.9008977954074517e-05,
+      "loss": 0.15567959547042848,
+      "step": 2560
+    },
+    {
+      "epoch": 0.46670305676855894,
+      "grad_norm": 0.16227173805236816,
+      "learning_rate": 2.8936305188128392e-05,
+      "loss": 0.1522113561630249,
+      "step": 2565
+    },
+    {
+      "epoch": 0.4676128093158661,
+      "grad_norm": 0.2031075656414032,
+      "learning_rate": 2.8863598298227674e-05,
+      "loss": 0.15054640769958497,
+      "step": 2570
+    },
+    {
+      "epoch": 0.4685225618631732,
+      "grad_norm": 0.18351472914218903,
+      "learning_rate": 2.8790857914670698e-05,
+      "loss": 0.15837019681930542,
+      "step": 2575
+    },
+    {
+      "epoch": 0.46943231441048033,
+      "grad_norm": 0.15914765000343323,
+      "learning_rate": 2.871808466804616e-05,
+      "loss": 0.1550259470939636,
+      "step": 2580
+    },
+    {
+      "epoch": 0.47034206695778746,
+      "grad_norm": 0.17366717755794525,
+      "learning_rate": 2.8645279189227636e-05,
+      "loss": 0.15702390670776367,
+      "step": 2585
+    },
+    {
+      "epoch": 0.4712518195050946,
+      "grad_norm": 0.13677838444709778,
+      "learning_rate": 2.8572442109368134e-05,
+      "loss": 0.15485031604766847,
+      "step": 2590
+    },
+    {
+      "epoch": 0.4721615720524017,
+      "grad_norm": 0.1477748304605484,
+      "learning_rate": 2.8499574059894617e-05,
+      "loss": 0.14577245712280273,
+      "step": 2595
+    },
+    {
+      "epoch": 0.47307132459970885,
+      "grad_norm": 0.1582217663526535,
+      "learning_rate": 2.842667567250252e-05,
+      "loss": 0.15586793422698975,
+      "step": 2600
+    },
+    {
+      "epoch": 0.47398107714701604,
+      "grad_norm": 0.19658738374710083,
+      "learning_rate": 2.8353747579150268e-05,
+      "loss": 0.15060495138168334,
+      "step": 2605
+    },
+    {
+      "epoch": 0.47489082969432317,
+      "grad_norm": 0.176767036318779,
+      "learning_rate": 2.828079041205382e-05,
+      "loss": 0.15116705894470214,
+      "step": 2610
+    },
+    {
+      "epoch": 0.4758005822416303,
+      "grad_norm": 0.16972507536411285,
+      "learning_rate": 2.820780480368117e-05,
+      "loss": 0.1541937470436096,
+      "step": 2615
+    },
+    {
+      "epoch": 0.47671033478893743,
+      "grad_norm": 0.1548585742712021,
+      "learning_rate": 2.8134791386746884e-05,
+      "loss": 0.14334756135940552,
+      "step": 2620
+    },
+    {
+      "epoch": 0.47762008733624456,
+      "grad_norm": 0.15411986410617828,
+      "learning_rate": 2.806175079420658e-05,
+      "loss": 0.14642289876937867,
+      "step": 2625
+    },
+    {
+      "epoch": 0.4785298398835517,
+      "grad_norm": 0.16609491407871246,
+      "learning_rate": 2.7988683659251474e-05,
+      "loss": 0.15083469152450563,
+      "step": 2630
+    },
+    {
+      "epoch": 0.4794395924308588,
+      "grad_norm": 0.16592684388160706,
+      "learning_rate": 2.791559061530289e-05,
+      "loss": 0.14218480587005616,
+      "step": 2635
+    },
+    {
+      "epoch": 0.48034934497816595,
+      "grad_norm": 0.1764935404062271,
+      "learning_rate": 2.7842472296006722e-05,
+      "loss": 0.15004343986511232,
+      "step": 2640
+    },
+    {
+      "epoch": 0.4812590975254731,
+      "grad_norm": 0.20094354450702667,
+      "learning_rate": 2.7769329335228022e-05,
+      "loss": 0.14975016117095946,
+      "step": 2645
+    },
+    {
+      "epoch": 0.4821688500727802,
+      "grad_norm": 0.1869269460439682,
+      "learning_rate": 2.769616236704542e-05,
+      "loss": 0.155981707572937,
+      "step": 2650
+    },
+    {
+      "epoch": 0.48307860262008734,
+      "grad_norm": 0.16671574115753174,
+      "learning_rate": 2.762297202574571e-05,
+      "loss": 0.14633859395980836,
+      "step": 2655
+    },
+    {
+      "epoch": 0.48398835516739447,
+      "grad_norm": 0.14999663829803467,
+      "learning_rate": 2.754975894581826e-05,
+      "loss": 0.15692603588104248,
+      "step": 2660
+    },
+    {
+      "epoch": 0.4848981077147016,
+      "grad_norm": 0.16893649101257324,
+      "learning_rate": 2.7476523761949592e-05,
+      "loss": 0.14530394077301026,
+      "step": 2665
+    },
+    {
+      "epoch": 0.48580786026200873,
+      "grad_norm": 0.16039884090423584,
+      "learning_rate": 2.740326710901784e-05,
+      "loss": 0.15013915300369263,
+      "step": 2670
+    },
+    {
+      "epoch": 0.48671761280931586,
+      "grad_norm": 0.16672006249427795,
+      "learning_rate": 2.732998962208725e-05,
+      "loss": 0.15667349100112915,
+      "step": 2675
+    },
+    {
+      "epoch": 0.487627365356623,
+      "grad_norm": 0.2160867303609848,
+      "learning_rate": 2.7256691936402684e-05,
+      "loss": 0.14335414171218872,
+      "step": 2680
+    },
+    {
+      "epoch": 0.4885371179039301,
+      "grad_norm": 0.349030077457428,
+      "learning_rate": 2.71833746873841e-05,
+      "loss": 0.1437530279159546,
+      "step": 2685
+    },
+    {
+      "epoch": 0.48944687045123725,
+      "grad_norm": 0.18380966782569885,
+      "learning_rate": 2.7110038510621073e-05,
+      "loss": 0.1476014256477356,
+      "step": 2690
+    },
+    {
+      "epoch": 0.4903566229985444,
+      "grad_norm": 0.1523742377758026,
+      "learning_rate": 2.703668404186722e-05,
+      "loss": 0.14578526020050048,
+      "step": 2695
+    },
+    {
+      "epoch": 0.4912663755458515,
+      "grad_norm": 0.16092729568481445,
+      "learning_rate": 2.696331191703479e-05,
+      "loss": 0.15335593223571778,
+      "step": 2700
+    },
+    {
+      "epoch": 0.49217612809315864,
+      "grad_norm": 0.17185333371162415,
+      "learning_rate": 2.688992277218904e-05,
+      "loss": 0.1540898084640503,
+      "step": 2705
+    },
+    {
+      "epoch": 0.49308588064046577,
+      "grad_norm": 0.1521969735622406,
+      "learning_rate": 2.6816517243542792e-05,
+      "loss": 0.15171396732330322,
+      "step": 2710
+    },
+    {
+      "epoch": 0.49399563318777295,
+      "grad_norm": 0.16064171493053436,
+      "learning_rate": 2.674309596745092e-05,
+      "loss": 0.1505839228630066,
+      "step": 2715
+    },
+    {
+      "epoch": 0.4949053857350801,
+      "grad_norm": 0.16430898010730743,
+      "learning_rate": 2.6669659580404795e-05,
+      "loss": 0.1551363468170166,
+      "step": 2720
+    },
+    {
+      "epoch": 0.4958151382823872,
+      "grad_norm": 0.16125477850437164,
+      "learning_rate": 2.659620871902677e-05,
+      "loss": 0.15069286823272704,
+      "step": 2725
+    },
+    {
+      "epoch": 0.49672489082969434,
+      "grad_norm": 0.1428450047969818,
+      "learning_rate": 2.652274402006471e-05,
+      "loss": 0.15511081218719483,
+      "step": 2730
+    },
+    {
+      "epoch": 0.4976346433770015,
+      "grad_norm": 0.15452754497528076,
+      "learning_rate": 2.6449266120386406e-05,
+      "loss": 0.14941939115524291,
+      "step": 2735
+    },
+    {
+      "epoch": 0.4985443959243086,
+      "grad_norm": 0.17243537306785583,
+      "learning_rate": 2.6375775656974123e-05,
+      "loss": 0.151741623878479,
+      "step": 2740
+    },
+    {
+      "epoch": 0.49945414847161573,
+      "grad_norm": 0.13736453652381897,
+      "learning_rate": 2.6302273266919008e-05,
+      "loss": 0.147042977809906,
+      "step": 2745
+    },
+    {
+      "epoch": 0.5003639010189228,
+      "grad_norm": 0.16241495311260223,
+      "learning_rate": 2.6228759587415614e-05,
+      "loss": 0.14664684534072875,
+      "step": 2750
+    },
+    {
+      "epoch": 0.50127365356623,
+      "grad_norm": 0.193496435880661,
+      "learning_rate": 2.6155235255756356e-05,
+      "loss": 0.15486966371536254,
+      "step": 2755
+    },
+    {
+      "epoch": 0.5021834061135371,
+      "grad_norm": 0.1542847901582718,
+      "learning_rate": 2.6081700909326e-05,
+      "loss": 0.15148009061813356,
+      "step": 2760
+    },
+    {
+      "epoch": 0.5030931586608443,
+      "grad_norm": 0.1696511209011078,
+      "learning_rate": 2.6008157185596142e-05,
+      "loss": 0.14190055131912233,
+      "step": 2765
+    },
+    {
+      "epoch": 0.5040029112081513,
+      "grad_norm": 0.14690077304840088,
+      "learning_rate": 2.5934604722119655e-05,
+      "loss": 0.1570739269256592,
+      "step": 2770
+    },
+    {
+      "epoch": 0.5049126637554585,
+      "grad_norm": 0.17149671912193298,
+      "learning_rate": 2.5861044156525162e-05,
+      "loss": 0.14940304756164552,
+      "step": 2775
+    },
+    {
+      "epoch": 0.5058224163027657,
+      "grad_norm": 0.16639231145381927,
+      "learning_rate": 2.578747612651155e-05,
+      "loss": 0.15691237449645995,
+      "step": 2780
+    },
+    {
+      "epoch": 0.5067321688500728,
+      "grad_norm": 0.2062763124704361,
+      "learning_rate": 2.5713901269842404e-05,
+      "loss": 0.1564734935760498,
+      "step": 2785
+    },
+    {
+      "epoch": 0.50764192139738,
+      "grad_norm": 0.12636308372020721,
+      "learning_rate": 2.5640320224340502e-05,
+      "loss": 0.14539417028427123,
+      "step": 2790
+    },
+    {
+      "epoch": 0.508551673944687,
+      "grad_norm": 0.16893689334392548,
+      "learning_rate": 2.556673362788225e-05,
+      "loss": 0.15440930128097535,
+      "step": 2795
+    },
+    {
+      "epoch": 0.5094614264919942,
+      "grad_norm": 0.16250015795230865,
+      "learning_rate": 2.54931421183922e-05,
+      "loss": 0.14485647678375244,
+      "step": 2800
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.545047564883377e+18,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-2800/training_args.bin b/checkpoint-2800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-2800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/checkpoint-2900/README.md b/checkpoint-2900/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-2900/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-2900/adapter_config.json b/checkpoint-2900/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-2900/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-2900/adapter_model.safetensors b/checkpoint-2900/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e93ecc7dec5f1db01880fe9ec51835079156f03d
--- /dev/null
+++ b/checkpoint-2900/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:732120270facc7d6d2d69d3ca0c49897f3ba3f1a3559d0b1f470be38475a79b6
+size 169741912
diff --git a/checkpoint-2900/chat_template.jinja b/checkpoint-2900/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-2900/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-2900/optimizer.pt b/checkpoint-2900/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fe4b6b047f623f02808df43aacb1626fbc0c0293
--- /dev/null
+++ b/checkpoint-2900/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2a918f35832fbacc3db98ebed06ee510ed39bc3d87ee127fa994dbda8cb345bb
+size 72807355
diff --git a/checkpoint-2900/processor_config.json b/checkpoint-2900/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-2900/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-2900/rng_state.pth b/checkpoint-2900/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-2900/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-2900/scheduler.pt b/checkpoint-2900/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..88479698beda3260b0434a3c81dac58cc905a738
--- /dev/null
+++ b/checkpoint-2900/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f55cadcff16b609e6d7d899d6d85d51d6152dd63f70acbdd873e182531f819b0
+size 1465
diff --git a/checkpoint-2900/tokenizer.json b/checkpoint-2900/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-2900/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-2900/tokenizer_config.json b/checkpoint-2900/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-2900/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-2900/trainer_state.json b/checkpoint-2900/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..d779da3a571fa96ac3973cdf26d413aad71c12c7
--- /dev/null
+++ b/checkpoint-2900/trainer_state.json
@@ -0,0 +1,4102 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.5276564774381368,
+  "eval_steps": 100,
+  "global_step": 2900,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    },
+    {
+      "epoch": 0.03729985443959243,
+      "grad_norm": 0.061678580939769745,
+      "learning_rate": 4.999340748636462e-05,
+      "loss": 0.24956226348876953,
+      "step": 205
+    },
+    {
+      "epoch": 0.03820960698689956,
+      "grad_norm": 0.07328873127698898,
+      "learning_rate": 4.999160884067051e-05,
+      "loss": 0.26169676780700685,
+      "step": 210
+    },
+    {
+      "epoch": 0.0391193595342067,
+      "grad_norm": 0.08287990838289261,
+      "learning_rate": 4.9989593541926246e-05,
+      "loss": 0.2574604034423828,
+      "step": 215
+    },
+    {
+      "epoch": 0.04002911208151383,
+      "grad_norm": 0.06787359714508057,
+      "learning_rate": 4.9987361607602525e-05,
+      "loss": 0.25351409912109374,
+      "step": 220
+    },
+    {
+      "epoch": 0.04093886462882096,
+      "grad_norm": 0.06695502996444702,
+      "learning_rate": 4.998491305704805e-05,
+      "loss": 0.24522039890289307,
+      "step": 225
+    },
+    {
+      "epoch": 0.041848617176128096,
+      "grad_norm": 0.08872214704751968,
+      "learning_rate": 4.9982247911489375e-05,
+      "loss": 0.2581867933273315,
+      "step": 230
+    },
+    {
+      "epoch": 0.042758369723435226,
+      "grad_norm": 0.07637131959199905,
+      "learning_rate": 4.9979366194030743e-05,
+      "loss": 0.25569658279418944,
+      "step": 235
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 0.08158119022846222,
+      "learning_rate": 4.997626792965385e-05,
+      "loss": 0.2529409646987915,
+      "step": 240
+    },
+    {
+      "epoch": 0.04457787481804949,
+      "grad_norm": 0.07529161125421524,
+      "learning_rate": 4.997295314521766e-05,
+      "loss": 0.24049024581909179,
+      "step": 245
+    },
+    {
+      "epoch": 0.04548762736535662,
+      "grad_norm": 0.08860139548778534,
+      "learning_rate": 4.996942186945813e-05,
+      "loss": 0.2490522861480713,
+      "step": 250
+    },
+    {
+      "epoch": 0.04639737991266375,
+      "grad_norm": 0.0850321501493454,
+      "learning_rate": 4.9965674132988005e-05,
+      "loss": 0.24180831909179687,
+      "step": 255
+    },
+    {
+      "epoch": 0.04730713245997089,
+      "grad_norm": 0.07556115090847015,
+      "learning_rate": 4.996170996829653e-05,
+      "loss": 0.2509631872177124,
+      "step": 260
+    },
+    {
+      "epoch": 0.04821688500727802,
+      "grad_norm": 0.07971206307411194,
+      "learning_rate": 4.995752940974918e-05,
+      "loss": 0.24398891925811766,
+      "step": 265
+    },
+    {
+      "epoch": 0.04912663755458515,
+      "grad_norm": 0.09149336814880371,
+      "learning_rate": 4.9953132493587344e-05,
+      "loss": 0.2300492286682129,
+      "step": 270
+    },
+    {
+      "epoch": 0.050036390101892286,
+      "grad_norm": 0.08265820890665054,
+      "learning_rate": 4.9948519257928034e-05,
+      "loss": 0.24246792793273925,
+      "step": 275
+    },
+    {
+      "epoch": 0.050946142649199416,
+      "grad_norm": 0.10328587144613266,
+      "learning_rate": 4.9943689742763534e-05,
+      "loss": 0.2367171049118042,
+      "step": 280
+    },
+    {
+      "epoch": 0.05185589519650655,
+      "grad_norm": 0.0836917981505394,
+      "learning_rate": 4.993864398996105e-05,
+      "loss": 0.23215813636779786,
+      "step": 285
+    },
+    {
+      "epoch": 0.05276564774381368,
+      "grad_norm": 0.09475161135196686,
+      "learning_rate": 4.99333820432624e-05,
+      "loss": 0.2350748062133789,
+      "step": 290
+    },
+    {
+      "epoch": 0.05367540029112081,
+      "grad_norm": 0.08040128648281097,
+      "learning_rate": 4.992790394828355e-05,
+      "loss": 0.23253886699676513,
+      "step": 295
+    },
+    {
+      "epoch": 0.05458515283842795,
+      "grad_norm": 0.08852150291204453,
+      "learning_rate": 4.992220975251428e-05,
+      "loss": 0.23856515884399415,
+      "step": 300
+    },
+    {
+      "epoch": 0.05549490538573508,
+      "grad_norm": 0.09565229713916779,
+      "learning_rate": 4.991629950531775e-05,
+      "loss": 0.23311660289764405,
+      "step": 305
+    },
+    {
+      "epoch": 0.05640465793304221,
+      "grad_norm": 0.08158160001039505,
+      "learning_rate": 4.991017325793009e-05,
+      "loss": 0.22467944622039795,
+      "step": 310
+    },
+    {
+      "epoch": 0.05731441048034935,
+      "grad_norm": 0.07746429741382599,
+      "learning_rate": 4.990383106345994e-05,
+      "loss": 0.229844069480896,
+      "step": 315
+    },
+    {
+      "epoch": 0.05822416302765648,
+      "grad_norm": 0.08564355969429016,
+      "learning_rate": 4.989727297688797e-05,
+      "loss": 0.22414517402648926,
+      "step": 320
+    },
+    {
+      "epoch": 0.05913391557496361,
+      "grad_norm": 0.07517435401678085,
+      "learning_rate": 4.9890499055066435e-05,
+      "loss": 0.2236532211303711,
+      "step": 325
+    },
+    {
+      "epoch": 0.060043668122270744,
+      "grad_norm": 0.111734539270401,
+      "learning_rate": 4.988350935671869e-05,
+      "loss": 0.21474847793579102,
+      "step": 330
+    },
+    {
+      "epoch": 0.060953420669577874,
+      "grad_norm": 0.09906989336013794,
+      "learning_rate": 4.987630394243866e-05,
+      "loss": 0.23321933746337892,
+      "step": 335
+    },
+    {
+      "epoch": 0.06186317321688501,
+      "grad_norm": 0.10131457448005676,
+      "learning_rate": 4.98688828746903e-05,
+      "loss": 0.2310662031173706,
+      "step": 340
+    },
+    {
+      "epoch": 0.06277292576419213,
+      "grad_norm": 0.09203507006168365,
+      "learning_rate": 4.986124621780708e-05,
+      "loss": 0.22021169662475587,
+      "step": 345
+    },
+    {
+      "epoch": 0.06368267831149928,
+      "grad_norm": 0.09505912661552429,
+      "learning_rate": 4.9853394037991416e-05,
+      "loss": 0.2197155237197876,
+      "step": 350
+    },
+    {
+      "epoch": 0.06459243085880641,
+      "grad_norm": 0.09038657695055008,
+      "learning_rate": 4.984532640331412e-05,
+      "loss": 0.22066287994384765,
+      "step": 355
+    },
+    {
+      "epoch": 0.06550218340611354,
+      "grad_norm": 0.09707064181566238,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 0.22455451488494874,
+      "step": 360
+    },
+    {
+      "epoch": 0.06641193595342067,
+      "grad_norm": 0.10367228090763092,
+      "learning_rate": 4.98285450509961e-05,
+      "loss": 0.21993820667266845,
+      "step": 365
+    },
+    {
+      "epoch": 0.0673216885007278,
+      "grad_norm": 0.12229471653699875,
+      "learning_rate": 4.9819831478833456e-05,
+      "loss": 0.2168867588043213,
+      "step": 370
+    },
+    {
+      "epoch": 0.06823144104803494,
+      "grad_norm": 0.0964592918753624,
+      "learning_rate": 4.981090274276406e-05,
+      "loss": 0.21579203605651856,
+      "step": 375
+    },
+    {
+      "epoch": 0.06914119359534207,
+      "grad_norm": 0.09400496631860733,
+      "learning_rate": 4.980175892019141e-05,
+      "loss": 0.20972180366516113,
+      "step": 380
+    },
+    {
+      "epoch": 0.0700509461426492,
+      "grad_norm": 0.08158645778894424,
+      "learning_rate": 4.9792400090383594e-05,
+      "loss": 0.22148358821868896,
+      "step": 385
+    },
+    {
+      "epoch": 0.07096069868995633,
+      "grad_norm": 0.10916394740343094,
+      "learning_rate": 4.978282633447261e-05,
+      "loss": 0.2214418649673462,
+      "step": 390
+    },
+    {
+      "epoch": 0.07187045123726346,
+      "grad_norm": 0.11138810962438583,
+      "learning_rate": 4.9773037735453636e-05,
+      "loss": 0.21814754009246826,
+      "step": 395
+    },
+    {
+      "epoch": 0.07278020378457059,
+      "grad_norm": 0.10914396494626999,
+      "learning_rate": 4.9763034378184365e-05,
+      "loss": 0.21310818195343018,
+      "step": 400
+    },
+    {
+      "epoch": 0.07368995633187773,
+      "grad_norm": 0.1043366864323616,
+      "learning_rate": 4.975281634938421e-05,
+      "loss": 0.21266789436340333,
+      "step": 405
+    },
+    {
+      "epoch": 0.07459970887918486,
+      "grad_norm": 0.1036868542432785,
+      "learning_rate": 4.9742383737633594e-05,
+      "loss": 0.21606721878051757,
+      "step": 410
+    },
+    {
+      "epoch": 0.075509461426492,
+      "grad_norm": 0.11640442907810211,
+      "learning_rate": 4.9731736633373144e-05,
+      "loss": 0.21532948017120362,
+      "step": 415
+    },
+    {
+      "epoch": 0.07641921397379912,
+      "grad_norm": 0.11219926178455353,
+      "learning_rate": 4.9720875128902956e-05,
+      "loss": 0.2191627025604248,
+      "step": 420
+    },
+    {
+      "epoch": 0.07732896652110625,
+      "grad_norm": 0.12103637307882309,
+      "learning_rate": 4.970979931838176e-05,
+      "loss": 0.20938868522644044,
+      "step": 425
+    },
+    {
+      "epoch": 0.0782387190684134,
+      "grad_norm": 0.13274189829826355,
+      "learning_rate": 4.96985092978261e-05,
+      "loss": 0.21792960166931152,
+      "step": 430
+    },
+    {
+      "epoch": 0.07914847161572053,
+      "grad_norm": 0.11164513230323792,
+      "learning_rate": 4.968700516510954e-05,
+      "loss": 0.2022618055343628,
+      "step": 435
+    },
+    {
+      "epoch": 0.08005822416302766,
+      "grad_norm": 0.09532847255468369,
+      "learning_rate": 4.967528701996174e-05,
+      "loss": 0.21255812644958497,
+      "step": 440
+    },
+    {
+      "epoch": 0.08096797671033479,
+      "grad_norm": 0.10279258340597153,
+      "learning_rate": 4.96633549639677e-05,
+      "loss": 0.20683050155639648,
+      "step": 445
+    },
+    {
+      "epoch": 0.08187772925764192,
+      "grad_norm": 0.1257462352514267,
+      "learning_rate": 4.965120910056677e-05,
+      "loss": 0.21419920921325683,
+      "step": 450
+    },
+    {
+      "epoch": 0.08278748180494905,
+      "grad_norm": 0.11663137376308441,
+      "learning_rate": 4.963884953505186e-05,
+      "loss": 0.2072287082672119,
+      "step": 455
+    },
+    {
+      "epoch": 0.08369723435225619,
+      "grad_norm": 0.10488224029541016,
+      "learning_rate": 4.96262763745684e-05,
+      "loss": 0.1982678532600403,
+      "step": 460
+    },
+    {
+      "epoch": 0.08460698689956332,
+      "grad_norm": 0.11801692098379135,
+      "learning_rate": 4.961348972811354e-05,
+      "loss": 0.20662031173706055,
+      "step": 465
+    },
+    {
+      "epoch": 0.08551673944687045,
+      "grad_norm": 0.11318827420473099,
+      "learning_rate": 4.96004897065351e-05,
+      "loss": 0.20947303771972656,
+      "step": 470
+    },
+    {
+      "epoch": 0.08642649199417758,
+      "grad_norm": 0.13409486413002014,
+      "learning_rate": 4.95872764225307e-05,
+      "loss": 0.19670876264572143,
+      "step": 475
+    },
+    {
+      "epoch": 0.08733624454148471,
+      "grad_norm": 0.14440792798995972,
+      "learning_rate": 4.957384999064672e-05,
+      "loss": 0.19842848777770997,
+      "step": 480
+    },
+    {
+      "epoch": 0.08824599708879186,
+      "grad_norm": 0.12246996909379959,
+      "learning_rate": 4.956021052727731e-05,
+      "loss": 0.20318071842193602,
+      "step": 485
+    },
+    {
+      "epoch": 0.08915574963609899,
+      "grad_norm": 0.13437233865261078,
+      "learning_rate": 4.954635815066342e-05,
+      "loss": 0.21675212383270265,
+      "step": 490
+    },
+    {
+      "epoch": 0.09006550218340612,
+      "grad_norm": 0.11109672486782074,
+      "learning_rate": 4.9532292980891744e-05,
+      "loss": 0.2100757837295532,
+      "step": 495
+    },
+    {
+      "epoch": 0.09097525473071325,
+      "grad_norm": 0.1388893872499466,
+      "learning_rate": 4.9518015139893675e-05,
+      "loss": 0.20303285121917725,
+      "step": 500
+    },
+    {
+      "epoch": 0.09188500727802038,
+      "grad_norm": 0.13239721953868866,
+      "learning_rate": 4.950352475144427e-05,
+      "loss": 0.2152268409729004,
+      "step": 505
+    },
+    {
+      "epoch": 0.0927947598253275,
+      "grad_norm": 0.12834979593753815,
+      "learning_rate": 4.948882194116119e-05,
+      "loss": 0.20799248218536376,
+      "step": 510
+    },
+    {
+      "epoch": 0.09370451237263465,
+      "grad_norm": 0.11886704713106155,
+      "learning_rate": 4.947390683650354e-05,
+      "loss": 0.20394976139068605,
+      "step": 515
+    },
+    {
+      "epoch": 0.09461426491994178,
+      "grad_norm": 0.11398876458406448,
+      "learning_rate": 4.945877956677083e-05,
+      "loss": 0.2091092586517334,
+      "step": 520
+    },
+    {
+      "epoch": 0.09552401746724891,
+      "grad_norm": 0.1422540694475174,
+      "learning_rate": 4.944344026310186e-05,
+      "loss": 0.19564238786697388,
+      "step": 525
+    },
+    {
+      "epoch": 0.09643377001455604,
+      "grad_norm": 0.11359584331512451,
+      "learning_rate": 4.9427889058473535e-05,
+      "loss": 0.20493624210357667,
+      "step": 530
+    },
+    {
+      "epoch": 0.09734352256186317,
+      "grad_norm": 0.11703553050756454,
+      "learning_rate": 4.941212608769974e-05,
+      "loss": 0.2098615884780884,
+      "step": 535
+    },
+    {
+      "epoch": 0.0982532751091703,
+      "grad_norm": 0.14552047848701477,
+      "learning_rate": 4.939615148743017e-05,
+      "loss": 0.20382182598114013,
+      "step": 540
+    },
+    {
+      "epoch": 0.09916302765647744,
+      "grad_norm": 0.13178016245365143,
+      "learning_rate": 4.937996539614914e-05,
+      "loss": 0.19901862144470214,
+      "step": 545
+    },
+    {
+      "epoch": 0.10007278020378457,
+      "grad_norm": 0.635392427444458,
+      "learning_rate": 4.936356795417439e-05,
+      "loss": 0.20694944858551026,
+      "step": 550
+    },
+    {
+      "epoch": 0.1009825327510917,
+      "grad_norm": 0.15019077062606812,
+      "learning_rate": 4.934695930365586e-05,
+      "loss": 0.19313746690750122,
+      "step": 555
+    },
+    {
+      "epoch": 0.10189228529839883,
+      "grad_norm": 0.12941956520080566,
+      "learning_rate": 4.9330139588574474e-05,
+      "loss": 0.19671722650527954,
+      "step": 560
+    },
+    {
+      "epoch": 0.10280203784570596,
+      "grad_norm": 0.13818831741809845,
+      "learning_rate": 4.931310895474088e-05,
+      "loss": 0.20026786327362062,
+      "step": 565
+    },
+    {
+      "epoch": 0.1037117903930131,
+      "grad_norm": 0.12011194974184036,
+      "learning_rate": 4.929586754979417e-05,
+      "loss": 0.1932437539100647,
+      "step": 570
+    },
+    {
+      "epoch": 0.10462154294032024,
+      "grad_norm": 0.1345364898443222,
+      "learning_rate": 4.9278415523200644e-05,
+      "loss": 0.20245940685272218,
+      "step": 575
+    },
+    {
+      "epoch": 0.10553129548762737,
+      "grad_norm": 0.13281017541885376,
+      "learning_rate": 4.926075302625247e-05,
+      "loss": 0.19864981174468993,
+      "step": 580
+    },
+    {
+      "epoch": 0.1064410480349345,
+      "grad_norm": 0.13465586304664612,
+      "learning_rate": 4.924288021206639e-05,
+      "loss": 0.19573183059692384,
+      "step": 585
+    },
+    {
+      "epoch": 0.10735080058224163,
+      "grad_norm": 0.15225961804389954,
+      "learning_rate": 4.9224797235582396e-05,
+      "loss": 0.19946801662445068,
+      "step": 590
+    },
+    {
+      "epoch": 0.10826055312954876,
+      "grad_norm": 0.12816746532917023,
+      "learning_rate": 4.92065042535624e-05,
+      "loss": 0.19851526021957397,
+      "step": 595
+    },
+    {
+      "epoch": 0.1091703056768559,
+      "grad_norm": 0.13802853226661682,
+      "learning_rate": 4.9188001424588824e-05,
+      "loss": 0.19321763515472412,
+      "step": 600
+    },
+    {
+      "epoch": 0.11008005822416303,
+      "grad_norm": 0.17504797875881195,
+      "learning_rate": 4.9169288909063295e-05,
+      "loss": 0.2032616138458252,
+      "step": 605
+    },
+    {
+      "epoch": 0.11098981077147016,
+      "grad_norm": 0.13544194400310516,
+      "learning_rate": 4.91503668692052e-05,
+      "loss": 0.2011256456375122,
+      "step": 610
+    },
+    {
+      "epoch": 0.11189956331877729,
+      "grad_norm": 1.3976134061813354,
+      "learning_rate": 4.91312354690503e-05,
+      "loss": 0.19916868209838867,
+      "step": 615
+    },
+    {
+      "epoch": 0.11280931586608442,
+      "grad_norm": 0.1465059071779251,
+      "learning_rate": 4.91118948744493e-05,
+      "loss": 0.19487457275390624,
+      "step": 620
+    },
+    {
+      "epoch": 0.11371906841339156,
+      "grad_norm": 0.12103168666362762,
+      "learning_rate": 4.909234525306645e-05,
+      "loss": 0.1907251238822937,
+      "step": 625
+    },
+    {
+      "epoch": 0.1146288209606987,
+      "grad_norm": 0.12660574913024902,
+      "learning_rate": 4.907258677437802e-05,
+      "loss": 0.19327253103256226,
+      "step": 630
+    },
+    {
+      "epoch": 0.11553857350800582,
+      "grad_norm": 0.1347813606262207,
+      "learning_rate": 4.90526196096709e-05,
+      "loss": 0.19637736082077026,
+      "step": 635
+    },
+    {
+      "epoch": 0.11644832605531295,
+      "grad_norm": 0.14953652024269104,
+      "learning_rate": 4.903244393204107e-05,
+      "loss": 0.20325069427490233,
+      "step": 640
+    },
+    {
+      "epoch": 0.11735807860262008,
+      "grad_norm": 0.13936272263526917,
+      "learning_rate": 4.901205991639213e-05,
+      "loss": 0.1930275321006775,
+      "step": 645
+    },
+    {
+      "epoch": 0.11826783114992721,
+      "grad_norm": 0.1448420137166977,
+      "learning_rate": 4.899146773943374e-05,
+      "loss": 0.20026936531066894,
+      "step": 650
+    },
+    {
+      "epoch": 0.11917758369723436,
+      "grad_norm": 0.1312534064054489,
+      "learning_rate": 4.897066757968014e-05,
+      "loss": 0.19062033891677857,
+      "step": 655
+    },
+    {
+      "epoch": 0.12008733624454149,
+      "grad_norm": 0.13644742965698242,
+      "learning_rate": 4.894965961744859e-05,
+      "loss": 0.18719595670700073,
+      "step": 660
+    },
+    {
+      "epoch": 0.12099708879184862,
+      "grad_norm": 0.14276087284088135,
+      "learning_rate": 4.892844403485777e-05,
+      "loss": 0.19784307479858398,
+      "step": 665
+    },
+    {
+      "epoch": 0.12190684133915575,
+      "grad_norm": 0.14735399186611176,
+      "learning_rate": 4.890702101582623e-05,
+      "loss": 0.19163782596588136,
+      "step": 670
+    },
+    {
+      "epoch": 0.12281659388646288,
+      "grad_norm": 0.15742065012454987,
+      "learning_rate": 4.888539074607082e-05,
+      "loss": 0.19312986135482788,
+      "step": 675
+    },
+    {
+      "epoch": 0.12372634643377002,
+      "grad_norm": 0.12917031347751617,
+      "learning_rate": 4.8863553413105025e-05,
+      "loss": 0.20066320896148682,
+      "step": 680
+    },
+    {
+      "epoch": 0.12463609898107715,
+      "grad_norm": 0.1484801322221756,
+      "learning_rate": 4.884150920623737e-05,
+      "loss": 0.20096096992492676,
+      "step": 685
+    },
+    {
+      "epoch": 0.12554585152838427,
+      "grad_norm": 0.1455296128988266,
+      "learning_rate": 4.88192583165698e-05,
+      "loss": 0.20518505573272705,
+      "step": 690
+    },
+    {
+      "epoch": 0.12645560407569142,
+      "grad_norm": 0.14517490565776825,
+      "learning_rate": 4.879680093699598e-05,
+      "loss": 0.18859238624572755,
+      "step": 695
+    },
+    {
+      "epoch": 0.12736535662299855,
+      "grad_norm": 0.18778090178966522,
+      "learning_rate": 4.877413726219964e-05,
+      "loss": 0.197074818611145,
+      "step": 700
+    },
+    {
+      "epoch": 0.12827510917030568,
+      "grad_norm": 0.13497677445411682,
+      "learning_rate": 4.87512674886529e-05,
+      "loss": 0.18713107109069824,
+      "step": 705
+    },
+    {
+      "epoch": 0.12918486171761281,
+      "grad_norm": 0.12657155096530914,
+      "learning_rate": 4.872819181461455e-05,
+      "loss": 0.1858484387397766,
+      "step": 710
+    },
+    {
+      "epoch": 0.13009461426491994,
+      "grad_norm": 0.11458148807287216,
+      "learning_rate": 4.870491044012834e-05,
+      "loss": 0.18732179403305055,
+      "step": 715
+    },
+    {
+      "epoch": 0.13100436681222707,
+      "grad_norm": 0.13000249862670898,
+      "learning_rate": 4.8681423567021244e-05,
+      "loss": 0.1872936010360718,
+      "step": 720
+    },
+    {
+      "epoch": 0.1319141193595342,
+      "grad_norm": 0.14580890536308289,
+      "learning_rate": 4.865773139890172e-05,
+      "loss": 0.19280019998550416,
+      "step": 725
+    },
+    {
+      "epoch": 0.13282387190684133,
+      "grad_norm": 0.1507277935743332,
+      "learning_rate": 4.8633834141157913e-05,
+      "loss": 0.1898929238319397,
+      "step": 730
+    },
+    {
+      "epoch": 0.13373362445414846,
+      "grad_norm": 0.1418737769126892,
+      "learning_rate": 4.860973200095592e-05,
+      "loss": 0.17926375865936278,
+      "step": 735
+    },
+    {
+      "epoch": 0.1346433770014556,
+      "grad_norm": 0.17151866853237152,
+      "learning_rate": 4.858542518723794e-05,
+      "loss": 0.18963592052459716,
+      "step": 740
+    },
+    {
+      "epoch": 0.13555312954876272,
+      "grad_norm": 0.11162743717432022,
+      "learning_rate": 4.8560913910720535e-05,
+      "loss": 0.19466646909713745,
+      "step": 745
+    },
+    {
+      "epoch": 0.13646288209606988,
+      "grad_norm": 0.15628376603126526,
+      "learning_rate": 4.8536198383892725e-05,
+      "loss": 0.19494034051895143,
+      "step": 750
+    },
+    {
+      "epoch": 0.137372634643377,
+      "grad_norm": 0.18209289014339447,
+      "learning_rate": 4.851127882101421e-05,
+      "loss": 0.18747550249099731,
+      "step": 755
+    },
+    {
+      "epoch": 0.13828238719068414,
+      "grad_norm": 0.14559614658355713,
+      "learning_rate": 4.8486155438113454e-05,
+      "loss": 0.1897158980369568,
+      "step": 760
+    },
+    {
+      "epoch": 0.13919213973799127,
+      "grad_norm": 0.3198587894439697,
+      "learning_rate": 4.846082845298586e-05,
+      "loss": 0.18571001291275024,
+      "step": 765
+    },
+    {
+      "epoch": 0.1401018922852984,
+      "grad_norm": 0.1486678421497345,
+      "learning_rate": 4.843529808519189e-05,
+      "loss": 0.19561930894851684,
+      "step": 770
+    },
+    {
+      "epoch": 0.14101164483260553,
+      "grad_norm": 0.15318170189857483,
+      "learning_rate": 4.840956455605509e-05,
+      "loss": 0.187040114402771,
+      "step": 775
+    },
+    {
+      "epoch": 0.14192139737991266,
+      "grad_norm": 0.13754244148731232,
+      "learning_rate": 4.838362808866025e-05,
+      "loss": 0.18345539569854735,
+      "step": 780
+    },
+    {
+      "epoch": 0.1428311499272198,
+      "grad_norm": 0.12943248450756073,
+      "learning_rate": 4.835748890785143e-05,
+      "loss": 0.1921079397201538,
+      "step": 785
+    },
+    {
+      "epoch": 0.14374090247452692,
+      "grad_norm": 0.110458143055439,
+      "learning_rate": 4.833114724023001e-05,
+      "loss": 0.17927205562591553,
+      "step": 790
+    },
+    {
+      "epoch": 0.14465065502183405,
+      "grad_norm": 0.2421770840883255,
+      "learning_rate": 4.830460331415275e-05,
+      "loss": 0.18317567110061644,
+      "step": 795
+    },
+    {
+      "epoch": 0.14556040756914118,
+      "grad_norm": 0.14752762019634247,
+      "learning_rate": 4.8277857359729787e-05,
+      "loss": 0.1843916058540344,
+      "step": 800
+    },
+    {
+      "epoch": 0.14647016011644834,
+      "grad_norm": 0.15043556690216064,
+      "learning_rate": 4.8250909608822644e-05,
+      "loss": 0.18354393243789674,
+      "step": 805
+    },
+    {
+      "epoch": 0.14737991266375547,
+      "grad_norm": 0.1381794661283493,
+      "learning_rate": 4.822376029504223e-05,
+      "loss": 0.1789781332015991,
+      "step": 810
+    },
+    {
+      "epoch": 0.1482896652110626,
+      "grad_norm": 0.18386174738407135,
+      "learning_rate": 4.819640965374681e-05,
+      "loss": 0.19494292736053467,
+      "step": 815
+    },
+    {
+      "epoch": 0.14919941775836973,
+      "grad_norm": 0.13829593360424042,
+      "learning_rate": 4.816885792203996e-05,
+      "loss": 0.18486063480377196,
+      "step": 820
+    },
+    {
+      "epoch": 0.15010917030567686,
+      "grad_norm": 0.15033291280269623,
+      "learning_rate": 4.814110533876852e-05,
+      "loss": 0.18061509132385253,
+      "step": 825
+    },
+    {
+      "epoch": 0.151018922852984,
+      "grad_norm": 0.17150473594665527,
+      "learning_rate": 4.811315214452051e-05,
+      "loss": 0.18464866876602173,
+      "step": 830
+    },
+    {
+      "epoch": 0.15192867540029112,
+      "grad_norm": 0.15317125618457794,
+      "learning_rate": 4.808499858162307e-05,
+      "loss": 0.1837708592414856,
+      "step": 835
+    },
+    {
+      "epoch": 0.15283842794759825,
+      "grad_norm": 0.2671392560005188,
+      "learning_rate": 4.805664489414031e-05,
+      "loss": 0.19338636398315429,
+      "step": 840
+    },
+    {
+      "epoch": 0.15374818049490538,
+      "grad_norm": 0.14047028124332428,
+      "learning_rate": 4.802809132787125e-05,
+      "loss": 0.17069108486175538,
+      "step": 845
+    },
+    {
+      "epoch": 0.1546579330422125,
+      "grad_norm": 0.1520431935787201,
+      "learning_rate": 4.799933813034768e-05,
+      "loss": 0.18607735633850098,
+      "step": 850
+    },
+    {
+      "epoch": 0.15556768558951964,
+      "grad_norm": 0.17239463329315186,
+      "learning_rate": 4.797038555083197e-05,
+      "loss": 0.18069062232971192,
+      "step": 855
+    },
+    {
+      "epoch": 0.1564774381368268,
+      "grad_norm": 0.1377955675125122,
+      "learning_rate": 4.794123384031495e-05,
+      "loss": 0.18870222568511963,
+      "step": 860
+    },
+    {
+      "epoch": 0.15738719068413393,
+      "grad_norm": 0.15901461243629456,
+      "learning_rate": 4.791188325151373e-05,
+      "loss": 0.18128334283828734,
+      "step": 865
+    },
+    {
+      "epoch": 0.15829694323144106,
+      "grad_norm": 0.14634132385253906,
+      "learning_rate": 4.7882334038869495e-05,
+      "loss": 0.1866163969039917,
+      "step": 870
+    },
+    {
+      "epoch": 0.1592066957787482,
+      "grad_norm": 0.15361061692237854,
+      "learning_rate": 4.785258645854529e-05,
+      "loss": 0.17850807905197144,
+      "step": 875
+    },
+    {
+      "epoch": 0.16011644832605532,
+      "grad_norm": 0.13751649856567383,
+      "learning_rate": 4.782264076842385e-05,
+      "loss": 0.17731113433837892,
+      "step": 880
+    },
+    {
+      "epoch": 0.16102620087336245,
+      "grad_norm": 0.17909638583660126,
+      "learning_rate": 4.7792497228105314e-05,
+      "loss": 0.18344542980194092,
+      "step": 885
+    },
+    {
+      "epoch": 0.16193595342066958,
+      "grad_norm": 0.16038304567337036,
+      "learning_rate": 4.776215609890498e-05,
+      "loss": 0.18868647813796996,
+      "step": 890
+    },
+    {
+      "epoch": 0.1628457059679767,
+      "grad_norm": 0.1653951108455658,
+      "learning_rate": 4.773161764385107e-05,
+      "loss": 0.18614152669906617,
+      "step": 895
+    },
+    {
+      "epoch": 0.16375545851528384,
+      "grad_norm": 0.16193026304244995,
+      "learning_rate": 4.770088212768241e-05,
+      "loss": 0.18564575910568237,
+      "step": 900
+    },
+    {
+      "epoch": 0.16466521106259097,
+      "grad_norm": 0.16048531234264374,
+      "learning_rate": 4.7669949816846173e-05,
+      "loss": 0.18330031633377075,
+      "step": 905
+    },
+    {
+      "epoch": 0.1655749636098981,
+      "grad_norm": 0.1440177708864212,
+      "learning_rate": 4.7638820979495534e-05,
+      "loss": 0.17712442874908446,
+      "step": 910
+    },
+    {
+      "epoch": 0.16648471615720525,
+      "grad_norm": 0.19635969400405884,
+      "learning_rate": 4.760749588548738e-05,
+      "loss": 0.18679027557373046,
+      "step": 915
+    },
+    {
+      "epoch": 0.16739446870451238,
+      "grad_norm": 0.15576541423797607,
+      "learning_rate": 4.757597480637995e-05,
+      "loss": 0.19283764362335204,
+      "step": 920
+    },
+    {
+      "epoch": 0.1683042212518195,
+      "grad_norm": 0.1550331562757492,
+      "learning_rate": 4.7544258015430463e-05,
+      "loss": 0.18269542455673218,
+      "step": 925
+    },
+    {
+      "epoch": 0.16921397379912664,
+      "grad_norm": 0.18369626998901367,
+      "learning_rate": 4.75123457875928e-05,
+      "loss": 0.1697891116142273,
+      "step": 930
+    },
+    {
+      "epoch": 0.17012372634643377,
+      "grad_norm": 0.15266314148902893,
+      "learning_rate": 4.7480238399515074e-05,
+      "loss": 0.18523451089859008,
+      "step": 935
+    },
+    {
+      "epoch": 0.1710334788937409,
+      "grad_norm": 0.16709664463996887,
+      "learning_rate": 4.744793612953724e-05,
+      "loss": 0.1803238034248352,
+      "step": 940
+    },
+    {
+      "epoch": 0.17194323144104803,
+      "grad_norm": 0.14929179847240448,
+      "learning_rate": 4.741543925768872e-05,
+      "loss": 0.1861217737197876,
+      "step": 945
+    },
+    {
+      "epoch": 0.17285298398835516,
+      "grad_norm": 0.1362280696630478,
+      "learning_rate": 4.7382748065685915e-05,
+      "loss": 0.17896100282669067,
+      "step": 950
+    },
+    {
+      "epoch": 0.1737627365356623,
+      "grad_norm": 0.15290239453315735,
+      "learning_rate": 4.734986283692982e-05,
+      "loss": 0.18432788848876952,
+      "step": 955
+    },
+    {
+      "epoch": 0.17467248908296942,
+      "grad_norm": 0.1287035197019577,
+      "learning_rate": 4.73167838565035e-05,
+      "loss": 0.18485682010650634,
+      "step": 960
+    },
+    {
+      "epoch": 0.17558224163027655,
+      "grad_norm": 0.17969627678394318,
+      "learning_rate": 4.728351141116971e-05,
+      "loss": 0.17361557483673096,
+      "step": 965
+    },
+    {
+      "epoch": 0.1764919941775837,
+      "grad_norm": 0.13751201331615448,
+      "learning_rate": 4.7250045789368326e-05,
+      "loss": 0.1731679320335388,
+      "step": 970
+    },
+    {
+      "epoch": 0.17740174672489084,
+      "grad_norm": 0.1603265255689621,
+      "learning_rate": 4.721638728121388e-05,
+      "loss": 0.17308170795440675,
+      "step": 975
+    },
+    {
+      "epoch": 0.17831149927219797,
+      "grad_norm": 0.1592789888381958,
+      "learning_rate": 4.718253617849306e-05,
+      "loss": 0.17534757852554322,
+      "step": 980
+    },
+    {
+      "epoch": 0.1792212518195051,
+      "grad_norm": 0.12727224826812744,
+      "learning_rate": 4.714849277466214e-05,
+      "loss": 0.17817609310150145,
+      "step": 985
+    },
+    {
+      "epoch": 0.18013100436681223,
+      "grad_norm": 0.15401554107666016,
+      "learning_rate": 4.711425736484447e-05,
+      "loss": 0.1733405351638794,
+      "step": 990
+    },
+    {
+      "epoch": 0.18104075691411936,
+      "grad_norm": 0.13253968954086304,
+      "learning_rate": 4.7079830245827906e-05,
+      "loss": 0.17846795320510864,
+      "step": 995
+    },
+    {
+      "epoch": 0.1819505094614265,
+      "grad_norm": 0.21846213936805725,
+      "learning_rate": 4.7045211716062245e-05,
+      "loss": 0.18021599054336548,
+      "step": 1000
+    },
+    {
+      "epoch": 0.18286026200873362,
+      "grad_norm": 0.16867990791797638,
+      "learning_rate": 4.7010402075656595e-05,
+      "loss": 0.18232386112213134,
+      "step": 1005
+    },
+    {
+      "epoch": 0.18377001455604075,
+      "grad_norm": 0.17180582880973816,
+      "learning_rate": 4.697540162637686e-05,
+      "loss": 0.1816317319869995,
+      "step": 1010
+    },
+    {
+      "epoch": 0.18467976710334788,
+      "grad_norm": 0.16480213403701782,
+      "learning_rate": 4.694021067164303e-05,
+      "loss": 0.17718446254730225,
+      "step": 1015
+    },
+    {
+      "epoch": 0.185589519650655,
+      "grad_norm": 0.15015918016433716,
+      "learning_rate": 4.6904829516526605e-05,
+      "loss": 0.17412011623382567,
+      "step": 1020
+    },
+    {
+      "epoch": 0.18649927219796217,
+      "grad_norm": 0.14445139467716217,
+      "learning_rate": 4.686925846774795e-05,
+      "loss": 0.1778018832206726,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1874090247452693,
+      "grad_norm": 0.1701960265636444,
+      "learning_rate": 4.683349783367362e-05,
+      "loss": 0.16901081800460815,
+      "step": 1030
+    },
+    {
+      "epoch": 0.18831877729257643,
+      "grad_norm": 0.15894867479801178,
+      "learning_rate": 4.679754792431368e-05,
+      "loss": 0.17055928707122803,
+      "step": 1035
+    },
+    {
+      "epoch": 0.18922852983988356,
+      "grad_norm": 0.1511942446231842,
+      "learning_rate": 4.676140905131903e-05,
+      "loss": 0.17339680194854737,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1901382823871907,
+      "grad_norm": 0.14735209941864014,
+      "learning_rate": 4.672508152797872e-05,
+      "loss": 0.17802717685699462,
+      "step": 1045
+    },
+    {
+      "epoch": 0.19104803493449782,
+      "grad_norm": 0.17367291450500488,
+      "learning_rate": 4.66885656692172e-05,
+      "loss": 0.1732744097709656,
+      "step": 1050
+    },
+    {
+      "epoch": 0.19195778748180495,
+      "grad_norm": 0.147227481007576,
+      "learning_rate": 4.665186179159159e-05,
+      "loss": 0.17040517330169677,
+      "step": 1055
+    },
+    {
+      "epoch": 0.19286754002911208,
+      "grad_norm": 0.1709655076265335,
+      "learning_rate": 4.6614970213289e-05,
+      "loss": 0.17794088125228882,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1937772925764192,
+      "grad_norm": 0.1588088721036911,
+      "learning_rate": 4.657789125412366e-05,
+      "loss": 0.17180380821228028,
+      "step": 1065
+    },
+    {
+      "epoch": 0.19468704512372634,
+      "grad_norm": 0.14827021956443787,
+      "learning_rate": 4.654062523553428e-05,
+      "loss": 0.182997989654541,
+      "step": 1070
+    },
+    {
+      "epoch": 0.19559679767103347,
+      "grad_norm": 0.16230466961860657,
+      "learning_rate": 4.6503172480581126e-05,
+      "loss": 0.17346880435943604,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1965065502183406,
+      "grad_norm": 0.1637624353170395,
+      "learning_rate": 4.646553331394333e-05,
+      "loss": 0.17263576984405518,
+      "step": 1080
+    },
+    {
+      "epoch": 0.19741630276564776,
+      "grad_norm": 0.15977843105793,
+      "learning_rate": 4.642770806191603e-05,
+      "loss": 0.17284308671951293,
+      "step": 1085
+    },
+    {
+      "epoch": 0.19832605531295489,
+      "grad_norm": 0.15394869446754456,
+      "learning_rate": 4.6389697052407534e-05,
+      "loss": 0.17797101736068727,
+      "step": 1090
+    },
+    {
+      "epoch": 0.19923580786026202,
+      "grad_norm": 0.15995225310325623,
+      "learning_rate": 4.6351500614936485e-05,
+      "loss": 0.18137198686599731,
+      "step": 1095
+    },
+    {
+      "epoch": 0.20014556040756915,
+      "grad_norm": 0.1779479682445526,
+      "learning_rate": 4.6313119080629006e-05,
+      "loss": 0.17998344898223878,
+      "step": 1100
+    },
+    {
+      "epoch": 0.20105531295487628,
+      "grad_norm": 0.14362832903862,
+      "learning_rate": 4.627455278221584e-05,
+      "loss": 0.18196423053741456,
+      "step": 1105
+    },
+    {
+      "epoch": 0.2019650655021834,
+      "grad_norm": 0.15951639413833618,
+      "learning_rate": 4.623580205402947e-05,
+      "loss": 0.17423888444900512,
+      "step": 1110
+    },
+    {
+      "epoch": 0.20287481804949054,
+      "grad_norm": 0.17273563146591187,
+      "learning_rate": 4.619686723200115e-05,
+      "loss": 0.17392473220825194,
+      "step": 1115
+    },
+    {
+      "epoch": 0.20378457059679767,
+      "grad_norm": 0.1655360758304596,
+      "learning_rate": 4.615774865365813e-05,
+      "loss": 0.17528389692306517,
+      "step": 1120
+    },
+    {
+      "epoch": 0.2046943231441048,
+      "grad_norm": 0.15920691192150116,
+      "learning_rate": 4.611844665812058e-05,
+      "loss": 0.1806849241256714,
+      "step": 1125
+    },
+    {
+      "epoch": 0.20560407569141192,
+      "grad_norm": 0.16114577651023865,
+      "learning_rate": 4.607896158609875e-05,
+      "loss": 0.17217352390289306,
+      "step": 1130
+    },
+    {
+      "epoch": 0.20651382823871905,
+      "grad_norm": 0.1499422937631607,
+      "learning_rate": 4.603929377988999e-05,
+      "loss": 0.17806737422943114,
+      "step": 1135
+    },
+    {
+      "epoch": 0.2074235807860262,
+      "grad_norm": 0.17605191469192505,
+      "learning_rate": 4.5999443583375765e-05,
+      "loss": 0.17842113971710205,
+      "step": 1140
+    },
+    {
+      "epoch": 0.20833333333333334,
+      "grad_norm": 0.16117210686206818,
+      "learning_rate": 4.595941134201871e-05,
+      "loss": 0.18379683494567872,
+      "step": 1145
+    },
+    {
+      "epoch": 0.20924308588064047,
+      "grad_norm": 0.21199050545692444,
+      "learning_rate": 4.591919740285957e-05,
+      "loss": 0.16286123991012574,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2101528384279476,
+      "grad_norm": 0.15100529789924622,
+      "learning_rate": 4.587880211451427e-05,
+      "loss": 0.17995200157165528,
+      "step": 1155
+    },
+    {
+      "epoch": 0.21106259097525473,
+      "grad_norm": 0.16618172824382782,
+      "learning_rate": 4.583822582717085e-05,
+      "loss": 0.16960303783416747,
+      "step": 1160
+    },
+    {
+      "epoch": 0.21197234352256186,
+      "grad_norm": 0.14743569493293762,
+      "learning_rate": 4.579746889258643e-05,
+      "loss": 0.17762668132781984,
+      "step": 1165
+    },
+    {
+      "epoch": 0.212882096069869,
+      "grad_norm": 0.1697179079055786,
+      "learning_rate": 4.575653166408417e-05,
+      "loss": 0.16656005382537842,
+      "step": 1170
+    },
+    {
+      "epoch": 0.21379184861717612,
+      "grad_norm": 0.14886513352394104,
+      "learning_rate": 4.57154144965502e-05,
+      "loss": 0.17091882228851318,
+      "step": 1175
+    },
+    {
+      "epoch": 0.21470160116448325,
+      "grad_norm": 0.18197473883628845,
+      "learning_rate": 4.5674117746430556e-05,
+      "loss": 0.1770920753479004,
+      "step": 1180
+    },
+    {
+      "epoch": 0.21561135371179038,
+      "grad_norm": 0.17323088645935059,
+      "learning_rate": 4.563264177172807e-05,
+      "loss": 0.1734643578529358,
+      "step": 1185
+    },
+    {
+      "epoch": 0.2165211062590975,
+      "grad_norm": 0.1521984338760376,
+      "learning_rate": 4.559098693199929e-05,
+      "loss": 0.17515116930007935,
+      "step": 1190
+    },
+    {
+      "epoch": 0.21743085880640467,
+      "grad_norm": 0.1842304915189743,
+      "learning_rate": 4.554915358835134e-05,
+      "loss": 0.16798022985458375,
+      "step": 1195
+    },
+    {
+      "epoch": 0.2183406113537118,
+      "grad_norm": 0.14753451943397522,
+      "learning_rate": 4.5507142103438794e-05,
+      "loss": 0.1755476713180542,
+      "step": 1200
+    },
+    {
+      "epoch": 0.21925036390101893,
+      "grad_norm": 0.17096194624900818,
+      "learning_rate": 4.546495284146057e-05,
+      "loss": 0.1792473554611206,
+      "step": 1205
+    },
+    {
+      "epoch": 0.22016011644832606,
+      "grad_norm": 0.1579233556985855,
+      "learning_rate": 4.542258616815669e-05,
+      "loss": 0.17230144739151002,
+      "step": 1210
+    },
+    {
+      "epoch": 0.2210698689956332,
+      "grad_norm": 0.177297905087471,
+      "learning_rate": 4.5380042450805216e-05,
+      "loss": 0.1807127833366394,
+      "step": 1215
+    },
+    {
+      "epoch": 0.22197962154294032,
+      "grad_norm": 0.14331696927547455,
+      "learning_rate": 4.533732205821897e-05,
+      "loss": 0.17201389074325563,
+      "step": 1220
+    },
+    {
+      "epoch": 0.22288937409024745,
+      "grad_norm": 0.14473360776901245,
+      "learning_rate": 4.529442536074239e-05,
+      "loss": 0.17036900520324708,
+      "step": 1225
+    },
+    {
+      "epoch": 0.22379912663755458,
+      "grad_norm": 0.1820901483297348,
+      "learning_rate": 4.5251352730248314e-05,
+      "loss": 0.17704882621765136,
+      "step": 1230
+    },
+    {
+      "epoch": 0.2247088791848617,
+      "grad_norm": 0.1948976367712021,
+      "learning_rate": 4.5208104540134746e-05,
+      "loss": 0.1706973433494568,
+      "step": 1235
+    },
+    {
+      "epoch": 0.22561863173216884,
+      "grad_norm": 0.16660070419311523,
+      "learning_rate": 4.51646811653216e-05,
+      "loss": 0.17636821269989014,
+      "step": 1240
+    },
+    {
+      "epoch": 0.22652838427947597,
+      "grad_norm": 0.1699984073638916,
+      "learning_rate": 4.512108298224751e-05,
+      "loss": 0.16986632347106934,
+      "step": 1245
+    },
+    {
+      "epoch": 0.22743813682678313,
+      "grad_norm": 0.17601042985916138,
+      "learning_rate": 4.50773103688665e-05,
+      "loss": 0.17507898807525635,
+      "step": 1250
+    },
+    {
+      "epoch": 0.22834788937409026,
+      "grad_norm": 0.17557238042354584,
+      "learning_rate": 4.503336370464476e-05,
+      "loss": 0.17702863216400147,
+      "step": 1255
+    },
+    {
+      "epoch": 0.2292576419213974,
+      "grad_norm": 0.1800651252269745,
+      "learning_rate": 4.498924337055729e-05,
+      "loss": 0.16419180631637573,
+      "step": 1260
+    },
+    {
+      "epoch": 0.23016739446870452,
+      "grad_norm": 0.2022479772567749,
+      "learning_rate": 4.494494974908468e-05,
+      "loss": 0.17482060194015503,
+      "step": 1265
+    },
+    {
+      "epoch": 0.23107714701601165,
+      "grad_norm": 0.14180205762386322,
+      "learning_rate": 4.490048322420973e-05,
+      "loss": 0.1723136067390442,
+      "step": 1270
+    },
+    {
+      "epoch": 0.23198689956331878,
+      "grad_norm": 0.18607310950756073,
+      "learning_rate": 4.485584418141419e-05,
+      "loss": 0.17096419334411622,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2328966521106259,
+      "grad_norm": 0.15958310663700104,
+      "learning_rate": 4.481103300767529e-05,
+      "loss": 0.1656244158744812,
+      "step": 1280
+    },
+    {
+      "epoch": 0.23380640465793304,
+      "grad_norm": 0.17552383244037628,
+      "learning_rate": 4.476605009146255e-05,
+      "loss": 0.17677626609802247,
+      "step": 1285
+    },
+    {
+      "epoch": 0.23471615720524017,
+      "grad_norm": 0.15299823880195618,
+      "learning_rate": 4.472089582273429e-05,
+      "loss": 0.1778991103172302,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2356259097525473,
+      "grad_norm": 0.14613987505435944,
+      "learning_rate": 4.46755705929343e-05,
+      "loss": 0.17071452140808105,
+      "step": 1295
+    },
+    {
+      "epoch": 0.23653566229985443,
+      "grad_norm": 0.17781122028827667,
+      "learning_rate": 4.463007479498843e-05,
+      "loss": 0.16955430507659913,
+      "step": 1300
+    },
+    {
+      "epoch": 0.23744541484716158,
+      "grad_norm": 0.16326487064361572,
+      "learning_rate": 4.458440882330119e-05,
+      "loss": 0.1777693510055542,
+      "step": 1305
+    },
+    {
+      "epoch": 0.23835516739446871,
+      "grad_norm": 0.17701926827430725,
+      "learning_rate": 4.4538573073752365e-05,
+      "loss": 0.16323351860046387,
+      "step": 1310
+    },
+    {
+      "epoch": 0.23926491994177584,
+      "grad_norm": 0.13104717433452606,
+      "learning_rate": 4.449256794369349e-05,
+      "loss": 0.17653456926345826,
+      "step": 1315
+    },
+    {
+      "epoch": 0.24017467248908297,
+      "grad_norm": 0.1796836256980896,
+      "learning_rate": 4.444639383194452e-05,
+      "loss": 0.17189600467681884,
+      "step": 1320
+    },
+    {
+      "epoch": 0.2410844250363901,
+      "grad_norm": 0.14919696748256683,
+      "learning_rate": 4.440005113879029e-05,
+      "loss": 0.17003334760665895,
+      "step": 1325
+    },
+    {
+      "epoch": 0.24199417758369723,
+      "grad_norm": 0.1728784441947937,
+      "learning_rate": 4.4353540265977064e-05,
+      "loss": 0.17397408485412597,
+      "step": 1330
+    },
+    {
+      "epoch": 0.24290393013100436,
+      "grad_norm": 0.14591015875339508,
+      "learning_rate": 4.43068616167091e-05,
+      "loss": 0.16498478651046752,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2438136826783115,
+      "grad_norm": 0.18417201936244965,
+      "learning_rate": 4.4260015595645055e-05,
+      "loss": 0.16841750144958495,
+      "step": 1340
+    },
+    {
+      "epoch": 0.24472343522561862,
+      "grad_norm": 0.16264279186725616,
+      "learning_rate": 4.4213002608894605e-05,
+      "loss": 0.16907373666763306,
+      "step": 1345
+    },
+    {
+      "epoch": 0.24563318777292575,
+      "grad_norm": 0.15248481929302216,
+      "learning_rate": 4.416582306401481e-05,
+      "loss": 0.15931472778320313,
+      "step": 1350
+    },
+    {
+      "epoch": 0.24654294032023288,
+      "grad_norm": 0.1488373875617981,
+      "learning_rate": 4.4118477370006636e-05,
+      "loss": 0.1701716423034668,
+      "step": 1355
+    },
+    {
+      "epoch": 0.24745269286754004,
+      "grad_norm": 0.14679782092571259,
+      "learning_rate": 4.407096593731142e-05,
+      "loss": 0.157412326335907,
+      "step": 1360
+    },
+    {
+      "epoch": 0.24836244541484717,
+      "grad_norm": 0.17139530181884766,
+      "learning_rate": 4.402328917780728e-05,
+      "loss": 0.17303754091262818,
+      "step": 1365
+    },
+    {
+      "epoch": 0.2492721979621543,
+      "grad_norm": 0.1534871757030487,
+      "learning_rate": 4.397544750480554e-05,
+      "loss": 0.1786255121231079,
+      "step": 1370
+    },
+    {
+      "epoch": 0.2501819505094614,
+      "grad_norm": 0.1876252293586731,
+      "learning_rate": 4.39274413330472e-05,
+      "loss": 0.16442898511886597,
+      "step": 1375
+    },
+    {
+      "epoch": 0.25109170305676853,
+      "grad_norm": 0.16165752708911896,
+      "learning_rate": 4.387927107869928e-05,
+      "loss": 0.1780426025390625,
+      "step": 1380
+    },
+    {
+      "epoch": 0.25200145560407566,
+      "grad_norm": 0.17242255806922913,
+      "learning_rate": 4.383093715935124e-05,
+      "loss": 0.15959256887435913,
+      "step": 1385
+    },
+    {
+      "epoch": 0.25291120815138285,
+      "grad_norm": 0.1627114862203598,
+      "learning_rate": 4.378243999401137e-05,
+      "loss": 0.17606115341186523,
+      "step": 1390
+    },
+    {
+      "epoch": 0.25382096069869,
+      "grad_norm": 0.15911224484443665,
+      "learning_rate": 4.373378000310312e-05,
+      "loss": 0.16798585653305054,
+      "step": 1395
+    },
+    {
+      "epoch": 0.2547307132459971,
+      "grad_norm": 0.15542249381542206,
+      "learning_rate": 4.3684957608461505e-05,
+      "loss": 0.1695417881011963,
+      "step": 1400
+    },
+    {
+      "epoch": 0.25564046579330424,
+      "grad_norm": 0.1475304812192917,
+      "learning_rate": 4.363597323332941e-05,
+      "loss": 0.16340878009796142,
+      "step": 1405
+    },
+    {
+      "epoch": 0.25655021834061137,
+      "grad_norm": 0.16943927109241486,
+      "learning_rate": 4.358682730235395e-05,
+      "loss": 0.17240238189697266,
+      "step": 1410
+    },
+    {
+      "epoch": 0.2574599708879185,
+      "grad_norm": 0.1816391944885254,
+      "learning_rate": 4.3537520241582744e-05,
+      "loss": 0.16558437347412108,
+      "step": 1415
+    },
+    {
+      "epoch": 0.25836972343522563,
+      "grad_norm": 0.23851341009140015,
+      "learning_rate": 4.348805247846027e-05,
+      "loss": 0.16796000003814698,
+      "step": 1420
+    },
+    {
+      "epoch": 0.25927947598253276,
+      "grad_norm": 0.15415243804454803,
+      "learning_rate": 4.343842444182414e-05,
+      "loss": 0.1746017098426819,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2601892285298399,
+      "grad_norm": 0.15651032328605652,
+      "learning_rate": 4.338863656190139e-05,
+      "loss": 0.1649057984352112,
+      "step": 1430
+    },
+    {
+      "epoch": 0.261098981077147,
+      "grad_norm": 0.16601966321468353,
+      "learning_rate": 4.333868927030471e-05,
+      "loss": 0.15888988971710205,
+      "step": 1435
+    },
+    {
+      "epoch": 0.26200873362445415,
+      "grad_norm": 0.1549467295408249,
+      "learning_rate": 4.328858300002876e-05,
+      "loss": 0.16357985734939576,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2629184861717613,
+      "grad_norm": 0.16332370042800903,
+      "learning_rate": 4.32383181854464e-05,
+      "loss": 0.16749982833862304,
+      "step": 1445
+    },
+    {
+      "epoch": 0.2638282387190684,
+      "grad_norm": 0.14827077090740204,
+      "learning_rate": 4.3187895262304894e-05,
+      "loss": 0.16886214017868043,
+      "step": 1450
+    },
+    {
+      "epoch": 0.26473799126637554,
+      "grad_norm": 0.1557198166847229,
+      "learning_rate": 4.313731466772216e-05,
+      "loss": 0.17512214183807373,
+      "step": 1455
+    },
+    {
+      "epoch": 0.26564774381368267,
+      "grad_norm": 0.17263570427894592,
+      "learning_rate": 4.308657684018299e-05,
+      "loss": 0.16248074769973755,
+      "step": 1460
+    },
+    {
+      "epoch": 0.2665574963609898,
+      "grad_norm": 0.17135761678218842,
+      "learning_rate": 4.303568221953521e-05,
+      "loss": 0.16605921983718872,
+      "step": 1465
+    },
+    {
+      "epoch": 0.26746724890829693,
+      "grad_norm": 0.14322632551193237,
+      "learning_rate": 4.2984631246985897e-05,
+      "loss": 0.1610772728919983,
+      "step": 1470
+    },
+    {
+      "epoch": 0.26837700145560406,
+      "grad_norm": 0.18852312862873077,
+      "learning_rate": 4.2933424365097564e-05,
+      "loss": 0.1686462163925171,
+      "step": 1475
+    },
+    {
+      "epoch": 0.2692867540029112,
+      "grad_norm": 0.1780245155096054,
+      "learning_rate": 4.2882062017784294e-05,
+      "loss": 0.16953932046890258,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2701965065502183,
+      "grad_norm": 0.180568665266037,
+      "learning_rate": 4.2830544650307895e-05,
+      "loss": 0.16442664861679077,
+      "step": 1485
+    },
+    {
+      "epoch": 0.27110625909752545,
+      "grad_norm": 0.16876435279846191,
+      "learning_rate": 4.277887270927407e-05,
+      "loss": 0.17128173112869263,
+      "step": 1490
+    },
+    {
+      "epoch": 0.2720160116448326,
+      "grad_norm": 0.164053276181221,
+      "learning_rate": 4.2727046642628513e-05,
+      "loss": 0.16331382989883422,
+      "step": 1495
+    },
+    {
+      "epoch": 0.27292576419213976,
+      "grad_norm": 0.14577528834342957,
+      "learning_rate": 4.267506689965305e-05,
+      "loss": 0.1638316035270691,
+      "step": 1500
+    },
+    {
+      "epoch": 0.2738355167394469,
+      "grad_norm": 0.1648740917444229,
+      "learning_rate": 4.262293393096171e-05,
+      "loss": 0.15332664251327516,
+      "step": 1505
+    },
+    {
+      "epoch": 0.274745269286754,
+      "grad_norm": 0.16445094347000122,
+      "learning_rate": 4.257064818849685e-05,
+      "loss": 0.1706634521484375,
+      "step": 1510
+    },
+    {
+      "epoch": 0.27565502183406115,
+      "grad_norm": 0.1584935486316681,
+      "learning_rate": 4.251821012552524e-05,
+      "loss": 0.1684114694595337,
+      "step": 1515
+    },
+    {
+      "epoch": 0.2765647743813683,
+      "grad_norm": 0.17215611040592194,
+      "learning_rate": 4.24656201966341e-05,
+      "loss": 0.15594131946563722,
+      "step": 1520
+    },
+    {
+      "epoch": 0.2774745269286754,
+      "grad_norm": 0.15945589542388916,
+      "learning_rate": 4.2412878857727214e-05,
+      "loss": 0.1686659574508667,
+      "step": 1525
+    },
+    {
+      "epoch": 0.27838427947598254,
+      "grad_norm": 0.16103951632976532,
+      "learning_rate": 4.2359986566020906e-05,
+      "loss": 0.17779340744018554,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2792940320232897,
+      "grad_norm": 0.1770307570695877,
+      "learning_rate": 4.230694378004014e-05,
+      "loss": 0.16786882877349854,
+      "step": 1535
+    },
+    {
+      "epoch": 0.2802037845705968,
+      "grad_norm": 0.16225053369998932,
+      "learning_rate": 4.2253750959614504e-05,
+      "loss": 0.16558897495269775,
+      "step": 1540
+    },
+    {
+      "epoch": 0.28111353711790393,
+      "grad_norm": 0.27213969826698303,
+      "learning_rate": 4.220040856587425e-05,
+      "loss": 0.1641119599342346,
+      "step": 1545
+    },
+    {
+      "epoch": 0.28202328966521106,
+      "grad_norm": 0.1773071587085724,
+      "learning_rate": 4.2146917061246284e-05,
+      "loss": 0.16919140815734862,
+      "step": 1550
+    },
+    {
+      "epoch": 0.2829330422125182,
+      "grad_norm": 0.15519705414772034,
+      "learning_rate": 4.209327690945014e-05,
+      "loss": 0.15501506328582765,
+      "step": 1555
+    },
+    {
+      "epoch": 0.2838427947598253,
+      "grad_norm": 0.19921597838401794,
+      "learning_rate": 4.203948857549402e-05,
+      "loss": 0.1690821886062622,
+      "step": 1560
+    },
+    {
+      "epoch": 0.28475254730713245,
+      "grad_norm": 0.15417630970478058,
+      "learning_rate": 4.1985552525670696e-05,
+      "loss": 0.1675640344619751,
+      "step": 1565
+    },
+    {
+      "epoch": 0.2856622998544396,
+      "grad_norm": 0.1739572137594223,
+      "learning_rate": 4.193146922755348e-05,
+      "loss": 0.16738017797470092,
+      "step": 1570
+    },
+    {
+      "epoch": 0.2865720524017467,
+      "grad_norm": 0.1384361982345581,
+      "learning_rate": 4.187723914999221e-05,
+      "loss": 0.16802358627319336,
+      "step": 1575
+    },
+    {
+      "epoch": 0.28748180494905384,
+      "grad_norm": 0.1491454839706421,
+      "learning_rate": 4.182286276310915e-05,
+      "loss": 0.1619583249092102,
+      "step": 1580
+    },
+    {
+      "epoch": 0.288391557496361,
+      "grad_norm": 0.15831919014453888,
+      "learning_rate": 4.176834053829492e-05,
+      "loss": 0.1625199794769287,
+      "step": 1585
+    },
+    {
+      "epoch": 0.2893013100436681,
+      "grad_norm": 0.16265396773815155,
+      "learning_rate": 4.1713672948204416e-05,
+      "loss": 0.16718552112579346,
+      "step": 1590
+    },
+    {
+      "epoch": 0.29021106259097523,
+      "grad_norm": 0.15153461694717407,
+      "learning_rate": 4.1658860466752714e-05,
+      "loss": 0.15979087352752686,
+      "step": 1595
+    },
+    {
+      "epoch": 0.29112081513828236,
+      "grad_norm": 0.1620412915945053,
+      "learning_rate": 4.160390356911096e-05,
+      "loss": 0.16103557348251343,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2920305676855895,
+      "grad_norm": 0.16673807799816132,
+      "learning_rate": 4.154880273170223e-05,
+      "loss": 0.16394708156585694,
+      "step": 1605
+    },
+    {
+      "epoch": 0.2929403202328967,
+      "grad_norm": 0.14834867417812347,
+      "learning_rate": 4.149355843219744e-05,
+      "loss": 0.15916435718536376,
+      "step": 1610
+    },
+    {
+      "epoch": 0.2938500727802038,
+      "grad_norm": 0.16977964341640472,
+      "learning_rate": 4.143817114951119e-05,
+      "loss": 0.16538127660751342,
+      "step": 1615
+    },
+    {
+      "epoch": 0.29475982532751094,
+      "grad_norm": 0.17986875772476196,
+      "learning_rate": 4.138264136379756e-05,
+      "loss": 0.15514618158340454,
+      "step": 1620
+    },
+    {
+      "epoch": 0.29566957787481807,
+      "grad_norm": 0.15794920921325684,
+      "learning_rate": 4.132696955644605e-05,
+      "loss": 0.15992183685302735,
+      "step": 1625
+    },
+    {
+      "epoch": 0.2965793304221252,
+      "grad_norm": 0.19955399632453918,
+      "learning_rate": 4.127115621007731e-05,
+      "loss": 0.16362056732177735,
+      "step": 1630
+    },
+    {
+      "epoch": 0.29748908296943233,
+      "grad_norm": 0.1352023035287857,
+      "learning_rate": 4.121520180853903e-05,
+      "loss": 0.15631601810455323,
+      "step": 1635
+    },
+    {
+      "epoch": 0.29839883551673946,
+      "grad_norm": 0.15340781211853027,
+      "learning_rate": 4.1159106836901674e-05,
+      "loss": 0.1571858048439026,
+      "step": 1640
+    },
+    {
+      "epoch": 0.2993085880640466,
+      "grad_norm": 0.15311770141124725,
+      "learning_rate": 4.110287178145433e-05,
+      "loss": 0.16082344055175782,
+      "step": 1645
+    },
+    {
+      "epoch": 0.3002183406113537,
+      "grad_norm": 0.17811627686023712,
+      "learning_rate": 4.10464971297005e-05,
+      "loss": 0.16117215156555176,
+      "step": 1650
+    },
+    {
+      "epoch": 0.30112809315866085,
+      "grad_norm": 0.21060039103031158,
+      "learning_rate": 4.0989983370353805e-05,
+      "loss": 0.15838587284088135,
+      "step": 1655
+    },
+    {
+      "epoch": 0.302037845705968,
+      "grad_norm": 0.155836820602417,
+      "learning_rate": 4.093333099333383e-05,
+      "loss": 0.16648870706558228,
+      "step": 1660
+    },
+    {
+      "epoch": 0.3029475982532751,
+      "grad_norm": 0.13711698353290558,
+      "learning_rate": 4.0876540489761826e-05,
+      "loss": 0.16899349689483642,
+      "step": 1665
+    },
+    {
+      "epoch": 0.30385735080058224,
+      "grad_norm": 0.15162716805934906,
+      "learning_rate": 4.0819612351956485e-05,
+      "loss": 0.16574090719223022,
+      "step": 1670
+    },
+    {
+      "epoch": 0.30476710334788937,
+      "grad_norm": 0.15016348659992218,
+      "learning_rate": 4.0762547073429615e-05,
+      "loss": 0.1689780354499817,
+      "step": 1675
+    },
+    {
+      "epoch": 0.3056768558951965,
+      "grad_norm": 0.15182986855506897,
+      "learning_rate": 4.070534514888194e-05,
+      "loss": 0.1593686819076538,
+      "step": 1680
+    },
+    {
+      "epoch": 0.3065866084425036,
+      "grad_norm": 0.15648750960826874,
+      "learning_rate": 4.0648007074198765e-05,
+      "loss": 0.16436235904693602,
+      "step": 1685
+    },
+    {
+      "epoch": 0.30749636098981076,
+      "grad_norm": 0.18339484930038452,
+      "learning_rate": 4.0590533346445665e-05,
+      "loss": 0.1678077220916748,
+      "step": 1690
+    },
+    {
+      "epoch": 0.3084061135371179,
+      "grad_norm": 0.16426527500152588,
+      "learning_rate": 4.053292446386422e-05,
+      "loss": 0.1689227342605591,
+      "step": 1695
+    },
+    {
+      "epoch": 0.309315866084425,
+      "grad_norm": 0.16129335761070251,
+      "learning_rate": 4.047518092586766e-05,
+      "loss": 0.16592445373535156,
+      "step": 1700
+    },
+    {
+      "epoch": 0.31022561863173215,
+      "grad_norm": 0.15512363612651825,
+      "learning_rate": 4.041730323303654e-05,
+      "loss": 0.16142364740371704,
+      "step": 1705
+    },
+    {
+      "epoch": 0.3111353711790393,
+      "grad_norm": 0.159842386841774,
+      "learning_rate": 4.0359291887114425e-05,
+      "loss": 0.1702875852584839,
+      "step": 1710
+    },
+    {
+      "epoch": 0.3120451237263464,
+      "grad_norm": 0.19558854401111603,
+      "learning_rate": 4.030114739100352e-05,
+      "loss": 0.15966148376464845,
+      "step": 1715
+    },
+    {
+      "epoch": 0.3129548762736536,
+      "grad_norm": 0.1577496975660324,
+      "learning_rate": 4.024287024876029e-05,
+      "loss": 0.1620358943939209,
+      "step": 1720
+    },
+    {
+      "epoch": 0.3138646288209607,
+      "grad_norm": 0.1629355251789093,
+      "learning_rate": 4.0184460965591144e-05,
+      "loss": 0.16511552333831786,
+      "step": 1725
+    },
+    {
+      "epoch": 0.31477438136826785,
+      "grad_norm": 0.17060767114162445,
+      "learning_rate": 4.0125920047848e-05,
+      "loss": 0.15672838687896729,
+      "step": 1730
+    },
+    {
+      "epoch": 0.315684133915575,
+      "grad_norm": 0.22447620332241058,
+      "learning_rate": 4.006724800302394e-05,
+      "loss": 0.15339784622192382,
+      "step": 1735
+    },
+    {
+      "epoch": 0.3165938864628821,
+      "grad_norm": 0.14572037756443024,
+      "learning_rate": 4.000844533974878e-05,
+      "loss": 0.16566959619522095,
+      "step": 1740
+    },
+    {
+      "epoch": 0.31750363901018924,
+      "grad_norm": 0.15915483236312866,
+      "learning_rate": 3.9949512567784684e-05,
+      "loss": 0.16153957843780517,
+      "step": 1745
+    },
+    {
+      "epoch": 0.3184133915574964,
+      "grad_norm": 0.1668540984392166,
+      "learning_rate": 3.9890450198021704e-05,
+      "loss": 0.1659809947013855,
+      "step": 1750
+    },
+    {
+      "epoch": 0.3193231441048035,
+      "grad_norm": 0.16612035036087036,
+      "learning_rate": 3.983125874247341e-05,
+      "loss": 0.16941241025924683,
+      "step": 1755
+    },
+    {
+      "epoch": 0.32023289665211063,
+      "grad_norm": 0.15163679420948029,
+      "learning_rate": 3.9771938714272407e-05,
+      "loss": 0.16053590774536133,
+      "step": 1760
+    },
+    {
+      "epoch": 0.32114264919941776,
+      "grad_norm": 0.1797824203968048,
+      "learning_rate": 3.97124906276659e-05,
+      "loss": 0.1667110800743103,
+      "step": 1765
+    },
+    {
+      "epoch": 0.3220524017467249,
+      "grad_norm": 0.15076608955860138,
+      "learning_rate": 3.9652914998011237e-05,
+      "loss": 0.1607860803604126,
+      "step": 1770
+    },
+    {
+      "epoch": 0.322962154294032,
+      "grad_norm": 0.16523587703704834,
+      "learning_rate": 3.959321234177144e-05,
+      "loss": 0.16515827178955078,
+      "step": 1775
+    },
+    {
+      "epoch": 0.32387190684133915,
+      "grad_norm": 0.22065149247646332,
+      "learning_rate": 3.9533383176510746e-05,
+      "loss": 0.1618957757949829,
+      "step": 1780
+    },
+    {
+      "epoch": 0.3247816593886463,
+      "grad_norm": 0.16426463425159454,
+      "learning_rate": 3.9473428020890066e-05,
+      "loss": 0.15763382911682128,
+      "step": 1785
+    },
+    {
+      "epoch": 0.3256914119359534,
+      "grad_norm": 0.16474904119968414,
+      "learning_rate": 3.941334739466257e-05,
+      "loss": 0.15135571956634522,
+      "step": 1790
+    },
+    {
+      "epoch": 0.32660116448326054,
+      "grad_norm": 0.16746412217617035,
+      "learning_rate": 3.935314181866909e-05,
+      "loss": 0.15925389528274536,
+      "step": 1795
+    },
+    {
+      "epoch": 0.32751091703056767,
+      "grad_norm": 0.17819371819496155,
+      "learning_rate": 3.929281181483369e-05,
+      "loss": 0.1598669171333313,
+      "step": 1800
+    },
+    {
+      "epoch": 0.3284206695778748,
+      "grad_norm": 0.1816040277481079,
+      "learning_rate": 3.923235790615907e-05,
+      "loss": 0.1652522087097168,
+      "step": 1805
+    },
+    {
+      "epoch": 0.32933042212518193,
+      "grad_norm": 0.14846695959568024,
+      "learning_rate": 3.917178061672211e-05,
+      "loss": 0.16665585041046144,
+      "step": 1810
+    },
+    {
+      "epoch": 0.33024017467248906,
+      "grad_norm": 0.1734926551580429,
+      "learning_rate": 3.911108047166924e-05,
+      "loss": 0.16069791316986085,
+      "step": 1815
+    },
+    {
+      "epoch": 0.3311499272197962,
+      "grad_norm": 0.16154922544956207,
+      "learning_rate": 3.905025799721194e-05,
+      "loss": 0.16114097833633423,
+      "step": 1820
+    },
+    {
+      "epoch": 0.3320596797671033,
+      "grad_norm": 0.1538771390914917,
+      "learning_rate": 3.898931372062217e-05,
+      "loss": 0.1602831244468689,
+      "step": 1825
+    },
+    {
+      "epoch": 0.3329694323144105,
+      "grad_norm": 0.14036566019058228,
+      "learning_rate": 3.892824817022781e-05,
+      "loss": 0.1502395749092102,
+      "step": 1830
+    },
+    {
+      "epoch": 0.33387918486171764,
+      "grad_norm": 0.19212059676647186,
+      "learning_rate": 3.886706187540804e-05,
+      "loss": 0.16265250444412233,
+      "step": 1835
+    },
+    {
+      "epoch": 0.33478893740902477,
+      "grad_norm": 0.17410333454608917,
+      "learning_rate": 3.880575536658881e-05,
+      "loss": 0.15689224004745483,
+      "step": 1840
+    },
+    {
+      "epoch": 0.3356986899563319,
+      "grad_norm": 0.15165294706821442,
+      "learning_rate": 3.874432917523817e-05,
+      "loss": 0.15033140182495117,
+      "step": 1845
+    },
+    {
+      "epoch": 0.336608442503639,
+      "grad_norm": 0.16166730225086212,
+      "learning_rate": 3.8682783833861736e-05,
+      "loss": 0.16896235942840576,
+      "step": 1850
+    },
+    {
+      "epoch": 0.33751819505094616,
+      "grad_norm": 0.16497021913528442,
+      "learning_rate": 3.8621119875998026e-05,
+      "loss": 0.1600774645805359,
+      "step": 1855
+    },
+    {
+      "epoch": 0.3384279475982533,
+      "grad_norm": 0.17264948785305023,
+      "learning_rate": 3.855933783621384e-05,
+      "loss": 0.16947593688964843,
+      "step": 1860
+    },
+    {
+      "epoch": 0.3393377001455604,
+      "grad_norm": 0.16870704293251038,
+      "learning_rate": 3.8497438250099636e-05,
+      "loss": 0.16062095165252685,
+      "step": 1865
+    },
+    {
+      "epoch": 0.34024745269286755,
+      "grad_norm": 0.16644036769866943,
+      "learning_rate": 3.843542165426492e-05,
+      "loss": 0.16015599966049193,
+      "step": 1870
+    },
+    {
+      "epoch": 0.3411572052401747,
+      "grad_norm": 0.1626352220773697,
+      "learning_rate": 3.837328858633349e-05,
+      "loss": 0.17444703578948975,
+      "step": 1875
+    },
+    {
+      "epoch": 0.3420669577874818,
+      "grad_norm": 0.1427375227212906,
+      "learning_rate": 3.83110395849389e-05,
+      "loss": 0.1589805006980896,
+      "step": 1880
+    },
+    {
+      "epoch": 0.34297671033478894,
+      "grad_norm": 0.17840255796909332,
+      "learning_rate": 3.824867518971973e-05,
+      "loss": 0.15953952074050903,
+      "step": 1885
+    },
+    {
+      "epoch": 0.34388646288209607,
+      "grad_norm": 0.16998249292373657,
+      "learning_rate": 3.818619594131489e-05,
+      "loss": 0.16027032136917113,
+      "step": 1890
+    },
+    {
+      "epoch": 0.3447962154294032,
+      "grad_norm": 0.14950257539749146,
+      "learning_rate": 3.812360238135897e-05,
+      "loss": 0.15335670709609986,
+      "step": 1895
+    },
+    {
+      "epoch": 0.3457059679767103,
+      "grad_norm": 0.1678011417388916,
+      "learning_rate": 3.806089505247752e-05,
+      "loss": 0.1560648798942566,
+      "step": 1900
+    },
+    {
+      "epoch": 0.34661572052401746,
+      "grad_norm": 0.17944541573524475,
+      "learning_rate": 3.799807449828238e-05,
+      "loss": 0.16072254180908202,
+      "step": 1905
+    },
+    {
+      "epoch": 0.3475254730713246,
+      "grad_norm": 0.166817307472229,
+      "learning_rate": 3.793514126336691e-05,
+      "loss": 0.1542820692062378,
+      "step": 1910
+    },
+    {
+      "epoch": 0.3484352256186317,
+      "grad_norm": 0.16047626733779907,
+      "learning_rate": 3.787209589330134e-05,
+      "loss": 0.16092092990875245,
+      "step": 1915
+    },
+    {
+      "epoch": 0.34934497816593885,
+      "grad_norm": 0.16478900611400604,
+      "learning_rate": 3.7808938934627965e-05,
+      "loss": 0.16765867471694945,
+      "step": 1920
+    },
+    {
+      "epoch": 0.350254730713246,
+      "grad_norm": 0.15349514782428741,
+      "learning_rate": 3.774567093485648e-05,
+      "loss": 0.15890377759933472,
+      "step": 1925
+    },
+    {
+      "epoch": 0.3511644832605531,
+      "grad_norm": 0.1515921950340271,
+      "learning_rate": 3.768229244245917e-05,
+      "loss": 0.16668319702148438,
+      "step": 1930
+    },
+    {
+      "epoch": 0.35207423580786024,
+      "grad_norm": 0.16310466825962067,
+      "learning_rate": 3.7618804006866195e-05,
+      "loss": 0.15182652473449706,
+      "step": 1935
+    },
+    {
+      "epoch": 0.3529839883551674,
+      "grad_norm": 0.17294517159461975,
+      "learning_rate": 3.755520617846084e-05,
+      "loss": 0.16287628412246705,
+      "step": 1940
+    },
+    {
+      "epoch": 0.35389374090247455,
+      "grad_norm": 0.1482895463705063,
+      "learning_rate": 3.749149950857467e-05,
+      "loss": 0.15321952104568481,
+      "step": 1945
+    },
+    {
+      "epoch": 0.3548034934497817,
+      "grad_norm": 0.2236029952764511,
+      "learning_rate": 3.7427684549482847e-05,
+      "loss": 0.15403482913970948,
+      "step": 1950
+    },
+    {
+      "epoch": 0.3557132459970888,
+      "grad_norm": 0.20185327529907227,
+      "learning_rate": 3.736376185439927e-05,
+      "loss": 0.1633884072303772,
+      "step": 1955
+    },
+    {
+      "epoch": 0.35662299854439594,
+      "grad_norm": 0.13906247913837433,
+      "learning_rate": 3.7299731977471816e-05,
+      "loss": 0.15925350189208984,
+      "step": 1960
+    },
+    {
+      "epoch": 0.35753275109170307,
+      "grad_norm": 0.18665002286434174,
+      "learning_rate": 3.723559547377751e-05,
+      "loss": 0.1612026572227478,
+      "step": 1965
+    },
+    {
+      "epoch": 0.3584425036390102,
+      "grad_norm": 0.16913433372974396,
+      "learning_rate": 3.717135289931774e-05,
+      "loss": 0.15479494333267213,
+      "step": 1970
+    },
+    {
+      "epoch": 0.35935225618631733,
+      "grad_norm": 0.1620066910982132,
+      "learning_rate": 3.7107004811013434e-05,
+      "loss": 0.1604058027267456,
+      "step": 1975
+    },
+    {
+      "epoch": 0.36026200873362446,
+      "grad_norm": 0.16838301718235016,
+      "learning_rate": 3.704255176670021e-05,
+      "loss": 0.15335073471069335,
+      "step": 1980
+    },
+    {
+      "epoch": 0.3611717612809316,
+      "grad_norm": 0.3054695427417755,
+      "learning_rate": 3.6977994325123535e-05,
+      "loss": 0.16558053493499755,
+      "step": 1985
+    },
+    {
+      "epoch": 0.3620815138282387,
+      "grad_norm": 0.1526716649532318,
+      "learning_rate": 3.6913333045933934e-05,
+      "loss": 0.16148923635482787,
+      "step": 1990
+    },
+    {
+      "epoch": 0.36299126637554585,
+      "grad_norm": 0.15328513085842133,
+      "learning_rate": 3.684856848968209e-05,
+      "loss": 0.1553613781929016,
+      "step": 1995
+    },
+    {
+      "epoch": 0.363901018922853,
+      "grad_norm": 0.16129714250564575,
+      "learning_rate": 3.6783701217813995e-05,
+      "loss": 0.16724612712860107,
+      "step": 2000
+    },
+    {
+      "epoch": 0.3648107714701601,
+      "grad_norm": 0.15715539455413818,
+      "learning_rate": 3.6718731792666086e-05,
+      "loss": 0.15867922306060792,
+      "step": 2005
+    },
+    {
+      "epoch": 0.36572052401746724,
+      "grad_norm": 0.15569166839122772,
+      "learning_rate": 3.6653660777460366e-05,
+      "loss": 0.1552058696746826,
+      "step": 2010
+    },
+    {
+      "epoch": 0.36663027656477437,
+      "grad_norm": 0.16223010420799255,
+      "learning_rate": 3.6588488736299535e-05,
+      "loss": 0.1583200454711914,
+      "step": 2015
+    },
+    {
+      "epoch": 0.3675400291120815,
+      "grad_norm": 0.18441995978355408,
+      "learning_rate": 3.652321623416209e-05,
+      "loss": 0.15050662755966188,
+      "step": 2020
+    },
+    {
+      "epoch": 0.36844978165938863,
+      "grad_norm": 0.13792674243450165,
+      "learning_rate": 3.645784383689742e-05,
+      "loss": 0.15458759069442748,
+      "step": 2025
+    },
+    {
+      "epoch": 0.36935953420669576,
+      "grad_norm": 0.14993111789226532,
+      "learning_rate": 3.639237211122091e-05,
+      "loss": 0.15926222801208495,
+      "step": 2030
+    },
+    {
+      "epoch": 0.3702692867540029,
+      "grad_norm": 0.16815930604934692,
+      "learning_rate": 3.632680162470904e-05,
+      "loss": 0.15524441003799438,
+      "step": 2035
+    },
+    {
+      "epoch": 0.37117903930131,
+      "grad_norm": 0.13312821090221405,
+      "learning_rate": 3.626113294579441e-05,
+      "loss": 0.15883516073226928,
+      "step": 2040
+    },
+    {
+      "epoch": 0.37208879184861715,
+      "grad_norm": 0.16838273406028748,
+      "learning_rate": 3.619536664376091e-05,
+      "loss": 0.15829603672027587,
+      "step": 2045
+    },
+    {
+      "epoch": 0.37299854439592434,
+      "grad_norm": 0.14706873893737793,
+      "learning_rate": 3.612950328873869e-05,
+      "loss": 0.15644397735595703,
+      "step": 2050
+    },
+    {
+      "epoch": 0.37390829694323147,
+      "grad_norm": 0.1644199639558792,
+      "learning_rate": 3.606354345169926e-05,
+      "loss": 0.15858219861984252,
+      "step": 2055
+    },
+    {
+      "epoch": 0.3748180494905386,
+      "grad_norm": 0.18077051639556885,
+      "learning_rate": 3.599748770445055e-05,
+      "loss": 0.1641286849975586,
+      "step": 2060
+    },
+    {
+      "epoch": 0.3757278020378457,
+      "grad_norm": 0.16329127550125122,
+      "learning_rate": 3.5931336619631914e-05,
+      "loss": 0.15027186870574952,
+      "step": 2065
+    },
+    {
+      "epoch": 0.37663755458515286,
+      "grad_norm": 0.16346783936023712,
+      "learning_rate": 3.586509077070922e-05,
+      "loss": 0.1558641314506531,
+      "step": 2070
+    },
+    {
+      "epoch": 0.37754730713246,
+      "grad_norm": 0.1727602630853653,
+      "learning_rate": 3.5798750731969834e-05,
+      "loss": 0.15390506982803345,
+      "step": 2075
+    },
+    {
+      "epoch": 0.3784570596797671,
+      "grad_norm": 0.7598192691802979,
+      "learning_rate": 3.5732317078517654e-05,
+      "loss": 0.1533232808113098,
+      "step": 2080
+    },
+    {
+      "epoch": 0.37936681222707425,
+      "grad_norm": 0.1433355212211609,
+      "learning_rate": 3.5665790386268124e-05,
+      "loss": 0.15560413599014283,
+      "step": 2085
+    },
+    {
+      "epoch": 0.3802765647743814,
+      "grad_norm": 0.18439625203609467,
+      "learning_rate": 3.559917123194325e-05,
+      "loss": 0.16695556640625,
+      "step": 2090
+    },
+    {
+      "epoch": 0.3811863173216885,
+      "grad_norm": 0.1693502813577652,
+      "learning_rate": 3.55324601930666e-05,
+      "loss": 0.15957870483398437,
+      "step": 2095
+    },
+    {
+      "epoch": 0.38209606986899564,
+      "grad_norm": 0.17776088416576385,
+      "learning_rate": 3.54656578479583e-05,
+      "loss": 0.1527492880821228,
+      "step": 2100
+    },
+    {
+      "epoch": 0.38300582241630277,
+      "grad_norm": 0.15993724763393402,
+      "learning_rate": 3.539876477572998e-05,
+      "loss": 0.1567505717277527,
+      "step": 2105
+    },
+    {
+      "epoch": 0.3839155749636099,
+      "grad_norm": 0.17067375779151917,
+      "learning_rate": 3.533178155627981e-05,
+      "loss": 0.14660797119140626,
+      "step": 2110
+    },
+    {
+      "epoch": 0.384825327510917,
+      "grad_norm": 0.20239882171154022,
+      "learning_rate": 3.526470877028745e-05,
+      "loss": 0.1596767544746399,
+      "step": 2115
+    },
+    {
+      "epoch": 0.38573508005822416,
+      "grad_norm": 0.1863643079996109,
+      "learning_rate": 3.5197546999209005e-05,
+      "loss": 0.15738571882247926,
+      "step": 2120
+    },
+    {
+      "epoch": 0.3866448326055313,
+      "grad_norm": 0.16994133591651917,
+      "learning_rate": 3.5130296825272014e-05,
+      "loss": 0.16255316734313965,
+      "step": 2125
+    },
+    {
+      "epoch": 0.3875545851528384,
+      "grad_norm": 0.18703415989875793,
+      "learning_rate": 3.5062958831470355e-05,
+      "loss": 0.15206334590911866,
+      "step": 2130
+    },
+    {
+      "epoch": 0.38846433770014555,
+      "grad_norm": 0.15433982014656067,
+      "learning_rate": 3.4995533601559226e-05,
+      "loss": 0.1590178370475769,
+      "step": 2135
+    },
+    {
+      "epoch": 0.3893740902474527,
+      "grad_norm": 0.16498146951198578,
+      "learning_rate": 3.4928021720050104e-05,
+      "loss": 0.14759145975112914,
+      "step": 2140
+    },
+    {
+      "epoch": 0.3902838427947598,
+      "grad_norm": 0.17880478501319885,
+      "learning_rate": 3.486042377220562e-05,
+      "loss": 0.1642458915710449,
+      "step": 2145
+    },
+    {
+      "epoch": 0.39119359534206694,
+      "grad_norm": 0.14700061082839966,
+      "learning_rate": 3.479274034403455e-05,
+      "loss": 0.16105138063430785,
+      "step": 2150
+    },
+    {
+      "epoch": 0.39210334788937407,
+      "grad_norm": 0.1620762050151825,
+      "learning_rate": 3.472497202228664e-05,
+      "loss": 0.15104985237121582,
+      "step": 2155
+    },
+    {
+      "epoch": 0.3930131004366812,
+      "grad_norm": 0.1625058799982071,
+      "learning_rate": 3.4657119394447654e-05,
+      "loss": 0.16145485639572144,
+      "step": 2160
+    },
+    {
+      "epoch": 0.3939228529839884,
+      "grad_norm": 0.1631549596786499,
+      "learning_rate": 3.458918304873417e-05,
+      "loss": 0.16712255477905275,
+      "step": 2165
+    },
+    {
+      "epoch": 0.3948326055312955,
+      "grad_norm": 0.16041551530361176,
+      "learning_rate": 3.452116357408853e-05,
+      "loss": 0.15118330717086792,
+      "step": 2170
+    },
+    {
+      "epoch": 0.39574235807860264,
+      "grad_norm": 0.16692611575126648,
+      "learning_rate": 3.44530615601737e-05,
+      "loss": 0.16982550621032716,
+      "step": 2175
+    },
+    {
+      "epoch": 0.39665211062590977,
+      "grad_norm": 0.16082268953323364,
+      "learning_rate": 3.438487759736821e-05,
+      "loss": 0.1513260006904602,
+      "step": 2180
+    },
+    {
+      "epoch": 0.3975618631732169,
+      "grad_norm": 0.1474589854478836,
+      "learning_rate": 3.4316612276761004e-05,
+      "loss": 0.14968743324279785,
+      "step": 2185
+    },
+    {
+      "epoch": 0.39847161572052403,
+      "grad_norm": 0.14531342685222626,
+      "learning_rate": 3.42482661901463e-05,
+      "loss": 0.1563260555267334,
+      "step": 2190
+    },
+    {
+      "epoch": 0.39938136826783116,
+      "grad_norm": 0.16775506734848022,
+      "learning_rate": 3.41798399300185e-05,
+      "loss": 0.14861010313034057,
+      "step": 2195
+    },
+    {
+      "epoch": 0.4002911208151383,
+      "grad_norm": 0.15065217018127441,
+      "learning_rate": 3.411133408956703e-05,
+      "loss": 0.15559519529342652,
+      "step": 2200
+    },
+    {
+      "epoch": 0.4012008733624454,
+      "grad_norm": 0.16655296087265015,
+      "learning_rate": 3.4042749262671184e-05,
+      "loss": 0.16025567054748535,
+      "step": 2205
+    },
+    {
+      "epoch": 0.40211062590975255,
+      "grad_norm": 0.14773905277252197,
+      "learning_rate": 3.397408604389501e-05,
+      "loss": 0.15074082612991332,
+      "step": 2210
+    },
+    {
+      "epoch": 0.4030203784570597,
+      "grad_norm": 0.16233304142951965,
+      "learning_rate": 3.3905345028482125e-05,
+      "loss": 0.15490520000457764,
+      "step": 2215
+    },
+    {
+      "epoch": 0.4039301310043668,
+      "grad_norm": 0.17520153522491455,
+      "learning_rate": 3.383652681235058e-05,
+      "loss": 0.1517520785331726,
+      "step": 2220
+    },
+    {
+      "epoch": 0.40483988355167394,
+      "grad_norm": 0.14749875664710999,
+      "learning_rate": 3.376763199208766e-05,
+      "loss": 0.15410997867584228,
+      "step": 2225
+    },
+    {
+      "epoch": 0.40574963609898107,
+      "grad_norm": 0.16855919361114502,
+      "learning_rate": 3.369866116494477e-05,
+      "loss": 0.1510261058807373,
+      "step": 2230
+    },
+    {
+      "epoch": 0.4066593886462882,
+      "grad_norm": 0.1594122350215912,
+      "learning_rate": 3.362961492883218e-05,
+      "loss": 0.1493813395500183,
+      "step": 2235
+    },
+    {
+      "epoch": 0.40756914119359533,
+      "grad_norm": 0.13645926117897034,
+      "learning_rate": 3.3560493882313915e-05,
+      "loss": 0.14876762628555298,
+      "step": 2240
+    },
+    {
+      "epoch": 0.40847889374090246,
+      "grad_norm": 0.14304400980472565,
+      "learning_rate": 3.349129862460251e-05,
+      "loss": 0.15567013025283813,
+      "step": 2245
+    },
+    {
+      "epoch": 0.4093886462882096,
+      "grad_norm": 0.17040041089057922,
+      "learning_rate": 3.342202975555386e-05,
+      "loss": 0.1563249945640564,
+      "step": 2250
+    },
+    {
+      "epoch": 0.4102983988355167,
+      "grad_norm": 0.15594671666622162,
+      "learning_rate": 3.3352687875661984e-05,
+      "loss": 0.1546410083770752,
+      "step": 2255
+    },
+    {
+      "epoch": 0.41120815138282385,
+      "grad_norm": 0.1677195280790329,
+      "learning_rate": 3.328327358605384e-05,
+      "loss": 0.15710171461105346,
+      "step": 2260
+    },
+    {
+      "epoch": 0.412117903930131,
+      "grad_norm": 0.1731705516576767,
+      "learning_rate": 3.321378748848412e-05,
+      "loss": 0.16444036960601807,
+      "step": 2265
+    },
+    {
+      "epoch": 0.4130276564774381,
+      "grad_norm": 0.18779033422470093,
+      "learning_rate": 3.3144230185329984e-05,
+      "loss": 0.15659687519073487,
+      "step": 2270
+    },
+    {
+      "epoch": 0.4139374090247453,
+      "grad_norm": 0.1543768346309662,
+      "learning_rate": 3.3074602279585913e-05,
+      "loss": 0.15100739002227784,
+      "step": 2275
+    },
+    {
+      "epoch": 0.4148471615720524,
+      "grad_norm": 0.16672168672084808,
+      "learning_rate": 3.300490437485843e-05,
+      "loss": 0.15535364151000977,
+      "step": 2280
+    },
+    {
+      "epoch": 0.41575691411935956,
+      "grad_norm": 0.16741308569908142,
+      "learning_rate": 3.293513707536089e-05,
+      "loss": 0.15523911714553834,
+      "step": 2285
+    },
+    {
+      "epoch": 0.4166666666666667,
+      "grad_norm": 0.1488303542137146,
+      "learning_rate": 3.286530098590822e-05,
+      "loss": 0.1542000651359558,
+      "step": 2290
+    },
+    {
+      "epoch": 0.4175764192139738,
+      "grad_norm": 0.1637732982635498,
+      "learning_rate": 3.2795396711911694e-05,
+      "loss": 0.15354831218719484,
+      "step": 2295
+    },
+    {
+      "epoch": 0.41848617176128095,
+      "grad_norm": 0.1472022533416748,
+      "learning_rate": 3.272542485937369e-05,
+      "loss": 0.16235145330429077,
+      "step": 2300
+    },
+    {
+      "epoch": 0.4193959243085881,
+      "grad_norm": 0.15908290445804596,
+      "learning_rate": 3.265538603488241e-05,
+      "loss": 0.15642645359039306,
+      "step": 2305
+    },
+    {
+      "epoch": 0.4203056768558952,
+      "grad_norm": 0.1584865301847458,
+      "learning_rate": 3.2585280845606645e-05,
+      "loss": 0.15490249395370484,
+      "step": 2310
+    },
+    {
+      "epoch": 0.42121542940320233,
+      "grad_norm": 0.15893949568271637,
+      "learning_rate": 3.251510989929052e-05,
+      "loss": 0.1598116159439087,
+      "step": 2315
+    },
+    {
+      "epoch": 0.42212518195050946,
+      "grad_norm": 0.18930596113204956,
+      "learning_rate": 3.244487380424817e-05,
+      "loss": 0.1482008934020996,
+      "step": 2320
+    },
+    {
+      "epoch": 0.4230349344978166,
+      "grad_norm": 0.132876455783844,
+      "learning_rate": 3.237457316935856e-05,
+      "loss": 0.15304710865020751,
+      "step": 2325
+    },
+    {
+      "epoch": 0.4239446870451237,
+      "grad_norm": 0.16447032988071442,
+      "learning_rate": 3.2304208604060106e-05,
+      "loss": 0.15298750400543212,
+      "step": 2330
+    },
+    {
+      "epoch": 0.42485443959243085,
+      "grad_norm": 0.17748120427131653,
+      "learning_rate": 3.223378071834546e-05,
+      "loss": 0.1556084156036377,
+      "step": 2335
+    },
+    {
+      "epoch": 0.425764192139738,
+      "grad_norm": 0.16366586089134216,
+      "learning_rate": 3.2163290122756206e-05,
+      "loss": 0.14387927055358887,
+      "step": 2340
+    },
+    {
+      "epoch": 0.4266739446870451,
+      "grad_norm": 0.15398970246315002,
+      "learning_rate": 3.209273742837755e-05,
+      "loss": 0.16091293096542358,
+      "step": 2345
+    },
+    {
+      "epoch": 0.42758369723435224,
+      "grad_norm": 0.164212167263031,
+      "learning_rate": 3.202212324683305e-05,
+      "loss": 0.15523531436920165,
+      "step": 2350
+    },
+    {
+      "epoch": 0.4284934497816594,
+      "grad_norm": 0.16749800741672516,
+      "learning_rate": 3.1951448190279255e-05,
+      "loss": 0.15354975461959838,
+      "step": 2355
+    },
+    {
+      "epoch": 0.4294032023289665,
+      "grad_norm": 0.14137034118175507,
+      "learning_rate": 3.18807128714005e-05,
+      "loss": 0.14981694221496583,
+      "step": 2360
+    },
+    {
+      "epoch": 0.43031295487627363,
+      "grad_norm": 0.14848439395427704,
+      "learning_rate": 3.1809917903403507e-05,
+      "loss": 0.15448769330978393,
+      "step": 2365
+    },
+    {
+      "epoch": 0.43122270742358076,
+      "grad_norm": 0.1747605800628662,
+      "learning_rate": 3.1739063900012095e-05,
+      "loss": 0.15882387161254882,
+      "step": 2370
+    },
+    {
+      "epoch": 0.4321324599708879,
+      "grad_norm": 0.16054467856884003,
+      "learning_rate": 3.166815147546186e-05,
+      "loss": 0.15170297622680665,
+      "step": 2375
+    },
+    {
+      "epoch": 0.433042212518195,
+      "grad_norm": 0.15428027510643005,
+      "learning_rate": 3.1597181244494886e-05,
+      "loss": 0.16202548742294312,
+      "step": 2380
+    },
+    {
+      "epoch": 0.4339519650655022,
+      "grad_norm": 0.16747219860553741,
+      "learning_rate": 3.1526153822354325e-05,
+      "loss": 0.15461477041244506,
+      "step": 2385
+    },
+    {
+      "epoch": 0.43486171761280934,
+      "grad_norm": 0.17415772378444672,
+      "learning_rate": 3.145506982477918e-05,
+      "loss": 0.16173542737960817,
+      "step": 2390
+    },
+    {
+      "epoch": 0.43577147016011647,
+      "grad_norm": 0.1293518990278244,
+      "learning_rate": 3.1383929867998865e-05,
+      "loss": 0.15572521686553956,
+      "step": 2395
+    },
+    {
+      "epoch": 0.4366812227074236,
+      "grad_norm": 0.16909323632717133,
+      "learning_rate": 3.1312734568727935e-05,
+      "loss": 0.15898628234863282,
+      "step": 2400
+    },
+    {
+      "epoch": 0.43759097525473073,
+      "grad_norm": 0.16770294308662415,
+      "learning_rate": 3.124148454416069e-05,
+      "loss": 0.1536281704902649,
+      "step": 2405
+    },
+    {
+      "epoch": 0.43850072780203786,
+      "grad_norm": 0.14078612625598907,
+      "learning_rate": 3.117018041196585e-05,
+      "loss": 0.15274266004562378,
+      "step": 2410
+    },
+    {
+      "epoch": 0.439410480349345,
+      "grad_norm": 0.15457536280155182,
+      "learning_rate": 3.1098822790281226e-05,
+      "loss": 0.15391263961791993,
+      "step": 2415
+    },
+    {
+      "epoch": 0.4403202328966521,
+      "grad_norm": 0.1640717089176178,
+      "learning_rate": 3.102741229770827e-05,
+      "loss": 0.15515168905258178,
+      "step": 2420
+    },
+    {
+      "epoch": 0.44122998544395925,
+      "grad_norm": 0.2601533830165863,
+      "learning_rate": 3.095594955330683e-05,
+      "loss": 0.1587247371673584,
+      "step": 2425
+    },
+    {
+      "epoch": 0.4421397379912664,
+      "grad_norm": 0.1352529525756836,
+      "learning_rate": 3.08844351765897e-05,
+      "loss": 0.1483217477798462,
+      "step": 2430
+    },
+    {
+      "epoch": 0.4430494905385735,
+      "grad_norm": 0.18479721248149872,
+      "learning_rate": 3.081286978751728e-05,
+      "loss": 0.15121787786483765,
+      "step": 2435
+    },
+    {
+      "epoch": 0.44395924308588064,
+      "grad_norm": 0.16954511404037476,
+      "learning_rate": 3.074125400649221e-05,
+      "loss": 0.16073100566864013,
+      "step": 2440
+    },
+    {
+      "epoch": 0.44486899563318777,
+      "grad_norm": 0.15154729783535004,
+      "learning_rate": 3.0669588454353944e-05,
+      "loss": 0.15738017559051515,
+      "step": 2445
+    },
+    {
+      "epoch": 0.4457787481804949,
+      "grad_norm": 0.1540488302707672,
+      "learning_rate": 3.059787375237344e-05,
+      "loss": 0.1515384554862976,
+      "step": 2450
+    },
+    {
+      "epoch": 0.44668850072780203,
+      "grad_norm": 0.1814432442188263,
+      "learning_rate": 3.052611052224774e-05,
+      "loss": 0.15731438398361205,
+      "step": 2455
+    },
+    {
+      "epoch": 0.44759825327510916,
+      "grad_norm": 0.16657036542892456,
+      "learning_rate": 3.0454299386094542e-05,
+      "loss": 0.15741543769836425,
+      "step": 2460
+    },
+    {
+      "epoch": 0.4485080058224163,
+      "grad_norm": 0.2177237570285797,
+      "learning_rate": 3.0382440966446875e-05,
+      "loss": 0.14972515106201173,
+      "step": 2465
+    },
+    {
+      "epoch": 0.4494177583697234,
+      "grad_norm": 0.1669909954071045,
+      "learning_rate": 3.031053588624766e-05,
+      "loss": 0.1506432294845581,
+      "step": 2470
+    },
+    {
+      "epoch": 0.45032751091703055,
+      "grad_norm": 0.1752234250307083,
+      "learning_rate": 3.0238584768844313e-05,
+      "loss": 0.14969609975814818,
+      "step": 2475
+    },
+    {
+      "epoch": 0.4512372634643377,
+      "grad_norm": 0.18267901241779327,
+      "learning_rate": 3.0166588237983363e-05,
+      "loss": 0.15112748146057128,
+      "step": 2480
+    },
+    {
+      "epoch": 0.4521470160116448,
+      "grad_norm": 0.16250105202198029,
+      "learning_rate": 3.0094546917805007e-05,
+      "loss": 0.15864100456237792,
+      "step": 2485
+    },
+    {
+      "epoch": 0.45305676855895194,
+      "grad_norm": 0.14825721085071564,
+      "learning_rate": 3.0022461432837752e-05,
+      "loss": 0.1513954520225525,
+      "step": 2490
+    },
+    {
+      "epoch": 0.4539665211062591,
+      "grad_norm": 0.1626640111207962,
+      "learning_rate": 2.9950332407992943e-05,
+      "loss": 0.1505578875541687,
+      "step": 2495
+    },
+    {
+      "epoch": 0.45487627365356625,
+      "grad_norm": 0.1535351574420929,
+      "learning_rate": 2.987816046855939e-05,
+      "loss": 0.15255829095840454,
+      "step": 2500
+    },
+    {
+      "epoch": 0.4557860262008734,
+      "grad_norm": 0.17552775144577026,
+      "learning_rate": 2.9805946240197928e-05,
+      "loss": 0.1516443133354187,
+      "step": 2505
+    },
+    {
+      "epoch": 0.4566957787481805,
+      "grad_norm": 0.16020981967449188,
+      "learning_rate": 2.9733690348935994e-05,
+      "loss": 0.14519743919372557,
+      "step": 2510
+    },
+    {
+      "epoch": 0.45760553129548764,
+      "grad_norm": 0.17800211906433105,
+      "learning_rate": 2.9661393421162204e-05,
+      "loss": 0.15679080486297609,
+      "step": 2515
+    },
+    {
+      "epoch": 0.4585152838427948,
+      "grad_norm": 0.16016991436481476,
+      "learning_rate": 2.9589056083620902e-05,
+      "loss": 0.14768127202987671,
+      "step": 2520
+    },
+    {
+      "epoch": 0.4594250363901019,
+      "grad_norm": 0.16272081434726715,
+      "learning_rate": 2.951667896340679e-05,
+      "loss": 0.1513301968574524,
+      "step": 2525
+    },
+    {
+      "epoch": 0.46033478893740903,
+      "grad_norm": 0.1726413071155548,
+      "learning_rate": 2.9444262687959402e-05,
+      "loss": 0.14819332361221313,
+      "step": 2530
+    },
+    {
+      "epoch": 0.46124454148471616,
+      "grad_norm": 0.1670403778553009,
+      "learning_rate": 2.9371807885057735e-05,
+      "loss": 0.15245940685272216,
+      "step": 2535
+    },
+    {
+      "epoch": 0.4621542940320233,
+      "grad_norm": 0.1650049239397049,
+      "learning_rate": 2.9299315182814772e-05,
+      "loss": 0.15187418460845947,
+      "step": 2540
+    },
+    {
+      "epoch": 0.4630640465793304,
+      "grad_norm": 0.16327734291553497,
+      "learning_rate": 2.9226785209672047e-05,
+      "loss": 0.15579828023910522,
+      "step": 2545
+    },
+    {
+      "epoch": 0.46397379912663755,
+      "grad_norm": 0.3367880582809448,
+      "learning_rate": 2.91542185943942e-05,
+      "loss": 0.15617697238922118,
+      "step": 2550
+    },
+    {
+      "epoch": 0.4648835516739447,
+      "grad_norm": 0.1731594055891037,
+      "learning_rate": 2.908161596606353e-05,
+      "loss": 0.1559603691101074,
+      "step": 2555
+    },
+    {
+      "epoch": 0.4657933042212518,
+      "grad_norm": 0.1477293074131012,
+      "learning_rate": 2.9008977954074517e-05,
+      "loss": 0.15567959547042848,
+      "step": 2560
+    },
+    {
+      "epoch": 0.46670305676855894,
+      "grad_norm": 0.16227173805236816,
+      "learning_rate": 2.8936305188128392e-05,
+      "loss": 0.1522113561630249,
+      "step": 2565
+    },
+    {
+      "epoch": 0.4676128093158661,
+      "grad_norm": 0.2031075656414032,
+      "learning_rate": 2.8863598298227674e-05,
+      "loss": 0.15054640769958497,
+      "step": 2570
+    },
+    {
+      "epoch": 0.4685225618631732,
+      "grad_norm": 0.18351472914218903,
+      "learning_rate": 2.8790857914670698e-05,
+      "loss": 0.15837019681930542,
+      "step": 2575
+    },
+    {
+      "epoch": 0.46943231441048033,
+      "grad_norm": 0.15914765000343323,
+      "learning_rate": 2.871808466804616e-05,
+      "loss": 0.1550259470939636,
+      "step": 2580
+    },
+    {
+      "epoch": 0.47034206695778746,
+      "grad_norm": 0.17366717755794525,
+      "learning_rate": 2.8645279189227636e-05,
+      "loss": 0.15702390670776367,
+      "step": 2585
+    },
+    {
+      "epoch": 0.4712518195050946,
+      "grad_norm": 0.13677838444709778,
+      "learning_rate": 2.8572442109368134e-05,
+      "loss": 0.15485031604766847,
+      "step": 2590
+    },
+    {
+      "epoch": 0.4721615720524017,
+      "grad_norm": 0.1477748304605484,
+      "learning_rate": 2.8499574059894617e-05,
+      "loss": 0.14577245712280273,
+      "step": 2595
+    },
+    {
+      "epoch": 0.47307132459970885,
+      "grad_norm": 0.1582217663526535,
+      "learning_rate": 2.842667567250252e-05,
+      "loss": 0.15586793422698975,
+      "step": 2600
+    },
+    {
+      "epoch": 0.47398107714701604,
+      "grad_norm": 0.19658738374710083,
+      "learning_rate": 2.8353747579150268e-05,
+      "loss": 0.15060495138168334,
+      "step": 2605
+    },
+    {
+      "epoch": 0.47489082969432317,
+      "grad_norm": 0.176767036318779,
+      "learning_rate": 2.828079041205382e-05,
+      "loss": 0.15116705894470214,
+      "step": 2610
+    },
+    {
+      "epoch": 0.4758005822416303,
+      "grad_norm": 0.16972507536411285,
+      "learning_rate": 2.820780480368117e-05,
+      "loss": 0.1541937470436096,
+      "step": 2615
+    },
+    {
+      "epoch": 0.47671033478893743,
+      "grad_norm": 0.1548585742712021,
+      "learning_rate": 2.8134791386746884e-05,
+      "loss": 0.14334756135940552,
+      "step": 2620
+    },
+    {
+      "epoch": 0.47762008733624456,
+      "grad_norm": 0.15411986410617828,
+      "learning_rate": 2.806175079420658e-05,
+      "loss": 0.14642289876937867,
+      "step": 2625
+    },
+    {
+      "epoch": 0.4785298398835517,
+      "grad_norm": 0.16609491407871246,
+      "learning_rate": 2.7988683659251474e-05,
+      "loss": 0.15083469152450563,
+      "step": 2630
+    },
+    {
+      "epoch": 0.4794395924308588,
+      "grad_norm": 0.16592684388160706,
+      "learning_rate": 2.791559061530289e-05,
+      "loss": 0.14218480587005616,
+      "step": 2635
+    },
+    {
+      "epoch": 0.48034934497816595,
+      "grad_norm": 0.1764935404062271,
+      "learning_rate": 2.7842472296006722e-05,
+      "loss": 0.15004343986511232,
+      "step": 2640
+    },
+    {
+      "epoch": 0.4812590975254731,
+      "grad_norm": 0.20094354450702667,
+      "learning_rate": 2.7769329335228022e-05,
+      "loss": 0.14975016117095946,
+      "step": 2645
+    },
+    {
+      "epoch": 0.4821688500727802,
+      "grad_norm": 0.1869269460439682,
+      "learning_rate": 2.769616236704542e-05,
+      "loss": 0.155981707572937,
+      "step": 2650
+    },
+    {
+      "epoch": 0.48307860262008734,
+      "grad_norm": 0.16671574115753174,
+      "learning_rate": 2.762297202574571e-05,
+      "loss": 0.14633859395980836,
+      "step": 2655
+    },
+    {
+      "epoch": 0.48398835516739447,
+      "grad_norm": 0.14999663829803467,
+      "learning_rate": 2.754975894581826e-05,
+      "loss": 0.15692603588104248,
+      "step": 2660
+    },
+    {
+      "epoch": 0.4848981077147016,
+      "grad_norm": 0.16893649101257324,
+      "learning_rate": 2.7476523761949592e-05,
+      "loss": 0.14530394077301026,
+      "step": 2665
+    },
+    {
+      "epoch": 0.48580786026200873,
+      "grad_norm": 0.16039884090423584,
+      "learning_rate": 2.740326710901784e-05,
+      "loss": 0.15013915300369263,
+      "step": 2670
+    },
+    {
+      "epoch": 0.48671761280931586,
+      "grad_norm": 0.16672006249427795,
+      "learning_rate": 2.732998962208725e-05,
+      "loss": 0.15667349100112915,
+      "step": 2675
+    },
+    {
+      "epoch": 0.487627365356623,
+      "grad_norm": 0.2160867303609848,
+      "learning_rate": 2.7256691936402684e-05,
+      "loss": 0.14335414171218872,
+      "step": 2680
+    },
+    {
+      "epoch": 0.4885371179039301,
+      "grad_norm": 0.349030077457428,
+      "learning_rate": 2.71833746873841e-05,
+      "loss": 0.1437530279159546,
+      "step": 2685
+    },
+    {
+      "epoch": 0.48944687045123725,
+      "grad_norm": 0.18380966782569885,
+      "learning_rate": 2.7110038510621073e-05,
+      "loss": 0.1476014256477356,
+      "step": 2690
+    },
+    {
+      "epoch": 0.4903566229985444,
+      "grad_norm": 0.1523742377758026,
+      "learning_rate": 2.703668404186722e-05,
+      "loss": 0.14578526020050048,
+      "step": 2695
+    },
+    {
+      "epoch": 0.4912663755458515,
+      "grad_norm": 0.16092729568481445,
+      "learning_rate": 2.696331191703479e-05,
+      "loss": 0.15335593223571778,
+      "step": 2700
+    },
+    {
+      "epoch": 0.49217612809315864,
+      "grad_norm": 0.17185333371162415,
+      "learning_rate": 2.688992277218904e-05,
+      "loss": 0.1540898084640503,
+      "step": 2705
+    },
+    {
+      "epoch": 0.49308588064046577,
+      "grad_norm": 0.1521969735622406,
+      "learning_rate": 2.6816517243542792e-05,
+      "loss": 0.15171396732330322,
+      "step": 2710
+    },
+    {
+      "epoch": 0.49399563318777295,
+      "grad_norm": 0.16064171493053436,
+      "learning_rate": 2.674309596745092e-05,
+      "loss": 0.1505839228630066,
+      "step": 2715
+    },
+    {
+      "epoch": 0.4949053857350801,
+      "grad_norm": 0.16430898010730743,
+      "learning_rate": 2.6669659580404795e-05,
+      "loss": 0.1551363468170166,
+      "step": 2720
+    },
+    {
+      "epoch": 0.4958151382823872,
+      "grad_norm": 0.16125477850437164,
+      "learning_rate": 2.659620871902677e-05,
+      "loss": 0.15069286823272704,
+      "step": 2725
+    },
+    {
+      "epoch": 0.49672489082969434,
+      "grad_norm": 0.1428450047969818,
+      "learning_rate": 2.652274402006471e-05,
+      "loss": 0.15511081218719483,
+      "step": 2730
+    },
+    {
+      "epoch": 0.4976346433770015,
+      "grad_norm": 0.15452754497528076,
+      "learning_rate": 2.6449266120386406e-05,
+      "loss": 0.14941939115524291,
+      "step": 2735
+    },
+    {
+      "epoch": 0.4985443959243086,
+      "grad_norm": 0.17243537306785583,
+      "learning_rate": 2.6375775656974123e-05,
+      "loss": 0.151741623878479,
+      "step": 2740
+    },
+    {
+      "epoch": 0.49945414847161573,
+      "grad_norm": 0.13736453652381897,
+      "learning_rate": 2.6302273266919008e-05,
+      "loss": 0.147042977809906,
+      "step": 2745
+    },
+    {
+      "epoch": 0.5003639010189228,
+      "grad_norm": 0.16241495311260223,
+      "learning_rate": 2.6228759587415614e-05,
+      "loss": 0.14664684534072875,
+      "step": 2750
+    },
+    {
+      "epoch": 0.50127365356623,
+      "grad_norm": 0.193496435880661,
+      "learning_rate": 2.6155235255756356e-05,
+      "loss": 0.15486966371536254,
+      "step": 2755
+    },
+    {
+      "epoch": 0.5021834061135371,
+      "grad_norm": 0.1542847901582718,
+      "learning_rate": 2.6081700909326e-05,
+      "loss": 0.15148009061813356,
+      "step": 2760
+    },
+    {
+      "epoch": 0.5030931586608443,
+      "grad_norm": 0.1696511209011078,
+      "learning_rate": 2.6008157185596142e-05,
+      "loss": 0.14190055131912233,
+      "step": 2765
+    },
+    {
+      "epoch": 0.5040029112081513,
+      "grad_norm": 0.14690077304840088,
+      "learning_rate": 2.5934604722119655e-05,
+      "loss": 0.1570739269256592,
+      "step": 2770
+    },
+    {
+      "epoch": 0.5049126637554585,
+      "grad_norm": 0.17149671912193298,
+      "learning_rate": 2.5861044156525162e-05,
+      "loss": 0.14940304756164552,
+      "step": 2775
+    },
+    {
+      "epoch": 0.5058224163027657,
+      "grad_norm": 0.16639231145381927,
+      "learning_rate": 2.578747612651155e-05,
+      "loss": 0.15691237449645995,
+      "step": 2780
+    },
+    {
+      "epoch": 0.5067321688500728,
+      "grad_norm": 0.2062763124704361,
+      "learning_rate": 2.5713901269842404e-05,
+      "loss": 0.1564734935760498,
+      "step": 2785
+    },
+    {
+      "epoch": 0.50764192139738,
+      "grad_norm": 0.12636308372020721,
+      "learning_rate": 2.5640320224340502e-05,
+      "loss": 0.14539417028427123,
+      "step": 2790
+    },
+    {
+      "epoch": 0.508551673944687,
+      "grad_norm": 0.16893689334392548,
+      "learning_rate": 2.556673362788225e-05,
+      "loss": 0.15440930128097535,
+      "step": 2795
+    },
+    {
+      "epoch": 0.5094614264919942,
+      "grad_norm": 0.16250015795230865,
+      "learning_rate": 2.54931421183922e-05,
+      "loss": 0.14485647678375244,
+      "step": 2800
+    },
+    {
+      "epoch": 0.5103711790393013,
+      "grad_norm": 0.1700994372367859,
+      "learning_rate": 2.5419546333837462e-05,
+      "loss": 0.15411126613616943,
+      "step": 2805
+    },
+    {
+      "epoch": 0.5112809315866085,
+      "grad_norm": 0.1547706127166748,
+      "learning_rate": 2.5345946912222256e-05,
+      "loss": 0.15516072511672974,
+      "step": 2810
+    },
+    {
+      "epoch": 0.5121906841339156,
+      "grad_norm": 0.17955681681632996,
+      "learning_rate": 2.527234449158228e-05,
+      "loss": 0.15546923875808716,
+      "step": 2815
+    },
+    {
+      "epoch": 0.5131004366812227,
+      "grad_norm": 0.163709819316864,
+      "learning_rate": 2.519873970997927e-05,
+      "loss": 0.15665037631988527,
+      "step": 2820
+    },
+    {
+      "epoch": 0.5140101892285298,
+      "grad_norm": 0.17859576642513275,
+      "learning_rate": 2.5125133205495405e-05,
+      "loss": 0.1539722204208374,
+      "step": 2825
+    },
+    {
+      "epoch": 0.514919941775837,
+      "grad_norm": 0.17443150281906128,
+      "learning_rate": 2.5051525616227806e-05,
+      "loss": 0.148411762714386,
+      "step": 2830
+    },
+    {
+      "epoch": 0.5158296943231441,
+      "grad_norm": 0.17397581040859222,
+      "learning_rate": 2.4977917580283007e-05,
+      "loss": 0.14880497455596925,
+      "step": 2835
+    },
+    {
+      "epoch": 0.5167394468704513,
+      "grad_norm": 0.14565663039684296,
+      "learning_rate": 2.4904309735771405e-05,
+      "loss": 0.14934680461883545,
+      "step": 2840
+    },
+    {
+      "epoch": 0.5176491994177583,
+      "grad_norm": 0.17895659804344177,
+      "learning_rate": 2.4830702720801746e-05,
+      "loss": 0.15287939310073853,
+      "step": 2845
+    },
+    {
+      "epoch": 0.5185589519650655,
+      "grad_norm": 0.15812788903713226,
+      "learning_rate": 2.4757097173475572e-05,
+      "loss": 0.14576947689056396,
+      "step": 2850
+    },
+    {
+      "epoch": 0.5194687045123726,
+      "grad_norm": 0.17123781144618988,
+      "learning_rate": 2.46834937318817e-05,
+      "loss": 0.15224847793579102,
+      "step": 2855
+    },
+    {
+      "epoch": 0.5203784570596798,
+      "grad_norm": 0.14845474064350128,
+      "learning_rate": 2.460989303409072e-05,
+      "loss": 0.14901585578918458,
+      "step": 2860
+    },
+    {
+      "epoch": 0.5212882096069869,
+      "grad_norm": 0.23493704199790955,
+      "learning_rate": 2.4536295718149407e-05,
+      "loss": 0.1517487049102783,
+      "step": 2865
+    },
+    {
+      "epoch": 0.522197962154294,
+      "grad_norm": 0.16209843754768372,
+      "learning_rate": 2.4462702422075217e-05,
+      "loss": 0.14327445030212402,
+      "step": 2870
+    },
+    {
+      "epoch": 0.5231077147016011,
+      "grad_norm": 0.17249803245067596,
+      "learning_rate": 2.4389113783850793e-05,
+      "loss": 0.1517549753189087,
+      "step": 2875
+    },
+    {
+      "epoch": 0.5240174672489083,
+      "grad_norm": 0.14561402797698975,
+      "learning_rate": 2.431553044141836e-05,
+      "loss": 0.14764087200164794,
+      "step": 2880
+    },
+    {
+      "epoch": 0.5249272197962155,
+      "grad_norm": 0.17033302783966064,
+      "learning_rate": 2.4241953032674256e-05,
+      "loss": 0.15181604623794556,
+      "step": 2885
+    },
+    {
+      "epoch": 0.5258369723435226,
+      "grad_norm": 0.1184430941939354,
+      "learning_rate": 2.4168382195463367e-05,
+      "loss": 0.14264242649078368,
+      "step": 2890
+    },
+    {
+      "epoch": 0.5267467248908297,
+      "grad_norm": 0.17521196603775024,
+      "learning_rate": 2.4094818567573618e-05,
+      "loss": 0.1509538173675537,
+      "step": 2895
+    },
+    {
+      "epoch": 0.5276564774381368,
+      "grad_norm": 0.1681576371192932,
+      "learning_rate": 2.4021262786730428e-05,
+      "loss": 0.15344605445861817,
+      "step": 2900
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.600169064604197e+18,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-2900/training_args.bin b/checkpoint-2900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-2900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/checkpoint-300/README.md b/checkpoint-300/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-300/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-300/adapter_config.json b/checkpoint-300/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-300/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-300/adapter_model.safetensors b/checkpoint-300/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7eb6dba203b6266f68fdc6f721ff4b83a6823a04
--- /dev/null
+++ b/checkpoint-300/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2aacf12521886db4beac302ec42b39ca34d003cd214d61ec1154b5d89fd8ce0e
+size 169741912
diff --git a/checkpoint-300/chat_template.jinja b/checkpoint-300/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-300/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-300/optimizer.pt b/checkpoint-300/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6cc6c15c0f60bd21339f80800867759afb17d443
--- /dev/null
+++ b/checkpoint-300/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3dfbfc331e5f36533cbba9406d38c1df79cb1bdb6c5e3c6088e48faec99533e5
+size 72807355
diff --git a/checkpoint-300/processor_config.json b/checkpoint-300/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-300/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-300/rng_state.pth b/checkpoint-300/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-300/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-300/scheduler.pt b/checkpoint-300/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4fd9c25e3450ea06b62e8f786fb70859e6136672
--- /dev/null
+++ b/checkpoint-300/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:40ce927394996dcd38318599e573f8e5f6def23906b68897e863fdfe657ab241
+size 1465
diff --git a/checkpoint-300/tokenizer.json b/checkpoint-300/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-300/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-300/tokenizer_config.json b/checkpoint-300/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-300/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-300/trainer_state.json b/checkpoint-300/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..738d41339059de7cc78d40936a8c94b41b577caf
--- /dev/null
+++ b/checkpoint-300/trainer_state.json
@@ -0,0 +1,462 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.05458515283842795,
+  "eval_steps": 100,
+  "global_step": 300,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    },
+    {
+      "epoch": 0.03729985443959243,
+      "grad_norm": 0.061678580939769745,
+      "learning_rate": 4.999340748636462e-05,
+      "loss": 0.24956226348876953,
+      "step": 205
+    },
+    {
+      "epoch": 0.03820960698689956,
+      "grad_norm": 0.07328873127698898,
+      "learning_rate": 4.999160884067051e-05,
+      "loss": 0.26169676780700685,
+      "step": 210
+    },
+    {
+      "epoch": 0.0391193595342067,
+      "grad_norm": 0.08287990838289261,
+      "learning_rate": 4.9989593541926246e-05,
+      "loss": 0.2574604034423828,
+      "step": 215
+    },
+    {
+      "epoch": 0.04002911208151383,
+      "grad_norm": 0.06787359714508057,
+      "learning_rate": 4.9987361607602525e-05,
+      "loss": 0.25351409912109374,
+      "step": 220
+    },
+    {
+      "epoch": 0.04093886462882096,
+      "grad_norm": 0.06695502996444702,
+      "learning_rate": 4.998491305704805e-05,
+      "loss": 0.24522039890289307,
+      "step": 225
+    },
+    {
+      "epoch": 0.041848617176128096,
+      "grad_norm": 0.08872214704751968,
+      "learning_rate": 4.9982247911489375e-05,
+      "loss": 0.2581867933273315,
+      "step": 230
+    },
+    {
+      "epoch": 0.042758369723435226,
+      "grad_norm": 0.07637131959199905,
+      "learning_rate": 4.9979366194030743e-05,
+      "loss": 0.25569658279418944,
+      "step": 235
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 0.08158119022846222,
+      "learning_rate": 4.997626792965385e-05,
+      "loss": 0.2529409646987915,
+      "step": 240
+    },
+    {
+      "epoch": 0.04457787481804949,
+      "grad_norm": 0.07529161125421524,
+      "learning_rate": 4.997295314521766e-05,
+      "loss": 0.24049024581909179,
+      "step": 245
+    },
+    {
+      "epoch": 0.04548762736535662,
+      "grad_norm": 0.08860139548778534,
+      "learning_rate": 4.996942186945813e-05,
+      "loss": 0.2490522861480713,
+      "step": 250
+    },
+    {
+      "epoch": 0.04639737991266375,
+      "grad_norm": 0.0850321501493454,
+      "learning_rate": 4.9965674132988005e-05,
+      "loss": 0.24180831909179687,
+      "step": 255
+    },
+    {
+      "epoch": 0.04730713245997089,
+      "grad_norm": 0.07556115090847015,
+      "learning_rate": 4.996170996829653e-05,
+      "loss": 0.2509631872177124,
+      "step": 260
+    },
+    {
+      "epoch": 0.04821688500727802,
+      "grad_norm": 0.07971206307411194,
+      "learning_rate": 4.995752940974918e-05,
+      "loss": 0.24398891925811766,
+      "step": 265
+    },
+    {
+      "epoch": 0.04912663755458515,
+      "grad_norm": 0.09149336814880371,
+      "learning_rate": 4.9953132493587344e-05,
+      "loss": 0.2300492286682129,
+      "step": 270
+    },
+    {
+      "epoch": 0.050036390101892286,
+      "grad_norm": 0.08265820890665054,
+      "learning_rate": 4.9948519257928034e-05,
+      "loss": 0.24246792793273925,
+      "step": 275
+    },
+    {
+      "epoch": 0.050946142649199416,
+      "grad_norm": 0.10328587144613266,
+      "learning_rate": 4.9943689742763534e-05,
+      "loss": 0.2367171049118042,
+      "step": 280
+    },
+    {
+      "epoch": 0.05185589519650655,
+      "grad_norm": 0.0836917981505394,
+      "learning_rate": 4.993864398996105e-05,
+      "loss": 0.23215813636779786,
+      "step": 285
+    },
+    {
+      "epoch": 0.05276564774381368,
+      "grad_norm": 0.09475161135196686,
+      "learning_rate": 4.99333820432624e-05,
+      "loss": 0.2350748062133789,
+      "step": 290
+    },
+    {
+      "epoch": 0.05367540029112081,
+      "grad_norm": 0.08040128648281097,
+      "learning_rate": 4.992790394828355e-05,
+      "loss": 0.23253886699676513,
+      "step": 295
+    },
+    {
+      "epoch": 0.05458515283842795,
+      "grad_norm": 0.08852150291204453,
+      "learning_rate": 4.992220975251428e-05,
+      "loss": 0.23856515884399415,
+      "step": 300
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.7433397849108378e+17,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-300/training_args.bin b/checkpoint-300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/checkpoint-3000/README.md b/checkpoint-3000/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-3000/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-3000/adapter_config.json b/checkpoint-3000/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-3000/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-3000/adapter_model.safetensors b/checkpoint-3000/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..91bf604436b870cb0685509b6f0bbcd7618b543d
--- /dev/null
+++ b/checkpoint-3000/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:561130f2a2f932a2beb712641a1a0c8f7402406d273ef7184af992d24f288c70
+size 169741912
diff --git a/checkpoint-3000/chat_template.jinja b/checkpoint-3000/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-3000/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-3000/optimizer.pt b/checkpoint-3000/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..40640158123914b30fa2b8006f7d72b13ff779c9
--- /dev/null
+++ b/checkpoint-3000/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ff954c41539e6202c1b840f66ad70716b5107dd46043d7deb64a6cddabcceb6b
+size 72807355
diff --git a/checkpoint-3000/processor_config.json b/checkpoint-3000/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-3000/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-3000/rng_state.pth b/checkpoint-3000/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-3000/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-3000/scheduler.pt b/checkpoint-3000/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f4e10983f12f2ee71fb90f299f3a40de6b072a5f
--- /dev/null
+++ b/checkpoint-3000/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fdd50cc977c9a153c6fd2866e786909af07ca646ecb6892b8cd2d8a1df02834c
+size 1465
diff --git a/checkpoint-3000/tokenizer.json b/checkpoint-3000/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-3000/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-3000/tokenizer_config.json b/checkpoint-3000/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-3000/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-3000/trainer_state.json b/checkpoint-3000/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..aee470cbfa1d68b89c7eebf2bef32ba796067967
--- /dev/null
+++ b/checkpoint-3000/trainer_state.json
@@ -0,0 +1,4242 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.5458515283842795,
+  "eval_steps": 100,
+  "global_step": 3000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    },
+    {
+      "epoch": 0.03729985443959243,
+      "grad_norm": 0.061678580939769745,
+      "learning_rate": 4.999340748636462e-05,
+      "loss": 0.24956226348876953,
+      "step": 205
+    },
+    {
+      "epoch": 0.03820960698689956,
+      "grad_norm": 0.07328873127698898,
+      "learning_rate": 4.999160884067051e-05,
+      "loss": 0.26169676780700685,
+      "step": 210
+    },
+    {
+      "epoch": 0.0391193595342067,
+      "grad_norm": 0.08287990838289261,
+      "learning_rate": 4.9989593541926246e-05,
+      "loss": 0.2574604034423828,
+      "step": 215
+    },
+    {
+      "epoch": 0.04002911208151383,
+      "grad_norm": 0.06787359714508057,
+      "learning_rate": 4.9987361607602525e-05,
+      "loss": 0.25351409912109374,
+      "step": 220
+    },
+    {
+      "epoch": 0.04093886462882096,
+      "grad_norm": 0.06695502996444702,
+      "learning_rate": 4.998491305704805e-05,
+      "loss": 0.24522039890289307,
+      "step": 225
+    },
+    {
+      "epoch": 0.041848617176128096,
+      "grad_norm": 0.08872214704751968,
+      "learning_rate": 4.9982247911489375e-05,
+      "loss": 0.2581867933273315,
+      "step": 230
+    },
+    {
+      "epoch": 0.042758369723435226,
+      "grad_norm": 0.07637131959199905,
+      "learning_rate": 4.9979366194030743e-05,
+      "loss": 0.25569658279418944,
+      "step": 235
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 0.08158119022846222,
+      "learning_rate": 4.997626792965385e-05,
+      "loss": 0.2529409646987915,
+      "step": 240
+    },
+    {
+      "epoch": 0.04457787481804949,
+      "grad_norm": 0.07529161125421524,
+      "learning_rate": 4.997295314521766e-05,
+      "loss": 0.24049024581909179,
+      "step": 245
+    },
+    {
+      "epoch": 0.04548762736535662,
+      "grad_norm": 0.08860139548778534,
+      "learning_rate": 4.996942186945813e-05,
+      "loss": 0.2490522861480713,
+      "step": 250
+    },
+    {
+      "epoch": 0.04639737991266375,
+      "grad_norm": 0.0850321501493454,
+      "learning_rate": 4.9965674132988005e-05,
+      "loss": 0.24180831909179687,
+      "step": 255
+    },
+    {
+      "epoch": 0.04730713245997089,
+      "grad_norm": 0.07556115090847015,
+      "learning_rate": 4.996170996829653e-05,
+      "loss": 0.2509631872177124,
+      "step": 260
+    },
+    {
+      "epoch": 0.04821688500727802,
+      "grad_norm": 0.07971206307411194,
+      "learning_rate": 4.995752940974918e-05,
+      "loss": 0.24398891925811766,
+      "step": 265
+    },
+    {
+      "epoch": 0.04912663755458515,
+      "grad_norm": 0.09149336814880371,
+      "learning_rate": 4.9953132493587344e-05,
+      "loss": 0.2300492286682129,
+      "step": 270
+    },
+    {
+      "epoch": 0.050036390101892286,
+      "grad_norm": 0.08265820890665054,
+      "learning_rate": 4.9948519257928034e-05,
+      "loss": 0.24246792793273925,
+      "step": 275
+    },
+    {
+      "epoch": 0.050946142649199416,
+      "grad_norm": 0.10328587144613266,
+      "learning_rate": 4.9943689742763534e-05,
+      "loss": 0.2367171049118042,
+      "step": 280
+    },
+    {
+      "epoch": 0.05185589519650655,
+      "grad_norm": 0.0836917981505394,
+      "learning_rate": 4.993864398996105e-05,
+      "loss": 0.23215813636779786,
+      "step": 285
+    },
+    {
+      "epoch": 0.05276564774381368,
+      "grad_norm": 0.09475161135196686,
+      "learning_rate": 4.99333820432624e-05,
+      "loss": 0.2350748062133789,
+      "step": 290
+    },
+    {
+      "epoch": 0.05367540029112081,
+      "grad_norm": 0.08040128648281097,
+      "learning_rate": 4.992790394828355e-05,
+      "loss": 0.23253886699676513,
+      "step": 295
+    },
+    {
+      "epoch": 0.05458515283842795,
+      "grad_norm": 0.08852150291204453,
+      "learning_rate": 4.992220975251428e-05,
+      "loss": 0.23856515884399415,
+      "step": 300
+    },
+    {
+      "epoch": 0.05549490538573508,
+      "grad_norm": 0.09565229713916779,
+      "learning_rate": 4.991629950531775e-05,
+      "loss": 0.23311660289764405,
+      "step": 305
+    },
+    {
+      "epoch": 0.05640465793304221,
+      "grad_norm": 0.08158160001039505,
+      "learning_rate": 4.991017325793009e-05,
+      "loss": 0.22467944622039795,
+      "step": 310
+    },
+    {
+      "epoch": 0.05731441048034935,
+      "grad_norm": 0.07746429741382599,
+      "learning_rate": 4.990383106345994e-05,
+      "loss": 0.229844069480896,
+      "step": 315
+    },
+    {
+      "epoch": 0.05822416302765648,
+      "grad_norm": 0.08564355969429016,
+      "learning_rate": 4.989727297688797e-05,
+      "loss": 0.22414517402648926,
+      "step": 320
+    },
+    {
+      "epoch": 0.05913391557496361,
+      "grad_norm": 0.07517435401678085,
+      "learning_rate": 4.9890499055066435e-05,
+      "loss": 0.2236532211303711,
+      "step": 325
+    },
+    {
+      "epoch": 0.060043668122270744,
+      "grad_norm": 0.111734539270401,
+      "learning_rate": 4.988350935671869e-05,
+      "loss": 0.21474847793579102,
+      "step": 330
+    },
+    {
+      "epoch": 0.060953420669577874,
+      "grad_norm": 0.09906989336013794,
+      "learning_rate": 4.987630394243866e-05,
+      "loss": 0.23321933746337892,
+      "step": 335
+    },
+    {
+      "epoch": 0.06186317321688501,
+      "grad_norm": 0.10131457448005676,
+      "learning_rate": 4.98688828746903e-05,
+      "loss": 0.2310662031173706,
+      "step": 340
+    },
+    {
+      "epoch": 0.06277292576419213,
+      "grad_norm": 0.09203507006168365,
+      "learning_rate": 4.986124621780708e-05,
+      "loss": 0.22021169662475587,
+      "step": 345
+    },
+    {
+      "epoch": 0.06368267831149928,
+      "grad_norm": 0.09505912661552429,
+      "learning_rate": 4.9853394037991416e-05,
+      "loss": 0.2197155237197876,
+      "step": 350
+    },
+    {
+      "epoch": 0.06459243085880641,
+      "grad_norm": 0.09038657695055008,
+      "learning_rate": 4.984532640331412e-05,
+      "loss": 0.22066287994384765,
+      "step": 355
+    },
+    {
+      "epoch": 0.06550218340611354,
+      "grad_norm": 0.09707064181566238,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 0.22455451488494874,
+      "step": 360
+    },
+    {
+      "epoch": 0.06641193595342067,
+      "grad_norm": 0.10367228090763092,
+      "learning_rate": 4.98285450509961e-05,
+      "loss": 0.21993820667266845,
+      "step": 365
+    },
+    {
+      "epoch": 0.0673216885007278,
+      "grad_norm": 0.12229471653699875,
+      "learning_rate": 4.9819831478833456e-05,
+      "loss": 0.2168867588043213,
+      "step": 370
+    },
+    {
+      "epoch": 0.06823144104803494,
+      "grad_norm": 0.0964592918753624,
+      "learning_rate": 4.981090274276406e-05,
+      "loss": 0.21579203605651856,
+      "step": 375
+    },
+    {
+      "epoch": 0.06914119359534207,
+      "grad_norm": 0.09400496631860733,
+      "learning_rate": 4.980175892019141e-05,
+      "loss": 0.20972180366516113,
+      "step": 380
+    },
+    {
+      "epoch": 0.0700509461426492,
+      "grad_norm": 0.08158645778894424,
+      "learning_rate": 4.9792400090383594e-05,
+      "loss": 0.22148358821868896,
+      "step": 385
+    },
+    {
+      "epoch": 0.07096069868995633,
+      "grad_norm": 0.10916394740343094,
+      "learning_rate": 4.978282633447261e-05,
+      "loss": 0.2214418649673462,
+      "step": 390
+    },
+    {
+      "epoch": 0.07187045123726346,
+      "grad_norm": 0.11138810962438583,
+      "learning_rate": 4.9773037735453636e-05,
+      "loss": 0.21814754009246826,
+      "step": 395
+    },
+    {
+      "epoch": 0.07278020378457059,
+      "grad_norm": 0.10914396494626999,
+      "learning_rate": 4.9763034378184365e-05,
+      "loss": 0.21310818195343018,
+      "step": 400
+    },
+    {
+      "epoch": 0.07368995633187773,
+      "grad_norm": 0.1043366864323616,
+      "learning_rate": 4.975281634938421e-05,
+      "loss": 0.21266789436340333,
+      "step": 405
+    },
+    {
+      "epoch": 0.07459970887918486,
+      "grad_norm": 0.1036868542432785,
+      "learning_rate": 4.9742383737633594e-05,
+      "loss": 0.21606721878051757,
+      "step": 410
+    },
+    {
+      "epoch": 0.075509461426492,
+      "grad_norm": 0.11640442907810211,
+      "learning_rate": 4.9731736633373144e-05,
+      "loss": 0.21532948017120362,
+      "step": 415
+    },
+    {
+      "epoch": 0.07641921397379912,
+      "grad_norm": 0.11219926178455353,
+      "learning_rate": 4.9720875128902956e-05,
+      "loss": 0.2191627025604248,
+      "step": 420
+    },
+    {
+      "epoch": 0.07732896652110625,
+      "grad_norm": 0.12103637307882309,
+      "learning_rate": 4.970979931838176e-05,
+      "loss": 0.20938868522644044,
+      "step": 425
+    },
+    {
+      "epoch": 0.0782387190684134,
+      "grad_norm": 0.13274189829826355,
+      "learning_rate": 4.96985092978261e-05,
+      "loss": 0.21792960166931152,
+      "step": 430
+    },
+    {
+      "epoch": 0.07914847161572053,
+      "grad_norm": 0.11164513230323792,
+      "learning_rate": 4.968700516510954e-05,
+      "loss": 0.2022618055343628,
+      "step": 435
+    },
+    {
+      "epoch": 0.08005822416302766,
+      "grad_norm": 0.09532847255468369,
+      "learning_rate": 4.967528701996174e-05,
+      "loss": 0.21255812644958497,
+      "step": 440
+    },
+    {
+      "epoch": 0.08096797671033479,
+      "grad_norm": 0.10279258340597153,
+      "learning_rate": 4.96633549639677e-05,
+      "loss": 0.20683050155639648,
+      "step": 445
+    },
+    {
+      "epoch": 0.08187772925764192,
+      "grad_norm": 0.1257462352514267,
+      "learning_rate": 4.965120910056677e-05,
+      "loss": 0.21419920921325683,
+      "step": 450
+    },
+    {
+      "epoch": 0.08278748180494905,
+      "grad_norm": 0.11663137376308441,
+      "learning_rate": 4.963884953505186e-05,
+      "loss": 0.2072287082672119,
+      "step": 455
+    },
+    {
+      "epoch": 0.08369723435225619,
+      "grad_norm": 0.10488224029541016,
+      "learning_rate": 4.96262763745684e-05,
+      "loss": 0.1982678532600403,
+      "step": 460
+    },
+    {
+      "epoch": 0.08460698689956332,
+      "grad_norm": 0.11801692098379135,
+      "learning_rate": 4.961348972811354e-05,
+      "loss": 0.20662031173706055,
+      "step": 465
+    },
+    {
+      "epoch": 0.08551673944687045,
+      "grad_norm": 0.11318827420473099,
+      "learning_rate": 4.96004897065351e-05,
+      "loss": 0.20947303771972656,
+      "step": 470
+    },
+    {
+      "epoch": 0.08642649199417758,
+      "grad_norm": 0.13409486413002014,
+      "learning_rate": 4.95872764225307e-05,
+      "loss": 0.19670876264572143,
+      "step": 475
+    },
+    {
+      "epoch": 0.08733624454148471,
+      "grad_norm": 0.14440792798995972,
+      "learning_rate": 4.957384999064672e-05,
+      "loss": 0.19842848777770997,
+      "step": 480
+    },
+    {
+      "epoch": 0.08824599708879186,
+      "grad_norm": 0.12246996909379959,
+      "learning_rate": 4.956021052727731e-05,
+      "loss": 0.20318071842193602,
+      "step": 485
+    },
+    {
+      "epoch": 0.08915574963609899,
+      "grad_norm": 0.13437233865261078,
+      "learning_rate": 4.954635815066342e-05,
+      "loss": 0.21675212383270265,
+      "step": 490
+    },
+    {
+      "epoch": 0.09006550218340612,
+      "grad_norm": 0.11109672486782074,
+      "learning_rate": 4.9532292980891744e-05,
+      "loss": 0.2100757837295532,
+      "step": 495
+    },
+    {
+      "epoch": 0.09097525473071325,
+      "grad_norm": 0.1388893872499466,
+      "learning_rate": 4.9518015139893675e-05,
+      "loss": 0.20303285121917725,
+      "step": 500
+    },
+    {
+      "epoch": 0.09188500727802038,
+      "grad_norm": 0.13239721953868866,
+      "learning_rate": 4.950352475144427e-05,
+      "loss": 0.2152268409729004,
+      "step": 505
+    },
+    {
+      "epoch": 0.0927947598253275,
+      "grad_norm": 0.12834979593753815,
+      "learning_rate": 4.948882194116119e-05,
+      "loss": 0.20799248218536376,
+      "step": 510
+    },
+    {
+      "epoch": 0.09370451237263465,
+      "grad_norm": 0.11886704713106155,
+      "learning_rate": 4.947390683650354e-05,
+      "loss": 0.20394976139068605,
+      "step": 515
+    },
+    {
+      "epoch": 0.09461426491994178,
+      "grad_norm": 0.11398876458406448,
+      "learning_rate": 4.945877956677083e-05,
+      "loss": 0.2091092586517334,
+      "step": 520
+    },
+    {
+      "epoch": 0.09552401746724891,
+      "grad_norm": 0.1422540694475174,
+      "learning_rate": 4.944344026310186e-05,
+      "loss": 0.19564238786697388,
+      "step": 525
+    },
+    {
+      "epoch": 0.09643377001455604,
+      "grad_norm": 0.11359584331512451,
+      "learning_rate": 4.9427889058473535e-05,
+      "loss": 0.20493624210357667,
+      "step": 530
+    },
+    {
+      "epoch": 0.09734352256186317,
+      "grad_norm": 0.11703553050756454,
+      "learning_rate": 4.941212608769974e-05,
+      "loss": 0.2098615884780884,
+      "step": 535
+    },
+    {
+      "epoch": 0.0982532751091703,
+      "grad_norm": 0.14552047848701477,
+      "learning_rate": 4.939615148743017e-05,
+      "loss": 0.20382182598114013,
+      "step": 540
+    },
+    {
+      "epoch": 0.09916302765647744,
+      "grad_norm": 0.13178016245365143,
+      "learning_rate": 4.937996539614914e-05,
+      "loss": 0.19901862144470214,
+      "step": 545
+    },
+    {
+      "epoch": 0.10007278020378457,
+      "grad_norm": 0.635392427444458,
+      "learning_rate": 4.936356795417439e-05,
+      "loss": 0.20694944858551026,
+      "step": 550
+    },
+    {
+      "epoch": 0.1009825327510917,
+      "grad_norm": 0.15019077062606812,
+      "learning_rate": 4.934695930365586e-05,
+      "loss": 0.19313746690750122,
+      "step": 555
+    },
+    {
+      "epoch": 0.10189228529839883,
+      "grad_norm": 0.12941956520080566,
+      "learning_rate": 4.9330139588574474e-05,
+      "loss": 0.19671722650527954,
+      "step": 560
+    },
+    {
+      "epoch": 0.10280203784570596,
+      "grad_norm": 0.13818831741809845,
+      "learning_rate": 4.931310895474088e-05,
+      "loss": 0.20026786327362062,
+      "step": 565
+    },
+    {
+      "epoch": 0.1037117903930131,
+      "grad_norm": 0.12011194974184036,
+      "learning_rate": 4.929586754979417e-05,
+      "loss": 0.1932437539100647,
+      "step": 570
+    },
+    {
+      "epoch": 0.10462154294032024,
+      "grad_norm": 0.1345364898443222,
+      "learning_rate": 4.9278415523200644e-05,
+      "loss": 0.20245940685272218,
+      "step": 575
+    },
+    {
+      "epoch": 0.10553129548762737,
+      "grad_norm": 0.13281017541885376,
+      "learning_rate": 4.926075302625247e-05,
+      "loss": 0.19864981174468993,
+      "step": 580
+    },
+    {
+      "epoch": 0.1064410480349345,
+      "grad_norm": 0.13465586304664612,
+      "learning_rate": 4.924288021206639e-05,
+      "loss": 0.19573183059692384,
+      "step": 585
+    },
+    {
+      "epoch": 0.10735080058224163,
+      "grad_norm": 0.15225961804389954,
+      "learning_rate": 4.9224797235582396e-05,
+      "loss": 0.19946801662445068,
+      "step": 590
+    },
+    {
+      "epoch": 0.10826055312954876,
+      "grad_norm": 0.12816746532917023,
+      "learning_rate": 4.92065042535624e-05,
+      "loss": 0.19851526021957397,
+      "step": 595
+    },
+    {
+      "epoch": 0.1091703056768559,
+      "grad_norm": 0.13802853226661682,
+      "learning_rate": 4.9188001424588824e-05,
+      "loss": 0.19321763515472412,
+      "step": 600
+    },
+    {
+      "epoch": 0.11008005822416303,
+      "grad_norm": 0.17504797875881195,
+      "learning_rate": 4.9169288909063295e-05,
+      "loss": 0.2032616138458252,
+      "step": 605
+    },
+    {
+      "epoch": 0.11098981077147016,
+      "grad_norm": 0.13544194400310516,
+      "learning_rate": 4.91503668692052e-05,
+      "loss": 0.2011256456375122,
+      "step": 610
+    },
+    {
+      "epoch": 0.11189956331877729,
+      "grad_norm": 1.3976134061813354,
+      "learning_rate": 4.91312354690503e-05,
+      "loss": 0.19916868209838867,
+      "step": 615
+    },
+    {
+      "epoch": 0.11280931586608442,
+      "grad_norm": 0.1465059071779251,
+      "learning_rate": 4.91118948744493e-05,
+      "loss": 0.19487457275390624,
+      "step": 620
+    },
+    {
+      "epoch": 0.11371906841339156,
+      "grad_norm": 0.12103168666362762,
+      "learning_rate": 4.909234525306645e-05,
+      "loss": 0.1907251238822937,
+      "step": 625
+    },
+    {
+      "epoch": 0.1146288209606987,
+      "grad_norm": 0.12660574913024902,
+      "learning_rate": 4.907258677437802e-05,
+      "loss": 0.19327253103256226,
+      "step": 630
+    },
+    {
+      "epoch": 0.11553857350800582,
+      "grad_norm": 0.1347813606262207,
+      "learning_rate": 4.90526196096709e-05,
+      "loss": 0.19637736082077026,
+      "step": 635
+    },
+    {
+      "epoch": 0.11644832605531295,
+      "grad_norm": 0.14953652024269104,
+      "learning_rate": 4.903244393204107e-05,
+      "loss": 0.20325069427490233,
+      "step": 640
+    },
+    {
+      "epoch": 0.11735807860262008,
+      "grad_norm": 0.13936272263526917,
+      "learning_rate": 4.901205991639213e-05,
+      "loss": 0.1930275321006775,
+      "step": 645
+    },
+    {
+      "epoch": 0.11826783114992721,
+      "grad_norm": 0.1448420137166977,
+      "learning_rate": 4.899146773943374e-05,
+      "loss": 0.20026936531066894,
+      "step": 650
+    },
+    {
+      "epoch": 0.11917758369723436,
+      "grad_norm": 0.1312534064054489,
+      "learning_rate": 4.897066757968014e-05,
+      "loss": 0.19062033891677857,
+      "step": 655
+    },
+    {
+      "epoch": 0.12008733624454149,
+      "grad_norm": 0.13644742965698242,
+      "learning_rate": 4.894965961744859e-05,
+      "loss": 0.18719595670700073,
+      "step": 660
+    },
+    {
+      "epoch": 0.12099708879184862,
+      "grad_norm": 0.14276087284088135,
+      "learning_rate": 4.892844403485777e-05,
+      "loss": 0.19784307479858398,
+      "step": 665
+    },
+    {
+      "epoch": 0.12190684133915575,
+      "grad_norm": 0.14735399186611176,
+      "learning_rate": 4.890702101582623e-05,
+      "loss": 0.19163782596588136,
+      "step": 670
+    },
+    {
+      "epoch": 0.12281659388646288,
+      "grad_norm": 0.15742065012454987,
+      "learning_rate": 4.888539074607082e-05,
+      "loss": 0.19312986135482788,
+      "step": 675
+    },
+    {
+      "epoch": 0.12372634643377002,
+      "grad_norm": 0.12917031347751617,
+      "learning_rate": 4.8863553413105025e-05,
+      "loss": 0.20066320896148682,
+      "step": 680
+    },
+    {
+      "epoch": 0.12463609898107715,
+      "grad_norm": 0.1484801322221756,
+      "learning_rate": 4.884150920623737e-05,
+      "loss": 0.20096096992492676,
+      "step": 685
+    },
+    {
+      "epoch": 0.12554585152838427,
+      "grad_norm": 0.1455296128988266,
+      "learning_rate": 4.88192583165698e-05,
+      "loss": 0.20518505573272705,
+      "step": 690
+    },
+    {
+      "epoch": 0.12645560407569142,
+      "grad_norm": 0.14517490565776825,
+      "learning_rate": 4.879680093699598e-05,
+      "loss": 0.18859238624572755,
+      "step": 695
+    },
+    {
+      "epoch": 0.12736535662299855,
+      "grad_norm": 0.18778090178966522,
+      "learning_rate": 4.877413726219964e-05,
+      "loss": 0.197074818611145,
+      "step": 700
+    },
+    {
+      "epoch": 0.12827510917030568,
+      "grad_norm": 0.13497677445411682,
+      "learning_rate": 4.87512674886529e-05,
+      "loss": 0.18713107109069824,
+      "step": 705
+    },
+    {
+      "epoch": 0.12918486171761281,
+      "grad_norm": 0.12657155096530914,
+      "learning_rate": 4.872819181461455e-05,
+      "loss": 0.1858484387397766,
+      "step": 710
+    },
+    {
+      "epoch": 0.13009461426491994,
+      "grad_norm": 0.11458148807287216,
+      "learning_rate": 4.870491044012834e-05,
+      "loss": 0.18732179403305055,
+      "step": 715
+    },
+    {
+      "epoch": 0.13100436681222707,
+      "grad_norm": 0.13000249862670898,
+      "learning_rate": 4.8681423567021244e-05,
+      "loss": 0.1872936010360718,
+      "step": 720
+    },
+    {
+      "epoch": 0.1319141193595342,
+      "grad_norm": 0.14580890536308289,
+      "learning_rate": 4.865773139890172e-05,
+      "loss": 0.19280019998550416,
+      "step": 725
+    },
+    {
+      "epoch": 0.13282387190684133,
+      "grad_norm": 0.1507277935743332,
+      "learning_rate": 4.8633834141157913e-05,
+      "loss": 0.1898929238319397,
+      "step": 730
+    },
+    {
+      "epoch": 0.13373362445414846,
+      "grad_norm": 0.1418737769126892,
+      "learning_rate": 4.860973200095592e-05,
+      "loss": 0.17926375865936278,
+      "step": 735
+    },
+    {
+      "epoch": 0.1346433770014556,
+      "grad_norm": 0.17151866853237152,
+      "learning_rate": 4.858542518723794e-05,
+      "loss": 0.18963592052459716,
+      "step": 740
+    },
+    {
+      "epoch": 0.13555312954876272,
+      "grad_norm": 0.11162743717432022,
+      "learning_rate": 4.8560913910720535e-05,
+      "loss": 0.19466646909713745,
+      "step": 745
+    },
+    {
+      "epoch": 0.13646288209606988,
+      "grad_norm": 0.15628376603126526,
+      "learning_rate": 4.8536198383892725e-05,
+      "loss": 0.19494034051895143,
+      "step": 750
+    },
+    {
+      "epoch": 0.137372634643377,
+      "grad_norm": 0.18209289014339447,
+      "learning_rate": 4.851127882101421e-05,
+      "loss": 0.18747550249099731,
+      "step": 755
+    },
+    {
+      "epoch": 0.13828238719068414,
+      "grad_norm": 0.14559614658355713,
+      "learning_rate": 4.8486155438113454e-05,
+      "loss": 0.1897158980369568,
+      "step": 760
+    },
+    {
+      "epoch": 0.13919213973799127,
+      "grad_norm": 0.3198587894439697,
+      "learning_rate": 4.846082845298586e-05,
+      "loss": 0.18571001291275024,
+      "step": 765
+    },
+    {
+      "epoch": 0.1401018922852984,
+      "grad_norm": 0.1486678421497345,
+      "learning_rate": 4.843529808519189e-05,
+      "loss": 0.19561930894851684,
+      "step": 770
+    },
+    {
+      "epoch": 0.14101164483260553,
+      "grad_norm": 0.15318170189857483,
+      "learning_rate": 4.840956455605509e-05,
+      "loss": 0.187040114402771,
+      "step": 775
+    },
+    {
+      "epoch": 0.14192139737991266,
+      "grad_norm": 0.13754244148731232,
+      "learning_rate": 4.838362808866025e-05,
+      "loss": 0.18345539569854735,
+      "step": 780
+    },
+    {
+      "epoch": 0.1428311499272198,
+      "grad_norm": 0.12943248450756073,
+      "learning_rate": 4.835748890785143e-05,
+      "loss": 0.1921079397201538,
+      "step": 785
+    },
+    {
+      "epoch": 0.14374090247452692,
+      "grad_norm": 0.110458143055439,
+      "learning_rate": 4.833114724023001e-05,
+      "loss": 0.17927205562591553,
+      "step": 790
+    },
+    {
+      "epoch": 0.14465065502183405,
+      "grad_norm": 0.2421770840883255,
+      "learning_rate": 4.830460331415275e-05,
+      "loss": 0.18317567110061644,
+      "step": 795
+    },
+    {
+      "epoch": 0.14556040756914118,
+      "grad_norm": 0.14752762019634247,
+      "learning_rate": 4.8277857359729787e-05,
+      "loss": 0.1843916058540344,
+      "step": 800
+    },
+    {
+      "epoch": 0.14647016011644834,
+      "grad_norm": 0.15043556690216064,
+      "learning_rate": 4.8250909608822644e-05,
+      "loss": 0.18354393243789674,
+      "step": 805
+    },
+    {
+      "epoch": 0.14737991266375547,
+      "grad_norm": 0.1381794661283493,
+      "learning_rate": 4.822376029504223e-05,
+      "loss": 0.1789781332015991,
+      "step": 810
+    },
+    {
+      "epoch": 0.1482896652110626,
+      "grad_norm": 0.18386174738407135,
+      "learning_rate": 4.819640965374681e-05,
+      "loss": 0.19494292736053467,
+      "step": 815
+    },
+    {
+      "epoch": 0.14919941775836973,
+      "grad_norm": 0.13829593360424042,
+      "learning_rate": 4.816885792203996e-05,
+      "loss": 0.18486063480377196,
+      "step": 820
+    },
+    {
+      "epoch": 0.15010917030567686,
+      "grad_norm": 0.15033291280269623,
+      "learning_rate": 4.814110533876852e-05,
+      "loss": 0.18061509132385253,
+      "step": 825
+    },
+    {
+      "epoch": 0.151018922852984,
+      "grad_norm": 0.17150473594665527,
+      "learning_rate": 4.811315214452051e-05,
+      "loss": 0.18464866876602173,
+      "step": 830
+    },
+    {
+      "epoch": 0.15192867540029112,
+      "grad_norm": 0.15317125618457794,
+      "learning_rate": 4.808499858162307e-05,
+      "loss": 0.1837708592414856,
+      "step": 835
+    },
+    {
+      "epoch": 0.15283842794759825,
+      "grad_norm": 0.2671392560005188,
+      "learning_rate": 4.805664489414031e-05,
+      "loss": 0.19338636398315429,
+      "step": 840
+    },
+    {
+      "epoch": 0.15374818049490538,
+      "grad_norm": 0.14047028124332428,
+      "learning_rate": 4.802809132787125e-05,
+      "loss": 0.17069108486175538,
+      "step": 845
+    },
+    {
+      "epoch": 0.1546579330422125,
+      "grad_norm": 0.1520431935787201,
+      "learning_rate": 4.799933813034768e-05,
+      "loss": 0.18607735633850098,
+      "step": 850
+    },
+    {
+      "epoch": 0.15556768558951964,
+      "grad_norm": 0.17239463329315186,
+      "learning_rate": 4.797038555083197e-05,
+      "loss": 0.18069062232971192,
+      "step": 855
+    },
+    {
+      "epoch": 0.1564774381368268,
+      "grad_norm": 0.1377955675125122,
+      "learning_rate": 4.794123384031495e-05,
+      "loss": 0.18870222568511963,
+      "step": 860
+    },
+    {
+      "epoch": 0.15738719068413393,
+      "grad_norm": 0.15901461243629456,
+      "learning_rate": 4.791188325151373e-05,
+      "loss": 0.18128334283828734,
+      "step": 865
+    },
+    {
+      "epoch": 0.15829694323144106,
+      "grad_norm": 0.14634132385253906,
+      "learning_rate": 4.7882334038869495e-05,
+      "loss": 0.1866163969039917,
+      "step": 870
+    },
+    {
+      "epoch": 0.1592066957787482,
+      "grad_norm": 0.15361061692237854,
+      "learning_rate": 4.785258645854529e-05,
+      "loss": 0.17850807905197144,
+      "step": 875
+    },
+    {
+      "epoch": 0.16011644832605532,
+      "grad_norm": 0.13751649856567383,
+      "learning_rate": 4.782264076842385e-05,
+      "loss": 0.17731113433837892,
+      "step": 880
+    },
+    {
+      "epoch": 0.16102620087336245,
+      "grad_norm": 0.17909638583660126,
+      "learning_rate": 4.7792497228105314e-05,
+      "loss": 0.18344542980194092,
+      "step": 885
+    },
+    {
+      "epoch": 0.16193595342066958,
+      "grad_norm": 0.16038304567337036,
+      "learning_rate": 4.776215609890498e-05,
+      "loss": 0.18868647813796996,
+      "step": 890
+    },
+    {
+      "epoch": 0.1628457059679767,
+      "grad_norm": 0.1653951108455658,
+      "learning_rate": 4.773161764385107e-05,
+      "loss": 0.18614152669906617,
+      "step": 895
+    },
+    {
+      "epoch": 0.16375545851528384,
+      "grad_norm": 0.16193026304244995,
+      "learning_rate": 4.770088212768241e-05,
+      "loss": 0.18564575910568237,
+      "step": 900
+    },
+    {
+      "epoch": 0.16466521106259097,
+      "grad_norm": 0.16048531234264374,
+      "learning_rate": 4.7669949816846173e-05,
+      "loss": 0.18330031633377075,
+      "step": 905
+    },
+    {
+      "epoch": 0.1655749636098981,
+      "grad_norm": 0.1440177708864212,
+      "learning_rate": 4.7638820979495534e-05,
+      "loss": 0.17712442874908446,
+      "step": 910
+    },
+    {
+      "epoch": 0.16648471615720525,
+      "grad_norm": 0.19635969400405884,
+      "learning_rate": 4.760749588548738e-05,
+      "loss": 0.18679027557373046,
+      "step": 915
+    },
+    {
+      "epoch": 0.16739446870451238,
+      "grad_norm": 0.15576541423797607,
+      "learning_rate": 4.757597480637995e-05,
+      "loss": 0.19283764362335204,
+      "step": 920
+    },
+    {
+      "epoch": 0.1683042212518195,
+      "grad_norm": 0.1550331562757492,
+      "learning_rate": 4.7544258015430463e-05,
+      "loss": 0.18269542455673218,
+      "step": 925
+    },
+    {
+      "epoch": 0.16921397379912664,
+      "grad_norm": 0.18369626998901367,
+      "learning_rate": 4.75123457875928e-05,
+      "loss": 0.1697891116142273,
+      "step": 930
+    },
+    {
+      "epoch": 0.17012372634643377,
+      "grad_norm": 0.15266314148902893,
+      "learning_rate": 4.7480238399515074e-05,
+      "loss": 0.18523451089859008,
+      "step": 935
+    },
+    {
+      "epoch": 0.1710334788937409,
+      "grad_norm": 0.16709664463996887,
+      "learning_rate": 4.744793612953724e-05,
+      "loss": 0.1803238034248352,
+      "step": 940
+    },
+    {
+      "epoch": 0.17194323144104803,
+      "grad_norm": 0.14929179847240448,
+      "learning_rate": 4.741543925768872e-05,
+      "loss": 0.1861217737197876,
+      "step": 945
+    },
+    {
+      "epoch": 0.17285298398835516,
+      "grad_norm": 0.1362280696630478,
+      "learning_rate": 4.7382748065685915e-05,
+      "loss": 0.17896100282669067,
+      "step": 950
+    },
+    {
+      "epoch": 0.1737627365356623,
+      "grad_norm": 0.15290239453315735,
+      "learning_rate": 4.734986283692982e-05,
+      "loss": 0.18432788848876952,
+      "step": 955
+    },
+    {
+      "epoch": 0.17467248908296942,
+      "grad_norm": 0.1287035197019577,
+      "learning_rate": 4.73167838565035e-05,
+      "loss": 0.18485682010650634,
+      "step": 960
+    },
+    {
+      "epoch": 0.17558224163027655,
+      "grad_norm": 0.17969627678394318,
+      "learning_rate": 4.728351141116971e-05,
+      "loss": 0.17361557483673096,
+      "step": 965
+    },
+    {
+      "epoch": 0.1764919941775837,
+      "grad_norm": 0.13751201331615448,
+      "learning_rate": 4.7250045789368326e-05,
+      "loss": 0.1731679320335388,
+      "step": 970
+    },
+    {
+      "epoch": 0.17740174672489084,
+      "grad_norm": 0.1603265255689621,
+      "learning_rate": 4.721638728121388e-05,
+      "loss": 0.17308170795440675,
+      "step": 975
+    },
+    {
+      "epoch": 0.17831149927219797,
+      "grad_norm": 0.1592789888381958,
+      "learning_rate": 4.718253617849306e-05,
+      "loss": 0.17534757852554322,
+      "step": 980
+    },
+    {
+      "epoch": 0.1792212518195051,
+      "grad_norm": 0.12727224826812744,
+      "learning_rate": 4.714849277466214e-05,
+      "loss": 0.17817609310150145,
+      "step": 985
+    },
+    {
+      "epoch": 0.18013100436681223,
+      "grad_norm": 0.15401554107666016,
+      "learning_rate": 4.711425736484447e-05,
+      "loss": 0.1733405351638794,
+      "step": 990
+    },
+    {
+      "epoch": 0.18104075691411936,
+      "grad_norm": 0.13253968954086304,
+      "learning_rate": 4.7079830245827906e-05,
+      "loss": 0.17846795320510864,
+      "step": 995
+    },
+    {
+      "epoch": 0.1819505094614265,
+      "grad_norm": 0.21846213936805725,
+      "learning_rate": 4.7045211716062245e-05,
+      "loss": 0.18021599054336548,
+      "step": 1000
+    },
+    {
+      "epoch": 0.18286026200873362,
+      "grad_norm": 0.16867990791797638,
+      "learning_rate": 4.7010402075656595e-05,
+      "loss": 0.18232386112213134,
+      "step": 1005
+    },
+    {
+      "epoch": 0.18377001455604075,
+      "grad_norm": 0.17180582880973816,
+      "learning_rate": 4.697540162637686e-05,
+      "loss": 0.1816317319869995,
+      "step": 1010
+    },
+    {
+      "epoch": 0.18467976710334788,
+      "grad_norm": 0.16480213403701782,
+      "learning_rate": 4.694021067164303e-05,
+      "loss": 0.17718446254730225,
+      "step": 1015
+    },
+    {
+      "epoch": 0.185589519650655,
+      "grad_norm": 0.15015918016433716,
+      "learning_rate": 4.6904829516526605e-05,
+      "loss": 0.17412011623382567,
+      "step": 1020
+    },
+    {
+      "epoch": 0.18649927219796217,
+      "grad_norm": 0.14445139467716217,
+      "learning_rate": 4.686925846774795e-05,
+      "loss": 0.1778018832206726,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1874090247452693,
+      "grad_norm": 0.1701960265636444,
+      "learning_rate": 4.683349783367362e-05,
+      "loss": 0.16901081800460815,
+      "step": 1030
+    },
+    {
+      "epoch": 0.18831877729257643,
+      "grad_norm": 0.15894867479801178,
+      "learning_rate": 4.679754792431368e-05,
+      "loss": 0.17055928707122803,
+      "step": 1035
+    },
+    {
+      "epoch": 0.18922852983988356,
+      "grad_norm": 0.1511942446231842,
+      "learning_rate": 4.676140905131903e-05,
+      "loss": 0.17339680194854737,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1901382823871907,
+      "grad_norm": 0.14735209941864014,
+      "learning_rate": 4.672508152797872e-05,
+      "loss": 0.17802717685699462,
+      "step": 1045
+    },
+    {
+      "epoch": 0.19104803493449782,
+      "grad_norm": 0.17367291450500488,
+      "learning_rate": 4.66885656692172e-05,
+      "loss": 0.1732744097709656,
+      "step": 1050
+    },
+    {
+      "epoch": 0.19195778748180495,
+      "grad_norm": 0.147227481007576,
+      "learning_rate": 4.665186179159159e-05,
+      "loss": 0.17040517330169677,
+      "step": 1055
+    },
+    {
+      "epoch": 0.19286754002911208,
+      "grad_norm": 0.1709655076265335,
+      "learning_rate": 4.6614970213289e-05,
+      "loss": 0.17794088125228882,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1937772925764192,
+      "grad_norm": 0.1588088721036911,
+      "learning_rate": 4.657789125412366e-05,
+      "loss": 0.17180380821228028,
+      "step": 1065
+    },
+    {
+      "epoch": 0.19468704512372634,
+      "grad_norm": 0.14827021956443787,
+      "learning_rate": 4.654062523553428e-05,
+      "loss": 0.182997989654541,
+      "step": 1070
+    },
+    {
+      "epoch": 0.19559679767103347,
+      "grad_norm": 0.16230466961860657,
+      "learning_rate": 4.6503172480581126e-05,
+      "loss": 0.17346880435943604,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1965065502183406,
+      "grad_norm": 0.1637624353170395,
+      "learning_rate": 4.646553331394333e-05,
+      "loss": 0.17263576984405518,
+      "step": 1080
+    },
+    {
+      "epoch": 0.19741630276564776,
+      "grad_norm": 0.15977843105793,
+      "learning_rate": 4.642770806191603e-05,
+      "loss": 0.17284308671951293,
+      "step": 1085
+    },
+    {
+      "epoch": 0.19832605531295489,
+      "grad_norm": 0.15394869446754456,
+      "learning_rate": 4.6389697052407534e-05,
+      "loss": 0.17797101736068727,
+      "step": 1090
+    },
+    {
+      "epoch": 0.19923580786026202,
+      "grad_norm": 0.15995225310325623,
+      "learning_rate": 4.6351500614936485e-05,
+      "loss": 0.18137198686599731,
+      "step": 1095
+    },
+    {
+      "epoch": 0.20014556040756915,
+      "grad_norm": 0.1779479682445526,
+      "learning_rate": 4.6313119080629006e-05,
+      "loss": 0.17998344898223878,
+      "step": 1100
+    },
+    {
+      "epoch": 0.20105531295487628,
+      "grad_norm": 0.14362832903862,
+      "learning_rate": 4.627455278221584e-05,
+      "loss": 0.18196423053741456,
+      "step": 1105
+    },
+    {
+      "epoch": 0.2019650655021834,
+      "grad_norm": 0.15951639413833618,
+      "learning_rate": 4.623580205402947e-05,
+      "loss": 0.17423888444900512,
+      "step": 1110
+    },
+    {
+      "epoch": 0.20287481804949054,
+      "grad_norm": 0.17273563146591187,
+      "learning_rate": 4.619686723200115e-05,
+      "loss": 0.17392473220825194,
+      "step": 1115
+    },
+    {
+      "epoch": 0.20378457059679767,
+      "grad_norm": 0.1655360758304596,
+      "learning_rate": 4.615774865365813e-05,
+      "loss": 0.17528389692306517,
+      "step": 1120
+    },
+    {
+      "epoch": 0.2046943231441048,
+      "grad_norm": 0.15920691192150116,
+      "learning_rate": 4.611844665812058e-05,
+      "loss": 0.1806849241256714,
+      "step": 1125
+    },
+    {
+      "epoch": 0.20560407569141192,
+      "grad_norm": 0.16114577651023865,
+      "learning_rate": 4.607896158609875e-05,
+      "loss": 0.17217352390289306,
+      "step": 1130
+    },
+    {
+      "epoch": 0.20651382823871905,
+      "grad_norm": 0.1499422937631607,
+      "learning_rate": 4.603929377988999e-05,
+      "loss": 0.17806737422943114,
+      "step": 1135
+    },
+    {
+      "epoch": 0.2074235807860262,
+      "grad_norm": 0.17605191469192505,
+      "learning_rate": 4.5999443583375765e-05,
+      "loss": 0.17842113971710205,
+      "step": 1140
+    },
+    {
+      "epoch": 0.20833333333333334,
+      "grad_norm": 0.16117210686206818,
+      "learning_rate": 4.595941134201871e-05,
+      "loss": 0.18379683494567872,
+      "step": 1145
+    },
+    {
+      "epoch": 0.20924308588064047,
+      "grad_norm": 0.21199050545692444,
+      "learning_rate": 4.591919740285957e-05,
+      "loss": 0.16286123991012574,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2101528384279476,
+      "grad_norm": 0.15100529789924622,
+      "learning_rate": 4.587880211451427e-05,
+      "loss": 0.17995200157165528,
+      "step": 1155
+    },
+    {
+      "epoch": 0.21106259097525473,
+      "grad_norm": 0.16618172824382782,
+      "learning_rate": 4.583822582717085e-05,
+      "loss": 0.16960303783416747,
+      "step": 1160
+    },
+    {
+      "epoch": 0.21197234352256186,
+      "grad_norm": 0.14743569493293762,
+      "learning_rate": 4.579746889258643e-05,
+      "loss": 0.17762668132781984,
+      "step": 1165
+    },
+    {
+      "epoch": 0.212882096069869,
+      "grad_norm": 0.1697179079055786,
+      "learning_rate": 4.575653166408417e-05,
+      "loss": 0.16656005382537842,
+      "step": 1170
+    },
+    {
+      "epoch": 0.21379184861717612,
+      "grad_norm": 0.14886513352394104,
+      "learning_rate": 4.57154144965502e-05,
+      "loss": 0.17091882228851318,
+      "step": 1175
+    },
+    {
+      "epoch": 0.21470160116448325,
+      "grad_norm": 0.18197473883628845,
+      "learning_rate": 4.5674117746430556e-05,
+      "loss": 0.1770920753479004,
+      "step": 1180
+    },
+    {
+      "epoch": 0.21561135371179038,
+      "grad_norm": 0.17323088645935059,
+      "learning_rate": 4.563264177172807e-05,
+      "loss": 0.1734643578529358,
+      "step": 1185
+    },
+    {
+      "epoch": 0.2165211062590975,
+      "grad_norm": 0.1521984338760376,
+      "learning_rate": 4.559098693199929e-05,
+      "loss": 0.17515116930007935,
+      "step": 1190
+    },
+    {
+      "epoch": 0.21743085880640467,
+      "grad_norm": 0.1842304915189743,
+      "learning_rate": 4.554915358835134e-05,
+      "loss": 0.16798022985458375,
+      "step": 1195
+    },
+    {
+      "epoch": 0.2183406113537118,
+      "grad_norm": 0.14753451943397522,
+      "learning_rate": 4.5507142103438794e-05,
+      "loss": 0.1755476713180542,
+      "step": 1200
+    },
+    {
+      "epoch": 0.21925036390101893,
+      "grad_norm": 0.17096194624900818,
+      "learning_rate": 4.546495284146057e-05,
+      "loss": 0.1792473554611206,
+      "step": 1205
+    },
+    {
+      "epoch": 0.22016011644832606,
+      "grad_norm": 0.1579233556985855,
+      "learning_rate": 4.542258616815669e-05,
+      "loss": 0.17230144739151002,
+      "step": 1210
+    },
+    {
+      "epoch": 0.2210698689956332,
+      "grad_norm": 0.177297905087471,
+      "learning_rate": 4.5380042450805216e-05,
+      "loss": 0.1807127833366394,
+      "step": 1215
+    },
+    {
+      "epoch": 0.22197962154294032,
+      "grad_norm": 0.14331696927547455,
+      "learning_rate": 4.533732205821897e-05,
+      "loss": 0.17201389074325563,
+      "step": 1220
+    },
+    {
+      "epoch": 0.22288937409024745,
+      "grad_norm": 0.14473360776901245,
+      "learning_rate": 4.529442536074239e-05,
+      "loss": 0.17036900520324708,
+      "step": 1225
+    },
+    {
+      "epoch": 0.22379912663755458,
+      "grad_norm": 0.1820901483297348,
+      "learning_rate": 4.5251352730248314e-05,
+      "loss": 0.17704882621765136,
+      "step": 1230
+    },
+    {
+      "epoch": 0.2247088791848617,
+      "grad_norm": 0.1948976367712021,
+      "learning_rate": 4.5208104540134746e-05,
+      "loss": 0.1706973433494568,
+      "step": 1235
+    },
+    {
+      "epoch": 0.22561863173216884,
+      "grad_norm": 0.16660070419311523,
+      "learning_rate": 4.51646811653216e-05,
+      "loss": 0.17636821269989014,
+      "step": 1240
+    },
+    {
+      "epoch": 0.22652838427947597,
+      "grad_norm": 0.1699984073638916,
+      "learning_rate": 4.512108298224751e-05,
+      "loss": 0.16986632347106934,
+      "step": 1245
+    },
+    {
+      "epoch": 0.22743813682678313,
+      "grad_norm": 0.17601042985916138,
+      "learning_rate": 4.50773103688665e-05,
+      "loss": 0.17507898807525635,
+      "step": 1250
+    },
+    {
+      "epoch": 0.22834788937409026,
+      "grad_norm": 0.17557238042354584,
+      "learning_rate": 4.503336370464476e-05,
+      "loss": 0.17702863216400147,
+      "step": 1255
+    },
+    {
+      "epoch": 0.2292576419213974,
+      "grad_norm": 0.1800651252269745,
+      "learning_rate": 4.498924337055729e-05,
+      "loss": 0.16419180631637573,
+      "step": 1260
+    },
+    {
+      "epoch": 0.23016739446870452,
+      "grad_norm": 0.2022479772567749,
+      "learning_rate": 4.494494974908468e-05,
+      "loss": 0.17482060194015503,
+      "step": 1265
+    },
+    {
+      "epoch": 0.23107714701601165,
+      "grad_norm": 0.14180205762386322,
+      "learning_rate": 4.490048322420973e-05,
+      "loss": 0.1723136067390442,
+      "step": 1270
+    },
+    {
+      "epoch": 0.23198689956331878,
+      "grad_norm": 0.18607310950756073,
+      "learning_rate": 4.485584418141419e-05,
+      "loss": 0.17096419334411622,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2328966521106259,
+      "grad_norm": 0.15958310663700104,
+      "learning_rate": 4.481103300767529e-05,
+      "loss": 0.1656244158744812,
+      "step": 1280
+    },
+    {
+      "epoch": 0.23380640465793304,
+      "grad_norm": 0.17552383244037628,
+      "learning_rate": 4.476605009146255e-05,
+      "loss": 0.17677626609802247,
+      "step": 1285
+    },
+    {
+      "epoch": 0.23471615720524017,
+      "grad_norm": 0.15299823880195618,
+      "learning_rate": 4.472089582273429e-05,
+      "loss": 0.1778991103172302,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2356259097525473,
+      "grad_norm": 0.14613987505435944,
+      "learning_rate": 4.46755705929343e-05,
+      "loss": 0.17071452140808105,
+      "step": 1295
+    },
+    {
+      "epoch": 0.23653566229985443,
+      "grad_norm": 0.17781122028827667,
+      "learning_rate": 4.463007479498843e-05,
+      "loss": 0.16955430507659913,
+      "step": 1300
+    },
+    {
+      "epoch": 0.23744541484716158,
+      "grad_norm": 0.16326487064361572,
+      "learning_rate": 4.458440882330119e-05,
+      "loss": 0.1777693510055542,
+      "step": 1305
+    },
+    {
+      "epoch": 0.23835516739446871,
+      "grad_norm": 0.17701926827430725,
+      "learning_rate": 4.4538573073752365e-05,
+      "loss": 0.16323351860046387,
+      "step": 1310
+    },
+    {
+      "epoch": 0.23926491994177584,
+      "grad_norm": 0.13104717433452606,
+      "learning_rate": 4.449256794369349e-05,
+      "loss": 0.17653456926345826,
+      "step": 1315
+    },
+    {
+      "epoch": 0.24017467248908297,
+      "grad_norm": 0.1796836256980896,
+      "learning_rate": 4.444639383194452e-05,
+      "loss": 0.17189600467681884,
+      "step": 1320
+    },
+    {
+      "epoch": 0.2410844250363901,
+      "grad_norm": 0.14919696748256683,
+      "learning_rate": 4.440005113879029e-05,
+      "loss": 0.17003334760665895,
+      "step": 1325
+    },
+    {
+      "epoch": 0.24199417758369723,
+      "grad_norm": 0.1728784441947937,
+      "learning_rate": 4.4353540265977064e-05,
+      "loss": 0.17397408485412597,
+      "step": 1330
+    },
+    {
+      "epoch": 0.24290393013100436,
+      "grad_norm": 0.14591015875339508,
+      "learning_rate": 4.43068616167091e-05,
+      "loss": 0.16498478651046752,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2438136826783115,
+      "grad_norm": 0.18417201936244965,
+      "learning_rate": 4.4260015595645055e-05,
+      "loss": 0.16841750144958495,
+      "step": 1340
+    },
+    {
+      "epoch": 0.24472343522561862,
+      "grad_norm": 0.16264279186725616,
+      "learning_rate": 4.4213002608894605e-05,
+      "loss": 0.16907373666763306,
+      "step": 1345
+    },
+    {
+      "epoch": 0.24563318777292575,
+      "grad_norm": 0.15248481929302216,
+      "learning_rate": 4.416582306401481e-05,
+      "loss": 0.15931472778320313,
+      "step": 1350
+    },
+    {
+      "epoch": 0.24654294032023288,
+      "grad_norm": 0.1488373875617981,
+      "learning_rate": 4.4118477370006636e-05,
+      "loss": 0.1701716423034668,
+      "step": 1355
+    },
+    {
+      "epoch": 0.24745269286754004,
+      "grad_norm": 0.14679782092571259,
+      "learning_rate": 4.407096593731142e-05,
+      "loss": 0.157412326335907,
+      "step": 1360
+    },
+    {
+      "epoch": 0.24836244541484717,
+      "grad_norm": 0.17139530181884766,
+      "learning_rate": 4.402328917780728e-05,
+      "loss": 0.17303754091262818,
+      "step": 1365
+    },
+    {
+      "epoch": 0.2492721979621543,
+      "grad_norm": 0.1534871757030487,
+      "learning_rate": 4.397544750480554e-05,
+      "loss": 0.1786255121231079,
+      "step": 1370
+    },
+    {
+      "epoch": 0.2501819505094614,
+      "grad_norm": 0.1876252293586731,
+      "learning_rate": 4.39274413330472e-05,
+      "loss": 0.16442898511886597,
+      "step": 1375
+    },
+    {
+      "epoch": 0.25109170305676853,
+      "grad_norm": 0.16165752708911896,
+      "learning_rate": 4.387927107869928e-05,
+      "loss": 0.1780426025390625,
+      "step": 1380
+    },
+    {
+      "epoch": 0.25200145560407566,
+      "grad_norm": 0.17242255806922913,
+      "learning_rate": 4.383093715935124e-05,
+      "loss": 0.15959256887435913,
+      "step": 1385
+    },
+    {
+      "epoch": 0.25291120815138285,
+      "grad_norm": 0.1627114862203598,
+      "learning_rate": 4.378243999401137e-05,
+      "loss": 0.17606115341186523,
+      "step": 1390
+    },
+    {
+      "epoch": 0.25382096069869,
+      "grad_norm": 0.15911224484443665,
+      "learning_rate": 4.373378000310312e-05,
+      "loss": 0.16798585653305054,
+      "step": 1395
+    },
+    {
+      "epoch": 0.2547307132459971,
+      "grad_norm": 0.15542249381542206,
+      "learning_rate": 4.3684957608461505e-05,
+      "loss": 0.1695417881011963,
+      "step": 1400
+    },
+    {
+      "epoch": 0.25564046579330424,
+      "grad_norm": 0.1475304812192917,
+      "learning_rate": 4.363597323332941e-05,
+      "loss": 0.16340878009796142,
+      "step": 1405
+    },
+    {
+      "epoch": 0.25655021834061137,
+      "grad_norm": 0.16943927109241486,
+      "learning_rate": 4.358682730235395e-05,
+      "loss": 0.17240238189697266,
+      "step": 1410
+    },
+    {
+      "epoch": 0.2574599708879185,
+      "grad_norm": 0.1816391944885254,
+      "learning_rate": 4.3537520241582744e-05,
+      "loss": 0.16558437347412108,
+      "step": 1415
+    },
+    {
+      "epoch": 0.25836972343522563,
+      "grad_norm": 0.23851341009140015,
+      "learning_rate": 4.348805247846027e-05,
+      "loss": 0.16796000003814698,
+      "step": 1420
+    },
+    {
+      "epoch": 0.25927947598253276,
+      "grad_norm": 0.15415243804454803,
+      "learning_rate": 4.343842444182414e-05,
+      "loss": 0.1746017098426819,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2601892285298399,
+      "grad_norm": 0.15651032328605652,
+      "learning_rate": 4.338863656190139e-05,
+      "loss": 0.1649057984352112,
+      "step": 1430
+    },
+    {
+      "epoch": 0.261098981077147,
+      "grad_norm": 0.16601966321468353,
+      "learning_rate": 4.333868927030471e-05,
+      "loss": 0.15888988971710205,
+      "step": 1435
+    },
+    {
+      "epoch": 0.26200873362445415,
+      "grad_norm": 0.1549467295408249,
+      "learning_rate": 4.328858300002876e-05,
+      "loss": 0.16357985734939576,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2629184861717613,
+      "grad_norm": 0.16332370042800903,
+      "learning_rate": 4.32383181854464e-05,
+      "loss": 0.16749982833862304,
+      "step": 1445
+    },
+    {
+      "epoch": 0.2638282387190684,
+      "grad_norm": 0.14827077090740204,
+      "learning_rate": 4.3187895262304894e-05,
+      "loss": 0.16886214017868043,
+      "step": 1450
+    },
+    {
+      "epoch": 0.26473799126637554,
+      "grad_norm": 0.1557198166847229,
+      "learning_rate": 4.313731466772216e-05,
+      "loss": 0.17512214183807373,
+      "step": 1455
+    },
+    {
+      "epoch": 0.26564774381368267,
+      "grad_norm": 0.17263570427894592,
+      "learning_rate": 4.308657684018299e-05,
+      "loss": 0.16248074769973755,
+      "step": 1460
+    },
+    {
+      "epoch": 0.2665574963609898,
+      "grad_norm": 0.17135761678218842,
+      "learning_rate": 4.303568221953521e-05,
+      "loss": 0.16605921983718872,
+      "step": 1465
+    },
+    {
+      "epoch": 0.26746724890829693,
+      "grad_norm": 0.14322632551193237,
+      "learning_rate": 4.2984631246985897e-05,
+      "loss": 0.1610772728919983,
+      "step": 1470
+    },
+    {
+      "epoch": 0.26837700145560406,
+      "grad_norm": 0.18852312862873077,
+      "learning_rate": 4.2933424365097564e-05,
+      "loss": 0.1686462163925171,
+      "step": 1475
+    },
+    {
+      "epoch": 0.2692867540029112,
+      "grad_norm": 0.1780245155096054,
+      "learning_rate": 4.2882062017784294e-05,
+      "loss": 0.16953932046890258,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2701965065502183,
+      "grad_norm": 0.180568665266037,
+      "learning_rate": 4.2830544650307895e-05,
+      "loss": 0.16442664861679077,
+      "step": 1485
+    },
+    {
+      "epoch": 0.27110625909752545,
+      "grad_norm": 0.16876435279846191,
+      "learning_rate": 4.277887270927407e-05,
+      "loss": 0.17128173112869263,
+      "step": 1490
+    },
+    {
+      "epoch": 0.2720160116448326,
+      "grad_norm": 0.164053276181221,
+      "learning_rate": 4.2727046642628513e-05,
+      "loss": 0.16331382989883422,
+      "step": 1495
+    },
+    {
+      "epoch": 0.27292576419213976,
+      "grad_norm": 0.14577528834342957,
+      "learning_rate": 4.267506689965305e-05,
+      "loss": 0.1638316035270691,
+      "step": 1500
+    },
+    {
+      "epoch": 0.2738355167394469,
+      "grad_norm": 0.1648740917444229,
+      "learning_rate": 4.262293393096171e-05,
+      "loss": 0.15332664251327516,
+      "step": 1505
+    },
+    {
+      "epoch": 0.274745269286754,
+      "grad_norm": 0.16445094347000122,
+      "learning_rate": 4.257064818849685e-05,
+      "loss": 0.1706634521484375,
+      "step": 1510
+    },
+    {
+      "epoch": 0.27565502183406115,
+      "grad_norm": 0.1584935486316681,
+      "learning_rate": 4.251821012552524e-05,
+      "loss": 0.1684114694595337,
+      "step": 1515
+    },
+    {
+      "epoch": 0.2765647743813683,
+      "grad_norm": 0.17215611040592194,
+      "learning_rate": 4.24656201966341e-05,
+      "loss": 0.15594131946563722,
+      "step": 1520
+    },
+    {
+      "epoch": 0.2774745269286754,
+      "grad_norm": 0.15945589542388916,
+      "learning_rate": 4.2412878857727214e-05,
+      "loss": 0.1686659574508667,
+      "step": 1525
+    },
+    {
+      "epoch": 0.27838427947598254,
+      "grad_norm": 0.16103951632976532,
+      "learning_rate": 4.2359986566020906e-05,
+      "loss": 0.17779340744018554,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2792940320232897,
+      "grad_norm": 0.1770307570695877,
+      "learning_rate": 4.230694378004014e-05,
+      "loss": 0.16786882877349854,
+      "step": 1535
+    },
+    {
+      "epoch": 0.2802037845705968,
+      "grad_norm": 0.16225053369998932,
+      "learning_rate": 4.2253750959614504e-05,
+      "loss": 0.16558897495269775,
+      "step": 1540
+    },
+    {
+      "epoch": 0.28111353711790393,
+      "grad_norm": 0.27213969826698303,
+      "learning_rate": 4.220040856587425e-05,
+      "loss": 0.1641119599342346,
+      "step": 1545
+    },
+    {
+      "epoch": 0.28202328966521106,
+      "grad_norm": 0.1773071587085724,
+      "learning_rate": 4.2146917061246284e-05,
+      "loss": 0.16919140815734862,
+      "step": 1550
+    },
+    {
+      "epoch": 0.2829330422125182,
+      "grad_norm": 0.15519705414772034,
+      "learning_rate": 4.209327690945014e-05,
+      "loss": 0.15501506328582765,
+      "step": 1555
+    },
+    {
+      "epoch": 0.2838427947598253,
+      "grad_norm": 0.19921597838401794,
+      "learning_rate": 4.203948857549402e-05,
+      "loss": 0.1690821886062622,
+      "step": 1560
+    },
+    {
+      "epoch": 0.28475254730713245,
+      "grad_norm": 0.15417630970478058,
+      "learning_rate": 4.1985552525670696e-05,
+      "loss": 0.1675640344619751,
+      "step": 1565
+    },
+    {
+      "epoch": 0.2856622998544396,
+      "grad_norm": 0.1739572137594223,
+      "learning_rate": 4.193146922755348e-05,
+      "loss": 0.16738017797470092,
+      "step": 1570
+    },
+    {
+      "epoch": 0.2865720524017467,
+      "grad_norm": 0.1384361982345581,
+      "learning_rate": 4.187723914999221e-05,
+      "loss": 0.16802358627319336,
+      "step": 1575
+    },
+    {
+      "epoch": 0.28748180494905384,
+      "grad_norm": 0.1491454839706421,
+      "learning_rate": 4.182286276310915e-05,
+      "loss": 0.1619583249092102,
+      "step": 1580
+    },
+    {
+      "epoch": 0.288391557496361,
+      "grad_norm": 0.15831919014453888,
+      "learning_rate": 4.176834053829492e-05,
+      "loss": 0.1625199794769287,
+      "step": 1585
+    },
+    {
+      "epoch": 0.2893013100436681,
+      "grad_norm": 0.16265396773815155,
+      "learning_rate": 4.1713672948204416e-05,
+      "loss": 0.16718552112579346,
+      "step": 1590
+    },
+    {
+      "epoch": 0.29021106259097523,
+      "grad_norm": 0.15153461694717407,
+      "learning_rate": 4.1658860466752714e-05,
+      "loss": 0.15979087352752686,
+      "step": 1595
+    },
+    {
+      "epoch": 0.29112081513828236,
+      "grad_norm": 0.1620412915945053,
+      "learning_rate": 4.160390356911096e-05,
+      "loss": 0.16103557348251343,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2920305676855895,
+      "grad_norm": 0.16673807799816132,
+      "learning_rate": 4.154880273170223e-05,
+      "loss": 0.16394708156585694,
+      "step": 1605
+    },
+    {
+      "epoch": 0.2929403202328967,
+      "grad_norm": 0.14834867417812347,
+      "learning_rate": 4.149355843219744e-05,
+      "loss": 0.15916435718536376,
+      "step": 1610
+    },
+    {
+      "epoch": 0.2938500727802038,
+      "grad_norm": 0.16977964341640472,
+      "learning_rate": 4.143817114951119e-05,
+      "loss": 0.16538127660751342,
+      "step": 1615
+    },
+    {
+      "epoch": 0.29475982532751094,
+      "grad_norm": 0.17986875772476196,
+      "learning_rate": 4.138264136379756e-05,
+      "loss": 0.15514618158340454,
+      "step": 1620
+    },
+    {
+      "epoch": 0.29566957787481807,
+      "grad_norm": 0.15794920921325684,
+      "learning_rate": 4.132696955644605e-05,
+      "loss": 0.15992183685302735,
+      "step": 1625
+    },
+    {
+      "epoch": 0.2965793304221252,
+      "grad_norm": 0.19955399632453918,
+      "learning_rate": 4.127115621007731e-05,
+      "loss": 0.16362056732177735,
+      "step": 1630
+    },
+    {
+      "epoch": 0.29748908296943233,
+      "grad_norm": 0.1352023035287857,
+      "learning_rate": 4.121520180853903e-05,
+      "loss": 0.15631601810455323,
+      "step": 1635
+    },
+    {
+      "epoch": 0.29839883551673946,
+      "grad_norm": 0.15340781211853027,
+      "learning_rate": 4.1159106836901674e-05,
+      "loss": 0.1571858048439026,
+      "step": 1640
+    },
+    {
+      "epoch": 0.2993085880640466,
+      "grad_norm": 0.15311770141124725,
+      "learning_rate": 4.110287178145433e-05,
+      "loss": 0.16082344055175782,
+      "step": 1645
+    },
+    {
+      "epoch": 0.3002183406113537,
+      "grad_norm": 0.17811627686023712,
+      "learning_rate": 4.10464971297005e-05,
+      "loss": 0.16117215156555176,
+      "step": 1650
+    },
+    {
+      "epoch": 0.30112809315866085,
+      "grad_norm": 0.21060039103031158,
+      "learning_rate": 4.0989983370353805e-05,
+      "loss": 0.15838587284088135,
+      "step": 1655
+    },
+    {
+      "epoch": 0.302037845705968,
+      "grad_norm": 0.155836820602417,
+      "learning_rate": 4.093333099333383e-05,
+      "loss": 0.16648870706558228,
+      "step": 1660
+    },
+    {
+      "epoch": 0.3029475982532751,
+      "grad_norm": 0.13711698353290558,
+      "learning_rate": 4.0876540489761826e-05,
+      "loss": 0.16899349689483642,
+      "step": 1665
+    },
+    {
+      "epoch": 0.30385735080058224,
+      "grad_norm": 0.15162716805934906,
+      "learning_rate": 4.0819612351956485e-05,
+      "loss": 0.16574090719223022,
+      "step": 1670
+    },
+    {
+      "epoch": 0.30476710334788937,
+      "grad_norm": 0.15016348659992218,
+      "learning_rate": 4.0762547073429615e-05,
+      "loss": 0.1689780354499817,
+      "step": 1675
+    },
+    {
+      "epoch": 0.3056768558951965,
+      "grad_norm": 0.15182986855506897,
+      "learning_rate": 4.070534514888194e-05,
+      "loss": 0.1593686819076538,
+      "step": 1680
+    },
+    {
+      "epoch": 0.3065866084425036,
+      "grad_norm": 0.15648750960826874,
+      "learning_rate": 4.0648007074198765e-05,
+      "loss": 0.16436235904693602,
+      "step": 1685
+    },
+    {
+      "epoch": 0.30749636098981076,
+      "grad_norm": 0.18339484930038452,
+      "learning_rate": 4.0590533346445665e-05,
+      "loss": 0.1678077220916748,
+      "step": 1690
+    },
+    {
+      "epoch": 0.3084061135371179,
+      "grad_norm": 0.16426527500152588,
+      "learning_rate": 4.053292446386422e-05,
+      "loss": 0.1689227342605591,
+      "step": 1695
+    },
+    {
+      "epoch": 0.309315866084425,
+      "grad_norm": 0.16129335761070251,
+      "learning_rate": 4.047518092586766e-05,
+      "loss": 0.16592445373535156,
+      "step": 1700
+    },
+    {
+      "epoch": 0.31022561863173215,
+      "grad_norm": 0.15512363612651825,
+      "learning_rate": 4.041730323303654e-05,
+      "loss": 0.16142364740371704,
+      "step": 1705
+    },
+    {
+      "epoch": 0.3111353711790393,
+      "grad_norm": 0.159842386841774,
+      "learning_rate": 4.0359291887114425e-05,
+      "loss": 0.1702875852584839,
+      "step": 1710
+    },
+    {
+      "epoch": 0.3120451237263464,
+      "grad_norm": 0.19558854401111603,
+      "learning_rate": 4.030114739100352e-05,
+      "loss": 0.15966148376464845,
+      "step": 1715
+    },
+    {
+      "epoch": 0.3129548762736536,
+      "grad_norm": 0.1577496975660324,
+      "learning_rate": 4.024287024876029e-05,
+      "loss": 0.1620358943939209,
+      "step": 1720
+    },
+    {
+      "epoch": 0.3138646288209607,
+      "grad_norm": 0.1629355251789093,
+      "learning_rate": 4.0184460965591144e-05,
+      "loss": 0.16511552333831786,
+      "step": 1725
+    },
+    {
+      "epoch": 0.31477438136826785,
+      "grad_norm": 0.17060767114162445,
+      "learning_rate": 4.0125920047848e-05,
+      "loss": 0.15672838687896729,
+      "step": 1730
+    },
+    {
+      "epoch": 0.315684133915575,
+      "grad_norm": 0.22447620332241058,
+      "learning_rate": 4.006724800302394e-05,
+      "loss": 0.15339784622192382,
+      "step": 1735
+    },
+    {
+      "epoch": 0.3165938864628821,
+      "grad_norm": 0.14572037756443024,
+      "learning_rate": 4.000844533974878e-05,
+      "loss": 0.16566959619522095,
+      "step": 1740
+    },
+    {
+      "epoch": 0.31750363901018924,
+      "grad_norm": 0.15915483236312866,
+      "learning_rate": 3.9949512567784684e-05,
+      "loss": 0.16153957843780517,
+      "step": 1745
+    },
+    {
+      "epoch": 0.3184133915574964,
+      "grad_norm": 0.1668540984392166,
+      "learning_rate": 3.9890450198021704e-05,
+      "loss": 0.1659809947013855,
+      "step": 1750
+    },
+    {
+      "epoch": 0.3193231441048035,
+      "grad_norm": 0.16612035036087036,
+      "learning_rate": 3.983125874247341e-05,
+      "loss": 0.16941241025924683,
+      "step": 1755
+    },
+    {
+      "epoch": 0.32023289665211063,
+      "grad_norm": 0.15163679420948029,
+      "learning_rate": 3.9771938714272407e-05,
+      "loss": 0.16053590774536133,
+      "step": 1760
+    },
+    {
+      "epoch": 0.32114264919941776,
+      "grad_norm": 0.1797824203968048,
+      "learning_rate": 3.97124906276659e-05,
+      "loss": 0.1667110800743103,
+      "step": 1765
+    },
+    {
+      "epoch": 0.3220524017467249,
+      "grad_norm": 0.15076608955860138,
+      "learning_rate": 3.9652914998011237e-05,
+      "loss": 0.1607860803604126,
+      "step": 1770
+    },
+    {
+      "epoch": 0.322962154294032,
+      "grad_norm": 0.16523587703704834,
+      "learning_rate": 3.959321234177144e-05,
+      "loss": 0.16515827178955078,
+      "step": 1775
+    },
+    {
+      "epoch": 0.32387190684133915,
+      "grad_norm": 0.22065149247646332,
+      "learning_rate": 3.9533383176510746e-05,
+      "loss": 0.1618957757949829,
+      "step": 1780
+    },
+    {
+      "epoch": 0.3247816593886463,
+      "grad_norm": 0.16426463425159454,
+      "learning_rate": 3.9473428020890066e-05,
+      "loss": 0.15763382911682128,
+      "step": 1785
+    },
+    {
+      "epoch": 0.3256914119359534,
+      "grad_norm": 0.16474904119968414,
+      "learning_rate": 3.941334739466257e-05,
+      "loss": 0.15135571956634522,
+      "step": 1790
+    },
+    {
+      "epoch": 0.32660116448326054,
+      "grad_norm": 0.16746412217617035,
+      "learning_rate": 3.935314181866909e-05,
+      "loss": 0.15925389528274536,
+      "step": 1795
+    },
+    {
+      "epoch": 0.32751091703056767,
+      "grad_norm": 0.17819371819496155,
+      "learning_rate": 3.929281181483369e-05,
+      "loss": 0.1598669171333313,
+      "step": 1800
+    },
+    {
+      "epoch": 0.3284206695778748,
+      "grad_norm": 0.1816040277481079,
+      "learning_rate": 3.923235790615907e-05,
+      "loss": 0.1652522087097168,
+      "step": 1805
+    },
+    {
+      "epoch": 0.32933042212518193,
+      "grad_norm": 0.14846695959568024,
+      "learning_rate": 3.917178061672211e-05,
+      "loss": 0.16665585041046144,
+      "step": 1810
+    },
+    {
+      "epoch": 0.33024017467248906,
+      "grad_norm": 0.1734926551580429,
+      "learning_rate": 3.911108047166924e-05,
+      "loss": 0.16069791316986085,
+      "step": 1815
+    },
+    {
+      "epoch": 0.3311499272197962,
+      "grad_norm": 0.16154922544956207,
+      "learning_rate": 3.905025799721194e-05,
+      "loss": 0.16114097833633423,
+      "step": 1820
+    },
+    {
+      "epoch": 0.3320596797671033,
+      "grad_norm": 0.1538771390914917,
+      "learning_rate": 3.898931372062217e-05,
+      "loss": 0.1602831244468689,
+      "step": 1825
+    },
+    {
+      "epoch": 0.3329694323144105,
+      "grad_norm": 0.14036566019058228,
+      "learning_rate": 3.892824817022781e-05,
+      "loss": 0.1502395749092102,
+      "step": 1830
+    },
+    {
+      "epoch": 0.33387918486171764,
+      "grad_norm": 0.19212059676647186,
+      "learning_rate": 3.886706187540804e-05,
+      "loss": 0.16265250444412233,
+      "step": 1835
+    },
+    {
+      "epoch": 0.33478893740902477,
+      "grad_norm": 0.17410333454608917,
+      "learning_rate": 3.880575536658881e-05,
+      "loss": 0.15689224004745483,
+      "step": 1840
+    },
+    {
+      "epoch": 0.3356986899563319,
+      "grad_norm": 0.15165294706821442,
+      "learning_rate": 3.874432917523817e-05,
+      "loss": 0.15033140182495117,
+      "step": 1845
+    },
+    {
+      "epoch": 0.336608442503639,
+      "grad_norm": 0.16166730225086212,
+      "learning_rate": 3.8682783833861736e-05,
+      "loss": 0.16896235942840576,
+      "step": 1850
+    },
+    {
+      "epoch": 0.33751819505094616,
+      "grad_norm": 0.16497021913528442,
+      "learning_rate": 3.8621119875998026e-05,
+      "loss": 0.1600774645805359,
+      "step": 1855
+    },
+    {
+      "epoch": 0.3384279475982533,
+      "grad_norm": 0.17264948785305023,
+      "learning_rate": 3.855933783621384e-05,
+      "loss": 0.16947593688964843,
+      "step": 1860
+    },
+    {
+      "epoch": 0.3393377001455604,
+      "grad_norm": 0.16870704293251038,
+      "learning_rate": 3.8497438250099636e-05,
+      "loss": 0.16062095165252685,
+      "step": 1865
+    },
+    {
+      "epoch": 0.34024745269286755,
+      "grad_norm": 0.16644036769866943,
+      "learning_rate": 3.843542165426492e-05,
+      "loss": 0.16015599966049193,
+      "step": 1870
+    },
+    {
+      "epoch": 0.3411572052401747,
+      "grad_norm": 0.1626352220773697,
+      "learning_rate": 3.837328858633349e-05,
+      "loss": 0.17444703578948975,
+      "step": 1875
+    },
+    {
+      "epoch": 0.3420669577874818,
+      "grad_norm": 0.1427375227212906,
+      "learning_rate": 3.83110395849389e-05,
+      "loss": 0.1589805006980896,
+      "step": 1880
+    },
+    {
+      "epoch": 0.34297671033478894,
+      "grad_norm": 0.17840255796909332,
+      "learning_rate": 3.824867518971973e-05,
+      "loss": 0.15953952074050903,
+      "step": 1885
+    },
+    {
+      "epoch": 0.34388646288209607,
+      "grad_norm": 0.16998249292373657,
+      "learning_rate": 3.818619594131489e-05,
+      "loss": 0.16027032136917113,
+      "step": 1890
+    },
+    {
+      "epoch": 0.3447962154294032,
+      "grad_norm": 0.14950257539749146,
+      "learning_rate": 3.812360238135897e-05,
+      "loss": 0.15335670709609986,
+      "step": 1895
+    },
+    {
+      "epoch": 0.3457059679767103,
+      "grad_norm": 0.1678011417388916,
+      "learning_rate": 3.806089505247752e-05,
+      "loss": 0.1560648798942566,
+      "step": 1900
+    },
+    {
+      "epoch": 0.34661572052401746,
+      "grad_norm": 0.17944541573524475,
+      "learning_rate": 3.799807449828238e-05,
+      "loss": 0.16072254180908202,
+      "step": 1905
+    },
+    {
+      "epoch": 0.3475254730713246,
+      "grad_norm": 0.166817307472229,
+      "learning_rate": 3.793514126336691e-05,
+      "loss": 0.1542820692062378,
+      "step": 1910
+    },
+    {
+      "epoch": 0.3484352256186317,
+      "grad_norm": 0.16047626733779907,
+      "learning_rate": 3.787209589330134e-05,
+      "loss": 0.16092092990875245,
+      "step": 1915
+    },
+    {
+      "epoch": 0.34934497816593885,
+      "grad_norm": 0.16478900611400604,
+      "learning_rate": 3.7808938934627965e-05,
+      "loss": 0.16765867471694945,
+      "step": 1920
+    },
+    {
+      "epoch": 0.350254730713246,
+      "grad_norm": 0.15349514782428741,
+      "learning_rate": 3.774567093485648e-05,
+      "loss": 0.15890377759933472,
+      "step": 1925
+    },
+    {
+      "epoch": 0.3511644832605531,
+      "grad_norm": 0.1515921950340271,
+      "learning_rate": 3.768229244245917e-05,
+      "loss": 0.16668319702148438,
+      "step": 1930
+    },
+    {
+      "epoch": 0.35207423580786024,
+      "grad_norm": 0.16310466825962067,
+      "learning_rate": 3.7618804006866195e-05,
+      "loss": 0.15182652473449706,
+      "step": 1935
+    },
+    {
+      "epoch": 0.3529839883551674,
+      "grad_norm": 0.17294517159461975,
+      "learning_rate": 3.755520617846084e-05,
+      "loss": 0.16287628412246705,
+      "step": 1940
+    },
+    {
+      "epoch": 0.35389374090247455,
+      "grad_norm": 0.1482895463705063,
+      "learning_rate": 3.749149950857467e-05,
+      "loss": 0.15321952104568481,
+      "step": 1945
+    },
+    {
+      "epoch": 0.3548034934497817,
+      "grad_norm": 0.2236029952764511,
+      "learning_rate": 3.7427684549482847e-05,
+      "loss": 0.15403482913970948,
+      "step": 1950
+    },
+    {
+      "epoch": 0.3557132459970888,
+      "grad_norm": 0.20185327529907227,
+      "learning_rate": 3.736376185439927e-05,
+      "loss": 0.1633884072303772,
+      "step": 1955
+    },
+    {
+      "epoch": 0.35662299854439594,
+      "grad_norm": 0.13906247913837433,
+      "learning_rate": 3.7299731977471816e-05,
+      "loss": 0.15925350189208984,
+      "step": 1960
+    },
+    {
+      "epoch": 0.35753275109170307,
+      "grad_norm": 0.18665002286434174,
+      "learning_rate": 3.723559547377751e-05,
+      "loss": 0.1612026572227478,
+      "step": 1965
+    },
+    {
+      "epoch": 0.3584425036390102,
+      "grad_norm": 0.16913433372974396,
+      "learning_rate": 3.717135289931774e-05,
+      "loss": 0.15479494333267213,
+      "step": 1970
+    },
+    {
+      "epoch": 0.35935225618631733,
+      "grad_norm": 0.1620066910982132,
+      "learning_rate": 3.7107004811013434e-05,
+      "loss": 0.1604058027267456,
+      "step": 1975
+    },
+    {
+      "epoch": 0.36026200873362446,
+      "grad_norm": 0.16838301718235016,
+      "learning_rate": 3.704255176670021e-05,
+      "loss": 0.15335073471069335,
+      "step": 1980
+    },
+    {
+      "epoch": 0.3611717612809316,
+      "grad_norm": 0.3054695427417755,
+      "learning_rate": 3.6977994325123535e-05,
+      "loss": 0.16558053493499755,
+      "step": 1985
+    },
+    {
+      "epoch": 0.3620815138282387,
+      "grad_norm": 0.1526716649532318,
+      "learning_rate": 3.6913333045933934e-05,
+      "loss": 0.16148923635482787,
+      "step": 1990
+    },
+    {
+      "epoch": 0.36299126637554585,
+      "grad_norm": 0.15328513085842133,
+      "learning_rate": 3.684856848968209e-05,
+      "loss": 0.1553613781929016,
+      "step": 1995
+    },
+    {
+      "epoch": 0.363901018922853,
+      "grad_norm": 0.16129714250564575,
+      "learning_rate": 3.6783701217813995e-05,
+      "loss": 0.16724612712860107,
+      "step": 2000
+    },
+    {
+      "epoch": 0.3648107714701601,
+      "grad_norm": 0.15715539455413818,
+      "learning_rate": 3.6718731792666086e-05,
+      "loss": 0.15867922306060792,
+      "step": 2005
+    },
+    {
+      "epoch": 0.36572052401746724,
+      "grad_norm": 0.15569166839122772,
+      "learning_rate": 3.6653660777460366e-05,
+      "loss": 0.1552058696746826,
+      "step": 2010
+    },
+    {
+      "epoch": 0.36663027656477437,
+      "grad_norm": 0.16223010420799255,
+      "learning_rate": 3.6588488736299535e-05,
+      "loss": 0.1583200454711914,
+      "step": 2015
+    },
+    {
+      "epoch": 0.3675400291120815,
+      "grad_norm": 0.18441995978355408,
+      "learning_rate": 3.652321623416209e-05,
+      "loss": 0.15050662755966188,
+      "step": 2020
+    },
+    {
+      "epoch": 0.36844978165938863,
+      "grad_norm": 0.13792674243450165,
+      "learning_rate": 3.645784383689742e-05,
+      "loss": 0.15458759069442748,
+      "step": 2025
+    },
+    {
+      "epoch": 0.36935953420669576,
+      "grad_norm": 0.14993111789226532,
+      "learning_rate": 3.639237211122091e-05,
+      "loss": 0.15926222801208495,
+      "step": 2030
+    },
+    {
+      "epoch": 0.3702692867540029,
+      "grad_norm": 0.16815930604934692,
+      "learning_rate": 3.632680162470904e-05,
+      "loss": 0.15524441003799438,
+      "step": 2035
+    },
+    {
+      "epoch": 0.37117903930131,
+      "grad_norm": 0.13312821090221405,
+      "learning_rate": 3.626113294579441e-05,
+      "loss": 0.15883516073226928,
+      "step": 2040
+    },
+    {
+      "epoch": 0.37208879184861715,
+      "grad_norm": 0.16838273406028748,
+      "learning_rate": 3.619536664376091e-05,
+      "loss": 0.15829603672027587,
+      "step": 2045
+    },
+    {
+      "epoch": 0.37299854439592434,
+      "grad_norm": 0.14706873893737793,
+      "learning_rate": 3.612950328873869e-05,
+      "loss": 0.15644397735595703,
+      "step": 2050
+    },
+    {
+      "epoch": 0.37390829694323147,
+      "grad_norm": 0.1644199639558792,
+      "learning_rate": 3.606354345169926e-05,
+      "loss": 0.15858219861984252,
+      "step": 2055
+    },
+    {
+      "epoch": 0.3748180494905386,
+      "grad_norm": 0.18077051639556885,
+      "learning_rate": 3.599748770445055e-05,
+      "loss": 0.1641286849975586,
+      "step": 2060
+    },
+    {
+      "epoch": 0.3757278020378457,
+      "grad_norm": 0.16329127550125122,
+      "learning_rate": 3.5931336619631914e-05,
+      "loss": 0.15027186870574952,
+      "step": 2065
+    },
+    {
+      "epoch": 0.37663755458515286,
+      "grad_norm": 0.16346783936023712,
+      "learning_rate": 3.586509077070922e-05,
+      "loss": 0.1558641314506531,
+      "step": 2070
+    },
+    {
+      "epoch": 0.37754730713246,
+      "grad_norm": 0.1727602630853653,
+      "learning_rate": 3.5798750731969834e-05,
+      "loss": 0.15390506982803345,
+      "step": 2075
+    },
+    {
+      "epoch": 0.3784570596797671,
+      "grad_norm": 0.7598192691802979,
+      "learning_rate": 3.5732317078517654e-05,
+      "loss": 0.1533232808113098,
+      "step": 2080
+    },
+    {
+      "epoch": 0.37936681222707425,
+      "grad_norm": 0.1433355212211609,
+      "learning_rate": 3.5665790386268124e-05,
+      "loss": 0.15560413599014283,
+      "step": 2085
+    },
+    {
+      "epoch": 0.3802765647743814,
+      "grad_norm": 0.18439625203609467,
+      "learning_rate": 3.559917123194325e-05,
+      "loss": 0.16695556640625,
+      "step": 2090
+    },
+    {
+      "epoch": 0.3811863173216885,
+      "grad_norm": 0.1693502813577652,
+      "learning_rate": 3.55324601930666e-05,
+      "loss": 0.15957870483398437,
+      "step": 2095
+    },
+    {
+      "epoch": 0.38209606986899564,
+      "grad_norm": 0.17776088416576385,
+      "learning_rate": 3.54656578479583e-05,
+      "loss": 0.1527492880821228,
+      "step": 2100
+    },
+    {
+      "epoch": 0.38300582241630277,
+      "grad_norm": 0.15993724763393402,
+      "learning_rate": 3.539876477572998e-05,
+      "loss": 0.1567505717277527,
+      "step": 2105
+    },
+    {
+      "epoch": 0.3839155749636099,
+      "grad_norm": 0.17067375779151917,
+      "learning_rate": 3.533178155627981e-05,
+      "loss": 0.14660797119140626,
+      "step": 2110
+    },
+    {
+      "epoch": 0.384825327510917,
+      "grad_norm": 0.20239882171154022,
+      "learning_rate": 3.526470877028745e-05,
+      "loss": 0.1596767544746399,
+      "step": 2115
+    },
+    {
+      "epoch": 0.38573508005822416,
+      "grad_norm": 0.1863643079996109,
+      "learning_rate": 3.5197546999209005e-05,
+      "loss": 0.15738571882247926,
+      "step": 2120
+    },
+    {
+      "epoch": 0.3866448326055313,
+      "grad_norm": 0.16994133591651917,
+      "learning_rate": 3.5130296825272014e-05,
+      "loss": 0.16255316734313965,
+      "step": 2125
+    },
+    {
+      "epoch": 0.3875545851528384,
+      "grad_norm": 0.18703415989875793,
+      "learning_rate": 3.5062958831470355e-05,
+      "loss": 0.15206334590911866,
+      "step": 2130
+    },
+    {
+      "epoch": 0.38846433770014555,
+      "grad_norm": 0.15433982014656067,
+      "learning_rate": 3.4995533601559226e-05,
+      "loss": 0.1590178370475769,
+      "step": 2135
+    },
+    {
+      "epoch": 0.3893740902474527,
+      "grad_norm": 0.16498146951198578,
+      "learning_rate": 3.4928021720050104e-05,
+      "loss": 0.14759145975112914,
+      "step": 2140
+    },
+    {
+      "epoch": 0.3902838427947598,
+      "grad_norm": 0.17880478501319885,
+      "learning_rate": 3.486042377220562e-05,
+      "loss": 0.1642458915710449,
+      "step": 2145
+    },
+    {
+      "epoch": 0.39119359534206694,
+      "grad_norm": 0.14700061082839966,
+      "learning_rate": 3.479274034403455e-05,
+      "loss": 0.16105138063430785,
+      "step": 2150
+    },
+    {
+      "epoch": 0.39210334788937407,
+      "grad_norm": 0.1620762050151825,
+      "learning_rate": 3.472497202228664e-05,
+      "loss": 0.15104985237121582,
+      "step": 2155
+    },
+    {
+      "epoch": 0.3930131004366812,
+      "grad_norm": 0.1625058799982071,
+      "learning_rate": 3.4657119394447654e-05,
+      "loss": 0.16145485639572144,
+      "step": 2160
+    },
+    {
+      "epoch": 0.3939228529839884,
+      "grad_norm": 0.1631549596786499,
+      "learning_rate": 3.458918304873417e-05,
+      "loss": 0.16712255477905275,
+      "step": 2165
+    },
+    {
+      "epoch": 0.3948326055312955,
+      "grad_norm": 0.16041551530361176,
+      "learning_rate": 3.452116357408853e-05,
+      "loss": 0.15118330717086792,
+      "step": 2170
+    },
+    {
+      "epoch": 0.39574235807860264,
+      "grad_norm": 0.16692611575126648,
+      "learning_rate": 3.44530615601737e-05,
+      "loss": 0.16982550621032716,
+      "step": 2175
+    },
+    {
+      "epoch": 0.39665211062590977,
+      "grad_norm": 0.16082268953323364,
+      "learning_rate": 3.438487759736821e-05,
+      "loss": 0.1513260006904602,
+      "step": 2180
+    },
+    {
+      "epoch": 0.3975618631732169,
+      "grad_norm": 0.1474589854478836,
+      "learning_rate": 3.4316612276761004e-05,
+      "loss": 0.14968743324279785,
+      "step": 2185
+    },
+    {
+      "epoch": 0.39847161572052403,
+      "grad_norm": 0.14531342685222626,
+      "learning_rate": 3.42482661901463e-05,
+      "loss": 0.1563260555267334,
+      "step": 2190
+    },
+    {
+      "epoch": 0.39938136826783116,
+      "grad_norm": 0.16775506734848022,
+      "learning_rate": 3.41798399300185e-05,
+      "loss": 0.14861010313034057,
+      "step": 2195
+    },
+    {
+      "epoch": 0.4002911208151383,
+      "grad_norm": 0.15065217018127441,
+      "learning_rate": 3.411133408956703e-05,
+      "loss": 0.15559519529342652,
+      "step": 2200
+    },
+    {
+      "epoch": 0.4012008733624454,
+      "grad_norm": 0.16655296087265015,
+      "learning_rate": 3.4042749262671184e-05,
+      "loss": 0.16025567054748535,
+      "step": 2205
+    },
+    {
+      "epoch": 0.40211062590975255,
+      "grad_norm": 0.14773905277252197,
+      "learning_rate": 3.397408604389501e-05,
+      "loss": 0.15074082612991332,
+      "step": 2210
+    },
+    {
+      "epoch": 0.4030203784570597,
+      "grad_norm": 0.16233304142951965,
+      "learning_rate": 3.3905345028482125e-05,
+      "loss": 0.15490520000457764,
+      "step": 2215
+    },
+    {
+      "epoch": 0.4039301310043668,
+      "grad_norm": 0.17520153522491455,
+      "learning_rate": 3.383652681235058e-05,
+      "loss": 0.1517520785331726,
+      "step": 2220
+    },
+    {
+      "epoch": 0.40483988355167394,
+      "grad_norm": 0.14749875664710999,
+      "learning_rate": 3.376763199208766e-05,
+      "loss": 0.15410997867584228,
+      "step": 2225
+    },
+    {
+      "epoch": 0.40574963609898107,
+      "grad_norm": 0.16855919361114502,
+      "learning_rate": 3.369866116494477e-05,
+      "loss": 0.1510261058807373,
+      "step": 2230
+    },
+    {
+      "epoch": 0.4066593886462882,
+      "grad_norm": 0.1594122350215912,
+      "learning_rate": 3.362961492883218e-05,
+      "loss": 0.1493813395500183,
+      "step": 2235
+    },
+    {
+      "epoch": 0.40756914119359533,
+      "grad_norm": 0.13645926117897034,
+      "learning_rate": 3.3560493882313915e-05,
+      "loss": 0.14876762628555298,
+      "step": 2240
+    },
+    {
+      "epoch": 0.40847889374090246,
+      "grad_norm": 0.14304400980472565,
+      "learning_rate": 3.349129862460251e-05,
+      "loss": 0.15567013025283813,
+      "step": 2245
+    },
+    {
+      "epoch": 0.4093886462882096,
+      "grad_norm": 0.17040041089057922,
+      "learning_rate": 3.342202975555386e-05,
+      "loss": 0.1563249945640564,
+      "step": 2250
+    },
+    {
+      "epoch": 0.4102983988355167,
+      "grad_norm": 0.15594671666622162,
+      "learning_rate": 3.3352687875661984e-05,
+      "loss": 0.1546410083770752,
+      "step": 2255
+    },
+    {
+      "epoch": 0.41120815138282385,
+      "grad_norm": 0.1677195280790329,
+      "learning_rate": 3.328327358605384e-05,
+      "loss": 0.15710171461105346,
+      "step": 2260
+    },
+    {
+      "epoch": 0.412117903930131,
+      "grad_norm": 0.1731705516576767,
+      "learning_rate": 3.321378748848412e-05,
+      "loss": 0.16444036960601807,
+      "step": 2265
+    },
+    {
+      "epoch": 0.4130276564774381,
+      "grad_norm": 0.18779033422470093,
+      "learning_rate": 3.3144230185329984e-05,
+      "loss": 0.15659687519073487,
+      "step": 2270
+    },
+    {
+      "epoch": 0.4139374090247453,
+      "grad_norm": 0.1543768346309662,
+      "learning_rate": 3.3074602279585913e-05,
+      "loss": 0.15100739002227784,
+      "step": 2275
+    },
+    {
+      "epoch": 0.4148471615720524,
+      "grad_norm": 0.16672168672084808,
+      "learning_rate": 3.300490437485843e-05,
+      "loss": 0.15535364151000977,
+      "step": 2280
+    },
+    {
+      "epoch": 0.41575691411935956,
+      "grad_norm": 0.16741308569908142,
+      "learning_rate": 3.293513707536089e-05,
+      "loss": 0.15523911714553834,
+      "step": 2285
+    },
+    {
+      "epoch": 0.4166666666666667,
+      "grad_norm": 0.1488303542137146,
+      "learning_rate": 3.286530098590822e-05,
+      "loss": 0.1542000651359558,
+      "step": 2290
+    },
+    {
+      "epoch": 0.4175764192139738,
+      "grad_norm": 0.1637732982635498,
+      "learning_rate": 3.2795396711911694e-05,
+      "loss": 0.15354831218719484,
+      "step": 2295
+    },
+    {
+      "epoch": 0.41848617176128095,
+      "grad_norm": 0.1472022533416748,
+      "learning_rate": 3.272542485937369e-05,
+      "loss": 0.16235145330429077,
+      "step": 2300
+    },
+    {
+      "epoch": 0.4193959243085881,
+      "grad_norm": 0.15908290445804596,
+      "learning_rate": 3.265538603488241e-05,
+      "loss": 0.15642645359039306,
+      "step": 2305
+    },
+    {
+      "epoch": 0.4203056768558952,
+      "grad_norm": 0.1584865301847458,
+      "learning_rate": 3.2585280845606645e-05,
+      "loss": 0.15490249395370484,
+      "step": 2310
+    },
+    {
+      "epoch": 0.42121542940320233,
+      "grad_norm": 0.15893949568271637,
+      "learning_rate": 3.251510989929052e-05,
+      "loss": 0.1598116159439087,
+      "step": 2315
+    },
+    {
+      "epoch": 0.42212518195050946,
+      "grad_norm": 0.18930596113204956,
+      "learning_rate": 3.244487380424817e-05,
+      "loss": 0.1482008934020996,
+      "step": 2320
+    },
+    {
+      "epoch": 0.4230349344978166,
+      "grad_norm": 0.132876455783844,
+      "learning_rate": 3.237457316935856e-05,
+      "loss": 0.15304710865020751,
+      "step": 2325
+    },
+    {
+      "epoch": 0.4239446870451237,
+      "grad_norm": 0.16447032988071442,
+      "learning_rate": 3.2304208604060106e-05,
+      "loss": 0.15298750400543212,
+      "step": 2330
+    },
+    {
+      "epoch": 0.42485443959243085,
+      "grad_norm": 0.17748120427131653,
+      "learning_rate": 3.223378071834546e-05,
+      "loss": 0.1556084156036377,
+      "step": 2335
+    },
+    {
+      "epoch": 0.425764192139738,
+      "grad_norm": 0.16366586089134216,
+      "learning_rate": 3.2163290122756206e-05,
+      "loss": 0.14387927055358887,
+      "step": 2340
+    },
+    {
+      "epoch": 0.4266739446870451,
+      "grad_norm": 0.15398970246315002,
+      "learning_rate": 3.209273742837755e-05,
+      "loss": 0.16091293096542358,
+      "step": 2345
+    },
+    {
+      "epoch": 0.42758369723435224,
+      "grad_norm": 0.164212167263031,
+      "learning_rate": 3.202212324683305e-05,
+      "loss": 0.15523531436920165,
+      "step": 2350
+    },
+    {
+      "epoch": 0.4284934497816594,
+      "grad_norm": 0.16749800741672516,
+      "learning_rate": 3.1951448190279255e-05,
+      "loss": 0.15354975461959838,
+      "step": 2355
+    },
+    {
+      "epoch": 0.4294032023289665,
+      "grad_norm": 0.14137034118175507,
+      "learning_rate": 3.18807128714005e-05,
+      "loss": 0.14981694221496583,
+      "step": 2360
+    },
+    {
+      "epoch": 0.43031295487627363,
+      "grad_norm": 0.14848439395427704,
+      "learning_rate": 3.1809917903403507e-05,
+      "loss": 0.15448769330978393,
+      "step": 2365
+    },
+    {
+      "epoch": 0.43122270742358076,
+      "grad_norm": 0.1747605800628662,
+      "learning_rate": 3.1739063900012095e-05,
+      "loss": 0.15882387161254882,
+      "step": 2370
+    },
+    {
+      "epoch": 0.4321324599708879,
+      "grad_norm": 0.16054467856884003,
+      "learning_rate": 3.166815147546186e-05,
+      "loss": 0.15170297622680665,
+      "step": 2375
+    },
+    {
+      "epoch": 0.433042212518195,
+      "grad_norm": 0.15428027510643005,
+      "learning_rate": 3.1597181244494886e-05,
+      "loss": 0.16202548742294312,
+      "step": 2380
+    },
+    {
+      "epoch": 0.4339519650655022,
+      "grad_norm": 0.16747219860553741,
+      "learning_rate": 3.1526153822354325e-05,
+      "loss": 0.15461477041244506,
+      "step": 2385
+    },
+    {
+      "epoch": 0.43486171761280934,
+      "grad_norm": 0.17415772378444672,
+      "learning_rate": 3.145506982477918e-05,
+      "loss": 0.16173542737960817,
+      "step": 2390
+    },
+    {
+      "epoch": 0.43577147016011647,
+      "grad_norm": 0.1293518990278244,
+      "learning_rate": 3.1383929867998865e-05,
+      "loss": 0.15572521686553956,
+      "step": 2395
+    },
+    {
+      "epoch": 0.4366812227074236,
+      "grad_norm": 0.16909323632717133,
+      "learning_rate": 3.1312734568727935e-05,
+      "loss": 0.15898628234863282,
+      "step": 2400
+    },
+    {
+      "epoch": 0.43759097525473073,
+      "grad_norm": 0.16770294308662415,
+      "learning_rate": 3.124148454416069e-05,
+      "loss": 0.1536281704902649,
+      "step": 2405
+    },
+    {
+      "epoch": 0.43850072780203786,
+      "grad_norm": 0.14078612625598907,
+      "learning_rate": 3.117018041196585e-05,
+      "loss": 0.15274266004562378,
+      "step": 2410
+    },
+    {
+      "epoch": 0.439410480349345,
+      "grad_norm": 0.15457536280155182,
+      "learning_rate": 3.1098822790281226e-05,
+      "loss": 0.15391263961791993,
+      "step": 2415
+    },
+    {
+      "epoch": 0.4403202328966521,
+      "grad_norm": 0.1640717089176178,
+      "learning_rate": 3.102741229770827e-05,
+      "loss": 0.15515168905258178,
+      "step": 2420
+    },
+    {
+      "epoch": 0.44122998544395925,
+      "grad_norm": 0.2601533830165863,
+      "learning_rate": 3.095594955330683e-05,
+      "loss": 0.1587247371673584,
+      "step": 2425
+    },
+    {
+      "epoch": 0.4421397379912664,
+      "grad_norm": 0.1352529525756836,
+      "learning_rate": 3.08844351765897e-05,
+      "loss": 0.1483217477798462,
+      "step": 2430
+    },
+    {
+      "epoch": 0.4430494905385735,
+      "grad_norm": 0.18479721248149872,
+      "learning_rate": 3.081286978751728e-05,
+      "loss": 0.15121787786483765,
+      "step": 2435
+    },
+    {
+      "epoch": 0.44395924308588064,
+      "grad_norm": 0.16954511404037476,
+      "learning_rate": 3.074125400649221e-05,
+      "loss": 0.16073100566864013,
+      "step": 2440
+    },
+    {
+      "epoch": 0.44486899563318777,
+      "grad_norm": 0.15154729783535004,
+      "learning_rate": 3.0669588454353944e-05,
+      "loss": 0.15738017559051515,
+      "step": 2445
+    },
+    {
+      "epoch": 0.4457787481804949,
+      "grad_norm": 0.1540488302707672,
+      "learning_rate": 3.059787375237344e-05,
+      "loss": 0.1515384554862976,
+      "step": 2450
+    },
+    {
+      "epoch": 0.44668850072780203,
+      "grad_norm": 0.1814432442188263,
+      "learning_rate": 3.052611052224774e-05,
+      "loss": 0.15731438398361205,
+      "step": 2455
+    },
+    {
+      "epoch": 0.44759825327510916,
+      "grad_norm": 0.16657036542892456,
+      "learning_rate": 3.0454299386094542e-05,
+      "loss": 0.15741543769836425,
+      "step": 2460
+    },
+    {
+      "epoch": 0.4485080058224163,
+      "grad_norm": 0.2177237570285797,
+      "learning_rate": 3.0382440966446875e-05,
+      "loss": 0.14972515106201173,
+      "step": 2465
+    },
+    {
+      "epoch": 0.4494177583697234,
+      "grad_norm": 0.1669909954071045,
+      "learning_rate": 3.031053588624766e-05,
+      "loss": 0.1506432294845581,
+      "step": 2470
+    },
+    {
+      "epoch": 0.45032751091703055,
+      "grad_norm": 0.1752234250307083,
+      "learning_rate": 3.0238584768844313e-05,
+      "loss": 0.14969609975814818,
+      "step": 2475
+    },
+    {
+      "epoch": 0.4512372634643377,
+      "grad_norm": 0.18267901241779327,
+      "learning_rate": 3.0166588237983363e-05,
+      "loss": 0.15112748146057128,
+      "step": 2480
+    },
+    {
+      "epoch": 0.4521470160116448,
+      "grad_norm": 0.16250105202198029,
+      "learning_rate": 3.0094546917805007e-05,
+      "loss": 0.15864100456237792,
+      "step": 2485
+    },
+    {
+      "epoch": 0.45305676855895194,
+      "grad_norm": 0.14825721085071564,
+      "learning_rate": 3.0022461432837752e-05,
+      "loss": 0.1513954520225525,
+      "step": 2490
+    },
+    {
+      "epoch": 0.4539665211062591,
+      "grad_norm": 0.1626640111207962,
+      "learning_rate": 2.9950332407992943e-05,
+      "loss": 0.1505578875541687,
+      "step": 2495
+    },
+    {
+      "epoch": 0.45487627365356625,
+      "grad_norm": 0.1535351574420929,
+      "learning_rate": 2.987816046855939e-05,
+      "loss": 0.15255829095840454,
+      "step": 2500
+    },
+    {
+      "epoch": 0.4557860262008734,
+      "grad_norm": 0.17552775144577026,
+      "learning_rate": 2.9805946240197928e-05,
+      "loss": 0.1516443133354187,
+      "step": 2505
+    },
+    {
+      "epoch": 0.4566957787481805,
+      "grad_norm": 0.16020981967449188,
+      "learning_rate": 2.9733690348935994e-05,
+      "loss": 0.14519743919372557,
+      "step": 2510
+    },
+    {
+      "epoch": 0.45760553129548764,
+      "grad_norm": 0.17800211906433105,
+      "learning_rate": 2.9661393421162204e-05,
+      "loss": 0.15679080486297609,
+      "step": 2515
+    },
+    {
+      "epoch": 0.4585152838427948,
+      "grad_norm": 0.16016991436481476,
+      "learning_rate": 2.9589056083620902e-05,
+      "loss": 0.14768127202987671,
+      "step": 2520
+    },
+    {
+      "epoch": 0.4594250363901019,
+      "grad_norm": 0.16272081434726715,
+      "learning_rate": 2.951667896340679e-05,
+      "loss": 0.1513301968574524,
+      "step": 2525
+    },
+    {
+      "epoch": 0.46033478893740903,
+      "grad_norm": 0.1726413071155548,
+      "learning_rate": 2.9444262687959402e-05,
+      "loss": 0.14819332361221313,
+      "step": 2530
+    },
+    {
+      "epoch": 0.46124454148471616,
+      "grad_norm": 0.1670403778553009,
+      "learning_rate": 2.9371807885057735e-05,
+      "loss": 0.15245940685272216,
+      "step": 2535
+    },
+    {
+      "epoch": 0.4621542940320233,
+      "grad_norm": 0.1650049239397049,
+      "learning_rate": 2.9299315182814772e-05,
+      "loss": 0.15187418460845947,
+      "step": 2540
+    },
+    {
+      "epoch": 0.4630640465793304,
+      "grad_norm": 0.16327734291553497,
+      "learning_rate": 2.9226785209672047e-05,
+      "loss": 0.15579828023910522,
+      "step": 2545
+    },
+    {
+      "epoch": 0.46397379912663755,
+      "grad_norm": 0.3367880582809448,
+      "learning_rate": 2.91542185943942e-05,
+      "loss": 0.15617697238922118,
+      "step": 2550
+    },
+    {
+      "epoch": 0.4648835516739447,
+      "grad_norm": 0.1731594055891037,
+      "learning_rate": 2.908161596606353e-05,
+      "loss": 0.1559603691101074,
+      "step": 2555
+    },
+    {
+      "epoch": 0.4657933042212518,
+      "grad_norm": 0.1477293074131012,
+      "learning_rate": 2.9008977954074517e-05,
+      "loss": 0.15567959547042848,
+      "step": 2560
+    },
+    {
+      "epoch": 0.46670305676855894,
+      "grad_norm": 0.16227173805236816,
+      "learning_rate": 2.8936305188128392e-05,
+      "loss": 0.1522113561630249,
+      "step": 2565
+    },
+    {
+      "epoch": 0.4676128093158661,
+      "grad_norm": 0.2031075656414032,
+      "learning_rate": 2.8863598298227674e-05,
+      "loss": 0.15054640769958497,
+      "step": 2570
+    },
+    {
+      "epoch": 0.4685225618631732,
+      "grad_norm": 0.18351472914218903,
+      "learning_rate": 2.8790857914670698e-05,
+      "loss": 0.15837019681930542,
+      "step": 2575
+    },
+    {
+      "epoch": 0.46943231441048033,
+      "grad_norm": 0.15914765000343323,
+      "learning_rate": 2.871808466804616e-05,
+      "loss": 0.1550259470939636,
+      "step": 2580
+    },
+    {
+      "epoch": 0.47034206695778746,
+      "grad_norm": 0.17366717755794525,
+      "learning_rate": 2.8645279189227636e-05,
+      "loss": 0.15702390670776367,
+      "step": 2585
+    },
+    {
+      "epoch": 0.4712518195050946,
+      "grad_norm": 0.13677838444709778,
+      "learning_rate": 2.8572442109368134e-05,
+      "loss": 0.15485031604766847,
+      "step": 2590
+    },
+    {
+      "epoch": 0.4721615720524017,
+      "grad_norm": 0.1477748304605484,
+      "learning_rate": 2.8499574059894617e-05,
+      "loss": 0.14577245712280273,
+      "step": 2595
+    },
+    {
+      "epoch": 0.47307132459970885,
+      "grad_norm": 0.1582217663526535,
+      "learning_rate": 2.842667567250252e-05,
+      "loss": 0.15586793422698975,
+      "step": 2600
+    },
+    {
+      "epoch": 0.47398107714701604,
+      "grad_norm": 0.19658738374710083,
+      "learning_rate": 2.8353747579150268e-05,
+      "loss": 0.15060495138168334,
+      "step": 2605
+    },
+    {
+      "epoch": 0.47489082969432317,
+      "grad_norm": 0.176767036318779,
+      "learning_rate": 2.828079041205382e-05,
+      "loss": 0.15116705894470214,
+      "step": 2610
+    },
+    {
+      "epoch": 0.4758005822416303,
+      "grad_norm": 0.16972507536411285,
+      "learning_rate": 2.820780480368117e-05,
+      "loss": 0.1541937470436096,
+      "step": 2615
+    },
+    {
+      "epoch": 0.47671033478893743,
+      "grad_norm": 0.1548585742712021,
+      "learning_rate": 2.8134791386746884e-05,
+      "loss": 0.14334756135940552,
+      "step": 2620
+    },
+    {
+      "epoch": 0.47762008733624456,
+      "grad_norm": 0.15411986410617828,
+      "learning_rate": 2.806175079420658e-05,
+      "loss": 0.14642289876937867,
+      "step": 2625
+    },
+    {
+      "epoch": 0.4785298398835517,
+      "grad_norm": 0.16609491407871246,
+      "learning_rate": 2.7988683659251474e-05,
+      "loss": 0.15083469152450563,
+      "step": 2630
+    },
+    {
+      "epoch": 0.4794395924308588,
+      "grad_norm": 0.16592684388160706,
+      "learning_rate": 2.791559061530289e-05,
+      "loss": 0.14218480587005616,
+      "step": 2635
+    },
+    {
+      "epoch": 0.48034934497816595,
+      "grad_norm": 0.1764935404062271,
+      "learning_rate": 2.7842472296006722e-05,
+      "loss": 0.15004343986511232,
+      "step": 2640
+    },
+    {
+      "epoch": 0.4812590975254731,
+      "grad_norm": 0.20094354450702667,
+      "learning_rate": 2.7769329335228022e-05,
+      "loss": 0.14975016117095946,
+      "step": 2645
+    },
+    {
+      "epoch": 0.4821688500727802,
+      "grad_norm": 0.1869269460439682,
+      "learning_rate": 2.769616236704542e-05,
+      "loss": 0.155981707572937,
+      "step": 2650
+    },
+    {
+      "epoch": 0.48307860262008734,
+      "grad_norm": 0.16671574115753174,
+      "learning_rate": 2.762297202574571e-05,
+      "loss": 0.14633859395980836,
+      "step": 2655
+    },
+    {
+      "epoch": 0.48398835516739447,
+      "grad_norm": 0.14999663829803467,
+      "learning_rate": 2.754975894581826e-05,
+      "loss": 0.15692603588104248,
+      "step": 2660
+    },
+    {
+      "epoch": 0.4848981077147016,
+      "grad_norm": 0.16893649101257324,
+      "learning_rate": 2.7476523761949592e-05,
+      "loss": 0.14530394077301026,
+      "step": 2665
+    },
+    {
+      "epoch": 0.48580786026200873,
+      "grad_norm": 0.16039884090423584,
+      "learning_rate": 2.740326710901784e-05,
+      "loss": 0.15013915300369263,
+      "step": 2670
+    },
+    {
+      "epoch": 0.48671761280931586,
+      "grad_norm": 0.16672006249427795,
+      "learning_rate": 2.732998962208725e-05,
+      "loss": 0.15667349100112915,
+      "step": 2675
+    },
+    {
+      "epoch": 0.487627365356623,
+      "grad_norm": 0.2160867303609848,
+      "learning_rate": 2.7256691936402684e-05,
+      "loss": 0.14335414171218872,
+      "step": 2680
+    },
+    {
+      "epoch": 0.4885371179039301,
+      "grad_norm": 0.349030077457428,
+      "learning_rate": 2.71833746873841e-05,
+      "loss": 0.1437530279159546,
+      "step": 2685
+    },
+    {
+      "epoch": 0.48944687045123725,
+      "grad_norm": 0.18380966782569885,
+      "learning_rate": 2.7110038510621073e-05,
+      "loss": 0.1476014256477356,
+      "step": 2690
+    },
+    {
+      "epoch": 0.4903566229985444,
+      "grad_norm": 0.1523742377758026,
+      "learning_rate": 2.703668404186722e-05,
+      "loss": 0.14578526020050048,
+      "step": 2695
+    },
+    {
+      "epoch": 0.4912663755458515,
+      "grad_norm": 0.16092729568481445,
+      "learning_rate": 2.696331191703479e-05,
+      "loss": 0.15335593223571778,
+      "step": 2700
+    },
+    {
+      "epoch": 0.49217612809315864,
+      "grad_norm": 0.17185333371162415,
+      "learning_rate": 2.688992277218904e-05,
+      "loss": 0.1540898084640503,
+      "step": 2705
+    },
+    {
+      "epoch": 0.49308588064046577,
+      "grad_norm": 0.1521969735622406,
+      "learning_rate": 2.6816517243542792e-05,
+      "loss": 0.15171396732330322,
+      "step": 2710
+    },
+    {
+      "epoch": 0.49399563318777295,
+      "grad_norm": 0.16064171493053436,
+      "learning_rate": 2.674309596745092e-05,
+      "loss": 0.1505839228630066,
+      "step": 2715
+    },
+    {
+      "epoch": 0.4949053857350801,
+      "grad_norm": 0.16430898010730743,
+      "learning_rate": 2.6669659580404795e-05,
+      "loss": 0.1551363468170166,
+      "step": 2720
+    },
+    {
+      "epoch": 0.4958151382823872,
+      "grad_norm": 0.16125477850437164,
+      "learning_rate": 2.659620871902677e-05,
+      "loss": 0.15069286823272704,
+      "step": 2725
+    },
+    {
+      "epoch": 0.49672489082969434,
+      "grad_norm": 0.1428450047969818,
+      "learning_rate": 2.652274402006471e-05,
+      "loss": 0.15511081218719483,
+      "step": 2730
+    },
+    {
+      "epoch": 0.4976346433770015,
+      "grad_norm": 0.15452754497528076,
+      "learning_rate": 2.6449266120386406e-05,
+      "loss": 0.14941939115524291,
+      "step": 2735
+    },
+    {
+      "epoch": 0.4985443959243086,
+      "grad_norm": 0.17243537306785583,
+      "learning_rate": 2.6375775656974123e-05,
+      "loss": 0.151741623878479,
+      "step": 2740
+    },
+    {
+      "epoch": 0.49945414847161573,
+      "grad_norm": 0.13736453652381897,
+      "learning_rate": 2.6302273266919008e-05,
+      "loss": 0.147042977809906,
+      "step": 2745
+    },
+    {
+      "epoch": 0.5003639010189228,
+      "grad_norm": 0.16241495311260223,
+      "learning_rate": 2.6228759587415614e-05,
+      "loss": 0.14664684534072875,
+      "step": 2750
+    },
+    {
+      "epoch": 0.50127365356623,
+      "grad_norm": 0.193496435880661,
+      "learning_rate": 2.6155235255756356e-05,
+      "loss": 0.15486966371536254,
+      "step": 2755
+    },
+    {
+      "epoch": 0.5021834061135371,
+      "grad_norm": 0.1542847901582718,
+      "learning_rate": 2.6081700909326e-05,
+      "loss": 0.15148009061813356,
+      "step": 2760
+    },
+    {
+      "epoch": 0.5030931586608443,
+      "grad_norm": 0.1696511209011078,
+      "learning_rate": 2.6008157185596142e-05,
+      "loss": 0.14190055131912233,
+      "step": 2765
+    },
+    {
+      "epoch": 0.5040029112081513,
+      "grad_norm": 0.14690077304840088,
+      "learning_rate": 2.5934604722119655e-05,
+      "loss": 0.1570739269256592,
+      "step": 2770
+    },
+    {
+      "epoch": 0.5049126637554585,
+      "grad_norm": 0.17149671912193298,
+      "learning_rate": 2.5861044156525162e-05,
+      "loss": 0.14940304756164552,
+      "step": 2775
+    },
+    {
+      "epoch": 0.5058224163027657,
+      "grad_norm": 0.16639231145381927,
+      "learning_rate": 2.578747612651155e-05,
+      "loss": 0.15691237449645995,
+      "step": 2780
+    },
+    {
+      "epoch": 0.5067321688500728,
+      "grad_norm": 0.2062763124704361,
+      "learning_rate": 2.5713901269842404e-05,
+      "loss": 0.1564734935760498,
+      "step": 2785
+    },
+    {
+      "epoch": 0.50764192139738,
+      "grad_norm": 0.12636308372020721,
+      "learning_rate": 2.5640320224340502e-05,
+      "loss": 0.14539417028427123,
+      "step": 2790
+    },
+    {
+      "epoch": 0.508551673944687,
+      "grad_norm": 0.16893689334392548,
+      "learning_rate": 2.556673362788225e-05,
+      "loss": 0.15440930128097535,
+      "step": 2795
+    },
+    {
+      "epoch": 0.5094614264919942,
+      "grad_norm": 0.16250015795230865,
+      "learning_rate": 2.54931421183922e-05,
+      "loss": 0.14485647678375244,
+      "step": 2800
+    },
+    {
+      "epoch": 0.5103711790393013,
+      "grad_norm": 0.1700994372367859,
+      "learning_rate": 2.5419546333837462e-05,
+      "loss": 0.15411126613616943,
+      "step": 2805
+    },
+    {
+      "epoch": 0.5112809315866085,
+      "grad_norm": 0.1547706127166748,
+      "learning_rate": 2.5345946912222256e-05,
+      "loss": 0.15516072511672974,
+      "step": 2810
+    },
+    {
+      "epoch": 0.5121906841339156,
+      "grad_norm": 0.17955681681632996,
+      "learning_rate": 2.527234449158228e-05,
+      "loss": 0.15546923875808716,
+      "step": 2815
+    },
+    {
+      "epoch": 0.5131004366812227,
+      "grad_norm": 0.163709819316864,
+      "learning_rate": 2.519873970997927e-05,
+      "loss": 0.15665037631988527,
+      "step": 2820
+    },
+    {
+      "epoch": 0.5140101892285298,
+      "grad_norm": 0.17859576642513275,
+      "learning_rate": 2.5125133205495405e-05,
+      "loss": 0.1539722204208374,
+      "step": 2825
+    },
+    {
+      "epoch": 0.514919941775837,
+      "grad_norm": 0.17443150281906128,
+      "learning_rate": 2.5051525616227806e-05,
+      "loss": 0.148411762714386,
+      "step": 2830
+    },
+    {
+      "epoch": 0.5158296943231441,
+      "grad_norm": 0.17397581040859222,
+      "learning_rate": 2.4977917580283007e-05,
+      "loss": 0.14880497455596925,
+      "step": 2835
+    },
+    {
+      "epoch": 0.5167394468704513,
+      "grad_norm": 0.14565663039684296,
+      "learning_rate": 2.4904309735771405e-05,
+      "loss": 0.14934680461883545,
+      "step": 2840
+    },
+    {
+      "epoch": 0.5176491994177583,
+      "grad_norm": 0.17895659804344177,
+      "learning_rate": 2.4830702720801746e-05,
+      "loss": 0.15287939310073853,
+      "step": 2845
+    },
+    {
+      "epoch": 0.5185589519650655,
+      "grad_norm": 0.15812788903713226,
+      "learning_rate": 2.4757097173475572e-05,
+      "loss": 0.14576947689056396,
+      "step": 2850
+    },
+    {
+      "epoch": 0.5194687045123726,
+      "grad_norm": 0.17123781144618988,
+      "learning_rate": 2.46834937318817e-05,
+      "loss": 0.15224847793579102,
+      "step": 2855
+    },
+    {
+      "epoch": 0.5203784570596798,
+      "grad_norm": 0.14845474064350128,
+      "learning_rate": 2.460989303409072e-05,
+      "loss": 0.14901585578918458,
+      "step": 2860
+    },
+    {
+      "epoch": 0.5212882096069869,
+      "grad_norm": 0.23493704199790955,
+      "learning_rate": 2.4536295718149407e-05,
+      "loss": 0.1517487049102783,
+      "step": 2865
+    },
+    {
+      "epoch": 0.522197962154294,
+      "grad_norm": 0.16209843754768372,
+      "learning_rate": 2.4462702422075217e-05,
+      "loss": 0.14327445030212402,
+      "step": 2870
+    },
+    {
+      "epoch": 0.5231077147016011,
+      "grad_norm": 0.17249803245067596,
+      "learning_rate": 2.4389113783850793e-05,
+      "loss": 0.1517549753189087,
+      "step": 2875
+    },
+    {
+      "epoch": 0.5240174672489083,
+      "grad_norm": 0.14561402797698975,
+      "learning_rate": 2.431553044141836e-05,
+      "loss": 0.14764087200164794,
+      "step": 2880
+    },
+    {
+      "epoch": 0.5249272197962155,
+      "grad_norm": 0.17033302783966064,
+      "learning_rate": 2.4241953032674256e-05,
+      "loss": 0.15181604623794556,
+      "step": 2885
+    },
+    {
+      "epoch": 0.5258369723435226,
+      "grad_norm": 0.1184430941939354,
+      "learning_rate": 2.4168382195463367e-05,
+      "loss": 0.14264242649078368,
+      "step": 2890
+    },
+    {
+      "epoch": 0.5267467248908297,
+      "grad_norm": 0.17521196603775024,
+      "learning_rate": 2.4094818567573618e-05,
+      "loss": 0.1509538173675537,
+      "step": 2895
+    },
+    {
+      "epoch": 0.5276564774381368,
+      "grad_norm": 0.1681576371192932,
+      "learning_rate": 2.4021262786730428e-05,
+      "loss": 0.15344605445861817,
+      "step": 2900
+    },
+    {
+      "epoch": 0.528566229985444,
+      "grad_norm": 0.17134182155132294,
+      "learning_rate": 2.3947715490591206e-05,
+      "loss": 0.15161689519882202,
+      "step": 2905
+    },
+    {
+      "epoch": 0.5294759825327511,
+      "grad_norm": 0.1796472817659378,
+      "learning_rate": 2.3874177316739778e-05,
+      "loss": 0.15086464881896972,
+      "step": 2910
+    },
+    {
+      "epoch": 0.5303857350800583,
+      "grad_norm": 0.23268625140190125,
+      "learning_rate": 2.380064890268093e-05,
+      "loss": 0.15354180335998535,
+      "step": 2915
+    },
+    {
+      "epoch": 0.5312954876273653,
+      "grad_norm": 0.16318941116333008,
+      "learning_rate": 2.372713088583481e-05,
+      "loss": 0.15131797790527343,
+      "step": 2920
+    },
+    {
+      "epoch": 0.5322052401746725,
+      "grad_norm": 0.18171803653240204,
+      "learning_rate": 2.365362390353143e-05,
+      "loss": 0.15784090757369995,
+      "step": 2925
+    },
+    {
+      "epoch": 0.5331149927219796,
+      "grad_norm": 0.17672640085220337,
+      "learning_rate": 2.3580128593005156e-05,
+      "loss": 0.15509436130523682,
+      "step": 2930
+    },
+    {
+      "epoch": 0.5340247452692868,
+      "grad_norm": 0.15985223650932312,
+      "learning_rate": 2.3506645591389174e-05,
+      "loss": 0.14851027727127075,
+      "step": 2935
+    },
+    {
+      "epoch": 0.5349344978165939,
+      "grad_norm": 0.16597607731819153,
+      "learning_rate": 2.343317553570995e-05,
+      "loss": 0.1504931092262268,
+      "step": 2940
+    },
+    {
+      "epoch": 0.535844250363901,
+      "grad_norm": 0.20180748403072357,
+      "learning_rate": 2.3359719062881725e-05,
+      "loss": 0.15023820400238036,
+      "step": 2945
+    },
+    {
+      "epoch": 0.5367540029112081,
+      "grad_norm": 0.1735963076353073,
+      "learning_rate": 2.3286276809701e-05,
+      "loss": 0.15374408960342406,
+      "step": 2950
+    },
+    {
+      "epoch": 0.5376637554585153,
+      "grad_norm": 0.17629501223564148,
+      "learning_rate": 2.3212849412840995e-05,
+      "loss": 0.15007833242416382,
+      "step": 2955
+    },
+    {
+      "epoch": 0.5385735080058224,
+      "grad_norm": 0.1493796557188034,
+      "learning_rate": 2.3139437508846155e-05,
+      "loss": 0.15206656455993653,
+      "step": 2960
+    },
+    {
+      "epoch": 0.5394832605531296,
+      "grad_norm": 0.17426837980747223,
+      "learning_rate": 2.306604173412659e-05,
+      "loss": 0.1441131591796875,
+      "step": 2965
+    },
+    {
+      "epoch": 0.5403930131004366,
+      "grad_norm": 0.16984431445598602,
+      "learning_rate": 2.2992662724952613e-05,
+      "loss": 0.14438753128051757,
+      "step": 2970
+    },
+    {
+      "epoch": 0.5413027656477438,
+      "grad_norm": 0.1814386397600174,
+      "learning_rate": 2.2919301117449167e-05,
+      "loss": 0.14887022972106934,
+      "step": 2975
+    },
+    {
+      "epoch": 0.5422125181950509,
+      "grad_norm": 0.158392995595932,
+      "learning_rate": 2.2845957547590368e-05,
+      "loss": 0.14404361248016356,
+      "step": 2980
+    },
+    {
+      "epoch": 0.5431222707423581,
+      "grad_norm": 0.17496263980865479,
+      "learning_rate": 2.2772632651193953e-05,
+      "loss": 0.1454906702041626,
+      "step": 2985
+    },
+    {
+      "epoch": 0.5440320232896652,
+      "grad_norm": 0.157533198595047,
+      "learning_rate": 2.2699327063915766e-05,
+      "loss": 0.1458217740058899,
+      "step": 2990
+    },
+    {
+      "epoch": 0.5449417758369723,
+      "grad_norm": 0.1767890453338623,
+      "learning_rate": 2.262604142124427e-05,
+      "loss": 0.14384825229644777,
+      "step": 2995
+    },
+    {
+      "epoch": 0.5458515283842795,
+      "grad_norm": 0.1851050704717636,
+      "learning_rate": 2.2552776358495033e-05,
+      "loss": 0.14832457304000854,
+      "step": 3000
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.6546640174048517e+18,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-3000/training_args.bin b/checkpoint-3000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-3000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/checkpoint-3100/README.md b/checkpoint-3100/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-3100/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-3100/adapter_config.json b/checkpoint-3100/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-3100/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-3100/adapter_model.safetensors b/checkpoint-3100/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..84b92e3c2e5c807881034cfc23fd388b48bcdca4
--- /dev/null
+++ b/checkpoint-3100/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2cf1a639191ca794ea4a0c6cc967246b64d61682f2942d49b2c15f9efa375139
+size 169741912
diff --git a/checkpoint-3100/chat_template.jinja b/checkpoint-3100/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-3100/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-3100/optimizer.pt b/checkpoint-3100/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..df513f92578eda8738ddeff751ad325fe7d5b473
--- /dev/null
+++ b/checkpoint-3100/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e5ef6405a1424ab6e05360a6ae89bc28d3c772739dd148801543b8df622246e1
+size 72807355
diff --git a/checkpoint-3100/processor_config.json b/checkpoint-3100/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-3100/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-3100/rng_state.pth b/checkpoint-3100/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-3100/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-3100/scheduler.pt b/checkpoint-3100/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0bfdabc81daa78f9144105e4599edc5cbb241854
--- /dev/null
+++ b/checkpoint-3100/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5d180b1c2dc0dee95fd0c0a838c18e51eb51269d25ce2b5a31707f5876e99f22
+size 1465
diff --git a/checkpoint-3100/tokenizer.json b/checkpoint-3100/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-3100/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-3100/tokenizer_config.json b/checkpoint-3100/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-3100/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-3100/trainer_state.json b/checkpoint-3100/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..6ad62e3a647bb7ebf5c9d5b2f78f3206e3654d71
--- /dev/null
+++ b/checkpoint-3100/trainer_state.json
@@ -0,0 +1,4382 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.5640465793304221,
+  "eval_steps": 100,
+  "global_step": 3100,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    },
+    {
+      "epoch": 0.03729985443959243,
+      "grad_norm": 0.061678580939769745,
+      "learning_rate": 4.999340748636462e-05,
+      "loss": 0.24956226348876953,
+      "step": 205
+    },
+    {
+      "epoch": 0.03820960698689956,
+      "grad_norm": 0.07328873127698898,
+      "learning_rate": 4.999160884067051e-05,
+      "loss": 0.26169676780700685,
+      "step": 210
+    },
+    {
+      "epoch": 0.0391193595342067,
+      "grad_norm": 0.08287990838289261,
+      "learning_rate": 4.9989593541926246e-05,
+      "loss": 0.2574604034423828,
+      "step": 215
+    },
+    {
+      "epoch": 0.04002911208151383,
+      "grad_norm": 0.06787359714508057,
+      "learning_rate": 4.9987361607602525e-05,
+      "loss": 0.25351409912109374,
+      "step": 220
+    },
+    {
+      "epoch": 0.04093886462882096,
+      "grad_norm": 0.06695502996444702,
+      "learning_rate": 4.998491305704805e-05,
+      "loss": 0.24522039890289307,
+      "step": 225
+    },
+    {
+      "epoch": 0.041848617176128096,
+      "grad_norm": 0.08872214704751968,
+      "learning_rate": 4.9982247911489375e-05,
+      "loss": 0.2581867933273315,
+      "step": 230
+    },
+    {
+      "epoch": 0.042758369723435226,
+      "grad_norm": 0.07637131959199905,
+      "learning_rate": 4.9979366194030743e-05,
+      "loss": 0.25569658279418944,
+      "step": 235
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 0.08158119022846222,
+      "learning_rate": 4.997626792965385e-05,
+      "loss": 0.2529409646987915,
+      "step": 240
+    },
+    {
+      "epoch": 0.04457787481804949,
+      "grad_norm": 0.07529161125421524,
+      "learning_rate": 4.997295314521766e-05,
+      "loss": 0.24049024581909179,
+      "step": 245
+    },
+    {
+      "epoch": 0.04548762736535662,
+      "grad_norm": 0.08860139548778534,
+      "learning_rate": 4.996942186945813e-05,
+      "loss": 0.2490522861480713,
+      "step": 250
+    },
+    {
+      "epoch": 0.04639737991266375,
+      "grad_norm": 0.0850321501493454,
+      "learning_rate": 4.9965674132988005e-05,
+      "loss": 0.24180831909179687,
+      "step": 255
+    },
+    {
+      "epoch": 0.04730713245997089,
+      "grad_norm": 0.07556115090847015,
+      "learning_rate": 4.996170996829653e-05,
+      "loss": 0.2509631872177124,
+      "step": 260
+    },
+    {
+      "epoch": 0.04821688500727802,
+      "grad_norm": 0.07971206307411194,
+      "learning_rate": 4.995752940974918e-05,
+      "loss": 0.24398891925811766,
+      "step": 265
+    },
+    {
+      "epoch": 0.04912663755458515,
+      "grad_norm": 0.09149336814880371,
+      "learning_rate": 4.9953132493587344e-05,
+      "loss": 0.2300492286682129,
+      "step": 270
+    },
+    {
+      "epoch": 0.050036390101892286,
+      "grad_norm": 0.08265820890665054,
+      "learning_rate": 4.9948519257928034e-05,
+      "loss": 0.24246792793273925,
+      "step": 275
+    },
+    {
+      "epoch": 0.050946142649199416,
+      "grad_norm": 0.10328587144613266,
+      "learning_rate": 4.9943689742763534e-05,
+      "loss": 0.2367171049118042,
+      "step": 280
+    },
+    {
+      "epoch": 0.05185589519650655,
+      "grad_norm": 0.0836917981505394,
+      "learning_rate": 4.993864398996105e-05,
+      "loss": 0.23215813636779786,
+      "step": 285
+    },
+    {
+      "epoch": 0.05276564774381368,
+      "grad_norm": 0.09475161135196686,
+      "learning_rate": 4.99333820432624e-05,
+      "loss": 0.2350748062133789,
+      "step": 290
+    },
+    {
+      "epoch": 0.05367540029112081,
+      "grad_norm": 0.08040128648281097,
+      "learning_rate": 4.992790394828355e-05,
+      "loss": 0.23253886699676513,
+      "step": 295
+    },
+    {
+      "epoch": 0.05458515283842795,
+      "grad_norm": 0.08852150291204453,
+      "learning_rate": 4.992220975251428e-05,
+      "loss": 0.23856515884399415,
+      "step": 300
+    },
+    {
+      "epoch": 0.05549490538573508,
+      "grad_norm": 0.09565229713916779,
+      "learning_rate": 4.991629950531775e-05,
+      "loss": 0.23311660289764405,
+      "step": 305
+    },
+    {
+      "epoch": 0.05640465793304221,
+      "grad_norm": 0.08158160001039505,
+      "learning_rate": 4.991017325793009e-05,
+      "loss": 0.22467944622039795,
+      "step": 310
+    },
+    {
+      "epoch": 0.05731441048034935,
+      "grad_norm": 0.07746429741382599,
+      "learning_rate": 4.990383106345994e-05,
+      "loss": 0.229844069480896,
+      "step": 315
+    },
+    {
+      "epoch": 0.05822416302765648,
+      "grad_norm": 0.08564355969429016,
+      "learning_rate": 4.989727297688797e-05,
+      "loss": 0.22414517402648926,
+      "step": 320
+    },
+    {
+      "epoch": 0.05913391557496361,
+      "grad_norm": 0.07517435401678085,
+      "learning_rate": 4.9890499055066435e-05,
+      "loss": 0.2236532211303711,
+      "step": 325
+    },
+    {
+      "epoch": 0.060043668122270744,
+      "grad_norm": 0.111734539270401,
+      "learning_rate": 4.988350935671869e-05,
+      "loss": 0.21474847793579102,
+      "step": 330
+    },
+    {
+      "epoch": 0.060953420669577874,
+      "grad_norm": 0.09906989336013794,
+      "learning_rate": 4.987630394243866e-05,
+      "loss": 0.23321933746337892,
+      "step": 335
+    },
+    {
+      "epoch": 0.06186317321688501,
+      "grad_norm": 0.10131457448005676,
+      "learning_rate": 4.98688828746903e-05,
+      "loss": 0.2310662031173706,
+      "step": 340
+    },
+    {
+      "epoch": 0.06277292576419213,
+      "grad_norm": 0.09203507006168365,
+      "learning_rate": 4.986124621780708e-05,
+      "loss": 0.22021169662475587,
+      "step": 345
+    },
+    {
+      "epoch": 0.06368267831149928,
+      "grad_norm": 0.09505912661552429,
+      "learning_rate": 4.9853394037991416e-05,
+      "loss": 0.2197155237197876,
+      "step": 350
+    },
+    {
+      "epoch": 0.06459243085880641,
+      "grad_norm": 0.09038657695055008,
+      "learning_rate": 4.984532640331412e-05,
+      "loss": 0.22066287994384765,
+      "step": 355
+    },
+    {
+      "epoch": 0.06550218340611354,
+      "grad_norm": 0.09707064181566238,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 0.22455451488494874,
+      "step": 360
+    },
+    {
+      "epoch": 0.06641193595342067,
+      "grad_norm": 0.10367228090763092,
+      "learning_rate": 4.98285450509961e-05,
+      "loss": 0.21993820667266845,
+      "step": 365
+    },
+    {
+      "epoch": 0.0673216885007278,
+      "grad_norm": 0.12229471653699875,
+      "learning_rate": 4.9819831478833456e-05,
+      "loss": 0.2168867588043213,
+      "step": 370
+    },
+    {
+      "epoch": 0.06823144104803494,
+      "grad_norm": 0.0964592918753624,
+      "learning_rate": 4.981090274276406e-05,
+      "loss": 0.21579203605651856,
+      "step": 375
+    },
+    {
+      "epoch": 0.06914119359534207,
+      "grad_norm": 0.09400496631860733,
+      "learning_rate": 4.980175892019141e-05,
+      "loss": 0.20972180366516113,
+      "step": 380
+    },
+    {
+      "epoch": 0.0700509461426492,
+      "grad_norm": 0.08158645778894424,
+      "learning_rate": 4.9792400090383594e-05,
+      "loss": 0.22148358821868896,
+      "step": 385
+    },
+    {
+      "epoch": 0.07096069868995633,
+      "grad_norm": 0.10916394740343094,
+      "learning_rate": 4.978282633447261e-05,
+      "loss": 0.2214418649673462,
+      "step": 390
+    },
+    {
+      "epoch": 0.07187045123726346,
+      "grad_norm": 0.11138810962438583,
+      "learning_rate": 4.9773037735453636e-05,
+      "loss": 0.21814754009246826,
+      "step": 395
+    },
+    {
+      "epoch": 0.07278020378457059,
+      "grad_norm": 0.10914396494626999,
+      "learning_rate": 4.9763034378184365e-05,
+      "loss": 0.21310818195343018,
+      "step": 400
+    },
+    {
+      "epoch": 0.07368995633187773,
+      "grad_norm": 0.1043366864323616,
+      "learning_rate": 4.975281634938421e-05,
+      "loss": 0.21266789436340333,
+      "step": 405
+    },
+    {
+      "epoch": 0.07459970887918486,
+      "grad_norm": 0.1036868542432785,
+      "learning_rate": 4.9742383737633594e-05,
+      "loss": 0.21606721878051757,
+      "step": 410
+    },
+    {
+      "epoch": 0.075509461426492,
+      "grad_norm": 0.11640442907810211,
+      "learning_rate": 4.9731736633373144e-05,
+      "loss": 0.21532948017120362,
+      "step": 415
+    },
+    {
+      "epoch": 0.07641921397379912,
+      "grad_norm": 0.11219926178455353,
+      "learning_rate": 4.9720875128902956e-05,
+      "loss": 0.2191627025604248,
+      "step": 420
+    },
+    {
+      "epoch": 0.07732896652110625,
+      "grad_norm": 0.12103637307882309,
+      "learning_rate": 4.970979931838176e-05,
+      "loss": 0.20938868522644044,
+      "step": 425
+    },
+    {
+      "epoch": 0.0782387190684134,
+      "grad_norm": 0.13274189829826355,
+      "learning_rate": 4.96985092978261e-05,
+      "loss": 0.21792960166931152,
+      "step": 430
+    },
+    {
+      "epoch": 0.07914847161572053,
+      "grad_norm": 0.11164513230323792,
+      "learning_rate": 4.968700516510954e-05,
+      "loss": 0.2022618055343628,
+      "step": 435
+    },
+    {
+      "epoch": 0.08005822416302766,
+      "grad_norm": 0.09532847255468369,
+      "learning_rate": 4.967528701996174e-05,
+      "loss": 0.21255812644958497,
+      "step": 440
+    },
+    {
+      "epoch": 0.08096797671033479,
+      "grad_norm": 0.10279258340597153,
+      "learning_rate": 4.96633549639677e-05,
+      "loss": 0.20683050155639648,
+      "step": 445
+    },
+    {
+      "epoch": 0.08187772925764192,
+      "grad_norm": 0.1257462352514267,
+      "learning_rate": 4.965120910056677e-05,
+      "loss": 0.21419920921325683,
+      "step": 450
+    },
+    {
+      "epoch": 0.08278748180494905,
+      "grad_norm": 0.11663137376308441,
+      "learning_rate": 4.963884953505186e-05,
+      "loss": 0.2072287082672119,
+      "step": 455
+    },
+    {
+      "epoch": 0.08369723435225619,
+      "grad_norm": 0.10488224029541016,
+      "learning_rate": 4.96262763745684e-05,
+      "loss": 0.1982678532600403,
+      "step": 460
+    },
+    {
+      "epoch": 0.08460698689956332,
+      "grad_norm": 0.11801692098379135,
+      "learning_rate": 4.961348972811354e-05,
+      "loss": 0.20662031173706055,
+      "step": 465
+    },
+    {
+      "epoch": 0.08551673944687045,
+      "grad_norm": 0.11318827420473099,
+      "learning_rate": 4.96004897065351e-05,
+      "loss": 0.20947303771972656,
+      "step": 470
+    },
+    {
+      "epoch": 0.08642649199417758,
+      "grad_norm": 0.13409486413002014,
+      "learning_rate": 4.95872764225307e-05,
+      "loss": 0.19670876264572143,
+      "step": 475
+    },
+    {
+      "epoch": 0.08733624454148471,
+      "grad_norm": 0.14440792798995972,
+      "learning_rate": 4.957384999064672e-05,
+      "loss": 0.19842848777770997,
+      "step": 480
+    },
+    {
+      "epoch": 0.08824599708879186,
+      "grad_norm": 0.12246996909379959,
+      "learning_rate": 4.956021052727731e-05,
+      "loss": 0.20318071842193602,
+      "step": 485
+    },
+    {
+      "epoch": 0.08915574963609899,
+      "grad_norm": 0.13437233865261078,
+      "learning_rate": 4.954635815066342e-05,
+      "loss": 0.21675212383270265,
+      "step": 490
+    },
+    {
+      "epoch": 0.09006550218340612,
+      "grad_norm": 0.11109672486782074,
+      "learning_rate": 4.9532292980891744e-05,
+      "loss": 0.2100757837295532,
+      "step": 495
+    },
+    {
+      "epoch": 0.09097525473071325,
+      "grad_norm": 0.1388893872499466,
+      "learning_rate": 4.9518015139893675e-05,
+      "loss": 0.20303285121917725,
+      "step": 500
+    },
+    {
+      "epoch": 0.09188500727802038,
+      "grad_norm": 0.13239721953868866,
+      "learning_rate": 4.950352475144427e-05,
+      "loss": 0.2152268409729004,
+      "step": 505
+    },
+    {
+      "epoch": 0.0927947598253275,
+      "grad_norm": 0.12834979593753815,
+      "learning_rate": 4.948882194116119e-05,
+      "loss": 0.20799248218536376,
+      "step": 510
+    },
+    {
+      "epoch": 0.09370451237263465,
+      "grad_norm": 0.11886704713106155,
+      "learning_rate": 4.947390683650354e-05,
+      "loss": 0.20394976139068605,
+      "step": 515
+    },
+    {
+      "epoch": 0.09461426491994178,
+      "grad_norm": 0.11398876458406448,
+      "learning_rate": 4.945877956677083e-05,
+      "loss": 0.2091092586517334,
+      "step": 520
+    },
+    {
+      "epoch": 0.09552401746724891,
+      "grad_norm": 0.1422540694475174,
+      "learning_rate": 4.944344026310186e-05,
+      "loss": 0.19564238786697388,
+      "step": 525
+    },
+    {
+      "epoch": 0.09643377001455604,
+      "grad_norm": 0.11359584331512451,
+      "learning_rate": 4.9427889058473535e-05,
+      "loss": 0.20493624210357667,
+      "step": 530
+    },
+    {
+      "epoch": 0.09734352256186317,
+      "grad_norm": 0.11703553050756454,
+      "learning_rate": 4.941212608769974e-05,
+      "loss": 0.2098615884780884,
+      "step": 535
+    },
+    {
+      "epoch": 0.0982532751091703,
+      "grad_norm": 0.14552047848701477,
+      "learning_rate": 4.939615148743017e-05,
+      "loss": 0.20382182598114013,
+      "step": 540
+    },
+    {
+      "epoch": 0.09916302765647744,
+      "grad_norm": 0.13178016245365143,
+      "learning_rate": 4.937996539614914e-05,
+      "loss": 0.19901862144470214,
+      "step": 545
+    },
+    {
+      "epoch": 0.10007278020378457,
+      "grad_norm": 0.635392427444458,
+      "learning_rate": 4.936356795417439e-05,
+      "loss": 0.20694944858551026,
+      "step": 550
+    },
+    {
+      "epoch": 0.1009825327510917,
+      "grad_norm": 0.15019077062606812,
+      "learning_rate": 4.934695930365586e-05,
+      "loss": 0.19313746690750122,
+      "step": 555
+    },
+    {
+      "epoch": 0.10189228529839883,
+      "grad_norm": 0.12941956520080566,
+      "learning_rate": 4.9330139588574474e-05,
+      "loss": 0.19671722650527954,
+      "step": 560
+    },
+    {
+      "epoch": 0.10280203784570596,
+      "grad_norm": 0.13818831741809845,
+      "learning_rate": 4.931310895474088e-05,
+      "loss": 0.20026786327362062,
+      "step": 565
+    },
+    {
+      "epoch": 0.1037117903930131,
+      "grad_norm": 0.12011194974184036,
+      "learning_rate": 4.929586754979417e-05,
+      "loss": 0.1932437539100647,
+      "step": 570
+    },
+    {
+      "epoch": 0.10462154294032024,
+      "grad_norm": 0.1345364898443222,
+      "learning_rate": 4.9278415523200644e-05,
+      "loss": 0.20245940685272218,
+      "step": 575
+    },
+    {
+      "epoch": 0.10553129548762737,
+      "grad_norm": 0.13281017541885376,
+      "learning_rate": 4.926075302625247e-05,
+      "loss": 0.19864981174468993,
+      "step": 580
+    },
+    {
+      "epoch": 0.1064410480349345,
+      "grad_norm": 0.13465586304664612,
+      "learning_rate": 4.924288021206639e-05,
+      "loss": 0.19573183059692384,
+      "step": 585
+    },
+    {
+      "epoch": 0.10735080058224163,
+      "grad_norm": 0.15225961804389954,
+      "learning_rate": 4.9224797235582396e-05,
+      "loss": 0.19946801662445068,
+      "step": 590
+    },
+    {
+      "epoch": 0.10826055312954876,
+      "grad_norm": 0.12816746532917023,
+      "learning_rate": 4.92065042535624e-05,
+      "loss": 0.19851526021957397,
+      "step": 595
+    },
+    {
+      "epoch": 0.1091703056768559,
+      "grad_norm": 0.13802853226661682,
+      "learning_rate": 4.9188001424588824e-05,
+      "loss": 0.19321763515472412,
+      "step": 600
+    },
+    {
+      "epoch": 0.11008005822416303,
+      "grad_norm": 0.17504797875881195,
+      "learning_rate": 4.9169288909063295e-05,
+      "loss": 0.2032616138458252,
+      "step": 605
+    },
+    {
+      "epoch": 0.11098981077147016,
+      "grad_norm": 0.13544194400310516,
+      "learning_rate": 4.91503668692052e-05,
+      "loss": 0.2011256456375122,
+      "step": 610
+    },
+    {
+      "epoch": 0.11189956331877729,
+      "grad_norm": 1.3976134061813354,
+      "learning_rate": 4.91312354690503e-05,
+      "loss": 0.19916868209838867,
+      "step": 615
+    },
+    {
+      "epoch": 0.11280931586608442,
+      "grad_norm": 0.1465059071779251,
+      "learning_rate": 4.91118948744493e-05,
+      "loss": 0.19487457275390624,
+      "step": 620
+    },
+    {
+      "epoch": 0.11371906841339156,
+      "grad_norm": 0.12103168666362762,
+      "learning_rate": 4.909234525306645e-05,
+      "loss": 0.1907251238822937,
+      "step": 625
+    },
+    {
+      "epoch": 0.1146288209606987,
+      "grad_norm": 0.12660574913024902,
+      "learning_rate": 4.907258677437802e-05,
+      "loss": 0.19327253103256226,
+      "step": 630
+    },
+    {
+      "epoch": 0.11553857350800582,
+      "grad_norm": 0.1347813606262207,
+      "learning_rate": 4.90526196096709e-05,
+      "loss": 0.19637736082077026,
+      "step": 635
+    },
+    {
+      "epoch": 0.11644832605531295,
+      "grad_norm": 0.14953652024269104,
+      "learning_rate": 4.903244393204107e-05,
+      "loss": 0.20325069427490233,
+      "step": 640
+    },
+    {
+      "epoch": 0.11735807860262008,
+      "grad_norm": 0.13936272263526917,
+      "learning_rate": 4.901205991639213e-05,
+      "loss": 0.1930275321006775,
+      "step": 645
+    },
+    {
+      "epoch": 0.11826783114992721,
+      "grad_norm": 0.1448420137166977,
+      "learning_rate": 4.899146773943374e-05,
+      "loss": 0.20026936531066894,
+      "step": 650
+    },
+    {
+      "epoch": 0.11917758369723436,
+      "grad_norm": 0.1312534064054489,
+      "learning_rate": 4.897066757968014e-05,
+      "loss": 0.19062033891677857,
+      "step": 655
+    },
+    {
+      "epoch": 0.12008733624454149,
+      "grad_norm": 0.13644742965698242,
+      "learning_rate": 4.894965961744859e-05,
+      "loss": 0.18719595670700073,
+      "step": 660
+    },
+    {
+      "epoch": 0.12099708879184862,
+      "grad_norm": 0.14276087284088135,
+      "learning_rate": 4.892844403485777e-05,
+      "loss": 0.19784307479858398,
+      "step": 665
+    },
+    {
+      "epoch": 0.12190684133915575,
+      "grad_norm": 0.14735399186611176,
+      "learning_rate": 4.890702101582623e-05,
+      "loss": 0.19163782596588136,
+      "step": 670
+    },
+    {
+      "epoch": 0.12281659388646288,
+      "grad_norm": 0.15742065012454987,
+      "learning_rate": 4.888539074607082e-05,
+      "loss": 0.19312986135482788,
+      "step": 675
+    },
+    {
+      "epoch": 0.12372634643377002,
+      "grad_norm": 0.12917031347751617,
+      "learning_rate": 4.8863553413105025e-05,
+      "loss": 0.20066320896148682,
+      "step": 680
+    },
+    {
+      "epoch": 0.12463609898107715,
+      "grad_norm": 0.1484801322221756,
+      "learning_rate": 4.884150920623737e-05,
+      "loss": 0.20096096992492676,
+      "step": 685
+    },
+    {
+      "epoch": 0.12554585152838427,
+      "grad_norm": 0.1455296128988266,
+      "learning_rate": 4.88192583165698e-05,
+      "loss": 0.20518505573272705,
+      "step": 690
+    },
+    {
+      "epoch": 0.12645560407569142,
+      "grad_norm": 0.14517490565776825,
+      "learning_rate": 4.879680093699598e-05,
+      "loss": 0.18859238624572755,
+      "step": 695
+    },
+    {
+      "epoch": 0.12736535662299855,
+      "grad_norm": 0.18778090178966522,
+      "learning_rate": 4.877413726219964e-05,
+      "loss": 0.197074818611145,
+      "step": 700
+    },
+    {
+      "epoch": 0.12827510917030568,
+      "grad_norm": 0.13497677445411682,
+      "learning_rate": 4.87512674886529e-05,
+      "loss": 0.18713107109069824,
+      "step": 705
+    },
+    {
+      "epoch": 0.12918486171761281,
+      "grad_norm": 0.12657155096530914,
+      "learning_rate": 4.872819181461455e-05,
+      "loss": 0.1858484387397766,
+      "step": 710
+    },
+    {
+      "epoch": 0.13009461426491994,
+      "grad_norm": 0.11458148807287216,
+      "learning_rate": 4.870491044012834e-05,
+      "loss": 0.18732179403305055,
+      "step": 715
+    },
+    {
+      "epoch": 0.13100436681222707,
+      "grad_norm": 0.13000249862670898,
+      "learning_rate": 4.8681423567021244e-05,
+      "loss": 0.1872936010360718,
+      "step": 720
+    },
+    {
+      "epoch": 0.1319141193595342,
+      "grad_norm": 0.14580890536308289,
+      "learning_rate": 4.865773139890172e-05,
+      "loss": 0.19280019998550416,
+      "step": 725
+    },
+    {
+      "epoch": 0.13282387190684133,
+      "grad_norm": 0.1507277935743332,
+      "learning_rate": 4.8633834141157913e-05,
+      "loss": 0.1898929238319397,
+      "step": 730
+    },
+    {
+      "epoch": 0.13373362445414846,
+      "grad_norm": 0.1418737769126892,
+      "learning_rate": 4.860973200095592e-05,
+      "loss": 0.17926375865936278,
+      "step": 735
+    },
+    {
+      "epoch": 0.1346433770014556,
+      "grad_norm": 0.17151866853237152,
+      "learning_rate": 4.858542518723794e-05,
+      "loss": 0.18963592052459716,
+      "step": 740
+    },
+    {
+      "epoch": 0.13555312954876272,
+      "grad_norm": 0.11162743717432022,
+      "learning_rate": 4.8560913910720535e-05,
+      "loss": 0.19466646909713745,
+      "step": 745
+    },
+    {
+      "epoch": 0.13646288209606988,
+      "grad_norm": 0.15628376603126526,
+      "learning_rate": 4.8536198383892725e-05,
+      "loss": 0.19494034051895143,
+      "step": 750
+    },
+    {
+      "epoch": 0.137372634643377,
+      "grad_norm": 0.18209289014339447,
+      "learning_rate": 4.851127882101421e-05,
+      "loss": 0.18747550249099731,
+      "step": 755
+    },
+    {
+      "epoch": 0.13828238719068414,
+      "grad_norm": 0.14559614658355713,
+      "learning_rate": 4.8486155438113454e-05,
+      "loss": 0.1897158980369568,
+      "step": 760
+    },
+    {
+      "epoch": 0.13919213973799127,
+      "grad_norm": 0.3198587894439697,
+      "learning_rate": 4.846082845298586e-05,
+      "loss": 0.18571001291275024,
+      "step": 765
+    },
+    {
+      "epoch": 0.1401018922852984,
+      "grad_norm": 0.1486678421497345,
+      "learning_rate": 4.843529808519189e-05,
+      "loss": 0.19561930894851684,
+      "step": 770
+    },
+    {
+      "epoch": 0.14101164483260553,
+      "grad_norm": 0.15318170189857483,
+      "learning_rate": 4.840956455605509e-05,
+      "loss": 0.187040114402771,
+      "step": 775
+    },
+    {
+      "epoch": 0.14192139737991266,
+      "grad_norm": 0.13754244148731232,
+      "learning_rate": 4.838362808866025e-05,
+      "loss": 0.18345539569854735,
+      "step": 780
+    },
+    {
+      "epoch": 0.1428311499272198,
+      "grad_norm": 0.12943248450756073,
+      "learning_rate": 4.835748890785143e-05,
+      "loss": 0.1921079397201538,
+      "step": 785
+    },
+    {
+      "epoch": 0.14374090247452692,
+      "grad_norm": 0.110458143055439,
+      "learning_rate": 4.833114724023001e-05,
+      "loss": 0.17927205562591553,
+      "step": 790
+    },
+    {
+      "epoch": 0.14465065502183405,
+      "grad_norm": 0.2421770840883255,
+      "learning_rate": 4.830460331415275e-05,
+      "loss": 0.18317567110061644,
+      "step": 795
+    },
+    {
+      "epoch": 0.14556040756914118,
+      "grad_norm": 0.14752762019634247,
+      "learning_rate": 4.8277857359729787e-05,
+      "loss": 0.1843916058540344,
+      "step": 800
+    },
+    {
+      "epoch": 0.14647016011644834,
+      "grad_norm": 0.15043556690216064,
+      "learning_rate": 4.8250909608822644e-05,
+      "loss": 0.18354393243789674,
+      "step": 805
+    },
+    {
+      "epoch": 0.14737991266375547,
+      "grad_norm": 0.1381794661283493,
+      "learning_rate": 4.822376029504223e-05,
+      "loss": 0.1789781332015991,
+      "step": 810
+    },
+    {
+      "epoch": 0.1482896652110626,
+      "grad_norm": 0.18386174738407135,
+      "learning_rate": 4.819640965374681e-05,
+      "loss": 0.19494292736053467,
+      "step": 815
+    },
+    {
+      "epoch": 0.14919941775836973,
+      "grad_norm": 0.13829593360424042,
+      "learning_rate": 4.816885792203996e-05,
+      "loss": 0.18486063480377196,
+      "step": 820
+    },
+    {
+      "epoch": 0.15010917030567686,
+      "grad_norm": 0.15033291280269623,
+      "learning_rate": 4.814110533876852e-05,
+      "loss": 0.18061509132385253,
+      "step": 825
+    },
+    {
+      "epoch": 0.151018922852984,
+      "grad_norm": 0.17150473594665527,
+      "learning_rate": 4.811315214452051e-05,
+      "loss": 0.18464866876602173,
+      "step": 830
+    },
+    {
+      "epoch": 0.15192867540029112,
+      "grad_norm": 0.15317125618457794,
+      "learning_rate": 4.808499858162307e-05,
+      "loss": 0.1837708592414856,
+      "step": 835
+    },
+    {
+      "epoch": 0.15283842794759825,
+      "grad_norm": 0.2671392560005188,
+      "learning_rate": 4.805664489414031e-05,
+      "loss": 0.19338636398315429,
+      "step": 840
+    },
+    {
+      "epoch": 0.15374818049490538,
+      "grad_norm": 0.14047028124332428,
+      "learning_rate": 4.802809132787125e-05,
+      "loss": 0.17069108486175538,
+      "step": 845
+    },
+    {
+      "epoch": 0.1546579330422125,
+      "grad_norm": 0.1520431935787201,
+      "learning_rate": 4.799933813034768e-05,
+      "loss": 0.18607735633850098,
+      "step": 850
+    },
+    {
+      "epoch": 0.15556768558951964,
+      "grad_norm": 0.17239463329315186,
+      "learning_rate": 4.797038555083197e-05,
+      "loss": 0.18069062232971192,
+      "step": 855
+    },
+    {
+      "epoch": 0.1564774381368268,
+      "grad_norm": 0.1377955675125122,
+      "learning_rate": 4.794123384031495e-05,
+      "loss": 0.18870222568511963,
+      "step": 860
+    },
+    {
+      "epoch": 0.15738719068413393,
+      "grad_norm": 0.15901461243629456,
+      "learning_rate": 4.791188325151373e-05,
+      "loss": 0.18128334283828734,
+      "step": 865
+    },
+    {
+      "epoch": 0.15829694323144106,
+      "grad_norm": 0.14634132385253906,
+      "learning_rate": 4.7882334038869495e-05,
+      "loss": 0.1866163969039917,
+      "step": 870
+    },
+    {
+      "epoch": 0.1592066957787482,
+      "grad_norm": 0.15361061692237854,
+      "learning_rate": 4.785258645854529e-05,
+      "loss": 0.17850807905197144,
+      "step": 875
+    },
+    {
+      "epoch": 0.16011644832605532,
+      "grad_norm": 0.13751649856567383,
+      "learning_rate": 4.782264076842385e-05,
+      "loss": 0.17731113433837892,
+      "step": 880
+    },
+    {
+      "epoch": 0.16102620087336245,
+      "grad_norm": 0.17909638583660126,
+      "learning_rate": 4.7792497228105314e-05,
+      "loss": 0.18344542980194092,
+      "step": 885
+    },
+    {
+      "epoch": 0.16193595342066958,
+      "grad_norm": 0.16038304567337036,
+      "learning_rate": 4.776215609890498e-05,
+      "loss": 0.18868647813796996,
+      "step": 890
+    },
+    {
+      "epoch": 0.1628457059679767,
+      "grad_norm": 0.1653951108455658,
+      "learning_rate": 4.773161764385107e-05,
+      "loss": 0.18614152669906617,
+      "step": 895
+    },
+    {
+      "epoch": 0.16375545851528384,
+      "grad_norm": 0.16193026304244995,
+      "learning_rate": 4.770088212768241e-05,
+      "loss": 0.18564575910568237,
+      "step": 900
+    },
+    {
+      "epoch": 0.16466521106259097,
+      "grad_norm": 0.16048531234264374,
+      "learning_rate": 4.7669949816846173e-05,
+      "loss": 0.18330031633377075,
+      "step": 905
+    },
+    {
+      "epoch": 0.1655749636098981,
+      "grad_norm": 0.1440177708864212,
+      "learning_rate": 4.7638820979495534e-05,
+      "loss": 0.17712442874908446,
+      "step": 910
+    },
+    {
+      "epoch": 0.16648471615720525,
+      "grad_norm": 0.19635969400405884,
+      "learning_rate": 4.760749588548738e-05,
+      "loss": 0.18679027557373046,
+      "step": 915
+    },
+    {
+      "epoch": 0.16739446870451238,
+      "grad_norm": 0.15576541423797607,
+      "learning_rate": 4.757597480637995e-05,
+      "loss": 0.19283764362335204,
+      "step": 920
+    },
+    {
+      "epoch": 0.1683042212518195,
+      "grad_norm": 0.1550331562757492,
+      "learning_rate": 4.7544258015430463e-05,
+      "loss": 0.18269542455673218,
+      "step": 925
+    },
+    {
+      "epoch": 0.16921397379912664,
+      "grad_norm": 0.18369626998901367,
+      "learning_rate": 4.75123457875928e-05,
+      "loss": 0.1697891116142273,
+      "step": 930
+    },
+    {
+      "epoch": 0.17012372634643377,
+      "grad_norm": 0.15266314148902893,
+      "learning_rate": 4.7480238399515074e-05,
+      "loss": 0.18523451089859008,
+      "step": 935
+    },
+    {
+      "epoch": 0.1710334788937409,
+      "grad_norm": 0.16709664463996887,
+      "learning_rate": 4.744793612953724e-05,
+      "loss": 0.1803238034248352,
+      "step": 940
+    },
+    {
+      "epoch": 0.17194323144104803,
+      "grad_norm": 0.14929179847240448,
+      "learning_rate": 4.741543925768872e-05,
+      "loss": 0.1861217737197876,
+      "step": 945
+    },
+    {
+      "epoch": 0.17285298398835516,
+      "grad_norm": 0.1362280696630478,
+      "learning_rate": 4.7382748065685915e-05,
+      "loss": 0.17896100282669067,
+      "step": 950
+    },
+    {
+      "epoch": 0.1737627365356623,
+      "grad_norm": 0.15290239453315735,
+      "learning_rate": 4.734986283692982e-05,
+      "loss": 0.18432788848876952,
+      "step": 955
+    },
+    {
+      "epoch": 0.17467248908296942,
+      "grad_norm": 0.1287035197019577,
+      "learning_rate": 4.73167838565035e-05,
+      "loss": 0.18485682010650634,
+      "step": 960
+    },
+    {
+      "epoch": 0.17558224163027655,
+      "grad_norm": 0.17969627678394318,
+      "learning_rate": 4.728351141116971e-05,
+      "loss": 0.17361557483673096,
+      "step": 965
+    },
+    {
+      "epoch": 0.1764919941775837,
+      "grad_norm": 0.13751201331615448,
+      "learning_rate": 4.7250045789368326e-05,
+      "loss": 0.1731679320335388,
+      "step": 970
+    },
+    {
+      "epoch": 0.17740174672489084,
+      "grad_norm": 0.1603265255689621,
+      "learning_rate": 4.721638728121388e-05,
+      "loss": 0.17308170795440675,
+      "step": 975
+    },
+    {
+      "epoch": 0.17831149927219797,
+      "grad_norm": 0.1592789888381958,
+      "learning_rate": 4.718253617849306e-05,
+      "loss": 0.17534757852554322,
+      "step": 980
+    },
+    {
+      "epoch": 0.1792212518195051,
+      "grad_norm": 0.12727224826812744,
+      "learning_rate": 4.714849277466214e-05,
+      "loss": 0.17817609310150145,
+      "step": 985
+    },
+    {
+      "epoch": 0.18013100436681223,
+      "grad_norm": 0.15401554107666016,
+      "learning_rate": 4.711425736484447e-05,
+      "loss": 0.1733405351638794,
+      "step": 990
+    },
+    {
+      "epoch": 0.18104075691411936,
+      "grad_norm": 0.13253968954086304,
+      "learning_rate": 4.7079830245827906e-05,
+      "loss": 0.17846795320510864,
+      "step": 995
+    },
+    {
+      "epoch": 0.1819505094614265,
+      "grad_norm": 0.21846213936805725,
+      "learning_rate": 4.7045211716062245e-05,
+      "loss": 0.18021599054336548,
+      "step": 1000
+    },
+    {
+      "epoch": 0.18286026200873362,
+      "grad_norm": 0.16867990791797638,
+      "learning_rate": 4.7010402075656595e-05,
+      "loss": 0.18232386112213134,
+      "step": 1005
+    },
+    {
+      "epoch": 0.18377001455604075,
+      "grad_norm": 0.17180582880973816,
+      "learning_rate": 4.697540162637686e-05,
+      "loss": 0.1816317319869995,
+      "step": 1010
+    },
+    {
+      "epoch": 0.18467976710334788,
+      "grad_norm": 0.16480213403701782,
+      "learning_rate": 4.694021067164303e-05,
+      "loss": 0.17718446254730225,
+      "step": 1015
+    },
+    {
+      "epoch": 0.185589519650655,
+      "grad_norm": 0.15015918016433716,
+      "learning_rate": 4.6904829516526605e-05,
+      "loss": 0.17412011623382567,
+      "step": 1020
+    },
+    {
+      "epoch": 0.18649927219796217,
+      "grad_norm": 0.14445139467716217,
+      "learning_rate": 4.686925846774795e-05,
+      "loss": 0.1778018832206726,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1874090247452693,
+      "grad_norm": 0.1701960265636444,
+      "learning_rate": 4.683349783367362e-05,
+      "loss": 0.16901081800460815,
+      "step": 1030
+    },
+    {
+      "epoch": 0.18831877729257643,
+      "grad_norm": 0.15894867479801178,
+      "learning_rate": 4.679754792431368e-05,
+      "loss": 0.17055928707122803,
+      "step": 1035
+    },
+    {
+      "epoch": 0.18922852983988356,
+      "grad_norm": 0.1511942446231842,
+      "learning_rate": 4.676140905131903e-05,
+      "loss": 0.17339680194854737,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1901382823871907,
+      "grad_norm": 0.14735209941864014,
+      "learning_rate": 4.672508152797872e-05,
+      "loss": 0.17802717685699462,
+      "step": 1045
+    },
+    {
+      "epoch": 0.19104803493449782,
+      "grad_norm": 0.17367291450500488,
+      "learning_rate": 4.66885656692172e-05,
+      "loss": 0.1732744097709656,
+      "step": 1050
+    },
+    {
+      "epoch": 0.19195778748180495,
+      "grad_norm": 0.147227481007576,
+      "learning_rate": 4.665186179159159e-05,
+      "loss": 0.17040517330169677,
+      "step": 1055
+    },
+    {
+      "epoch": 0.19286754002911208,
+      "grad_norm": 0.1709655076265335,
+      "learning_rate": 4.6614970213289e-05,
+      "loss": 0.17794088125228882,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1937772925764192,
+      "grad_norm": 0.1588088721036911,
+      "learning_rate": 4.657789125412366e-05,
+      "loss": 0.17180380821228028,
+      "step": 1065
+    },
+    {
+      "epoch": 0.19468704512372634,
+      "grad_norm": 0.14827021956443787,
+      "learning_rate": 4.654062523553428e-05,
+      "loss": 0.182997989654541,
+      "step": 1070
+    },
+    {
+      "epoch": 0.19559679767103347,
+      "grad_norm": 0.16230466961860657,
+      "learning_rate": 4.6503172480581126e-05,
+      "loss": 0.17346880435943604,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1965065502183406,
+      "grad_norm": 0.1637624353170395,
+      "learning_rate": 4.646553331394333e-05,
+      "loss": 0.17263576984405518,
+      "step": 1080
+    },
+    {
+      "epoch": 0.19741630276564776,
+      "grad_norm": 0.15977843105793,
+      "learning_rate": 4.642770806191603e-05,
+      "loss": 0.17284308671951293,
+      "step": 1085
+    },
+    {
+      "epoch": 0.19832605531295489,
+      "grad_norm": 0.15394869446754456,
+      "learning_rate": 4.6389697052407534e-05,
+      "loss": 0.17797101736068727,
+      "step": 1090
+    },
+    {
+      "epoch": 0.19923580786026202,
+      "grad_norm": 0.15995225310325623,
+      "learning_rate": 4.6351500614936485e-05,
+      "loss": 0.18137198686599731,
+      "step": 1095
+    },
+    {
+      "epoch": 0.20014556040756915,
+      "grad_norm": 0.1779479682445526,
+      "learning_rate": 4.6313119080629006e-05,
+      "loss": 0.17998344898223878,
+      "step": 1100
+    },
+    {
+      "epoch": 0.20105531295487628,
+      "grad_norm": 0.14362832903862,
+      "learning_rate": 4.627455278221584e-05,
+      "loss": 0.18196423053741456,
+      "step": 1105
+    },
+    {
+      "epoch": 0.2019650655021834,
+      "grad_norm": 0.15951639413833618,
+      "learning_rate": 4.623580205402947e-05,
+      "loss": 0.17423888444900512,
+      "step": 1110
+    },
+    {
+      "epoch": 0.20287481804949054,
+      "grad_norm": 0.17273563146591187,
+      "learning_rate": 4.619686723200115e-05,
+      "loss": 0.17392473220825194,
+      "step": 1115
+    },
+    {
+      "epoch": 0.20378457059679767,
+      "grad_norm": 0.1655360758304596,
+      "learning_rate": 4.615774865365813e-05,
+      "loss": 0.17528389692306517,
+      "step": 1120
+    },
+    {
+      "epoch": 0.2046943231441048,
+      "grad_norm": 0.15920691192150116,
+      "learning_rate": 4.611844665812058e-05,
+      "loss": 0.1806849241256714,
+      "step": 1125
+    },
+    {
+      "epoch": 0.20560407569141192,
+      "grad_norm": 0.16114577651023865,
+      "learning_rate": 4.607896158609875e-05,
+      "loss": 0.17217352390289306,
+      "step": 1130
+    },
+    {
+      "epoch": 0.20651382823871905,
+      "grad_norm": 0.1499422937631607,
+      "learning_rate": 4.603929377988999e-05,
+      "loss": 0.17806737422943114,
+      "step": 1135
+    },
+    {
+      "epoch": 0.2074235807860262,
+      "grad_norm": 0.17605191469192505,
+      "learning_rate": 4.5999443583375765e-05,
+      "loss": 0.17842113971710205,
+      "step": 1140
+    },
+    {
+      "epoch": 0.20833333333333334,
+      "grad_norm": 0.16117210686206818,
+      "learning_rate": 4.595941134201871e-05,
+      "loss": 0.18379683494567872,
+      "step": 1145
+    },
+    {
+      "epoch": 0.20924308588064047,
+      "grad_norm": 0.21199050545692444,
+      "learning_rate": 4.591919740285957e-05,
+      "loss": 0.16286123991012574,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2101528384279476,
+      "grad_norm": 0.15100529789924622,
+      "learning_rate": 4.587880211451427e-05,
+      "loss": 0.17995200157165528,
+      "step": 1155
+    },
+    {
+      "epoch": 0.21106259097525473,
+      "grad_norm": 0.16618172824382782,
+      "learning_rate": 4.583822582717085e-05,
+      "loss": 0.16960303783416747,
+      "step": 1160
+    },
+    {
+      "epoch": 0.21197234352256186,
+      "grad_norm": 0.14743569493293762,
+      "learning_rate": 4.579746889258643e-05,
+      "loss": 0.17762668132781984,
+      "step": 1165
+    },
+    {
+      "epoch": 0.212882096069869,
+      "grad_norm": 0.1697179079055786,
+      "learning_rate": 4.575653166408417e-05,
+      "loss": 0.16656005382537842,
+      "step": 1170
+    },
+    {
+      "epoch": 0.21379184861717612,
+      "grad_norm": 0.14886513352394104,
+      "learning_rate": 4.57154144965502e-05,
+      "loss": 0.17091882228851318,
+      "step": 1175
+    },
+    {
+      "epoch": 0.21470160116448325,
+      "grad_norm": 0.18197473883628845,
+      "learning_rate": 4.5674117746430556e-05,
+      "loss": 0.1770920753479004,
+      "step": 1180
+    },
+    {
+      "epoch": 0.21561135371179038,
+      "grad_norm": 0.17323088645935059,
+      "learning_rate": 4.563264177172807e-05,
+      "loss": 0.1734643578529358,
+      "step": 1185
+    },
+    {
+      "epoch": 0.2165211062590975,
+      "grad_norm": 0.1521984338760376,
+      "learning_rate": 4.559098693199929e-05,
+      "loss": 0.17515116930007935,
+      "step": 1190
+    },
+    {
+      "epoch": 0.21743085880640467,
+      "grad_norm": 0.1842304915189743,
+      "learning_rate": 4.554915358835134e-05,
+      "loss": 0.16798022985458375,
+      "step": 1195
+    },
+    {
+      "epoch": 0.2183406113537118,
+      "grad_norm": 0.14753451943397522,
+      "learning_rate": 4.5507142103438794e-05,
+      "loss": 0.1755476713180542,
+      "step": 1200
+    },
+    {
+      "epoch": 0.21925036390101893,
+      "grad_norm": 0.17096194624900818,
+      "learning_rate": 4.546495284146057e-05,
+      "loss": 0.1792473554611206,
+      "step": 1205
+    },
+    {
+      "epoch": 0.22016011644832606,
+      "grad_norm": 0.1579233556985855,
+      "learning_rate": 4.542258616815669e-05,
+      "loss": 0.17230144739151002,
+      "step": 1210
+    },
+    {
+      "epoch": 0.2210698689956332,
+      "grad_norm": 0.177297905087471,
+      "learning_rate": 4.5380042450805216e-05,
+      "loss": 0.1807127833366394,
+      "step": 1215
+    },
+    {
+      "epoch": 0.22197962154294032,
+      "grad_norm": 0.14331696927547455,
+      "learning_rate": 4.533732205821897e-05,
+      "loss": 0.17201389074325563,
+      "step": 1220
+    },
+    {
+      "epoch": 0.22288937409024745,
+      "grad_norm": 0.14473360776901245,
+      "learning_rate": 4.529442536074239e-05,
+      "loss": 0.17036900520324708,
+      "step": 1225
+    },
+    {
+      "epoch": 0.22379912663755458,
+      "grad_norm": 0.1820901483297348,
+      "learning_rate": 4.5251352730248314e-05,
+      "loss": 0.17704882621765136,
+      "step": 1230
+    },
+    {
+      "epoch": 0.2247088791848617,
+      "grad_norm": 0.1948976367712021,
+      "learning_rate": 4.5208104540134746e-05,
+      "loss": 0.1706973433494568,
+      "step": 1235
+    },
+    {
+      "epoch": 0.22561863173216884,
+      "grad_norm": 0.16660070419311523,
+      "learning_rate": 4.51646811653216e-05,
+      "loss": 0.17636821269989014,
+      "step": 1240
+    },
+    {
+      "epoch": 0.22652838427947597,
+      "grad_norm": 0.1699984073638916,
+      "learning_rate": 4.512108298224751e-05,
+      "loss": 0.16986632347106934,
+      "step": 1245
+    },
+    {
+      "epoch": 0.22743813682678313,
+      "grad_norm": 0.17601042985916138,
+      "learning_rate": 4.50773103688665e-05,
+      "loss": 0.17507898807525635,
+      "step": 1250
+    },
+    {
+      "epoch": 0.22834788937409026,
+      "grad_norm": 0.17557238042354584,
+      "learning_rate": 4.503336370464476e-05,
+      "loss": 0.17702863216400147,
+      "step": 1255
+    },
+    {
+      "epoch": 0.2292576419213974,
+      "grad_norm": 0.1800651252269745,
+      "learning_rate": 4.498924337055729e-05,
+      "loss": 0.16419180631637573,
+      "step": 1260
+    },
+    {
+      "epoch": 0.23016739446870452,
+      "grad_norm": 0.2022479772567749,
+      "learning_rate": 4.494494974908468e-05,
+      "loss": 0.17482060194015503,
+      "step": 1265
+    },
+    {
+      "epoch": 0.23107714701601165,
+      "grad_norm": 0.14180205762386322,
+      "learning_rate": 4.490048322420973e-05,
+      "loss": 0.1723136067390442,
+      "step": 1270
+    },
+    {
+      "epoch": 0.23198689956331878,
+      "grad_norm": 0.18607310950756073,
+      "learning_rate": 4.485584418141419e-05,
+      "loss": 0.17096419334411622,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2328966521106259,
+      "grad_norm": 0.15958310663700104,
+      "learning_rate": 4.481103300767529e-05,
+      "loss": 0.1656244158744812,
+      "step": 1280
+    },
+    {
+      "epoch": 0.23380640465793304,
+      "grad_norm": 0.17552383244037628,
+      "learning_rate": 4.476605009146255e-05,
+      "loss": 0.17677626609802247,
+      "step": 1285
+    },
+    {
+      "epoch": 0.23471615720524017,
+      "grad_norm": 0.15299823880195618,
+      "learning_rate": 4.472089582273429e-05,
+      "loss": 0.1778991103172302,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2356259097525473,
+      "grad_norm": 0.14613987505435944,
+      "learning_rate": 4.46755705929343e-05,
+      "loss": 0.17071452140808105,
+      "step": 1295
+    },
+    {
+      "epoch": 0.23653566229985443,
+      "grad_norm": 0.17781122028827667,
+      "learning_rate": 4.463007479498843e-05,
+      "loss": 0.16955430507659913,
+      "step": 1300
+    },
+    {
+      "epoch": 0.23744541484716158,
+      "grad_norm": 0.16326487064361572,
+      "learning_rate": 4.458440882330119e-05,
+      "loss": 0.1777693510055542,
+      "step": 1305
+    },
+    {
+      "epoch": 0.23835516739446871,
+      "grad_norm": 0.17701926827430725,
+      "learning_rate": 4.4538573073752365e-05,
+      "loss": 0.16323351860046387,
+      "step": 1310
+    },
+    {
+      "epoch": 0.23926491994177584,
+      "grad_norm": 0.13104717433452606,
+      "learning_rate": 4.449256794369349e-05,
+      "loss": 0.17653456926345826,
+      "step": 1315
+    },
+    {
+      "epoch": 0.24017467248908297,
+      "grad_norm": 0.1796836256980896,
+      "learning_rate": 4.444639383194452e-05,
+      "loss": 0.17189600467681884,
+      "step": 1320
+    },
+    {
+      "epoch": 0.2410844250363901,
+      "grad_norm": 0.14919696748256683,
+      "learning_rate": 4.440005113879029e-05,
+      "loss": 0.17003334760665895,
+      "step": 1325
+    },
+    {
+      "epoch": 0.24199417758369723,
+      "grad_norm": 0.1728784441947937,
+      "learning_rate": 4.4353540265977064e-05,
+      "loss": 0.17397408485412597,
+      "step": 1330
+    },
+    {
+      "epoch": 0.24290393013100436,
+      "grad_norm": 0.14591015875339508,
+      "learning_rate": 4.43068616167091e-05,
+      "loss": 0.16498478651046752,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2438136826783115,
+      "grad_norm": 0.18417201936244965,
+      "learning_rate": 4.4260015595645055e-05,
+      "loss": 0.16841750144958495,
+      "step": 1340
+    },
+    {
+      "epoch": 0.24472343522561862,
+      "grad_norm": 0.16264279186725616,
+      "learning_rate": 4.4213002608894605e-05,
+      "loss": 0.16907373666763306,
+      "step": 1345
+    },
+    {
+      "epoch": 0.24563318777292575,
+      "grad_norm": 0.15248481929302216,
+      "learning_rate": 4.416582306401481e-05,
+      "loss": 0.15931472778320313,
+      "step": 1350
+    },
+    {
+      "epoch": 0.24654294032023288,
+      "grad_norm": 0.1488373875617981,
+      "learning_rate": 4.4118477370006636e-05,
+      "loss": 0.1701716423034668,
+      "step": 1355
+    },
+    {
+      "epoch": 0.24745269286754004,
+      "grad_norm": 0.14679782092571259,
+      "learning_rate": 4.407096593731142e-05,
+      "loss": 0.157412326335907,
+      "step": 1360
+    },
+    {
+      "epoch": 0.24836244541484717,
+      "grad_norm": 0.17139530181884766,
+      "learning_rate": 4.402328917780728e-05,
+      "loss": 0.17303754091262818,
+      "step": 1365
+    },
+    {
+      "epoch": 0.2492721979621543,
+      "grad_norm": 0.1534871757030487,
+      "learning_rate": 4.397544750480554e-05,
+      "loss": 0.1786255121231079,
+      "step": 1370
+    },
+    {
+      "epoch": 0.2501819505094614,
+      "grad_norm": 0.1876252293586731,
+      "learning_rate": 4.39274413330472e-05,
+      "loss": 0.16442898511886597,
+      "step": 1375
+    },
+    {
+      "epoch": 0.25109170305676853,
+      "grad_norm": 0.16165752708911896,
+      "learning_rate": 4.387927107869928e-05,
+      "loss": 0.1780426025390625,
+      "step": 1380
+    },
+    {
+      "epoch": 0.25200145560407566,
+      "grad_norm": 0.17242255806922913,
+      "learning_rate": 4.383093715935124e-05,
+      "loss": 0.15959256887435913,
+      "step": 1385
+    },
+    {
+      "epoch": 0.25291120815138285,
+      "grad_norm": 0.1627114862203598,
+      "learning_rate": 4.378243999401137e-05,
+      "loss": 0.17606115341186523,
+      "step": 1390
+    },
+    {
+      "epoch": 0.25382096069869,
+      "grad_norm": 0.15911224484443665,
+      "learning_rate": 4.373378000310312e-05,
+      "loss": 0.16798585653305054,
+      "step": 1395
+    },
+    {
+      "epoch": 0.2547307132459971,
+      "grad_norm": 0.15542249381542206,
+      "learning_rate": 4.3684957608461505e-05,
+      "loss": 0.1695417881011963,
+      "step": 1400
+    },
+    {
+      "epoch": 0.25564046579330424,
+      "grad_norm": 0.1475304812192917,
+      "learning_rate": 4.363597323332941e-05,
+      "loss": 0.16340878009796142,
+      "step": 1405
+    },
+    {
+      "epoch": 0.25655021834061137,
+      "grad_norm": 0.16943927109241486,
+      "learning_rate": 4.358682730235395e-05,
+      "loss": 0.17240238189697266,
+      "step": 1410
+    },
+    {
+      "epoch": 0.2574599708879185,
+      "grad_norm": 0.1816391944885254,
+      "learning_rate": 4.3537520241582744e-05,
+      "loss": 0.16558437347412108,
+      "step": 1415
+    },
+    {
+      "epoch": 0.25836972343522563,
+      "grad_norm": 0.23851341009140015,
+      "learning_rate": 4.348805247846027e-05,
+      "loss": 0.16796000003814698,
+      "step": 1420
+    },
+    {
+      "epoch": 0.25927947598253276,
+      "grad_norm": 0.15415243804454803,
+      "learning_rate": 4.343842444182414e-05,
+      "loss": 0.1746017098426819,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2601892285298399,
+      "grad_norm": 0.15651032328605652,
+      "learning_rate": 4.338863656190139e-05,
+      "loss": 0.1649057984352112,
+      "step": 1430
+    },
+    {
+      "epoch": 0.261098981077147,
+      "grad_norm": 0.16601966321468353,
+      "learning_rate": 4.333868927030471e-05,
+      "loss": 0.15888988971710205,
+      "step": 1435
+    },
+    {
+      "epoch": 0.26200873362445415,
+      "grad_norm": 0.1549467295408249,
+      "learning_rate": 4.328858300002876e-05,
+      "loss": 0.16357985734939576,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2629184861717613,
+      "grad_norm": 0.16332370042800903,
+      "learning_rate": 4.32383181854464e-05,
+      "loss": 0.16749982833862304,
+      "step": 1445
+    },
+    {
+      "epoch": 0.2638282387190684,
+      "grad_norm": 0.14827077090740204,
+      "learning_rate": 4.3187895262304894e-05,
+      "loss": 0.16886214017868043,
+      "step": 1450
+    },
+    {
+      "epoch": 0.26473799126637554,
+      "grad_norm": 0.1557198166847229,
+      "learning_rate": 4.313731466772216e-05,
+      "loss": 0.17512214183807373,
+      "step": 1455
+    },
+    {
+      "epoch": 0.26564774381368267,
+      "grad_norm": 0.17263570427894592,
+      "learning_rate": 4.308657684018299e-05,
+      "loss": 0.16248074769973755,
+      "step": 1460
+    },
+    {
+      "epoch": 0.2665574963609898,
+      "grad_norm": 0.17135761678218842,
+      "learning_rate": 4.303568221953521e-05,
+      "loss": 0.16605921983718872,
+      "step": 1465
+    },
+    {
+      "epoch": 0.26746724890829693,
+      "grad_norm": 0.14322632551193237,
+      "learning_rate": 4.2984631246985897e-05,
+      "loss": 0.1610772728919983,
+      "step": 1470
+    },
+    {
+      "epoch": 0.26837700145560406,
+      "grad_norm": 0.18852312862873077,
+      "learning_rate": 4.2933424365097564e-05,
+      "loss": 0.1686462163925171,
+      "step": 1475
+    },
+    {
+      "epoch": 0.2692867540029112,
+      "grad_norm": 0.1780245155096054,
+      "learning_rate": 4.2882062017784294e-05,
+      "loss": 0.16953932046890258,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2701965065502183,
+      "grad_norm": 0.180568665266037,
+      "learning_rate": 4.2830544650307895e-05,
+      "loss": 0.16442664861679077,
+      "step": 1485
+    },
+    {
+      "epoch": 0.27110625909752545,
+      "grad_norm": 0.16876435279846191,
+      "learning_rate": 4.277887270927407e-05,
+      "loss": 0.17128173112869263,
+      "step": 1490
+    },
+    {
+      "epoch": 0.2720160116448326,
+      "grad_norm": 0.164053276181221,
+      "learning_rate": 4.2727046642628513e-05,
+      "loss": 0.16331382989883422,
+      "step": 1495
+    },
+    {
+      "epoch": 0.27292576419213976,
+      "grad_norm": 0.14577528834342957,
+      "learning_rate": 4.267506689965305e-05,
+      "loss": 0.1638316035270691,
+      "step": 1500
+    },
+    {
+      "epoch": 0.2738355167394469,
+      "grad_norm": 0.1648740917444229,
+      "learning_rate": 4.262293393096171e-05,
+      "loss": 0.15332664251327516,
+      "step": 1505
+    },
+    {
+      "epoch": 0.274745269286754,
+      "grad_norm": 0.16445094347000122,
+      "learning_rate": 4.257064818849685e-05,
+      "loss": 0.1706634521484375,
+      "step": 1510
+    },
+    {
+      "epoch": 0.27565502183406115,
+      "grad_norm": 0.1584935486316681,
+      "learning_rate": 4.251821012552524e-05,
+      "loss": 0.1684114694595337,
+      "step": 1515
+    },
+    {
+      "epoch": 0.2765647743813683,
+      "grad_norm": 0.17215611040592194,
+      "learning_rate": 4.24656201966341e-05,
+      "loss": 0.15594131946563722,
+      "step": 1520
+    },
+    {
+      "epoch": 0.2774745269286754,
+      "grad_norm": 0.15945589542388916,
+      "learning_rate": 4.2412878857727214e-05,
+      "loss": 0.1686659574508667,
+      "step": 1525
+    },
+    {
+      "epoch": 0.27838427947598254,
+      "grad_norm": 0.16103951632976532,
+      "learning_rate": 4.2359986566020906e-05,
+      "loss": 0.17779340744018554,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2792940320232897,
+      "grad_norm": 0.1770307570695877,
+      "learning_rate": 4.230694378004014e-05,
+      "loss": 0.16786882877349854,
+      "step": 1535
+    },
+    {
+      "epoch": 0.2802037845705968,
+      "grad_norm": 0.16225053369998932,
+      "learning_rate": 4.2253750959614504e-05,
+      "loss": 0.16558897495269775,
+      "step": 1540
+    },
+    {
+      "epoch": 0.28111353711790393,
+      "grad_norm": 0.27213969826698303,
+      "learning_rate": 4.220040856587425e-05,
+      "loss": 0.1641119599342346,
+      "step": 1545
+    },
+    {
+      "epoch": 0.28202328966521106,
+      "grad_norm": 0.1773071587085724,
+      "learning_rate": 4.2146917061246284e-05,
+      "loss": 0.16919140815734862,
+      "step": 1550
+    },
+    {
+      "epoch": 0.2829330422125182,
+      "grad_norm": 0.15519705414772034,
+      "learning_rate": 4.209327690945014e-05,
+      "loss": 0.15501506328582765,
+      "step": 1555
+    },
+    {
+      "epoch": 0.2838427947598253,
+      "grad_norm": 0.19921597838401794,
+      "learning_rate": 4.203948857549402e-05,
+      "loss": 0.1690821886062622,
+      "step": 1560
+    },
+    {
+      "epoch": 0.28475254730713245,
+      "grad_norm": 0.15417630970478058,
+      "learning_rate": 4.1985552525670696e-05,
+      "loss": 0.1675640344619751,
+      "step": 1565
+    },
+    {
+      "epoch": 0.2856622998544396,
+      "grad_norm": 0.1739572137594223,
+      "learning_rate": 4.193146922755348e-05,
+      "loss": 0.16738017797470092,
+      "step": 1570
+    },
+    {
+      "epoch": 0.2865720524017467,
+      "grad_norm": 0.1384361982345581,
+      "learning_rate": 4.187723914999221e-05,
+      "loss": 0.16802358627319336,
+      "step": 1575
+    },
+    {
+      "epoch": 0.28748180494905384,
+      "grad_norm": 0.1491454839706421,
+      "learning_rate": 4.182286276310915e-05,
+      "loss": 0.1619583249092102,
+      "step": 1580
+    },
+    {
+      "epoch": 0.288391557496361,
+      "grad_norm": 0.15831919014453888,
+      "learning_rate": 4.176834053829492e-05,
+      "loss": 0.1625199794769287,
+      "step": 1585
+    },
+    {
+      "epoch": 0.2893013100436681,
+      "grad_norm": 0.16265396773815155,
+      "learning_rate": 4.1713672948204416e-05,
+      "loss": 0.16718552112579346,
+      "step": 1590
+    },
+    {
+      "epoch": 0.29021106259097523,
+      "grad_norm": 0.15153461694717407,
+      "learning_rate": 4.1658860466752714e-05,
+      "loss": 0.15979087352752686,
+      "step": 1595
+    },
+    {
+      "epoch": 0.29112081513828236,
+      "grad_norm": 0.1620412915945053,
+      "learning_rate": 4.160390356911096e-05,
+      "loss": 0.16103557348251343,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2920305676855895,
+      "grad_norm": 0.16673807799816132,
+      "learning_rate": 4.154880273170223e-05,
+      "loss": 0.16394708156585694,
+      "step": 1605
+    },
+    {
+      "epoch": 0.2929403202328967,
+      "grad_norm": 0.14834867417812347,
+      "learning_rate": 4.149355843219744e-05,
+      "loss": 0.15916435718536376,
+      "step": 1610
+    },
+    {
+      "epoch": 0.2938500727802038,
+      "grad_norm": 0.16977964341640472,
+      "learning_rate": 4.143817114951119e-05,
+      "loss": 0.16538127660751342,
+      "step": 1615
+    },
+    {
+      "epoch": 0.29475982532751094,
+      "grad_norm": 0.17986875772476196,
+      "learning_rate": 4.138264136379756e-05,
+      "loss": 0.15514618158340454,
+      "step": 1620
+    },
+    {
+      "epoch": 0.29566957787481807,
+      "grad_norm": 0.15794920921325684,
+      "learning_rate": 4.132696955644605e-05,
+      "loss": 0.15992183685302735,
+      "step": 1625
+    },
+    {
+      "epoch": 0.2965793304221252,
+      "grad_norm": 0.19955399632453918,
+      "learning_rate": 4.127115621007731e-05,
+      "loss": 0.16362056732177735,
+      "step": 1630
+    },
+    {
+      "epoch": 0.29748908296943233,
+      "grad_norm": 0.1352023035287857,
+      "learning_rate": 4.121520180853903e-05,
+      "loss": 0.15631601810455323,
+      "step": 1635
+    },
+    {
+      "epoch": 0.29839883551673946,
+      "grad_norm": 0.15340781211853027,
+      "learning_rate": 4.1159106836901674e-05,
+      "loss": 0.1571858048439026,
+      "step": 1640
+    },
+    {
+      "epoch": 0.2993085880640466,
+      "grad_norm": 0.15311770141124725,
+      "learning_rate": 4.110287178145433e-05,
+      "loss": 0.16082344055175782,
+      "step": 1645
+    },
+    {
+      "epoch": 0.3002183406113537,
+      "grad_norm": 0.17811627686023712,
+      "learning_rate": 4.10464971297005e-05,
+      "loss": 0.16117215156555176,
+      "step": 1650
+    },
+    {
+      "epoch": 0.30112809315866085,
+      "grad_norm": 0.21060039103031158,
+      "learning_rate": 4.0989983370353805e-05,
+      "loss": 0.15838587284088135,
+      "step": 1655
+    },
+    {
+      "epoch": 0.302037845705968,
+      "grad_norm": 0.155836820602417,
+      "learning_rate": 4.093333099333383e-05,
+      "loss": 0.16648870706558228,
+      "step": 1660
+    },
+    {
+      "epoch": 0.3029475982532751,
+      "grad_norm": 0.13711698353290558,
+      "learning_rate": 4.0876540489761826e-05,
+      "loss": 0.16899349689483642,
+      "step": 1665
+    },
+    {
+      "epoch": 0.30385735080058224,
+      "grad_norm": 0.15162716805934906,
+      "learning_rate": 4.0819612351956485e-05,
+      "loss": 0.16574090719223022,
+      "step": 1670
+    },
+    {
+      "epoch": 0.30476710334788937,
+      "grad_norm": 0.15016348659992218,
+      "learning_rate": 4.0762547073429615e-05,
+      "loss": 0.1689780354499817,
+      "step": 1675
+    },
+    {
+      "epoch": 0.3056768558951965,
+      "grad_norm": 0.15182986855506897,
+      "learning_rate": 4.070534514888194e-05,
+      "loss": 0.1593686819076538,
+      "step": 1680
+    },
+    {
+      "epoch": 0.3065866084425036,
+      "grad_norm": 0.15648750960826874,
+      "learning_rate": 4.0648007074198765e-05,
+      "loss": 0.16436235904693602,
+      "step": 1685
+    },
+    {
+      "epoch": 0.30749636098981076,
+      "grad_norm": 0.18339484930038452,
+      "learning_rate": 4.0590533346445665e-05,
+      "loss": 0.1678077220916748,
+      "step": 1690
+    },
+    {
+      "epoch": 0.3084061135371179,
+      "grad_norm": 0.16426527500152588,
+      "learning_rate": 4.053292446386422e-05,
+      "loss": 0.1689227342605591,
+      "step": 1695
+    },
+    {
+      "epoch": 0.309315866084425,
+      "grad_norm": 0.16129335761070251,
+      "learning_rate": 4.047518092586766e-05,
+      "loss": 0.16592445373535156,
+      "step": 1700
+    },
+    {
+      "epoch": 0.31022561863173215,
+      "grad_norm": 0.15512363612651825,
+      "learning_rate": 4.041730323303654e-05,
+      "loss": 0.16142364740371704,
+      "step": 1705
+    },
+    {
+      "epoch": 0.3111353711790393,
+      "grad_norm": 0.159842386841774,
+      "learning_rate": 4.0359291887114425e-05,
+      "loss": 0.1702875852584839,
+      "step": 1710
+    },
+    {
+      "epoch": 0.3120451237263464,
+      "grad_norm": 0.19558854401111603,
+      "learning_rate": 4.030114739100352e-05,
+      "loss": 0.15966148376464845,
+      "step": 1715
+    },
+    {
+      "epoch": 0.3129548762736536,
+      "grad_norm": 0.1577496975660324,
+      "learning_rate": 4.024287024876029e-05,
+      "loss": 0.1620358943939209,
+      "step": 1720
+    },
+    {
+      "epoch": 0.3138646288209607,
+      "grad_norm": 0.1629355251789093,
+      "learning_rate": 4.0184460965591144e-05,
+      "loss": 0.16511552333831786,
+      "step": 1725
+    },
+    {
+      "epoch": 0.31477438136826785,
+      "grad_norm": 0.17060767114162445,
+      "learning_rate": 4.0125920047848e-05,
+      "loss": 0.15672838687896729,
+      "step": 1730
+    },
+    {
+      "epoch": 0.315684133915575,
+      "grad_norm": 0.22447620332241058,
+      "learning_rate": 4.006724800302394e-05,
+      "loss": 0.15339784622192382,
+      "step": 1735
+    },
+    {
+      "epoch": 0.3165938864628821,
+      "grad_norm": 0.14572037756443024,
+      "learning_rate": 4.000844533974878e-05,
+      "loss": 0.16566959619522095,
+      "step": 1740
+    },
+    {
+      "epoch": 0.31750363901018924,
+      "grad_norm": 0.15915483236312866,
+      "learning_rate": 3.9949512567784684e-05,
+      "loss": 0.16153957843780517,
+      "step": 1745
+    },
+    {
+      "epoch": 0.3184133915574964,
+      "grad_norm": 0.1668540984392166,
+      "learning_rate": 3.9890450198021704e-05,
+      "loss": 0.1659809947013855,
+      "step": 1750
+    },
+    {
+      "epoch": 0.3193231441048035,
+      "grad_norm": 0.16612035036087036,
+      "learning_rate": 3.983125874247341e-05,
+      "loss": 0.16941241025924683,
+      "step": 1755
+    },
+    {
+      "epoch": 0.32023289665211063,
+      "grad_norm": 0.15163679420948029,
+      "learning_rate": 3.9771938714272407e-05,
+      "loss": 0.16053590774536133,
+      "step": 1760
+    },
+    {
+      "epoch": 0.32114264919941776,
+      "grad_norm": 0.1797824203968048,
+      "learning_rate": 3.97124906276659e-05,
+      "loss": 0.1667110800743103,
+      "step": 1765
+    },
+    {
+      "epoch": 0.3220524017467249,
+      "grad_norm": 0.15076608955860138,
+      "learning_rate": 3.9652914998011237e-05,
+      "loss": 0.1607860803604126,
+      "step": 1770
+    },
+    {
+      "epoch": 0.322962154294032,
+      "grad_norm": 0.16523587703704834,
+      "learning_rate": 3.959321234177144e-05,
+      "loss": 0.16515827178955078,
+      "step": 1775
+    },
+    {
+      "epoch": 0.32387190684133915,
+      "grad_norm": 0.22065149247646332,
+      "learning_rate": 3.9533383176510746e-05,
+      "loss": 0.1618957757949829,
+      "step": 1780
+    },
+    {
+      "epoch": 0.3247816593886463,
+      "grad_norm": 0.16426463425159454,
+      "learning_rate": 3.9473428020890066e-05,
+      "loss": 0.15763382911682128,
+      "step": 1785
+    },
+    {
+      "epoch": 0.3256914119359534,
+      "grad_norm": 0.16474904119968414,
+      "learning_rate": 3.941334739466257e-05,
+      "loss": 0.15135571956634522,
+      "step": 1790
+    },
+    {
+      "epoch": 0.32660116448326054,
+      "grad_norm": 0.16746412217617035,
+      "learning_rate": 3.935314181866909e-05,
+      "loss": 0.15925389528274536,
+      "step": 1795
+    },
+    {
+      "epoch": 0.32751091703056767,
+      "grad_norm": 0.17819371819496155,
+      "learning_rate": 3.929281181483369e-05,
+      "loss": 0.1598669171333313,
+      "step": 1800
+    },
+    {
+      "epoch": 0.3284206695778748,
+      "grad_norm": 0.1816040277481079,
+      "learning_rate": 3.923235790615907e-05,
+      "loss": 0.1652522087097168,
+      "step": 1805
+    },
+    {
+      "epoch": 0.32933042212518193,
+      "grad_norm": 0.14846695959568024,
+      "learning_rate": 3.917178061672211e-05,
+      "loss": 0.16665585041046144,
+      "step": 1810
+    },
+    {
+      "epoch": 0.33024017467248906,
+      "grad_norm": 0.1734926551580429,
+      "learning_rate": 3.911108047166924e-05,
+      "loss": 0.16069791316986085,
+      "step": 1815
+    },
+    {
+      "epoch": 0.3311499272197962,
+      "grad_norm": 0.16154922544956207,
+      "learning_rate": 3.905025799721194e-05,
+      "loss": 0.16114097833633423,
+      "step": 1820
+    },
+    {
+      "epoch": 0.3320596797671033,
+      "grad_norm": 0.1538771390914917,
+      "learning_rate": 3.898931372062217e-05,
+      "loss": 0.1602831244468689,
+      "step": 1825
+    },
+    {
+      "epoch": 0.3329694323144105,
+      "grad_norm": 0.14036566019058228,
+      "learning_rate": 3.892824817022781e-05,
+      "loss": 0.1502395749092102,
+      "step": 1830
+    },
+    {
+      "epoch": 0.33387918486171764,
+      "grad_norm": 0.19212059676647186,
+      "learning_rate": 3.886706187540804e-05,
+      "loss": 0.16265250444412233,
+      "step": 1835
+    },
+    {
+      "epoch": 0.33478893740902477,
+      "grad_norm": 0.17410333454608917,
+      "learning_rate": 3.880575536658881e-05,
+      "loss": 0.15689224004745483,
+      "step": 1840
+    },
+    {
+      "epoch": 0.3356986899563319,
+      "grad_norm": 0.15165294706821442,
+      "learning_rate": 3.874432917523817e-05,
+      "loss": 0.15033140182495117,
+      "step": 1845
+    },
+    {
+      "epoch": 0.336608442503639,
+      "grad_norm": 0.16166730225086212,
+      "learning_rate": 3.8682783833861736e-05,
+      "loss": 0.16896235942840576,
+      "step": 1850
+    },
+    {
+      "epoch": 0.33751819505094616,
+      "grad_norm": 0.16497021913528442,
+      "learning_rate": 3.8621119875998026e-05,
+      "loss": 0.1600774645805359,
+      "step": 1855
+    },
+    {
+      "epoch": 0.3384279475982533,
+      "grad_norm": 0.17264948785305023,
+      "learning_rate": 3.855933783621384e-05,
+      "loss": 0.16947593688964843,
+      "step": 1860
+    },
+    {
+      "epoch": 0.3393377001455604,
+      "grad_norm": 0.16870704293251038,
+      "learning_rate": 3.8497438250099636e-05,
+      "loss": 0.16062095165252685,
+      "step": 1865
+    },
+    {
+      "epoch": 0.34024745269286755,
+      "grad_norm": 0.16644036769866943,
+      "learning_rate": 3.843542165426492e-05,
+      "loss": 0.16015599966049193,
+      "step": 1870
+    },
+    {
+      "epoch": 0.3411572052401747,
+      "grad_norm": 0.1626352220773697,
+      "learning_rate": 3.837328858633349e-05,
+      "loss": 0.17444703578948975,
+      "step": 1875
+    },
+    {
+      "epoch": 0.3420669577874818,
+      "grad_norm": 0.1427375227212906,
+      "learning_rate": 3.83110395849389e-05,
+      "loss": 0.1589805006980896,
+      "step": 1880
+    },
+    {
+      "epoch": 0.34297671033478894,
+      "grad_norm": 0.17840255796909332,
+      "learning_rate": 3.824867518971973e-05,
+      "loss": 0.15953952074050903,
+      "step": 1885
+    },
+    {
+      "epoch": 0.34388646288209607,
+      "grad_norm": 0.16998249292373657,
+      "learning_rate": 3.818619594131489e-05,
+      "loss": 0.16027032136917113,
+      "step": 1890
+    },
+    {
+      "epoch": 0.3447962154294032,
+      "grad_norm": 0.14950257539749146,
+      "learning_rate": 3.812360238135897e-05,
+      "loss": 0.15335670709609986,
+      "step": 1895
+    },
+    {
+      "epoch": 0.3457059679767103,
+      "grad_norm": 0.1678011417388916,
+      "learning_rate": 3.806089505247752e-05,
+      "loss": 0.1560648798942566,
+      "step": 1900
+    },
+    {
+      "epoch": 0.34661572052401746,
+      "grad_norm": 0.17944541573524475,
+      "learning_rate": 3.799807449828238e-05,
+      "loss": 0.16072254180908202,
+      "step": 1905
+    },
+    {
+      "epoch": 0.3475254730713246,
+      "grad_norm": 0.166817307472229,
+      "learning_rate": 3.793514126336691e-05,
+      "loss": 0.1542820692062378,
+      "step": 1910
+    },
+    {
+      "epoch": 0.3484352256186317,
+      "grad_norm": 0.16047626733779907,
+      "learning_rate": 3.787209589330134e-05,
+      "loss": 0.16092092990875245,
+      "step": 1915
+    },
+    {
+      "epoch": 0.34934497816593885,
+      "grad_norm": 0.16478900611400604,
+      "learning_rate": 3.7808938934627965e-05,
+      "loss": 0.16765867471694945,
+      "step": 1920
+    },
+    {
+      "epoch": 0.350254730713246,
+      "grad_norm": 0.15349514782428741,
+      "learning_rate": 3.774567093485648e-05,
+      "loss": 0.15890377759933472,
+      "step": 1925
+    },
+    {
+      "epoch": 0.3511644832605531,
+      "grad_norm": 0.1515921950340271,
+      "learning_rate": 3.768229244245917e-05,
+      "loss": 0.16668319702148438,
+      "step": 1930
+    },
+    {
+      "epoch": 0.35207423580786024,
+      "grad_norm": 0.16310466825962067,
+      "learning_rate": 3.7618804006866195e-05,
+      "loss": 0.15182652473449706,
+      "step": 1935
+    },
+    {
+      "epoch": 0.3529839883551674,
+      "grad_norm": 0.17294517159461975,
+      "learning_rate": 3.755520617846084e-05,
+      "loss": 0.16287628412246705,
+      "step": 1940
+    },
+    {
+      "epoch": 0.35389374090247455,
+      "grad_norm": 0.1482895463705063,
+      "learning_rate": 3.749149950857467e-05,
+      "loss": 0.15321952104568481,
+      "step": 1945
+    },
+    {
+      "epoch": 0.3548034934497817,
+      "grad_norm": 0.2236029952764511,
+      "learning_rate": 3.7427684549482847e-05,
+      "loss": 0.15403482913970948,
+      "step": 1950
+    },
+    {
+      "epoch": 0.3557132459970888,
+      "grad_norm": 0.20185327529907227,
+      "learning_rate": 3.736376185439927e-05,
+      "loss": 0.1633884072303772,
+      "step": 1955
+    },
+    {
+      "epoch": 0.35662299854439594,
+      "grad_norm": 0.13906247913837433,
+      "learning_rate": 3.7299731977471816e-05,
+      "loss": 0.15925350189208984,
+      "step": 1960
+    },
+    {
+      "epoch": 0.35753275109170307,
+      "grad_norm": 0.18665002286434174,
+      "learning_rate": 3.723559547377751e-05,
+      "loss": 0.1612026572227478,
+      "step": 1965
+    },
+    {
+      "epoch": 0.3584425036390102,
+      "grad_norm": 0.16913433372974396,
+      "learning_rate": 3.717135289931774e-05,
+      "loss": 0.15479494333267213,
+      "step": 1970
+    },
+    {
+      "epoch": 0.35935225618631733,
+      "grad_norm": 0.1620066910982132,
+      "learning_rate": 3.7107004811013434e-05,
+      "loss": 0.1604058027267456,
+      "step": 1975
+    },
+    {
+      "epoch": 0.36026200873362446,
+      "grad_norm": 0.16838301718235016,
+      "learning_rate": 3.704255176670021e-05,
+      "loss": 0.15335073471069335,
+      "step": 1980
+    },
+    {
+      "epoch": 0.3611717612809316,
+      "grad_norm": 0.3054695427417755,
+      "learning_rate": 3.6977994325123535e-05,
+      "loss": 0.16558053493499755,
+      "step": 1985
+    },
+    {
+      "epoch": 0.3620815138282387,
+      "grad_norm": 0.1526716649532318,
+      "learning_rate": 3.6913333045933934e-05,
+      "loss": 0.16148923635482787,
+      "step": 1990
+    },
+    {
+      "epoch": 0.36299126637554585,
+      "grad_norm": 0.15328513085842133,
+      "learning_rate": 3.684856848968209e-05,
+      "loss": 0.1553613781929016,
+      "step": 1995
+    },
+    {
+      "epoch": 0.363901018922853,
+      "grad_norm": 0.16129714250564575,
+      "learning_rate": 3.6783701217813995e-05,
+      "loss": 0.16724612712860107,
+      "step": 2000
+    },
+    {
+      "epoch": 0.3648107714701601,
+      "grad_norm": 0.15715539455413818,
+      "learning_rate": 3.6718731792666086e-05,
+      "loss": 0.15867922306060792,
+      "step": 2005
+    },
+    {
+      "epoch": 0.36572052401746724,
+      "grad_norm": 0.15569166839122772,
+      "learning_rate": 3.6653660777460366e-05,
+      "loss": 0.1552058696746826,
+      "step": 2010
+    },
+    {
+      "epoch": 0.36663027656477437,
+      "grad_norm": 0.16223010420799255,
+      "learning_rate": 3.6588488736299535e-05,
+      "loss": 0.1583200454711914,
+      "step": 2015
+    },
+    {
+      "epoch": 0.3675400291120815,
+      "grad_norm": 0.18441995978355408,
+      "learning_rate": 3.652321623416209e-05,
+      "loss": 0.15050662755966188,
+      "step": 2020
+    },
+    {
+      "epoch": 0.36844978165938863,
+      "grad_norm": 0.13792674243450165,
+      "learning_rate": 3.645784383689742e-05,
+      "loss": 0.15458759069442748,
+      "step": 2025
+    },
+    {
+      "epoch": 0.36935953420669576,
+      "grad_norm": 0.14993111789226532,
+      "learning_rate": 3.639237211122091e-05,
+      "loss": 0.15926222801208495,
+      "step": 2030
+    },
+    {
+      "epoch": 0.3702692867540029,
+      "grad_norm": 0.16815930604934692,
+      "learning_rate": 3.632680162470904e-05,
+      "loss": 0.15524441003799438,
+      "step": 2035
+    },
+    {
+      "epoch": 0.37117903930131,
+      "grad_norm": 0.13312821090221405,
+      "learning_rate": 3.626113294579441e-05,
+      "loss": 0.15883516073226928,
+      "step": 2040
+    },
+    {
+      "epoch": 0.37208879184861715,
+      "grad_norm": 0.16838273406028748,
+      "learning_rate": 3.619536664376091e-05,
+      "loss": 0.15829603672027587,
+      "step": 2045
+    },
+    {
+      "epoch": 0.37299854439592434,
+      "grad_norm": 0.14706873893737793,
+      "learning_rate": 3.612950328873869e-05,
+      "loss": 0.15644397735595703,
+      "step": 2050
+    },
+    {
+      "epoch": 0.37390829694323147,
+      "grad_norm": 0.1644199639558792,
+      "learning_rate": 3.606354345169926e-05,
+      "loss": 0.15858219861984252,
+      "step": 2055
+    },
+    {
+      "epoch": 0.3748180494905386,
+      "grad_norm": 0.18077051639556885,
+      "learning_rate": 3.599748770445055e-05,
+      "loss": 0.1641286849975586,
+      "step": 2060
+    },
+    {
+      "epoch": 0.3757278020378457,
+      "grad_norm": 0.16329127550125122,
+      "learning_rate": 3.5931336619631914e-05,
+      "loss": 0.15027186870574952,
+      "step": 2065
+    },
+    {
+      "epoch": 0.37663755458515286,
+      "grad_norm": 0.16346783936023712,
+      "learning_rate": 3.586509077070922e-05,
+      "loss": 0.1558641314506531,
+      "step": 2070
+    },
+    {
+      "epoch": 0.37754730713246,
+      "grad_norm": 0.1727602630853653,
+      "learning_rate": 3.5798750731969834e-05,
+      "loss": 0.15390506982803345,
+      "step": 2075
+    },
+    {
+      "epoch": 0.3784570596797671,
+      "grad_norm": 0.7598192691802979,
+      "learning_rate": 3.5732317078517654e-05,
+      "loss": 0.1533232808113098,
+      "step": 2080
+    },
+    {
+      "epoch": 0.37936681222707425,
+      "grad_norm": 0.1433355212211609,
+      "learning_rate": 3.5665790386268124e-05,
+      "loss": 0.15560413599014283,
+      "step": 2085
+    },
+    {
+      "epoch": 0.3802765647743814,
+      "grad_norm": 0.18439625203609467,
+      "learning_rate": 3.559917123194325e-05,
+      "loss": 0.16695556640625,
+      "step": 2090
+    },
+    {
+      "epoch": 0.3811863173216885,
+      "grad_norm": 0.1693502813577652,
+      "learning_rate": 3.55324601930666e-05,
+      "loss": 0.15957870483398437,
+      "step": 2095
+    },
+    {
+      "epoch": 0.38209606986899564,
+      "grad_norm": 0.17776088416576385,
+      "learning_rate": 3.54656578479583e-05,
+      "loss": 0.1527492880821228,
+      "step": 2100
+    },
+    {
+      "epoch": 0.38300582241630277,
+      "grad_norm": 0.15993724763393402,
+      "learning_rate": 3.539876477572998e-05,
+      "loss": 0.1567505717277527,
+      "step": 2105
+    },
+    {
+      "epoch": 0.3839155749636099,
+      "grad_norm": 0.17067375779151917,
+      "learning_rate": 3.533178155627981e-05,
+      "loss": 0.14660797119140626,
+      "step": 2110
+    },
+    {
+      "epoch": 0.384825327510917,
+      "grad_norm": 0.20239882171154022,
+      "learning_rate": 3.526470877028745e-05,
+      "loss": 0.1596767544746399,
+      "step": 2115
+    },
+    {
+      "epoch": 0.38573508005822416,
+      "grad_norm": 0.1863643079996109,
+      "learning_rate": 3.5197546999209005e-05,
+      "loss": 0.15738571882247926,
+      "step": 2120
+    },
+    {
+      "epoch": 0.3866448326055313,
+      "grad_norm": 0.16994133591651917,
+      "learning_rate": 3.5130296825272014e-05,
+      "loss": 0.16255316734313965,
+      "step": 2125
+    },
+    {
+      "epoch": 0.3875545851528384,
+      "grad_norm": 0.18703415989875793,
+      "learning_rate": 3.5062958831470355e-05,
+      "loss": 0.15206334590911866,
+      "step": 2130
+    },
+    {
+      "epoch": 0.38846433770014555,
+      "grad_norm": 0.15433982014656067,
+      "learning_rate": 3.4995533601559226e-05,
+      "loss": 0.1590178370475769,
+      "step": 2135
+    },
+    {
+      "epoch": 0.3893740902474527,
+      "grad_norm": 0.16498146951198578,
+      "learning_rate": 3.4928021720050104e-05,
+      "loss": 0.14759145975112914,
+      "step": 2140
+    },
+    {
+      "epoch": 0.3902838427947598,
+      "grad_norm": 0.17880478501319885,
+      "learning_rate": 3.486042377220562e-05,
+      "loss": 0.1642458915710449,
+      "step": 2145
+    },
+    {
+      "epoch": 0.39119359534206694,
+      "grad_norm": 0.14700061082839966,
+      "learning_rate": 3.479274034403455e-05,
+      "loss": 0.16105138063430785,
+      "step": 2150
+    },
+    {
+      "epoch": 0.39210334788937407,
+      "grad_norm": 0.1620762050151825,
+      "learning_rate": 3.472497202228664e-05,
+      "loss": 0.15104985237121582,
+      "step": 2155
+    },
+    {
+      "epoch": 0.3930131004366812,
+      "grad_norm": 0.1625058799982071,
+      "learning_rate": 3.4657119394447654e-05,
+      "loss": 0.16145485639572144,
+      "step": 2160
+    },
+    {
+      "epoch": 0.3939228529839884,
+      "grad_norm": 0.1631549596786499,
+      "learning_rate": 3.458918304873417e-05,
+      "loss": 0.16712255477905275,
+      "step": 2165
+    },
+    {
+      "epoch": 0.3948326055312955,
+      "grad_norm": 0.16041551530361176,
+      "learning_rate": 3.452116357408853e-05,
+      "loss": 0.15118330717086792,
+      "step": 2170
+    },
+    {
+      "epoch": 0.39574235807860264,
+      "grad_norm": 0.16692611575126648,
+      "learning_rate": 3.44530615601737e-05,
+      "loss": 0.16982550621032716,
+      "step": 2175
+    },
+    {
+      "epoch": 0.39665211062590977,
+      "grad_norm": 0.16082268953323364,
+      "learning_rate": 3.438487759736821e-05,
+      "loss": 0.1513260006904602,
+      "step": 2180
+    },
+    {
+      "epoch": 0.3975618631732169,
+      "grad_norm": 0.1474589854478836,
+      "learning_rate": 3.4316612276761004e-05,
+      "loss": 0.14968743324279785,
+      "step": 2185
+    },
+    {
+      "epoch": 0.39847161572052403,
+      "grad_norm": 0.14531342685222626,
+      "learning_rate": 3.42482661901463e-05,
+      "loss": 0.1563260555267334,
+      "step": 2190
+    },
+    {
+      "epoch": 0.39938136826783116,
+      "grad_norm": 0.16775506734848022,
+      "learning_rate": 3.41798399300185e-05,
+      "loss": 0.14861010313034057,
+      "step": 2195
+    },
+    {
+      "epoch": 0.4002911208151383,
+      "grad_norm": 0.15065217018127441,
+      "learning_rate": 3.411133408956703e-05,
+      "loss": 0.15559519529342652,
+      "step": 2200
+    },
+    {
+      "epoch": 0.4012008733624454,
+      "grad_norm": 0.16655296087265015,
+      "learning_rate": 3.4042749262671184e-05,
+      "loss": 0.16025567054748535,
+      "step": 2205
+    },
+    {
+      "epoch": 0.40211062590975255,
+      "grad_norm": 0.14773905277252197,
+      "learning_rate": 3.397408604389501e-05,
+      "loss": 0.15074082612991332,
+      "step": 2210
+    },
+    {
+      "epoch": 0.4030203784570597,
+      "grad_norm": 0.16233304142951965,
+      "learning_rate": 3.3905345028482125e-05,
+      "loss": 0.15490520000457764,
+      "step": 2215
+    },
+    {
+      "epoch": 0.4039301310043668,
+      "grad_norm": 0.17520153522491455,
+      "learning_rate": 3.383652681235058e-05,
+      "loss": 0.1517520785331726,
+      "step": 2220
+    },
+    {
+      "epoch": 0.40483988355167394,
+      "grad_norm": 0.14749875664710999,
+      "learning_rate": 3.376763199208766e-05,
+      "loss": 0.15410997867584228,
+      "step": 2225
+    },
+    {
+      "epoch": 0.40574963609898107,
+      "grad_norm": 0.16855919361114502,
+      "learning_rate": 3.369866116494477e-05,
+      "loss": 0.1510261058807373,
+      "step": 2230
+    },
+    {
+      "epoch": 0.4066593886462882,
+      "grad_norm": 0.1594122350215912,
+      "learning_rate": 3.362961492883218e-05,
+      "loss": 0.1493813395500183,
+      "step": 2235
+    },
+    {
+      "epoch": 0.40756914119359533,
+      "grad_norm": 0.13645926117897034,
+      "learning_rate": 3.3560493882313915e-05,
+      "loss": 0.14876762628555298,
+      "step": 2240
+    },
+    {
+      "epoch": 0.40847889374090246,
+      "grad_norm": 0.14304400980472565,
+      "learning_rate": 3.349129862460251e-05,
+      "loss": 0.15567013025283813,
+      "step": 2245
+    },
+    {
+      "epoch": 0.4093886462882096,
+      "grad_norm": 0.17040041089057922,
+      "learning_rate": 3.342202975555386e-05,
+      "loss": 0.1563249945640564,
+      "step": 2250
+    },
+    {
+      "epoch": 0.4102983988355167,
+      "grad_norm": 0.15594671666622162,
+      "learning_rate": 3.3352687875661984e-05,
+      "loss": 0.1546410083770752,
+      "step": 2255
+    },
+    {
+      "epoch": 0.41120815138282385,
+      "grad_norm": 0.1677195280790329,
+      "learning_rate": 3.328327358605384e-05,
+      "loss": 0.15710171461105346,
+      "step": 2260
+    },
+    {
+      "epoch": 0.412117903930131,
+      "grad_norm": 0.1731705516576767,
+      "learning_rate": 3.321378748848412e-05,
+      "loss": 0.16444036960601807,
+      "step": 2265
+    },
+    {
+      "epoch": 0.4130276564774381,
+      "grad_norm": 0.18779033422470093,
+      "learning_rate": 3.3144230185329984e-05,
+      "loss": 0.15659687519073487,
+      "step": 2270
+    },
+    {
+      "epoch": 0.4139374090247453,
+      "grad_norm": 0.1543768346309662,
+      "learning_rate": 3.3074602279585913e-05,
+      "loss": 0.15100739002227784,
+      "step": 2275
+    },
+    {
+      "epoch": 0.4148471615720524,
+      "grad_norm": 0.16672168672084808,
+      "learning_rate": 3.300490437485843e-05,
+      "loss": 0.15535364151000977,
+      "step": 2280
+    },
+    {
+      "epoch": 0.41575691411935956,
+      "grad_norm": 0.16741308569908142,
+      "learning_rate": 3.293513707536089e-05,
+      "loss": 0.15523911714553834,
+      "step": 2285
+    },
+    {
+      "epoch": 0.4166666666666667,
+      "grad_norm": 0.1488303542137146,
+      "learning_rate": 3.286530098590822e-05,
+      "loss": 0.1542000651359558,
+      "step": 2290
+    },
+    {
+      "epoch": 0.4175764192139738,
+      "grad_norm": 0.1637732982635498,
+      "learning_rate": 3.2795396711911694e-05,
+      "loss": 0.15354831218719484,
+      "step": 2295
+    },
+    {
+      "epoch": 0.41848617176128095,
+      "grad_norm": 0.1472022533416748,
+      "learning_rate": 3.272542485937369e-05,
+      "loss": 0.16235145330429077,
+      "step": 2300
+    },
+    {
+      "epoch": 0.4193959243085881,
+      "grad_norm": 0.15908290445804596,
+      "learning_rate": 3.265538603488241e-05,
+      "loss": 0.15642645359039306,
+      "step": 2305
+    },
+    {
+      "epoch": 0.4203056768558952,
+      "grad_norm": 0.1584865301847458,
+      "learning_rate": 3.2585280845606645e-05,
+      "loss": 0.15490249395370484,
+      "step": 2310
+    },
+    {
+      "epoch": 0.42121542940320233,
+      "grad_norm": 0.15893949568271637,
+      "learning_rate": 3.251510989929052e-05,
+      "loss": 0.1598116159439087,
+      "step": 2315
+    },
+    {
+      "epoch": 0.42212518195050946,
+      "grad_norm": 0.18930596113204956,
+      "learning_rate": 3.244487380424817e-05,
+      "loss": 0.1482008934020996,
+      "step": 2320
+    },
+    {
+      "epoch": 0.4230349344978166,
+      "grad_norm": 0.132876455783844,
+      "learning_rate": 3.237457316935856e-05,
+      "loss": 0.15304710865020751,
+      "step": 2325
+    },
+    {
+      "epoch": 0.4239446870451237,
+      "grad_norm": 0.16447032988071442,
+      "learning_rate": 3.2304208604060106e-05,
+      "loss": 0.15298750400543212,
+      "step": 2330
+    },
+    {
+      "epoch": 0.42485443959243085,
+      "grad_norm": 0.17748120427131653,
+      "learning_rate": 3.223378071834546e-05,
+      "loss": 0.1556084156036377,
+      "step": 2335
+    },
+    {
+      "epoch": 0.425764192139738,
+      "grad_norm": 0.16366586089134216,
+      "learning_rate": 3.2163290122756206e-05,
+      "loss": 0.14387927055358887,
+      "step": 2340
+    },
+    {
+      "epoch": 0.4266739446870451,
+      "grad_norm": 0.15398970246315002,
+      "learning_rate": 3.209273742837755e-05,
+      "loss": 0.16091293096542358,
+      "step": 2345
+    },
+    {
+      "epoch": 0.42758369723435224,
+      "grad_norm": 0.164212167263031,
+      "learning_rate": 3.202212324683305e-05,
+      "loss": 0.15523531436920165,
+      "step": 2350
+    },
+    {
+      "epoch": 0.4284934497816594,
+      "grad_norm": 0.16749800741672516,
+      "learning_rate": 3.1951448190279255e-05,
+      "loss": 0.15354975461959838,
+      "step": 2355
+    },
+    {
+      "epoch": 0.4294032023289665,
+      "grad_norm": 0.14137034118175507,
+      "learning_rate": 3.18807128714005e-05,
+      "loss": 0.14981694221496583,
+      "step": 2360
+    },
+    {
+      "epoch": 0.43031295487627363,
+      "grad_norm": 0.14848439395427704,
+      "learning_rate": 3.1809917903403507e-05,
+      "loss": 0.15448769330978393,
+      "step": 2365
+    },
+    {
+      "epoch": 0.43122270742358076,
+      "grad_norm": 0.1747605800628662,
+      "learning_rate": 3.1739063900012095e-05,
+      "loss": 0.15882387161254882,
+      "step": 2370
+    },
+    {
+      "epoch": 0.4321324599708879,
+      "grad_norm": 0.16054467856884003,
+      "learning_rate": 3.166815147546186e-05,
+      "loss": 0.15170297622680665,
+      "step": 2375
+    },
+    {
+      "epoch": 0.433042212518195,
+      "grad_norm": 0.15428027510643005,
+      "learning_rate": 3.1597181244494886e-05,
+      "loss": 0.16202548742294312,
+      "step": 2380
+    },
+    {
+      "epoch": 0.4339519650655022,
+      "grad_norm": 0.16747219860553741,
+      "learning_rate": 3.1526153822354325e-05,
+      "loss": 0.15461477041244506,
+      "step": 2385
+    },
+    {
+      "epoch": 0.43486171761280934,
+      "grad_norm": 0.17415772378444672,
+      "learning_rate": 3.145506982477918e-05,
+      "loss": 0.16173542737960817,
+      "step": 2390
+    },
+    {
+      "epoch": 0.43577147016011647,
+      "grad_norm": 0.1293518990278244,
+      "learning_rate": 3.1383929867998865e-05,
+      "loss": 0.15572521686553956,
+      "step": 2395
+    },
+    {
+      "epoch": 0.4366812227074236,
+      "grad_norm": 0.16909323632717133,
+      "learning_rate": 3.1312734568727935e-05,
+      "loss": 0.15898628234863282,
+      "step": 2400
+    },
+    {
+      "epoch": 0.43759097525473073,
+      "grad_norm": 0.16770294308662415,
+      "learning_rate": 3.124148454416069e-05,
+      "loss": 0.1536281704902649,
+      "step": 2405
+    },
+    {
+      "epoch": 0.43850072780203786,
+      "grad_norm": 0.14078612625598907,
+      "learning_rate": 3.117018041196585e-05,
+      "loss": 0.15274266004562378,
+      "step": 2410
+    },
+    {
+      "epoch": 0.439410480349345,
+      "grad_norm": 0.15457536280155182,
+      "learning_rate": 3.1098822790281226e-05,
+      "loss": 0.15391263961791993,
+      "step": 2415
+    },
+    {
+      "epoch": 0.4403202328966521,
+      "grad_norm": 0.1640717089176178,
+      "learning_rate": 3.102741229770827e-05,
+      "loss": 0.15515168905258178,
+      "step": 2420
+    },
+    {
+      "epoch": 0.44122998544395925,
+      "grad_norm": 0.2601533830165863,
+      "learning_rate": 3.095594955330683e-05,
+      "loss": 0.1587247371673584,
+      "step": 2425
+    },
+    {
+      "epoch": 0.4421397379912664,
+      "grad_norm": 0.1352529525756836,
+      "learning_rate": 3.08844351765897e-05,
+      "loss": 0.1483217477798462,
+      "step": 2430
+    },
+    {
+      "epoch": 0.4430494905385735,
+      "grad_norm": 0.18479721248149872,
+      "learning_rate": 3.081286978751728e-05,
+      "loss": 0.15121787786483765,
+      "step": 2435
+    },
+    {
+      "epoch": 0.44395924308588064,
+      "grad_norm": 0.16954511404037476,
+      "learning_rate": 3.074125400649221e-05,
+      "loss": 0.16073100566864013,
+      "step": 2440
+    },
+    {
+      "epoch": 0.44486899563318777,
+      "grad_norm": 0.15154729783535004,
+      "learning_rate": 3.0669588454353944e-05,
+      "loss": 0.15738017559051515,
+      "step": 2445
+    },
+    {
+      "epoch": 0.4457787481804949,
+      "grad_norm": 0.1540488302707672,
+      "learning_rate": 3.059787375237344e-05,
+      "loss": 0.1515384554862976,
+      "step": 2450
+    },
+    {
+      "epoch": 0.44668850072780203,
+      "grad_norm": 0.1814432442188263,
+      "learning_rate": 3.052611052224774e-05,
+      "loss": 0.15731438398361205,
+      "step": 2455
+    },
+    {
+      "epoch": 0.44759825327510916,
+      "grad_norm": 0.16657036542892456,
+      "learning_rate": 3.0454299386094542e-05,
+      "loss": 0.15741543769836425,
+      "step": 2460
+    },
+    {
+      "epoch": 0.4485080058224163,
+      "grad_norm": 0.2177237570285797,
+      "learning_rate": 3.0382440966446875e-05,
+      "loss": 0.14972515106201173,
+      "step": 2465
+    },
+    {
+      "epoch": 0.4494177583697234,
+      "grad_norm": 0.1669909954071045,
+      "learning_rate": 3.031053588624766e-05,
+      "loss": 0.1506432294845581,
+      "step": 2470
+    },
+    {
+      "epoch": 0.45032751091703055,
+      "grad_norm": 0.1752234250307083,
+      "learning_rate": 3.0238584768844313e-05,
+      "loss": 0.14969609975814818,
+      "step": 2475
+    },
+    {
+      "epoch": 0.4512372634643377,
+      "grad_norm": 0.18267901241779327,
+      "learning_rate": 3.0166588237983363e-05,
+      "loss": 0.15112748146057128,
+      "step": 2480
+    },
+    {
+      "epoch": 0.4521470160116448,
+      "grad_norm": 0.16250105202198029,
+      "learning_rate": 3.0094546917805007e-05,
+      "loss": 0.15864100456237792,
+      "step": 2485
+    },
+    {
+      "epoch": 0.45305676855895194,
+      "grad_norm": 0.14825721085071564,
+      "learning_rate": 3.0022461432837752e-05,
+      "loss": 0.1513954520225525,
+      "step": 2490
+    },
+    {
+      "epoch": 0.4539665211062591,
+      "grad_norm": 0.1626640111207962,
+      "learning_rate": 2.9950332407992943e-05,
+      "loss": 0.1505578875541687,
+      "step": 2495
+    },
+    {
+      "epoch": 0.45487627365356625,
+      "grad_norm": 0.1535351574420929,
+      "learning_rate": 2.987816046855939e-05,
+      "loss": 0.15255829095840454,
+      "step": 2500
+    },
+    {
+      "epoch": 0.4557860262008734,
+      "grad_norm": 0.17552775144577026,
+      "learning_rate": 2.9805946240197928e-05,
+      "loss": 0.1516443133354187,
+      "step": 2505
+    },
+    {
+      "epoch": 0.4566957787481805,
+      "grad_norm": 0.16020981967449188,
+      "learning_rate": 2.9733690348935994e-05,
+      "loss": 0.14519743919372557,
+      "step": 2510
+    },
+    {
+      "epoch": 0.45760553129548764,
+      "grad_norm": 0.17800211906433105,
+      "learning_rate": 2.9661393421162204e-05,
+      "loss": 0.15679080486297609,
+      "step": 2515
+    },
+    {
+      "epoch": 0.4585152838427948,
+      "grad_norm": 0.16016991436481476,
+      "learning_rate": 2.9589056083620902e-05,
+      "loss": 0.14768127202987671,
+      "step": 2520
+    },
+    {
+      "epoch": 0.4594250363901019,
+      "grad_norm": 0.16272081434726715,
+      "learning_rate": 2.951667896340679e-05,
+      "loss": 0.1513301968574524,
+      "step": 2525
+    },
+    {
+      "epoch": 0.46033478893740903,
+      "grad_norm": 0.1726413071155548,
+      "learning_rate": 2.9444262687959402e-05,
+      "loss": 0.14819332361221313,
+      "step": 2530
+    },
+    {
+      "epoch": 0.46124454148471616,
+      "grad_norm": 0.1670403778553009,
+      "learning_rate": 2.9371807885057735e-05,
+      "loss": 0.15245940685272216,
+      "step": 2535
+    },
+    {
+      "epoch": 0.4621542940320233,
+      "grad_norm": 0.1650049239397049,
+      "learning_rate": 2.9299315182814772e-05,
+      "loss": 0.15187418460845947,
+      "step": 2540
+    },
+    {
+      "epoch": 0.4630640465793304,
+      "grad_norm": 0.16327734291553497,
+      "learning_rate": 2.9226785209672047e-05,
+      "loss": 0.15579828023910522,
+      "step": 2545
+    },
+    {
+      "epoch": 0.46397379912663755,
+      "grad_norm": 0.3367880582809448,
+      "learning_rate": 2.91542185943942e-05,
+      "loss": 0.15617697238922118,
+      "step": 2550
+    },
+    {
+      "epoch": 0.4648835516739447,
+      "grad_norm": 0.1731594055891037,
+      "learning_rate": 2.908161596606353e-05,
+      "loss": 0.1559603691101074,
+      "step": 2555
+    },
+    {
+      "epoch": 0.4657933042212518,
+      "grad_norm": 0.1477293074131012,
+      "learning_rate": 2.9008977954074517e-05,
+      "loss": 0.15567959547042848,
+      "step": 2560
+    },
+    {
+      "epoch": 0.46670305676855894,
+      "grad_norm": 0.16227173805236816,
+      "learning_rate": 2.8936305188128392e-05,
+      "loss": 0.1522113561630249,
+      "step": 2565
+    },
+    {
+      "epoch": 0.4676128093158661,
+      "grad_norm": 0.2031075656414032,
+      "learning_rate": 2.8863598298227674e-05,
+      "loss": 0.15054640769958497,
+      "step": 2570
+    },
+    {
+      "epoch": 0.4685225618631732,
+      "grad_norm": 0.18351472914218903,
+      "learning_rate": 2.8790857914670698e-05,
+      "loss": 0.15837019681930542,
+      "step": 2575
+    },
+    {
+      "epoch": 0.46943231441048033,
+      "grad_norm": 0.15914765000343323,
+      "learning_rate": 2.871808466804616e-05,
+      "loss": 0.1550259470939636,
+      "step": 2580
+    },
+    {
+      "epoch": 0.47034206695778746,
+      "grad_norm": 0.17366717755794525,
+      "learning_rate": 2.8645279189227636e-05,
+      "loss": 0.15702390670776367,
+      "step": 2585
+    },
+    {
+      "epoch": 0.4712518195050946,
+      "grad_norm": 0.13677838444709778,
+      "learning_rate": 2.8572442109368134e-05,
+      "loss": 0.15485031604766847,
+      "step": 2590
+    },
+    {
+      "epoch": 0.4721615720524017,
+      "grad_norm": 0.1477748304605484,
+      "learning_rate": 2.8499574059894617e-05,
+      "loss": 0.14577245712280273,
+      "step": 2595
+    },
+    {
+      "epoch": 0.47307132459970885,
+      "grad_norm": 0.1582217663526535,
+      "learning_rate": 2.842667567250252e-05,
+      "loss": 0.15586793422698975,
+      "step": 2600
+    },
+    {
+      "epoch": 0.47398107714701604,
+      "grad_norm": 0.19658738374710083,
+      "learning_rate": 2.8353747579150268e-05,
+      "loss": 0.15060495138168334,
+      "step": 2605
+    },
+    {
+      "epoch": 0.47489082969432317,
+      "grad_norm": 0.176767036318779,
+      "learning_rate": 2.828079041205382e-05,
+      "loss": 0.15116705894470214,
+      "step": 2610
+    },
+    {
+      "epoch": 0.4758005822416303,
+      "grad_norm": 0.16972507536411285,
+      "learning_rate": 2.820780480368117e-05,
+      "loss": 0.1541937470436096,
+      "step": 2615
+    },
+    {
+      "epoch": 0.47671033478893743,
+      "grad_norm": 0.1548585742712021,
+      "learning_rate": 2.8134791386746884e-05,
+      "loss": 0.14334756135940552,
+      "step": 2620
+    },
+    {
+      "epoch": 0.47762008733624456,
+      "grad_norm": 0.15411986410617828,
+      "learning_rate": 2.806175079420658e-05,
+      "loss": 0.14642289876937867,
+      "step": 2625
+    },
+    {
+      "epoch": 0.4785298398835517,
+      "grad_norm": 0.16609491407871246,
+      "learning_rate": 2.7988683659251474e-05,
+      "loss": 0.15083469152450563,
+      "step": 2630
+    },
+    {
+      "epoch": 0.4794395924308588,
+      "grad_norm": 0.16592684388160706,
+      "learning_rate": 2.791559061530289e-05,
+      "loss": 0.14218480587005616,
+      "step": 2635
+    },
+    {
+      "epoch": 0.48034934497816595,
+      "grad_norm": 0.1764935404062271,
+      "learning_rate": 2.7842472296006722e-05,
+      "loss": 0.15004343986511232,
+      "step": 2640
+    },
+    {
+      "epoch": 0.4812590975254731,
+      "grad_norm": 0.20094354450702667,
+      "learning_rate": 2.7769329335228022e-05,
+      "loss": 0.14975016117095946,
+      "step": 2645
+    },
+    {
+      "epoch": 0.4821688500727802,
+      "grad_norm": 0.1869269460439682,
+      "learning_rate": 2.769616236704542e-05,
+      "loss": 0.155981707572937,
+      "step": 2650
+    },
+    {
+      "epoch": 0.48307860262008734,
+      "grad_norm": 0.16671574115753174,
+      "learning_rate": 2.762297202574571e-05,
+      "loss": 0.14633859395980836,
+      "step": 2655
+    },
+    {
+      "epoch": 0.48398835516739447,
+      "grad_norm": 0.14999663829803467,
+      "learning_rate": 2.754975894581826e-05,
+      "loss": 0.15692603588104248,
+      "step": 2660
+    },
+    {
+      "epoch": 0.4848981077147016,
+      "grad_norm": 0.16893649101257324,
+      "learning_rate": 2.7476523761949592e-05,
+      "loss": 0.14530394077301026,
+      "step": 2665
+    },
+    {
+      "epoch": 0.48580786026200873,
+      "grad_norm": 0.16039884090423584,
+      "learning_rate": 2.740326710901784e-05,
+      "loss": 0.15013915300369263,
+      "step": 2670
+    },
+    {
+      "epoch": 0.48671761280931586,
+      "grad_norm": 0.16672006249427795,
+      "learning_rate": 2.732998962208725e-05,
+      "loss": 0.15667349100112915,
+      "step": 2675
+    },
+    {
+      "epoch": 0.487627365356623,
+      "grad_norm": 0.2160867303609848,
+      "learning_rate": 2.7256691936402684e-05,
+      "loss": 0.14335414171218872,
+      "step": 2680
+    },
+    {
+      "epoch": 0.4885371179039301,
+      "grad_norm": 0.349030077457428,
+      "learning_rate": 2.71833746873841e-05,
+      "loss": 0.1437530279159546,
+      "step": 2685
+    },
+    {
+      "epoch": 0.48944687045123725,
+      "grad_norm": 0.18380966782569885,
+      "learning_rate": 2.7110038510621073e-05,
+      "loss": 0.1476014256477356,
+      "step": 2690
+    },
+    {
+      "epoch": 0.4903566229985444,
+      "grad_norm": 0.1523742377758026,
+      "learning_rate": 2.703668404186722e-05,
+      "loss": 0.14578526020050048,
+      "step": 2695
+    },
+    {
+      "epoch": 0.4912663755458515,
+      "grad_norm": 0.16092729568481445,
+      "learning_rate": 2.696331191703479e-05,
+      "loss": 0.15335593223571778,
+      "step": 2700
+    },
+    {
+      "epoch": 0.49217612809315864,
+      "grad_norm": 0.17185333371162415,
+      "learning_rate": 2.688992277218904e-05,
+      "loss": 0.1540898084640503,
+      "step": 2705
+    },
+    {
+      "epoch": 0.49308588064046577,
+      "grad_norm": 0.1521969735622406,
+      "learning_rate": 2.6816517243542792e-05,
+      "loss": 0.15171396732330322,
+      "step": 2710
+    },
+    {
+      "epoch": 0.49399563318777295,
+      "grad_norm": 0.16064171493053436,
+      "learning_rate": 2.674309596745092e-05,
+      "loss": 0.1505839228630066,
+      "step": 2715
+    },
+    {
+      "epoch": 0.4949053857350801,
+      "grad_norm": 0.16430898010730743,
+      "learning_rate": 2.6669659580404795e-05,
+      "loss": 0.1551363468170166,
+      "step": 2720
+    },
+    {
+      "epoch": 0.4958151382823872,
+      "grad_norm": 0.16125477850437164,
+      "learning_rate": 2.659620871902677e-05,
+      "loss": 0.15069286823272704,
+      "step": 2725
+    },
+    {
+      "epoch": 0.49672489082969434,
+      "grad_norm": 0.1428450047969818,
+      "learning_rate": 2.652274402006471e-05,
+      "loss": 0.15511081218719483,
+      "step": 2730
+    },
+    {
+      "epoch": 0.4976346433770015,
+      "grad_norm": 0.15452754497528076,
+      "learning_rate": 2.6449266120386406e-05,
+      "loss": 0.14941939115524291,
+      "step": 2735
+    },
+    {
+      "epoch": 0.4985443959243086,
+      "grad_norm": 0.17243537306785583,
+      "learning_rate": 2.6375775656974123e-05,
+      "loss": 0.151741623878479,
+      "step": 2740
+    },
+    {
+      "epoch": 0.49945414847161573,
+      "grad_norm": 0.13736453652381897,
+      "learning_rate": 2.6302273266919008e-05,
+      "loss": 0.147042977809906,
+      "step": 2745
+    },
+    {
+      "epoch": 0.5003639010189228,
+      "grad_norm": 0.16241495311260223,
+      "learning_rate": 2.6228759587415614e-05,
+      "loss": 0.14664684534072875,
+      "step": 2750
+    },
+    {
+      "epoch": 0.50127365356623,
+      "grad_norm": 0.193496435880661,
+      "learning_rate": 2.6155235255756356e-05,
+      "loss": 0.15486966371536254,
+      "step": 2755
+    },
+    {
+      "epoch": 0.5021834061135371,
+      "grad_norm": 0.1542847901582718,
+      "learning_rate": 2.6081700909326e-05,
+      "loss": 0.15148009061813356,
+      "step": 2760
+    },
+    {
+      "epoch": 0.5030931586608443,
+      "grad_norm": 0.1696511209011078,
+      "learning_rate": 2.6008157185596142e-05,
+      "loss": 0.14190055131912233,
+      "step": 2765
+    },
+    {
+      "epoch": 0.5040029112081513,
+      "grad_norm": 0.14690077304840088,
+      "learning_rate": 2.5934604722119655e-05,
+      "loss": 0.1570739269256592,
+      "step": 2770
+    },
+    {
+      "epoch": 0.5049126637554585,
+      "grad_norm": 0.17149671912193298,
+      "learning_rate": 2.5861044156525162e-05,
+      "loss": 0.14940304756164552,
+      "step": 2775
+    },
+    {
+      "epoch": 0.5058224163027657,
+      "grad_norm": 0.16639231145381927,
+      "learning_rate": 2.578747612651155e-05,
+      "loss": 0.15691237449645995,
+      "step": 2780
+    },
+    {
+      "epoch": 0.5067321688500728,
+      "grad_norm": 0.2062763124704361,
+      "learning_rate": 2.5713901269842404e-05,
+      "loss": 0.1564734935760498,
+      "step": 2785
+    },
+    {
+      "epoch": 0.50764192139738,
+      "grad_norm": 0.12636308372020721,
+      "learning_rate": 2.5640320224340502e-05,
+      "loss": 0.14539417028427123,
+      "step": 2790
+    },
+    {
+      "epoch": 0.508551673944687,
+      "grad_norm": 0.16893689334392548,
+      "learning_rate": 2.556673362788225e-05,
+      "loss": 0.15440930128097535,
+      "step": 2795
+    },
+    {
+      "epoch": 0.5094614264919942,
+      "grad_norm": 0.16250015795230865,
+      "learning_rate": 2.54931421183922e-05,
+      "loss": 0.14485647678375244,
+      "step": 2800
+    },
+    {
+      "epoch": 0.5103711790393013,
+      "grad_norm": 0.1700994372367859,
+      "learning_rate": 2.5419546333837462e-05,
+      "loss": 0.15411126613616943,
+      "step": 2805
+    },
+    {
+      "epoch": 0.5112809315866085,
+      "grad_norm": 0.1547706127166748,
+      "learning_rate": 2.5345946912222256e-05,
+      "loss": 0.15516072511672974,
+      "step": 2810
+    },
+    {
+      "epoch": 0.5121906841339156,
+      "grad_norm": 0.17955681681632996,
+      "learning_rate": 2.527234449158228e-05,
+      "loss": 0.15546923875808716,
+      "step": 2815
+    },
+    {
+      "epoch": 0.5131004366812227,
+      "grad_norm": 0.163709819316864,
+      "learning_rate": 2.519873970997927e-05,
+      "loss": 0.15665037631988527,
+      "step": 2820
+    },
+    {
+      "epoch": 0.5140101892285298,
+      "grad_norm": 0.17859576642513275,
+      "learning_rate": 2.5125133205495405e-05,
+      "loss": 0.1539722204208374,
+      "step": 2825
+    },
+    {
+      "epoch": 0.514919941775837,
+      "grad_norm": 0.17443150281906128,
+      "learning_rate": 2.5051525616227806e-05,
+      "loss": 0.148411762714386,
+      "step": 2830
+    },
+    {
+      "epoch": 0.5158296943231441,
+      "grad_norm": 0.17397581040859222,
+      "learning_rate": 2.4977917580283007e-05,
+      "loss": 0.14880497455596925,
+      "step": 2835
+    },
+    {
+      "epoch": 0.5167394468704513,
+      "grad_norm": 0.14565663039684296,
+      "learning_rate": 2.4904309735771405e-05,
+      "loss": 0.14934680461883545,
+      "step": 2840
+    },
+    {
+      "epoch": 0.5176491994177583,
+      "grad_norm": 0.17895659804344177,
+      "learning_rate": 2.4830702720801746e-05,
+      "loss": 0.15287939310073853,
+      "step": 2845
+    },
+    {
+      "epoch": 0.5185589519650655,
+      "grad_norm": 0.15812788903713226,
+      "learning_rate": 2.4757097173475572e-05,
+      "loss": 0.14576947689056396,
+      "step": 2850
+    },
+    {
+      "epoch": 0.5194687045123726,
+      "grad_norm": 0.17123781144618988,
+      "learning_rate": 2.46834937318817e-05,
+      "loss": 0.15224847793579102,
+      "step": 2855
+    },
+    {
+      "epoch": 0.5203784570596798,
+      "grad_norm": 0.14845474064350128,
+      "learning_rate": 2.460989303409072e-05,
+      "loss": 0.14901585578918458,
+      "step": 2860
+    },
+    {
+      "epoch": 0.5212882096069869,
+      "grad_norm": 0.23493704199790955,
+      "learning_rate": 2.4536295718149407e-05,
+      "loss": 0.1517487049102783,
+      "step": 2865
+    },
+    {
+      "epoch": 0.522197962154294,
+      "grad_norm": 0.16209843754768372,
+      "learning_rate": 2.4462702422075217e-05,
+      "loss": 0.14327445030212402,
+      "step": 2870
+    },
+    {
+      "epoch": 0.5231077147016011,
+      "grad_norm": 0.17249803245067596,
+      "learning_rate": 2.4389113783850793e-05,
+      "loss": 0.1517549753189087,
+      "step": 2875
+    },
+    {
+      "epoch": 0.5240174672489083,
+      "grad_norm": 0.14561402797698975,
+      "learning_rate": 2.431553044141836e-05,
+      "loss": 0.14764087200164794,
+      "step": 2880
+    },
+    {
+      "epoch": 0.5249272197962155,
+      "grad_norm": 0.17033302783966064,
+      "learning_rate": 2.4241953032674256e-05,
+      "loss": 0.15181604623794556,
+      "step": 2885
+    },
+    {
+      "epoch": 0.5258369723435226,
+      "grad_norm": 0.1184430941939354,
+      "learning_rate": 2.4168382195463367e-05,
+      "loss": 0.14264242649078368,
+      "step": 2890
+    },
+    {
+      "epoch": 0.5267467248908297,
+      "grad_norm": 0.17521196603775024,
+      "learning_rate": 2.4094818567573618e-05,
+      "loss": 0.1509538173675537,
+      "step": 2895
+    },
+    {
+      "epoch": 0.5276564774381368,
+      "grad_norm": 0.1681576371192932,
+      "learning_rate": 2.4021262786730428e-05,
+      "loss": 0.15344605445861817,
+      "step": 2900
+    },
+    {
+      "epoch": 0.528566229985444,
+      "grad_norm": 0.17134182155132294,
+      "learning_rate": 2.3947715490591206e-05,
+      "loss": 0.15161689519882202,
+      "step": 2905
+    },
+    {
+      "epoch": 0.5294759825327511,
+      "grad_norm": 0.1796472817659378,
+      "learning_rate": 2.3874177316739778e-05,
+      "loss": 0.15086464881896972,
+      "step": 2910
+    },
+    {
+      "epoch": 0.5303857350800583,
+      "grad_norm": 0.23268625140190125,
+      "learning_rate": 2.380064890268093e-05,
+      "loss": 0.15354180335998535,
+      "step": 2915
+    },
+    {
+      "epoch": 0.5312954876273653,
+      "grad_norm": 0.16318941116333008,
+      "learning_rate": 2.372713088583481e-05,
+      "loss": 0.15131797790527343,
+      "step": 2920
+    },
+    {
+      "epoch": 0.5322052401746725,
+      "grad_norm": 0.18171803653240204,
+      "learning_rate": 2.365362390353143e-05,
+      "loss": 0.15784090757369995,
+      "step": 2925
+    },
+    {
+      "epoch": 0.5331149927219796,
+      "grad_norm": 0.17672640085220337,
+      "learning_rate": 2.3580128593005156e-05,
+      "loss": 0.15509436130523682,
+      "step": 2930
+    },
+    {
+      "epoch": 0.5340247452692868,
+      "grad_norm": 0.15985223650932312,
+      "learning_rate": 2.3506645591389174e-05,
+      "loss": 0.14851027727127075,
+      "step": 2935
+    },
+    {
+      "epoch": 0.5349344978165939,
+      "grad_norm": 0.16597607731819153,
+      "learning_rate": 2.343317553570995e-05,
+      "loss": 0.1504931092262268,
+      "step": 2940
+    },
+    {
+      "epoch": 0.535844250363901,
+      "grad_norm": 0.20180748403072357,
+      "learning_rate": 2.3359719062881725e-05,
+      "loss": 0.15023820400238036,
+      "step": 2945
+    },
+    {
+      "epoch": 0.5367540029112081,
+      "grad_norm": 0.1735963076353073,
+      "learning_rate": 2.3286276809701e-05,
+      "loss": 0.15374408960342406,
+      "step": 2950
+    },
+    {
+      "epoch": 0.5376637554585153,
+      "grad_norm": 0.17629501223564148,
+      "learning_rate": 2.3212849412840995e-05,
+      "loss": 0.15007833242416382,
+      "step": 2955
+    },
+    {
+      "epoch": 0.5385735080058224,
+      "grad_norm": 0.1493796557188034,
+      "learning_rate": 2.3139437508846155e-05,
+      "loss": 0.15206656455993653,
+      "step": 2960
+    },
+    {
+      "epoch": 0.5394832605531296,
+      "grad_norm": 0.17426837980747223,
+      "learning_rate": 2.306604173412659e-05,
+      "loss": 0.1441131591796875,
+      "step": 2965
+    },
+    {
+      "epoch": 0.5403930131004366,
+      "grad_norm": 0.16984431445598602,
+      "learning_rate": 2.2992662724952613e-05,
+      "loss": 0.14438753128051757,
+      "step": 2970
+    },
+    {
+      "epoch": 0.5413027656477438,
+      "grad_norm": 0.1814386397600174,
+      "learning_rate": 2.2919301117449167e-05,
+      "loss": 0.14887022972106934,
+      "step": 2975
+    },
+    {
+      "epoch": 0.5422125181950509,
+      "grad_norm": 0.158392995595932,
+      "learning_rate": 2.2845957547590368e-05,
+      "loss": 0.14404361248016356,
+      "step": 2980
+    },
+    {
+      "epoch": 0.5431222707423581,
+      "grad_norm": 0.17496263980865479,
+      "learning_rate": 2.2772632651193953e-05,
+      "loss": 0.1454906702041626,
+      "step": 2985
+    },
+    {
+      "epoch": 0.5440320232896652,
+      "grad_norm": 0.157533198595047,
+      "learning_rate": 2.2699327063915766e-05,
+      "loss": 0.1458217740058899,
+      "step": 2990
+    },
+    {
+      "epoch": 0.5449417758369723,
+      "grad_norm": 0.1767890453338623,
+      "learning_rate": 2.262604142124427e-05,
+      "loss": 0.14384825229644777,
+      "step": 2995
+    },
+    {
+      "epoch": 0.5458515283842795,
+      "grad_norm": 0.1851050704717636,
+      "learning_rate": 2.2552776358495033e-05,
+      "loss": 0.14832457304000854,
+      "step": 3000
+    },
+    {
+      "epoch": 0.5467612809315866,
+      "grad_norm": 0.164175882935524,
+      "learning_rate": 2.247953251080521e-05,
+      "loss": 0.14999878406524658,
+      "step": 3005
+    },
+    {
+      "epoch": 0.5476710334788938,
+      "grad_norm": 0.3403675854206085,
+      "learning_rate": 2.240631051312804e-05,
+      "loss": 0.1443937063217163,
+      "step": 3010
+    },
+    {
+      "epoch": 0.5485807860262009,
+      "grad_norm": 0.16751109063625336,
+      "learning_rate": 2.2333111000227342e-05,
+      "loss": 0.1462402105331421,
+      "step": 3015
+    },
+    {
+      "epoch": 0.549490538573508,
+      "grad_norm": 0.14741151034832,
+      "learning_rate": 2.225993460667201e-05,
+      "loss": 0.149855899810791,
+      "step": 3020
+    },
+    {
+      "epoch": 0.5504002911208151,
+      "grad_norm": 0.20605266094207764,
+      "learning_rate": 2.218678196683054e-05,
+      "loss": 0.15413178205490113,
+      "step": 3025
+    },
+    {
+      "epoch": 0.5513100436681223,
+      "grad_norm": 0.14884796738624573,
+      "learning_rate": 2.2113653714865473e-05,
+      "loss": 0.14592334032058715,
+      "step": 3030
+    },
+    {
+      "epoch": 0.5522197962154294,
+      "grad_norm": 0.17114350199699402,
+      "learning_rate": 2.2040550484727943e-05,
+      "loss": 0.1498338460922241,
+      "step": 3035
+    },
+    {
+      "epoch": 0.5531295487627366,
+      "grad_norm": 0.16496853530406952,
+      "learning_rate": 2.196747291015219e-05,
+      "loss": 0.14650315046310425,
+      "step": 3040
+    },
+    {
+      "epoch": 0.5540393013100436,
+      "grad_norm": 0.15172401070594788,
+      "learning_rate": 2.189442162465001e-05,
+      "loss": 0.14984124898910522,
+      "step": 3045
+    },
+    {
+      "epoch": 0.5549490538573508,
+      "grad_norm": 0.19258467853069305,
+      "learning_rate": 2.182139726150532e-05,
+      "loss": 0.1486764669418335,
+      "step": 3050
+    },
+    {
+      "epoch": 0.5558588064046579,
+      "grad_norm": 0.1749001443386078,
+      "learning_rate": 2.1748400453768652e-05,
+      "loss": 0.14983701705932617,
+      "step": 3055
+    },
+    {
+      "epoch": 0.5567685589519651,
+      "grad_norm": 0.37510567903518677,
+      "learning_rate": 2.1675431834251637e-05,
+      "loss": 0.14483561515808105,
+      "step": 3060
+    },
+    {
+      "epoch": 0.5576783114992722,
+      "grad_norm": 0.16932405531406403,
+      "learning_rate": 2.1602492035521553e-05,
+      "loss": 0.14487643241882325,
+      "step": 3065
+    },
+    {
+      "epoch": 0.5585880640465793,
+      "grad_norm": 0.174176424741745,
+      "learning_rate": 2.152958168989584e-05,
+      "loss": 0.14737497568130492,
+      "step": 3070
+    },
+    {
+      "epoch": 0.5594978165938864,
+      "grad_norm": 0.1601252257823944,
+      "learning_rate": 2.1456701429436577e-05,
+      "loss": 0.15183379650115966,
+      "step": 3075
+    },
+    {
+      "epoch": 0.5604075691411936,
+      "grad_norm": 0.14960910379886627,
+      "learning_rate": 2.1383851885945085e-05,
+      "loss": 0.143074893951416,
+      "step": 3080
+    },
+    {
+      "epoch": 0.5613173216885007,
+      "grad_norm": 0.1678633838891983,
+      "learning_rate": 2.1311033690956346e-05,
+      "loss": 0.14961432218551635,
+      "step": 3085
+    },
+    {
+      "epoch": 0.5622270742358079,
+      "grad_norm": 0.15814319252967834,
+      "learning_rate": 2.1238247475733613e-05,
+      "loss": 0.14308581352233887,
+      "step": 3090
+    },
+    {
+      "epoch": 0.5631368267831149,
+      "grad_norm": 0.21240772306919098,
+      "learning_rate": 2.1165493871262887e-05,
+      "loss": 0.14737485647201537,
+      "step": 3095
+    },
+    {
+      "epoch": 0.5640465793304221,
+      "grad_norm": 0.15161271393299103,
+      "learning_rate": 2.109277350824749e-05,
+      "loss": 0.14534420967102052,
+      "step": 3100
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.70859825492724e+18,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-3100/training_args.bin b/checkpoint-3100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-3100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/checkpoint-3200/README.md b/checkpoint-3200/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-3200/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-3200/adapter_config.json b/checkpoint-3200/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-3200/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-3200/adapter_model.safetensors b/checkpoint-3200/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..2c1c591b838821579e7bf8d85202b7d52b68eaf1
--- /dev/null
+++ b/checkpoint-3200/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:32fb177fbe136ca043ed6035a43af0dd484d8fcda80671f1f5f4357be09052d5
+size 169741912
diff --git a/checkpoint-3200/chat_template.jinja b/checkpoint-3200/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-3200/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-3200/optimizer.pt b/checkpoint-3200/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bcda9c590a157673604ca0d5a969e5d4674132bb
--- /dev/null
+++ b/checkpoint-3200/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1c595669c7ffdaea388525bc80a51457a31a22b1735f005c778ba49ce7b80851
+size 72807355
diff --git a/checkpoint-3200/processor_config.json b/checkpoint-3200/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-3200/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-3200/rng_state.pth b/checkpoint-3200/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-3200/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-3200/scheduler.pt b/checkpoint-3200/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1fda2f44100f2f39146503b390bade53e8648468
--- /dev/null
+++ b/checkpoint-3200/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e1485f639f1264de28af0debdb7af8d42b220fc574595895bd6d45b959ed11c9
+size 1465
diff --git a/checkpoint-3200/tokenizer.json b/checkpoint-3200/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-3200/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-3200/tokenizer_config.json b/checkpoint-3200/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-3200/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-3200/trainer_state.json b/checkpoint-3200/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..60a0cdcf5851d6fce0b04bde66dbd209ad903699
--- /dev/null
+++ b/checkpoint-3200/trainer_state.json
@@ -0,0 +1,4522 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.5822416302765647,
+  "eval_steps": 100,
+  "global_step": 3200,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    },
+    {
+      "epoch": 0.03729985443959243,
+      "grad_norm": 0.061678580939769745,
+      "learning_rate": 4.999340748636462e-05,
+      "loss": 0.24956226348876953,
+      "step": 205
+    },
+    {
+      "epoch": 0.03820960698689956,
+      "grad_norm": 0.07328873127698898,
+      "learning_rate": 4.999160884067051e-05,
+      "loss": 0.26169676780700685,
+      "step": 210
+    },
+    {
+      "epoch": 0.0391193595342067,
+      "grad_norm": 0.08287990838289261,
+      "learning_rate": 4.9989593541926246e-05,
+      "loss": 0.2574604034423828,
+      "step": 215
+    },
+    {
+      "epoch": 0.04002911208151383,
+      "grad_norm": 0.06787359714508057,
+      "learning_rate": 4.9987361607602525e-05,
+      "loss": 0.25351409912109374,
+      "step": 220
+    },
+    {
+      "epoch": 0.04093886462882096,
+      "grad_norm": 0.06695502996444702,
+      "learning_rate": 4.998491305704805e-05,
+      "loss": 0.24522039890289307,
+      "step": 225
+    },
+    {
+      "epoch": 0.041848617176128096,
+      "grad_norm": 0.08872214704751968,
+      "learning_rate": 4.9982247911489375e-05,
+      "loss": 0.2581867933273315,
+      "step": 230
+    },
+    {
+      "epoch": 0.042758369723435226,
+      "grad_norm": 0.07637131959199905,
+      "learning_rate": 4.9979366194030743e-05,
+      "loss": 0.25569658279418944,
+      "step": 235
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 0.08158119022846222,
+      "learning_rate": 4.997626792965385e-05,
+      "loss": 0.2529409646987915,
+      "step": 240
+    },
+    {
+      "epoch": 0.04457787481804949,
+      "grad_norm": 0.07529161125421524,
+      "learning_rate": 4.997295314521766e-05,
+      "loss": 0.24049024581909179,
+      "step": 245
+    },
+    {
+      "epoch": 0.04548762736535662,
+      "grad_norm": 0.08860139548778534,
+      "learning_rate": 4.996942186945813e-05,
+      "loss": 0.2490522861480713,
+      "step": 250
+    },
+    {
+      "epoch": 0.04639737991266375,
+      "grad_norm": 0.0850321501493454,
+      "learning_rate": 4.9965674132988005e-05,
+      "loss": 0.24180831909179687,
+      "step": 255
+    },
+    {
+      "epoch": 0.04730713245997089,
+      "grad_norm": 0.07556115090847015,
+      "learning_rate": 4.996170996829653e-05,
+      "loss": 0.2509631872177124,
+      "step": 260
+    },
+    {
+      "epoch": 0.04821688500727802,
+      "grad_norm": 0.07971206307411194,
+      "learning_rate": 4.995752940974918e-05,
+      "loss": 0.24398891925811766,
+      "step": 265
+    },
+    {
+      "epoch": 0.04912663755458515,
+      "grad_norm": 0.09149336814880371,
+      "learning_rate": 4.9953132493587344e-05,
+      "loss": 0.2300492286682129,
+      "step": 270
+    },
+    {
+      "epoch": 0.050036390101892286,
+      "grad_norm": 0.08265820890665054,
+      "learning_rate": 4.9948519257928034e-05,
+      "loss": 0.24246792793273925,
+      "step": 275
+    },
+    {
+      "epoch": 0.050946142649199416,
+      "grad_norm": 0.10328587144613266,
+      "learning_rate": 4.9943689742763534e-05,
+      "loss": 0.2367171049118042,
+      "step": 280
+    },
+    {
+      "epoch": 0.05185589519650655,
+      "grad_norm": 0.0836917981505394,
+      "learning_rate": 4.993864398996105e-05,
+      "loss": 0.23215813636779786,
+      "step": 285
+    },
+    {
+      "epoch": 0.05276564774381368,
+      "grad_norm": 0.09475161135196686,
+      "learning_rate": 4.99333820432624e-05,
+      "loss": 0.2350748062133789,
+      "step": 290
+    },
+    {
+      "epoch": 0.05367540029112081,
+      "grad_norm": 0.08040128648281097,
+      "learning_rate": 4.992790394828355e-05,
+      "loss": 0.23253886699676513,
+      "step": 295
+    },
+    {
+      "epoch": 0.05458515283842795,
+      "grad_norm": 0.08852150291204453,
+      "learning_rate": 4.992220975251428e-05,
+      "loss": 0.23856515884399415,
+      "step": 300
+    },
+    {
+      "epoch": 0.05549490538573508,
+      "grad_norm": 0.09565229713916779,
+      "learning_rate": 4.991629950531775e-05,
+      "loss": 0.23311660289764405,
+      "step": 305
+    },
+    {
+      "epoch": 0.05640465793304221,
+      "grad_norm": 0.08158160001039505,
+      "learning_rate": 4.991017325793009e-05,
+      "loss": 0.22467944622039795,
+      "step": 310
+    },
+    {
+      "epoch": 0.05731441048034935,
+      "grad_norm": 0.07746429741382599,
+      "learning_rate": 4.990383106345994e-05,
+      "loss": 0.229844069480896,
+      "step": 315
+    },
+    {
+      "epoch": 0.05822416302765648,
+      "grad_norm": 0.08564355969429016,
+      "learning_rate": 4.989727297688797e-05,
+      "loss": 0.22414517402648926,
+      "step": 320
+    },
+    {
+      "epoch": 0.05913391557496361,
+      "grad_norm": 0.07517435401678085,
+      "learning_rate": 4.9890499055066435e-05,
+      "loss": 0.2236532211303711,
+      "step": 325
+    },
+    {
+      "epoch": 0.060043668122270744,
+      "grad_norm": 0.111734539270401,
+      "learning_rate": 4.988350935671869e-05,
+      "loss": 0.21474847793579102,
+      "step": 330
+    },
+    {
+      "epoch": 0.060953420669577874,
+      "grad_norm": 0.09906989336013794,
+      "learning_rate": 4.987630394243866e-05,
+      "loss": 0.23321933746337892,
+      "step": 335
+    },
+    {
+      "epoch": 0.06186317321688501,
+      "grad_norm": 0.10131457448005676,
+      "learning_rate": 4.98688828746903e-05,
+      "loss": 0.2310662031173706,
+      "step": 340
+    },
+    {
+      "epoch": 0.06277292576419213,
+      "grad_norm": 0.09203507006168365,
+      "learning_rate": 4.986124621780708e-05,
+      "loss": 0.22021169662475587,
+      "step": 345
+    },
+    {
+      "epoch": 0.06368267831149928,
+      "grad_norm": 0.09505912661552429,
+      "learning_rate": 4.9853394037991416e-05,
+      "loss": 0.2197155237197876,
+      "step": 350
+    },
+    {
+      "epoch": 0.06459243085880641,
+      "grad_norm": 0.09038657695055008,
+      "learning_rate": 4.984532640331412e-05,
+      "loss": 0.22066287994384765,
+      "step": 355
+    },
+    {
+      "epoch": 0.06550218340611354,
+      "grad_norm": 0.09707064181566238,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 0.22455451488494874,
+      "step": 360
+    },
+    {
+      "epoch": 0.06641193595342067,
+      "grad_norm": 0.10367228090763092,
+      "learning_rate": 4.98285450509961e-05,
+      "loss": 0.21993820667266845,
+      "step": 365
+    },
+    {
+      "epoch": 0.0673216885007278,
+      "grad_norm": 0.12229471653699875,
+      "learning_rate": 4.9819831478833456e-05,
+      "loss": 0.2168867588043213,
+      "step": 370
+    },
+    {
+      "epoch": 0.06823144104803494,
+      "grad_norm": 0.0964592918753624,
+      "learning_rate": 4.981090274276406e-05,
+      "loss": 0.21579203605651856,
+      "step": 375
+    },
+    {
+      "epoch": 0.06914119359534207,
+      "grad_norm": 0.09400496631860733,
+      "learning_rate": 4.980175892019141e-05,
+      "loss": 0.20972180366516113,
+      "step": 380
+    },
+    {
+      "epoch": 0.0700509461426492,
+      "grad_norm": 0.08158645778894424,
+      "learning_rate": 4.9792400090383594e-05,
+      "loss": 0.22148358821868896,
+      "step": 385
+    },
+    {
+      "epoch": 0.07096069868995633,
+      "grad_norm": 0.10916394740343094,
+      "learning_rate": 4.978282633447261e-05,
+      "loss": 0.2214418649673462,
+      "step": 390
+    },
+    {
+      "epoch": 0.07187045123726346,
+      "grad_norm": 0.11138810962438583,
+      "learning_rate": 4.9773037735453636e-05,
+      "loss": 0.21814754009246826,
+      "step": 395
+    },
+    {
+      "epoch": 0.07278020378457059,
+      "grad_norm": 0.10914396494626999,
+      "learning_rate": 4.9763034378184365e-05,
+      "loss": 0.21310818195343018,
+      "step": 400
+    },
+    {
+      "epoch": 0.07368995633187773,
+      "grad_norm": 0.1043366864323616,
+      "learning_rate": 4.975281634938421e-05,
+      "loss": 0.21266789436340333,
+      "step": 405
+    },
+    {
+      "epoch": 0.07459970887918486,
+      "grad_norm": 0.1036868542432785,
+      "learning_rate": 4.9742383737633594e-05,
+      "loss": 0.21606721878051757,
+      "step": 410
+    },
+    {
+      "epoch": 0.075509461426492,
+      "grad_norm": 0.11640442907810211,
+      "learning_rate": 4.9731736633373144e-05,
+      "loss": 0.21532948017120362,
+      "step": 415
+    },
+    {
+      "epoch": 0.07641921397379912,
+      "grad_norm": 0.11219926178455353,
+      "learning_rate": 4.9720875128902956e-05,
+      "loss": 0.2191627025604248,
+      "step": 420
+    },
+    {
+      "epoch": 0.07732896652110625,
+      "grad_norm": 0.12103637307882309,
+      "learning_rate": 4.970979931838176e-05,
+      "loss": 0.20938868522644044,
+      "step": 425
+    },
+    {
+      "epoch": 0.0782387190684134,
+      "grad_norm": 0.13274189829826355,
+      "learning_rate": 4.96985092978261e-05,
+      "loss": 0.21792960166931152,
+      "step": 430
+    },
+    {
+      "epoch": 0.07914847161572053,
+      "grad_norm": 0.11164513230323792,
+      "learning_rate": 4.968700516510954e-05,
+      "loss": 0.2022618055343628,
+      "step": 435
+    },
+    {
+      "epoch": 0.08005822416302766,
+      "grad_norm": 0.09532847255468369,
+      "learning_rate": 4.967528701996174e-05,
+      "loss": 0.21255812644958497,
+      "step": 440
+    },
+    {
+      "epoch": 0.08096797671033479,
+      "grad_norm": 0.10279258340597153,
+      "learning_rate": 4.96633549639677e-05,
+      "loss": 0.20683050155639648,
+      "step": 445
+    },
+    {
+      "epoch": 0.08187772925764192,
+      "grad_norm": 0.1257462352514267,
+      "learning_rate": 4.965120910056677e-05,
+      "loss": 0.21419920921325683,
+      "step": 450
+    },
+    {
+      "epoch": 0.08278748180494905,
+      "grad_norm": 0.11663137376308441,
+      "learning_rate": 4.963884953505186e-05,
+      "loss": 0.2072287082672119,
+      "step": 455
+    },
+    {
+      "epoch": 0.08369723435225619,
+      "grad_norm": 0.10488224029541016,
+      "learning_rate": 4.96262763745684e-05,
+      "loss": 0.1982678532600403,
+      "step": 460
+    },
+    {
+      "epoch": 0.08460698689956332,
+      "grad_norm": 0.11801692098379135,
+      "learning_rate": 4.961348972811354e-05,
+      "loss": 0.20662031173706055,
+      "step": 465
+    },
+    {
+      "epoch": 0.08551673944687045,
+      "grad_norm": 0.11318827420473099,
+      "learning_rate": 4.96004897065351e-05,
+      "loss": 0.20947303771972656,
+      "step": 470
+    },
+    {
+      "epoch": 0.08642649199417758,
+      "grad_norm": 0.13409486413002014,
+      "learning_rate": 4.95872764225307e-05,
+      "loss": 0.19670876264572143,
+      "step": 475
+    },
+    {
+      "epoch": 0.08733624454148471,
+      "grad_norm": 0.14440792798995972,
+      "learning_rate": 4.957384999064672e-05,
+      "loss": 0.19842848777770997,
+      "step": 480
+    },
+    {
+      "epoch": 0.08824599708879186,
+      "grad_norm": 0.12246996909379959,
+      "learning_rate": 4.956021052727731e-05,
+      "loss": 0.20318071842193602,
+      "step": 485
+    },
+    {
+      "epoch": 0.08915574963609899,
+      "grad_norm": 0.13437233865261078,
+      "learning_rate": 4.954635815066342e-05,
+      "loss": 0.21675212383270265,
+      "step": 490
+    },
+    {
+      "epoch": 0.09006550218340612,
+      "grad_norm": 0.11109672486782074,
+      "learning_rate": 4.9532292980891744e-05,
+      "loss": 0.2100757837295532,
+      "step": 495
+    },
+    {
+      "epoch": 0.09097525473071325,
+      "grad_norm": 0.1388893872499466,
+      "learning_rate": 4.9518015139893675e-05,
+      "loss": 0.20303285121917725,
+      "step": 500
+    },
+    {
+      "epoch": 0.09188500727802038,
+      "grad_norm": 0.13239721953868866,
+      "learning_rate": 4.950352475144427e-05,
+      "loss": 0.2152268409729004,
+      "step": 505
+    },
+    {
+      "epoch": 0.0927947598253275,
+      "grad_norm": 0.12834979593753815,
+      "learning_rate": 4.948882194116119e-05,
+      "loss": 0.20799248218536376,
+      "step": 510
+    },
+    {
+      "epoch": 0.09370451237263465,
+      "grad_norm": 0.11886704713106155,
+      "learning_rate": 4.947390683650354e-05,
+      "loss": 0.20394976139068605,
+      "step": 515
+    },
+    {
+      "epoch": 0.09461426491994178,
+      "grad_norm": 0.11398876458406448,
+      "learning_rate": 4.945877956677083e-05,
+      "loss": 0.2091092586517334,
+      "step": 520
+    },
+    {
+      "epoch": 0.09552401746724891,
+      "grad_norm": 0.1422540694475174,
+      "learning_rate": 4.944344026310186e-05,
+      "loss": 0.19564238786697388,
+      "step": 525
+    },
+    {
+      "epoch": 0.09643377001455604,
+      "grad_norm": 0.11359584331512451,
+      "learning_rate": 4.9427889058473535e-05,
+      "loss": 0.20493624210357667,
+      "step": 530
+    },
+    {
+      "epoch": 0.09734352256186317,
+      "grad_norm": 0.11703553050756454,
+      "learning_rate": 4.941212608769974e-05,
+      "loss": 0.2098615884780884,
+      "step": 535
+    },
+    {
+      "epoch": 0.0982532751091703,
+      "grad_norm": 0.14552047848701477,
+      "learning_rate": 4.939615148743017e-05,
+      "loss": 0.20382182598114013,
+      "step": 540
+    },
+    {
+      "epoch": 0.09916302765647744,
+      "grad_norm": 0.13178016245365143,
+      "learning_rate": 4.937996539614914e-05,
+      "loss": 0.19901862144470214,
+      "step": 545
+    },
+    {
+      "epoch": 0.10007278020378457,
+      "grad_norm": 0.635392427444458,
+      "learning_rate": 4.936356795417439e-05,
+      "loss": 0.20694944858551026,
+      "step": 550
+    },
+    {
+      "epoch": 0.1009825327510917,
+      "grad_norm": 0.15019077062606812,
+      "learning_rate": 4.934695930365586e-05,
+      "loss": 0.19313746690750122,
+      "step": 555
+    },
+    {
+      "epoch": 0.10189228529839883,
+      "grad_norm": 0.12941956520080566,
+      "learning_rate": 4.9330139588574474e-05,
+      "loss": 0.19671722650527954,
+      "step": 560
+    },
+    {
+      "epoch": 0.10280203784570596,
+      "grad_norm": 0.13818831741809845,
+      "learning_rate": 4.931310895474088e-05,
+      "loss": 0.20026786327362062,
+      "step": 565
+    },
+    {
+      "epoch": 0.1037117903930131,
+      "grad_norm": 0.12011194974184036,
+      "learning_rate": 4.929586754979417e-05,
+      "loss": 0.1932437539100647,
+      "step": 570
+    },
+    {
+      "epoch": 0.10462154294032024,
+      "grad_norm": 0.1345364898443222,
+      "learning_rate": 4.9278415523200644e-05,
+      "loss": 0.20245940685272218,
+      "step": 575
+    },
+    {
+      "epoch": 0.10553129548762737,
+      "grad_norm": 0.13281017541885376,
+      "learning_rate": 4.926075302625247e-05,
+      "loss": 0.19864981174468993,
+      "step": 580
+    },
+    {
+      "epoch": 0.1064410480349345,
+      "grad_norm": 0.13465586304664612,
+      "learning_rate": 4.924288021206639e-05,
+      "loss": 0.19573183059692384,
+      "step": 585
+    },
+    {
+      "epoch": 0.10735080058224163,
+      "grad_norm": 0.15225961804389954,
+      "learning_rate": 4.9224797235582396e-05,
+      "loss": 0.19946801662445068,
+      "step": 590
+    },
+    {
+      "epoch": 0.10826055312954876,
+      "grad_norm": 0.12816746532917023,
+      "learning_rate": 4.92065042535624e-05,
+      "loss": 0.19851526021957397,
+      "step": 595
+    },
+    {
+      "epoch": 0.1091703056768559,
+      "grad_norm": 0.13802853226661682,
+      "learning_rate": 4.9188001424588824e-05,
+      "loss": 0.19321763515472412,
+      "step": 600
+    },
+    {
+      "epoch": 0.11008005822416303,
+      "grad_norm": 0.17504797875881195,
+      "learning_rate": 4.9169288909063295e-05,
+      "loss": 0.2032616138458252,
+      "step": 605
+    },
+    {
+      "epoch": 0.11098981077147016,
+      "grad_norm": 0.13544194400310516,
+      "learning_rate": 4.91503668692052e-05,
+      "loss": 0.2011256456375122,
+      "step": 610
+    },
+    {
+      "epoch": 0.11189956331877729,
+      "grad_norm": 1.3976134061813354,
+      "learning_rate": 4.91312354690503e-05,
+      "loss": 0.19916868209838867,
+      "step": 615
+    },
+    {
+      "epoch": 0.11280931586608442,
+      "grad_norm": 0.1465059071779251,
+      "learning_rate": 4.91118948744493e-05,
+      "loss": 0.19487457275390624,
+      "step": 620
+    },
+    {
+      "epoch": 0.11371906841339156,
+      "grad_norm": 0.12103168666362762,
+      "learning_rate": 4.909234525306645e-05,
+      "loss": 0.1907251238822937,
+      "step": 625
+    },
+    {
+      "epoch": 0.1146288209606987,
+      "grad_norm": 0.12660574913024902,
+      "learning_rate": 4.907258677437802e-05,
+      "loss": 0.19327253103256226,
+      "step": 630
+    },
+    {
+      "epoch": 0.11553857350800582,
+      "grad_norm": 0.1347813606262207,
+      "learning_rate": 4.90526196096709e-05,
+      "loss": 0.19637736082077026,
+      "step": 635
+    },
+    {
+      "epoch": 0.11644832605531295,
+      "grad_norm": 0.14953652024269104,
+      "learning_rate": 4.903244393204107e-05,
+      "loss": 0.20325069427490233,
+      "step": 640
+    },
+    {
+      "epoch": 0.11735807860262008,
+      "grad_norm": 0.13936272263526917,
+      "learning_rate": 4.901205991639213e-05,
+      "loss": 0.1930275321006775,
+      "step": 645
+    },
+    {
+      "epoch": 0.11826783114992721,
+      "grad_norm": 0.1448420137166977,
+      "learning_rate": 4.899146773943374e-05,
+      "loss": 0.20026936531066894,
+      "step": 650
+    },
+    {
+      "epoch": 0.11917758369723436,
+      "grad_norm": 0.1312534064054489,
+      "learning_rate": 4.897066757968014e-05,
+      "loss": 0.19062033891677857,
+      "step": 655
+    },
+    {
+      "epoch": 0.12008733624454149,
+      "grad_norm": 0.13644742965698242,
+      "learning_rate": 4.894965961744859e-05,
+      "loss": 0.18719595670700073,
+      "step": 660
+    },
+    {
+      "epoch": 0.12099708879184862,
+      "grad_norm": 0.14276087284088135,
+      "learning_rate": 4.892844403485777e-05,
+      "loss": 0.19784307479858398,
+      "step": 665
+    },
+    {
+      "epoch": 0.12190684133915575,
+      "grad_norm": 0.14735399186611176,
+      "learning_rate": 4.890702101582623e-05,
+      "loss": 0.19163782596588136,
+      "step": 670
+    },
+    {
+      "epoch": 0.12281659388646288,
+      "grad_norm": 0.15742065012454987,
+      "learning_rate": 4.888539074607082e-05,
+      "loss": 0.19312986135482788,
+      "step": 675
+    },
+    {
+      "epoch": 0.12372634643377002,
+      "grad_norm": 0.12917031347751617,
+      "learning_rate": 4.8863553413105025e-05,
+      "loss": 0.20066320896148682,
+      "step": 680
+    },
+    {
+      "epoch": 0.12463609898107715,
+      "grad_norm": 0.1484801322221756,
+      "learning_rate": 4.884150920623737e-05,
+      "loss": 0.20096096992492676,
+      "step": 685
+    },
+    {
+      "epoch": 0.12554585152838427,
+      "grad_norm": 0.1455296128988266,
+      "learning_rate": 4.88192583165698e-05,
+      "loss": 0.20518505573272705,
+      "step": 690
+    },
+    {
+      "epoch": 0.12645560407569142,
+      "grad_norm": 0.14517490565776825,
+      "learning_rate": 4.879680093699598e-05,
+      "loss": 0.18859238624572755,
+      "step": 695
+    },
+    {
+      "epoch": 0.12736535662299855,
+      "grad_norm": 0.18778090178966522,
+      "learning_rate": 4.877413726219964e-05,
+      "loss": 0.197074818611145,
+      "step": 700
+    },
+    {
+      "epoch": 0.12827510917030568,
+      "grad_norm": 0.13497677445411682,
+      "learning_rate": 4.87512674886529e-05,
+      "loss": 0.18713107109069824,
+      "step": 705
+    },
+    {
+      "epoch": 0.12918486171761281,
+      "grad_norm": 0.12657155096530914,
+      "learning_rate": 4.872819181461455e-05,
+      "loss": 0.1858484387397766,
+      "step": 710
+    },
+    {
+      "epoch": 0.13009461426491994,
+      "grad_norm": 0.11458148807287216,
+      "learning_rate": 4.870491044012834e-05,
+      "loss": 0.18732179403305055,
+      "step": 715
+    },
+    {
+      "epoch": 0.13100436681222707,
+      "grad_norm": 0.13000249862670898,
+      "learning_rate": 4.8681423567021244e-05,
+      "loss": 0.1872936010360718,
+      "step": 720
+    },
+    {
+      "epoch": 0.1319141193595342,
+      "grad_norm": 0.14580890536308289,
+      "learning_rate": 4.865773139890172e-05,
+      "loss": 0.19280019998550416,
+      "step": 725
+    },
+    {
+      "epoch": 0.13282387190684133,
+      "grad_norm": 0.1507277935743332,
+      "learning_rate": 4.8633834141157913e-05,
+      "loss": 0.1898929238319397,
+      "step": 730
+    },
+    {
+      "epoch": 0.13373362445414846,
+      "grad_norm": 0.1418737769126892,
+      "learning_rate": 4.860973200095592e-05,
+      "loss": 0.17926375865936278,
+      "step": 735
+    },
+    {
+      "epoch": 0.1346433770014556,
+      "grad_norm": 0.17151866853237152,
+      "learning_rate": 4.858542518723794e-05,
+      "loss": 0.18963592052459716,
+      "step": 740
+    },
+    {
+      "epoch": 0.13555312954876272,
+      "grad_norm": 0.11162743717432022,
+      "learning_rate": 4.8560913910720535e-05,
+      "loss": 0.19466646909713745,
+      "step": 745
+    },
+    {
+      "epoch": 0.13646288209606988,
+      "grad_norm": 0.15628376603126526,
+      "learning_rate": 4.8536198383892725e-05,
+      "loss": 0.19494034051895143,
+      "step": 750
+    },
+    {
+      "epoch": 0.137372634643377,
+      "grad_norm": 0.18209289014339447,
+      "learning_rate": 4.851127882101421e-05,
+      "loss": 0.18747550249099731,
+      "step": 755
+    },
+    {
+      "epoch": 0.13828238719068414,
+      "grad_norm": 0.14559614658355713,
+      "learning_rate": 4.8486155438113454e-05,
+      "loss": 0.1897158980369568,
+      "step": 760
+    },
+    {
+      "epoch": 0.13919213973799127,
+      "grad_norm": 0.3198587894439697,
+      "learning_rate": 4.846082845298586e-05,
+      "loss": 0.18571001291275024,
+      "step": 765
+    },
+    {
+      "epoch": 0.1401018922852984,
+      "grad_norm": 0.1486678421497345,
+      "learning_rate": 4.843529808519189e-05,
+      "loss": 0.19561930894851684,
+      "step": 770
+    },
+    {
+      "epoch": 0.14101164483260553,
+      "grad_norm": 0.15318170189857483,
+      "learning_rate": 4.840956455605509e-05,
+      "loss": 0.187040114402771,
+      "step": 775
+    },
+    {
+      "epoch": 0.14192139737991266,
+      "grad_norm": 0.13754244148731232,
+      "learning_rate": 4.838362808866025e-05,
+      "loss": 0.18345539569854735,
+      "step": 780
+    },
+    {
+      "epoch": 0.1428311499272198,
+      "grad_norm": 0.12943248450756073,
+      "learning_rate": 4.835748890785143e-05,
+      "loss": 0.1921079397201538,
+      "step": 785
+    },
+    {
+      "epoch": 0.14374090247452692,
+      "grad_norm": 0.110458143055439,
+      "learning_rate": 4.833114724023001e-05,
+      "loss": 0.17927205562591553,
+      "step": 790
+    },
+    {
+      "epoch": 0.14465065502183405,
+      "grad_norm": 0.2421770840883255,
+      "learning_rate": 4.830460331415275e-05,
+      "loss": 0.18317567110061644,
+      "step": 795
+    },
+    {
+      "epoch": 0.14556040756914118,
+      "grad_norm": 0.14752762019634247,
+      "learning_rate": 4.8277857359729787e-05,
+      "loss": 0.1843916058540344,
+      "step": 800
+    },
+    {
+      "epoch": 0.14647016011644834,
+      "grad_norm": 0.15043556690216064,
+      "learning_rate": 4.8250909608822644e-05,
+      "loss": 0.18354393243789674,
+      "step": 805
+    },
+    {
+      "epoch": 0.14737991266375547,
+      "grad_norm": 0.1381794661283493,
+      "learning_rate": 4.822376029504223e-05,
+      "loss": 0.1789781332015991,
+      "step": 810
+    },
+    {
+      "epoch": 0.1482896652110626,
+      "grad_norm": 0.18386174738407135,
+      "learning_rate": 4.819640965374681e-05,
+      "loss": 0.19494292736053467,
+      "step": 815
+    },
+    {
+      "epoch": 0.14919941775836973,
+      "grad_norm": 0.13829593360424042,
+      "learning_rate": 4.816885792203996e-05,
+      "loss": 0.18486063480377196,
+      "step": 820
+    },
+    {
+      "epoch": 0.15010917030567686,
+      "grad_norm": 0.15033291280269623,
+      "learning_rate": 4.814110533876852e-05,
+      "loss": 0.18061509132385253,
+      "step": 825
+    },
+    {
+      "epoch": 0.151018922852984,
+      "grad_norm": 0.17150473594665527,
+      "learning_rate": 4.811315214452051e-05,
+      "loss": 0.18464866876602173,
+      "step": 830
+    },
+    {
+      "epoch": 0.15192867540029112,
+      "grad_norm": 0.15317125618457794,
+      "learning_rate": 4.808499858162307e-05,
+      "loss": 0.1837708592414856,
+      "step": 835
+    },
+    {
+      "epoch": 0.15283842794759825,
+      "grad_norm": 0.2671392560005188,
+      "learning_rate": 4.805664489414031e-05,
+      "loss": 0.19338636398315429,
+      "step": 840
+    },
+    {
+      "epoch": 0.15374818049490538,
+      "grad_norm": 0.14047028124332428,
+      "learning_rate": 4.802809132787125e-05,
+      "loss": 0.17069108486175538,
+      "step": 845
+    },
+    {
+      "epoch": 0.1546579330422125,
+      "grad_norm": 0.1520431935787201,
+      "learning_rate": 4.799933813034768e-05,
+      "loss": 0.18607735633850098,
+      "step": 850
+    },
+    {
+      "epoch": 0.15556768558951964,
+      "grad_norm": 0.17239463329315186,
+      "learning_rate": 4.797038555083197e-05,
+      "loss": 0.18069062232971192,
+      "step": 855
+    },
+    {
+      "epoch": 0.1564774381368268,
+      "grad_norm": 0.1377955675125122,
+      "learning_rate": 4.794123384031495e-05,
+      "loss": 0.18870222568511963,
+      "step": 860
+    },
+    {
+      "epoch": 0.15738719068413393,
+      "grad_norm": 0.15901461243629456,
+      "learning_rate": 4.791188325151373e-05,
+      "loss": 0.18128334283828734,
+      "step": 865
+    },
+    {
+      "epoch": 0.15829694323144106,
+      "grad_norm": 0.14634132385253906,
+      "learning_rate": 4.7882334038869495e-05,
+      "loss": 0.1866163969039917,
+      "step": 870
+    },
+    {
+      "epoch": 0.1592066957787482,
+      "grad_norm": 0.15361061692237854,
+      "learning_rate": 4.785258645854529e-05,
+      "loss": 0.17850807905197144,
+      "step": 875
+    },
+    {
+      "epoch": 0.16011644832605532,
+      "grad_norm": 0.13751649856567383,
+      "learning_rate": 4.782264076842385e-05,
+      "loss": 0.17731113433837892,
+      "step": 880
+    },
+    {
+      "epoch": 0.16102620087336245,
+      "grad_norm": 0.17909638583660126,
+      "learning_rate": 4.7792497228105314e-05,
+      "loss": 0.18344542980194092,
+      "step": 885
+    },
+    {
+      "epoch": 0.16193595342066958,
+      "grad_norm": 0.16038304567337036,
+      "learning_rate": 4.776215609890498e-05,
+      "loss": 0.18868647813796996,
+      "step": 890
+    },
+    {
+      "epoch": 0.1628457059679767,
+      "grad_norm": 0.1653951108455658,
+      "learning_rate": 4.773161764385107e-05,
+      "loss": 0.18614152669906617,
+      "step": 895
+    },
+    {
+      "epoch": 0.16375545851528384,
+      "grad_norm": 0.16193026304244995,
+      "learning_rate": 4.770088212768241e-05,
+      "loss": 0.18564575910568237,
+      "step": 900
+    },
+    {
+      "epoch": 0.16466521106259097,
+      "grad_norm": 0.16048531234264374,
+      "learning_rate": 4.7669949816846173e-05,
+      "loss": 0.18330031633377075,
+      "step": 905
+    },
+    {
+      "epoch": 0.1655749636098981,
+      "grad_norm": 0.1440177708864212,
+      "learning_rate": 4.7638820979495534e-05,
+      "loss": 0.17712442874908446,
+      "step": 910
+    },
+    {
+      "epoch": 0.16648471615720525,
+      "grad_norm": 0.19635969400405884,
+      "learning_rate": 4.760749588548738e-05,
+      "loss": 0.18679027557373046,
+      "step": 915
+    },
+    {
+      "epoch": 0.16739446870451238,
+      "grad_norm": 0.15576541423797607,
+      "learning_rate": 4.757597480637995e-05,
+      "loss": 0.19283764362335204,
+      "step": 920
+    },
+    {
+      "epoch": 0.1683042212518195,
+      "grad_norm": 0.1550331562757492,
+      "learning_rate": 4.7544258015430463e-05,
+      "loss": 0.18269542455673218,
+      "step": 925
+    },
+    {
+      "epoch": 0.16921397379912664,
+      "grad_norm": 0.18369626998901367,
+      "learning_rate": 4.75123457875928e-05,
+      "loss": 0.1697891116142273,
+      "step": 930
+    },
+    {
+      "epoch": 0.17012372634643377,
+      "grad_norm": 0.15266314148902893,
+      "learning_rate": 4.7480238399515074e-05,
+      "loss": 0.18523451089859008,
+      "step": 935
+    },
+    {
+      "epoch": 0.1710334788937409,
+      "grad_norm": 0.16709664463996887,
+      "learning_rate": 4.744793612953724e-05,
+      "loss": 0.1803238034248352,
+      "step": 940
+    },
+    {
+      "epoch": 0.17194323144104803,
+      "grad_norm": 0.14929179847240448,
+      "learning_rate": 4.741543925768872e-05,
+      "loss": 0.1861217737197876,
+      "step": 945
+    },
+    {
+      "epoch": 0.17285298398835516,
+      "grad_norm": 0.1362280696630478,
+      "learning_rate": 4.7382748065685915e-05,
+      "loss": 0.17896100282669067,
+      "step": 950
+    },
+    {
+      "epoch": 0.1737627365356623,
+      "grad_norm": 0.15290239453315735,
+      "learning_rate": 4.734986283692982e-05,
+      "loss": 0.18432788848876952,
+      "step": 955
+    },
+    {
+      "epoch": 0.17467248908296942,
+      "grad_norm": 0.1287035197019577,
+      "learning_rate": 4.73167838565035e-05,
+      "loss": 0.18485682010650634,
+      "step": 960
+    },
+    {
+      "epoch": 0.17558224163027655,
+      "grad_norm": 0.17969627678394318,
+      "learning_rate": 4.728351141116971e-05,
+      "loss": 0.17361557483673096,
+      "step": 965
+    },
+    {
+      "epoch": 0.1764919941775837,
+      "grad_norm": 0.13751201331615448,
+      "learning_rate": 4.7250045789368326e-05,
+      "loss": 0.1731679320335388,
+      "step": 970
+    },
+    {
+      "epoch": 0.17740174672489084,
+      "grad_norm": 0.1603265255689621,
+      "learning_rate": 4.721638728121388e-05,
+      "loss": 0.17308170795440675,
+      "step": 975
+    },
+    {
+      "epoch": 0.17831149927219797,
+      "grad_norm": 0.1592789888381958,
+      "learning_rate": 4.718253617849306e-05,
+      "loss": 0.17534757852554322,
+      "step": 980
+    },
+    {
+      "epoch": 0.1792212518195051,
+      "grad_norm": 0.12727224826812744,
+      "learning_rate": 4.714849277466214e-05,
+      "loss": 0.17817609310150145,
+      "step": 985
+    },
+    {
+      "epoch": 0.18013100436681223,
+      "grad_norm": 0.15401554107666016,
+      "learning_rate": 4.711425736484447e-05,
+      "loss": 0.1733405351638794,
+      "step": 990
+    },
+    {
+      "epoch": 0.18104075691411936,
+      "grad_norm": 0.13253968954086304,
+      "learning_rate": 4.7079830245827906e-05,
+      "loss": 0.17846795320510864,
+      "step": 995
+    },
+    {
+      "epoch": 0.1819505094614265,
+      "grad_norm": 0.21846213936805725,
+      "learning_rate": 4.7045211716062245e-05,
+      "loss": 0.18021599054336548,
+      "step": 1000
+    },
+    {
+      "epoch": 0.18286026200873362,
+      "grad_norm": 0.16867990791797638,
+      "learning_rate": 4.7010402075656595e-05,
+      "loss": 0.18232386112213134,
+      "step": 1005
+    },
+    {
+      "epoch": 0.18377001455604075,
+      "grad_norm": 0.17180582880973816,
+      "learning_rate": 4.697540162637686e-05,
+      "loss": 0.1816317319869995,
+      "step": 1010
+    },
+    {
+      "epoch": 0.18467976710334788,
+      "grad_norm": 0.16480213403701782,
+      "learning_rate": 4.694021067164303e-05,
+      "loss": 0.17718446254730225,
+      "step": 1015
+    },
+    {
+      "epoch": 0.185589519650655,
+      "grad_norm": 0.15015918016433716,
+      "learning_rate": 4.6904829516526605e-05,
+      "loss": 0.17412011623382567,
+      "step": 1020
+    },
+    {
+      "epoch": 0.18649927219796217,
+      "grad_norm": 0.14445139467716217,
+      "learning_rate": 4.686925846774795e-05,
+      "loss": 0.1778018832206726,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1874090247452693,
+      "grad_norm": 0.1701960265636444,
+      "learning_rate": 4.683349783367362e-05,
+      "loss": 0.16901081800460815,
+      "step": 1030
+    },
+    {
+      "epoch": 0.18831877729257643,
+      "grad_norm": 0.15894867479801178,
+      "learning_rate": 4.679754792431368e-05,
+      "loss": 0.17055928707122803,
+      "step": 1035
+    },
+    {
+      "epoch": 0.18922852983988356,
+      "grad_norm": 0.1511942446231842,
+      "learning_rate": 4.676140905131903e-05,
+      "loss": 0.17339680194854737,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1901382823871907,
+      "grad_norm": 0.14735209941864014,
+      "learning_rate": 4.672508152797872e-05,
+      "loss": 0.17802717685699462,
+      "step": 1045
+    },
+    {
+      "epoch": 0.19104803493449782,
+      "grad_norm": 0.17367291450500488,
+      "learning_rate": 4.66885656692172e-05,
+      "loss": 0.1732744097709656,
+      "step": 1050
+    },
+    {
+      "epoch": 0.19195778748180495,
+      "grad_norm": 0.147227481007576,
+      "learning_rate": 4.665186179159159e-05,
+      "loss": 0.17040517330169677,
+      "step": 1055
+    },
+    {
+      "epoch": 0.19286754002911208,
+      "grad_norm": 0.1709655076265335,
+      "learning_rate": 4.6614970213289e-05,
+      "loss": 0.17794088125228882,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1937772925764192,
+      "grad_norm": 0.1588088721036911,
+      "learning_rate": 4.657789125412366e-05,
+      "loss": 0.17180380821228028,
+      "step": 1065
+    },
+    {
+      "epoch": 0.19468704512372634,
+      "grad_norm": 0.14827021956443787,
+      "learning_rate": 4.654062523553428e-05,
+      "loss": 0.182997989654541,
+      "step": 1070
+    },
+    {
+      "epoch": 0.19559679767103347,
+      "grad_norm": 0.16230466961860657,
+      "learning_rate": 4.6503172480581126e-05,
+      "loss": 0.17346880435943604,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1965065502183406,
+      "grad_norm": 0.1637624353170395,
+      "learning_rate": 4.646553331394333e-05,
+      "loss": 0.17263576984405518,
+      "step": 1080
+    },
+    {
+      "epoch": 0.19741630276564776,
+      "grad_norm": 0.15977843105793,
+      "learning_rate": 4.642770806191603e-05,
+      "loss": 0.17284308671951293,
+      "step": 1085
+    },
+    {
+      "epoch": 0.19832605531295489,
+      "grad_norm": 0.15394869446754456,
+      "learning_rate": 4.6389697052407534e-05,
+      "loss": 0.17797101736068727,
+      "step": 1090
+    },
+    {
+      "epoch": 0.19923580786026202,
+      "grad_norm": 0.15995225310325623,
+      "learning_rate": 4.6351500614936485e-05,
+      "loss": 0.18137198686599731,
+      "step": 1095
+    },
+    {
+      "epoch": 0.20014556040756915,
+      "grad_norm": 0.1779479682445526,
+      "learning_rate": 4.6313119080629006e-05,
+      "loss": 0.17998344898223878,
+      "step": 1100
+    },
+    {
+      "epoch": 0.20105531295487628,
+      "grad_norm": 0.14362832903862,
+      "learning_rate": 4.627455278221584e-05,
+      "loss": 0.18196423053741456,
+      "step": 1105
+    },
+    {
+      "epoch": 0.2019650655021834,
+      "grad_norm": 0.15951639413833618,
+      "learning_rate": 4.623580205402947e-05,
+      "loss": 0.17423888444900512,
+      "step": 1110
+    },
+    {
+      "epoch": 0.20287481804949054,
+      "grad_norm": 0.17273563146591187,
+      "learning_rate": 4.619686723200115e-05,
+      "loss": 0.17392473220825194,
+      "step": 1115
+    },
+    {
+      "epoch": 0.20378457059679767,
+      "grad_norm": 0.1655360758304596,
+      "learning_rate": 4.615774865365813e-05,
+      "loss": 0.17528389692306517,
+      "step": 1120
+    },
+    {
+      "epoch": 0.2046943231441048,
+      "grad_norm": 0.15920691192150116,
+      "learning_rate": 4.611844665812058e-05,
+      "loss": 0.1806849241256714,
+      "step": 1125
+    },
+    {
+      "epoch": 0.20560407569141192,
+      "grad_norm": 0.16114577651023865,
+      "learning_rate": 4.607896158609875e-05,
+      "loss": 0.17217352390289306,
+      "step": 1130
+    },
+    {
+      "epoch": 0.20651382823871905,
+      "grad_norm": 0.1499422937631607,
+      "learning_rate": 4.603929377988999e-05,
+      "loss": 0.17806737422943114,
+      "step": 1135
+    },
+    {
+      "epoch": 0.2074235807860262,
+      "grad_norm": 0.17605191469192505,
+      "learning_rate": 4.5999443583375765e-05,
+      "loss": 0.17842113971710205,
+      "step": 1140
+    },
+    {
+      "epoch": 0.20833333333333334,
+      "grad_norm": 0.16117210686206818,
+      "learning_rate": 4.595941134201871e-05,
+      "loss": 0.18379683494567872,
+      "step": 1145
+    },
+    {
+      "epoch": 0.20924308588064047,
+      "grad_norm": 0.21199050545692444,
+      "learning_rate": 4.591919740285957e-05,
+      "loss": 0.16286123991012574,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2101528384279476,
+      "grad_norm": 0.15100529789924622,
+      "learning_rate": 4.587880211451427e-05,
+      "loss": 0.17995200157165528,
+      "step": 1155
+    },
+    {
+      "epoch": 0.21106259097525473,
+      "grad_norm": 0.16618172824382782,
+      "learning_rate": 4.583822582717085e-05,
+      "loss": 0.16960303783416747,
+      "step": 1160
+    },
+    {
+      "epoch": 0.21197234352256186,
+      "grad_norm": 0.14743569493293762,
+      "learning_rate": 4.579746889258643e-05,
+      "loss": 0.17762668132781984,
+      "step": 1165
+    },
+    {
+      "epoch": 0.212882096069869,
+      "grad_norm": 0.1697179079055786,
+      "learning_rate": 4.575653166408417e-05,
+      "loss": 0.16656005382537842,
+      "step": 1170
+    },
+    {
+      "epoch": 0.21379184861717612,
+      "grad_norm": 0.14886513352394104,
+      "learning_rate": 4.57154144965502e-05,
+      "loss": 0.17091882228851318,
+      "step": 1175
+    },
+    {
+      "epoch": 0.21470160116448325,
+      "grad_norm": 0.18197473883628845,
+      "learning_rate": 4.5674117746430556e-05,
+      "loss": 0.1770920753479004,
+      "step": 1180
+    },
+    {
+      "epoch": 0.21561135371179038,
+      "grad_norm": 0.17323088645935059,
+      "learning_rate": 4.563264177172807e-05,
+      "loss": 0.1734643578529358,
+      "step": 1185
+    },
+    {
+      "epoch": 0.2165211062590975,
+      "grad_norm": 0.1521984338760376,
+      "learning_rate": 4.559098693199929e-05,
+      "loss": 0.17515116930007935,
+      "step": 1190
+    },
+    {
+      "epoch": 0.21743085880640467,
+      "grad_norm": 0.1842304915189743,
+      "learning_rate": 4.554915358835134e-05,
+      "loss": 0.16798022985458375,
+      "step": 1195
+    },
+    {
+      "epoch": 0.2183406113537118,
+      "grad_norm": 0.14753451943397522,
+      "learning_rate": 4.5507142103438794e-05,
+      "loss": 0.1755476713180542,
+      "step": 1200
+    },
+    {
+      "epoch": 0.21925036390101893,
+      "grad_norm": 0.17096194624900818,
+      "learning_rate": 4.546495284146057e-05,
+      "loss": 0.1792473554611206,
+      "step": 1205
+    },
+    {
+      "epoch": 0.22016011644832606,
+      "grad_norm": 0.1579233556985855,
+      "learning_rate": 4.542258616815669e-05,
+      "loss": 0.17230144739151002,
+      "step": 1210
+    },
+    {
+      "epoch": 0.2210698689956332,
+      "grad_norm": 0.177297905087471,
+      "learning_rate": 4.5380042450805216e-05,
+      "loss": 0.1807127833366394,
+      "step": 1215
+    },
+    {
+      "epoch": 0.22197962154294032,
+      "grad_norm": 0.14331696927547455,
+      "learning_rate": 4.533732205821897e-05,
+      "loss": 0.17201389074325563,
+      "step": 1220
+    },
+    {
+      "epoch": 0.22288937409024745,
+      "grad_norm": 0.14473360776901245,
+      "learning_rate": 4.529442536074239e-05,
+      "loss": 0.17036900520324708,
+      "step": 1225
+    },
+    {
+      "epoch": 0.22379912663755458,
+      "grad_norm": 0.1820901483297348,
+      "learning_rate": 4.5251352730248314e-05,
+      "loss": 0.17704882621765136,
+      "step": 1230
+    },
+    {
+      "epoch": 0.2247088791848617,
+      "grad_norm": 0.1948976367712021,
+      "learning_rate": 4.5208104540134746e-05,
+      "loss": 0.1706973433494568,
+      "step": 1235
+    },
+    {
+      "epoch": 0.22561863173216884,
+      "grad_norm": 0.16660070419311523,
+      "learning_rate": 4.51646811653216e-05,
+      "loss": 0.17636821269989014,
+      "step": 1240
+    },
+    {
+      "epoch": 0.22652838427947597,
+      "grad_norm": 0.1699984073638916,
+      "learning_rate": 4.512108298224751e-05,
+      "loss": 0.16986632347106934,
+      "step": 1245
+    },
+    {
+      "epoch": 0.22743813682678313,
+      "grad_norm": 0.17601042985916138,
+      "learning_rate": 4.50773103688665e-05,
+      "loss": 0.17507898807525635,
+      "step": 1250
+    },
+    {
+      "epoch": 0.22834788937409026,
+      "grad_norm": 0.17557238042354584,
+      "learning_rate": 4.503336370464476e-05,
+      "loss": 0.17702863216400147,
+      "step": 1255
+    },
+    {
+      "epoch": 0.2292576419213974,
+      "grad_norm": 0.1800651252269745,
+      "learning_rate": 4.498924337055729e-05,
+      "loss": 0.16419180631637573,
+      "step": 1260
+    },
+    {
+      "epoch": 0.23016739446870452,
+      "grad_norm": 0.2022479772567749,
+      "learning_rate": 4.494494974908468e-05,
+      "loss": 0.17482060194015503,
+      "step": 1265
+    },
+    {
+      "epoch": 0.23107714701601165,
+      "grad_norm": 0.14180205762386322,
+      "learning_rate": 4.490048322420973e-05,
+      "loss": 0.1723136067390442,
+      "step": 1270
+    },
+    {
+      "epoch": 0.23198689956331878,
+      "grad_norm": 0.18607310950756073,
+      "learning_rate": 4.485584418141419e-05,
+      "loss": 0.17096419334411622,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2328966521106259,
+      "grad_norm": 0.15958310663700104,
+      "learning_rate": 4.481103300767529e-05,
+      "loss": 0.1656244158744812,
+      "step": 1280
+    },
+    {
+      "epoch": 0.23380640465793304,
+      "grad_norm": 0.17552383244037628,
+      "learning_rate": 4.476605009146255e-05,
+      "loss": 0.17677626609802247,
+      "step": 1285
+    },
+    {
+      "epoch": 0.23471615720524017,
+      "grad_norm": 0.15299823880195618,
+      "learning_rate": 4.472089582273429e-05,
+      "loss": 0.1778991103172302,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2356259097525473,
+      "grad_norm": 0.14613987505435944,
+      "learning_rate": 4.46755705929343e-05,
+      "loss": 0.17071452140808105,
+      "step": 1295
+    },
+    {
+      "epoch": 0.23653566229985443,
+      "grad_norm": 0.17781122028827667,
+      "learning_rate": 4.463007479498843e-05,
+      "loss": 0.16955430507659913,
+      "step": 1300
+    },
+    {
+      "epoch": 0.23744541484716158,
+      "grad_norm": 0.16326487064361572,
+      "learning_rate": 4.458440882330119e-05,
+      "loss": 0.1777693510055542,
+      "step": 1305
+    },
+    {
+      "epoch": 0.23835516739446871,
+      "grad_norm": 0.17701926827430725,
+      "learning_rate": 4.4538573073752365e-05,
+      "loss": 0.16323351860046387,
+      "step": 1310
+    },
+    {
+      "epoch": 0.23926491994177584,
+      "grad_norm": 0.13104717433452606,
+      "learning_rate": 4.449256794369349e-05,
+      "loss": 0.17653456926345826,
+      "step": 1315
+    },
+    {
+      "epoch": 0.24017467248908297,
+      "grad_norm": 0.1796836256980896,
+      "learning_rate": 4.444639383194452e-05,
+      "loss": 0.17189600467681884,
+      "step": 1320
+    },
+    {
+      "epoch": 0.2410844250363901,
+      "grad_norm": 0.14919696748256683,
+      "learning_rate": 4.440005113879029e-05,
+      "loss": 0.17003334760665895,
+      "step": 1325
+    },
+    {
+      "epoch": 0.24199417758369723,
+      "grad_norm": 0.1728784441947937,
+      "learning_rate": 4.4353540265977064e-05,
+      "loss": 0.17397408485412597,
+      "step": 1330
+    },
+    {
+      "epoch": 0.24290393013100436,
+      "grad_norm": 0.14591015875339508,
+      "learning_rate": 4.43068616167091e-05,
+      "loss": 0.16498478651046752,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2438136826783115,
+      "grad_norm": 0.18417201936244965,
+      "learning_rate": 4.4260015595645055e-05,
+      "loss": 0.16841750144958495,
+      "step": 1340
+    },
+    {
+      "epoch": 0.24472343522561862,
+      "grad_norm": 0.16264279186725616,
+      "learning_rate": 4.4213002608894605e-05,
+      "loss": 0.16907373666763306,
+      "step": 1345
+    },
+    {
+      "epoch": 0.24563318777292575,
+      "grad_norm": 0.15248481929302216,
+      "learning_rate": 4.416582306401481e-05,
+      "loss": 0.15931472778320313,
+      "step": 1350
+    },
+    {
+      "epoch": 0.24654294032023288,
+      "grad_norm": 0.1488373875617981,
+      "learning_rate": 4.4118477370006636e-05,
+      "loss": 0.1701716423034668,
+      "step": 1355
+    },
+    {
+      "epoch": 0.24745269286754004,
+      "grad_norm": 0.14679782092571259,
+      "learning_rate": 4.407096593731142e-05,
+      "loss": 0.157412326335907,
+      "step": 1360
+    },
+    {
+      "epoch": 0.24836244541484717,
+      "grad_norm": 0.17139530181884766,
+      "learning_rate": 4.402328917780728e-05,
+      "loss": 0.17303754091262818,
+      "step": 1365
+    },
+    {
+      "epoch": 0.2492721979621543,
+      "grad_norm": 0.1534871757030487,
+      "learning_rate": 4.397544750480554e-05,
+      "loss": 0.1786255121231079,
+      "step": 1370
+    },
+    {
+      "epoch": 0.2501819505094614,
+      "grad_norm": 0.1876252293586731,
+      "learning_rate": 4.39274413330472e-05,
+      "loss": 0.16442898511886597,
+      "step": 1375
+    },
+    {
+      "epoch": 0.25109170305676853,
+      "grad_norm": 0.16165752708911896,
+      "learning_rate": 4.387927107869928e-05,
+      "loss": 0.1780426025390625,
+      "step": 1380
+    },
+    {
+      "epoch": 0.25200145560407566,
+      "grad_norm": 0.17242255806922913,
+      "learning_rate": 4.383093715935124e-05,
+      "loss": 0.15959256887435913,
+      "step": 1385
+    },
+    {
+      "epoch": 0.25291120815138285,
+      "grad_norm": 0.1627114862203598,
+      "learning_rate": 4.378243999401137e-05,
+      "loss": 0.17606115341186523,
+      "step": 1390
+    },
+    {
+      "epoch": 0.25382096069869,
+      "grad_norm": 0.15911224484443665,
+      "learning_rate": 4.373378000310312e-05,
+      "loss": 0.16798585653305054,
+      "step": 1395
+    },
+    {
+      "epoch": 0.2547307132459971,
+      "grad_norm": 0.15542249381542206,
+      "learning_rate": 4.3684957608461505e-05,
+      "loss": 0.1695417881011963,
+      "step": 1400
+    },
+    {
+      "epoch": 0.25564046579330424,
+      "grad_norm": 0.1475304812192917,
+      "learning_rate": 4.363597323332941e-05,
+      "loss": 0.16340878009796142,
+      "step": 1405
+    },
+    {
+      "epoch": 0.25655021834061137,
+      "grad_norm": 0.16943927109241486,
+      "learning_rate": 4.358682730235395e-05,
+      "loss": 0.17240238189697266,
+      "step": 1410
+    },
+    {
+      "epoch": 0.2574599708879185,
+      "grad_norm": 0.1816391944885254,
+      "learning_rate": 4.3537520241582744e-05,
+      "loss": 0.16558437347412108,
+      "step": 1415
+    },
+    {
+      "epoch": 0.25836972343522563,
+      "grad_norm": 0.23851341009140015,
+      "learning_rate": 4.348805247846027e-05,
+      "loss": 0.16796000003814698,
+      "step": 1420
+    },
+    {
+      "epoch": 0.25927947598253276,
+      "grad_norm": 0.15415243804454803,
+      "learning_rate": 4.343842444182414e-05,
+      "loss": 0.1746017098426819,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2601892285298399,
+      "grad_norm": 0.15651032328605652,
+      "learning_rate": 4.338863656190139e-05,
+      "loss": 0.1649057984352112,
+      "step": 1430
+    },
+    {
+      "epoch": 0.261098981077147,
+      "grad_norm": 0.16601966321468353,
+      "learning_rate": 4.333868927030471e-05,
+      "loss": 0.15888988971710205,
+      "step": 1435
+    },
+    {
+      "epoch": 0.26200873362445415,
+      "grad_norm": 0.1549467295408249,
+      "learning_rate": 4.328858300002876e-05,
+      "loss": 0.16357985734939576,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2629184861717613,
+      "grad_norm": 0.16332370042800903,
+      "learning_rate": 4.32383181854464e-05,
+      "loss": 0.16749982833862304,
+      "step": 1445
+    },
+    {
+      "epoch": 0.2638282387190684,
+      "grad_norm": 0.14827077090740204,
+      "learning_rate": 4.3187895262304894e-05,
+      "loss": 0.16886214017868043,
+      "step": 1450
+    },
+    {
+      "epoch": 0.26473799126637554,
+      "grad_norm": 0.1557198166847229,
+      "learning_rate": 4.313731466772216e-05,
+      "loss": 0.17512214183807373,
+      "step": 1455
+    },
+    {
+      "epoch": 0.26564774381368267,
+      "grad_norm": 0.17263570427894592,
+      "learning_rate": 4.308657684018299e-05,
+      "loss": 0.16248074769973755,
+      "step": 1460
+    },
+    {
+      "epoch": 0.2665574963609898,
+      "grad_norm": 0.17135761678218842,
+      "learning_rate": 4.303568221953521e-05,
+      "loss": 0.16605921983718872,
+      "step": 1465
+    },
+    {
+      "epoch": 0.26746724890829693,
+      "grad_norm": 0.14322632551193237,
+      "learning_rate": 4.2984631246985897e-05,
+      "loss": 0.1610772728919983,
+      "step": 1470
+    },
+    {
+      "epoch": 0.26837700145560406,
+      "grad_norm": 0.18852312862873077,
+      "learning_rate": 4.2933424365097564e-05,
+      "loss": 0.1686462163925171,
+      "step": 1475
+    },
+    {
+      "epoch": 0.2692867540029112,
+      "grad_norm": 0.1780245155096054,
+      "learning_rate": 4.2882062017784294e-05,
+      "loss": 0.16953932046890258,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2701965065502183,
+      "grad_norm": 0.180568665266037,
+      "learning_rate": 4.2830544650307895e-05,
+      "loss": 0.16442664861679077,
+      "step": 1485
+    },
+    {
+      "epoch": 0.27110625909752545,
+      "grad_norm": 0.16876435279846191,
+      "learning_rate": 4.277887270927407e-05,
+      "loss": 0.17128173112869263,
+      "step": 1490
+    },
+    {
+      "epoch": 0.2720160116448326,
+      "grad_norm": 0.164053276181221,
+      "learning_rate": 4.2727046642628513e-05,
+      "loss": 0.16331382989883422,
+      "step": 1495
+    },
+    {
+      "epoch": 0.27292576419213976,
+      "grad_norm": 0.14577528834342957,
+      "learning_rate": 4.267506689965305e-05,
+      "loss": 0.1638316035270691,
+      "step": 1500
+    },
+    {
+      "epoch": 0.2738355167394469,
+      "grad_norm": 0.1648740917444229,
+      "learning_rate": 4.262293393096171e-05,
+      "loss": 0.15332664251327516,
+      "step": 1505
+    },
+    {
+      "epoch": 0.274745269286754,
+      "grad_norm": 0.16445094347000122,
+      "learning_rate": 4.257064818849685e-05,
+      "loss": 0.1706634521484375,
+      "step": 1510
+    },
+    {
+      "epoch": 0.27565502183406115,
+      "grad_norm": 0.1584935486316681,
+      "learning_rate": 4.251821012552524e-05,
+      "loss": 0.1684114694595337,
+      "step": 1515
+    },
+    {
+      "epoch": 0.2765647743813683,
+      "grad_norm": 0.17215611040592194,
+      "learning_rate": 4.24656201966341e-05,
+      "loss": 0.15594131946563722,
+      "step": 1520
+    },
+    {
+      "epoch": 0.2774745269286754,
+      "grad_norm": 0.15945589542388916,
+      "learning_rate": 4.2412878857727214e-05,
+      "loss": 0.1686659574508667,
+      "step": 1525
+    },
+    {
+      "epoch": 0.27838427947598254,
+      "grad_norm": 0.16103951632976532,
+      "learning_rate": 4.2359986566020906e-05,
+      "loss": 0.17779340744018554,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2792940320232897,
+      "grad_norm": 0.1770307570695877,
+      "learning_rate": 4.230694378004014e-05,
+      "loss": 0.16786882877349854,
+      "step": 1535
+    },
+    {
+      "epoch": 0.2802037845705968,
+      "grad_norm": 0.16225053369998932,
+      "learning_rate": 4.2253750959614504e-05,
+      "loss": 0.16558897495269775,
+      "step": 1540
+    },
+    {
+      "epoch": 0.28111353711790393,
+      "grad_norm": 0.27213969826698303,
+      "learning_rate": 4.220040856587425e-05,
+      "loss": 0.1641119599342346,
+      "step": 1545
+    },
+    {
+      "epoch": 0.28202328966521106,
+      "grad_norm": 0.1773071587085724,
+      "learning_rate": 4.2146917061246284e-05,
+      "loss": 0.16919140815734862,
+      "step": 1550
+    },
+    {
+      "epoch": 0.2829330422125182,
+      "grad_norm": 0.15519705414772034,
+      "learning_rate": 4.209327690945014e-05,
+      "loss": 0.15501506328582765,
+      "step": 1555
+    },
+    {
+      "epoch": 0.2838427947598253,
+      "grad_norm": 0.19921597838401794,
+      "learning_rate": 4.203948857549402e-05,
+      "loss": 0.1690821886062622,
+      "step": 1560
+    },
+    {
+      "epoch": 0.28475254730713245,
+      "grad_norm": 0.15417630970478058,
+      "learning_rate": 4.1985552525670696e-05,
+      "loss": 0.1675640344619751,
+      "step": 1565
+    },
+    {
+      "epoch": 0.2856622998544396,
+      "grad_norm": 0.1739572137594223,
+      "learning_rate": 4.193146922755348e-05,
+      "loss": 0.16738017797470092,
+      "step": 1570
+    },
+    {
+      "epoch": 0.2865720524017467,
+      "grad_norm": 0.1384361982345581,
+      "learning_rate": 4.187723914999221e-05,
+      "loss": 0.16802358627319336,
+      "step": 1575
+    },
+    {
+      "epoch": 0.28748180494905384,
+      "grad_norm": 0.1491454839706421,
+      "learning_rate": 4.182286276310915e-05,
+      "loss": 0.1619583249092102,
+      "step": 1580
+    },
+    {
+      "epoch": 0.288391557496361,
+      "grad_norm": 0.15831919014453888,
+      "learning_rate": 4.176834053829492e-05,
+      "loss": 0.1625199794769287,
+      "step": 1585
+    },
+    {
+      "epoch": 0.2893013100436681,
+      "grad_norm": 0.16265396773815155,
+      "learning_rate": 4.1713672948204416e-05,
+      "loss": 0.16718552112579346,
+      "step": 1590
+    },
+    {
+      "epoch": 0.29021106259097523,
+      "grad_norm": 0.15153461694717407,
+      "learning_rate": 4.1658860466752714e-05,
+      "loss": 0.15979087352752686,
+      "step": 1595
+    },
+    {
+      "epoch": 0.29112081513828236,
+      "grad_norm": 0.1620412915945053,
+      "learning_rate": 4.160390356911096e-05,
+      "loss": 0.16103557348251343,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2920305676855895,
+      "grad_norm": 0.16673807799816132,
+      "learning_rate": 4.154880273170223e-05,
+      "loss": 0.16394708156585694,
+      "step": 1605
+    },
+    {
+      "epoch": 0.2929403202328967,
+      "grad_norm": 0.14834867417812347,
+      "learning_rate": 4.149355843219744e-05,
+      "loss": 0.15916435718536376,
+      "step": 1610
+    },
+    {
+      "epoch": 0.2938500727802038,
+      "grad_norm": 0.16977964341640472,
+      "learning_rate": 4.143817114951119e-05,
+      "loss": 0.16538127660751342,
+      "step": 1615
+    },
+    {
+      "epoch": 0.29475982532751094,
+      "grad_norm": 0.17986875772476196,
+      "learning_rate": 4.138264136379756e-05,
+      "loss": 0.15514618158340454,
+      "step": 1620
+    },
+    {
+      "epoch": 0.29566957787481807,
+      "grad_norm": 0.15794920921325684,
+      "learning_rate": 4.132696955644605e-05,
+      "loss": 0.15992183685302735,
+      "step": 1625
+    },
+    {
+      "epoch": 0.2965793304221252,
+      "grad_norm": 0.19955399632453918,
+      "learning_rate": 4.127115621007731e-05,
+      "loss": 0.16362056732177735,
+      "step": 1630
+    },
+    {
+      "epoch": 0.29748908296943233,
+      "grad_norm": 0.1352023035287857,
+      "learning_rate": 4.121520180853903e-05,
+      "loss": 0.15631601810455323,
+      "step": 1635
+    },
+    {
+      "epoch": 0.29839883551673946,
+      "grad_norm": 0.15340781211853027,
+      "learning_rate": 4.1159106836901674e-05,
+      "loss": 0.1571858048439026,
+      "step": 1640
+    },
+    {
+      "epoch": 0.2993085880640466,
+      "grad_norm": 0.15311770141124725,
+      "learning_rate": 4.110287178145433e-05,
+      "loss": 0.16082344055175782,
+      "step": 1645
+    },
+    {
+      "epoch": 0.3002183406113537,
+      "grad_norm": 0.17811627686023712,
+      "learning_rate": 4.10464971297005e-05,
+      "loss": 0.16117215156555176,
+      "step": 1650
+    },
+    {
+      "epoch": 0.30112809315866085,
+      "grad_norm": 0.21060039103031158,
+      "learning_rate": 4.0989983370353805e-05,
+      "loss": 0.15838587284088135,
+      "step": 1655
+    },
+    {
+      "epoch": 0.302037845705968,
+      "grad_norm": 0.155836820602417,
+      "learning_rate": 4.093333099333383e-05,
+      "loss": 0.16648870706558228,
+      "step": 1660
+    },
+    {
+      "epoch": 0.3029475982532751,
+      "grad_norm": 0.13711698353290558,
+      "learning_rate": 4.0876540489761826e-05,
+      "loss": 0.16899349689483642,
+      "step": 1665
+    },
+    {
+      "epoch": 0.30385735080058224,
+      "grad_norm": 0.15162716805934906,
+      "learning_rate": 4.0819612351956485e-05,
+      "loss": 0.16574090719223022,
+      "step": 1670
+    },
+    {
+      "epoch": 0.30476710334788937,
+      "grad_norm": 0.15016348659992218,
+      "learning_rate": 4.0762547073429615e-05,
+      "loss": 0.1689780354499817,
+      "step": 1675
+    },
+    {
+      "epoch": 0.3056768558951965,
+      "grad_norm": 0.15182986855506897,
+      "learning_rate": 4.070534514888194e-05,
+      "loss": 0.1593686819076538,
+      "step": 1680
+    },
+    {
+      "epoch": 0.3065866084425036,
+      "grad_norm": 0.15648750960826874,
+      "learning_rate": 4.0648007074198765e-05,
+      "loss": 0.16436235904693602,
+      "step": 1685
+    },
+    {
+      "epoch": 0.30749636098981076,
+      "grad_norm": 0.18339484930038452,
+      "learning_rate": 4.0590533346445665e-05,
+      "loss": 0.1678077220916748,
+      "step": 1690
+    },
+    {
+      "epoch": 0.3084061135371179,
+      "grad_norm": 0.16426527500152588,
+      "learning_rate": 4.053292446386422e-05,
+      "loss": 0.1689227342605591,
+      "step": 1695
+    },
+    {
+      "epoch": 0.309315866084425,
+      "grad_norm": 0.16129335761070251,
+      "learning_rate": 4.047518092586766e-05,
+      "loss": 0.16592445373535156,
+      "step": 1700
+    },
+    {
+      "epoch": 0.31022561863173215,
+      "grad_norm": 0.15512363612651825,
+      "learning_rate": 4.041730323303654e-05,
+      "loss": 0.16142364740371704,
+      "step": 1705
+    },
+    {
+      "epoch": 0.3111353711790393,
+      "grad_norm": 0.159842386841774,
+      "learning_rate": 4.0359291887114425e-05,
+      "loss": 0.1702875852584839,
+      "step": 1710
+    },
+    {
+      "epoch": 0.3120451237263464,
+      "grad_norm": 0.19558854401111603,
+      "learning_rate": 4.030114739100352e-05,
+      "loss": 0.15966148376464845,
+      "step": 1715
+    },
+    {
+      "epoch": 0.3129548762736536,
+      "grad_norm": 0.1577496975660324,
+      "learning_rate": 4.024287024876029e-05,
+      "loss": 0.1620358943939209,
+      "step": 1720
+    },
+    {
+      "epoch": 0.3138646288209607,
+      "grad_norm": 0.1629355251789093,
+      "learning_rate": 4.0184460965591144e-05,
+      "loss": 0.16511552333831786,
+      "step": 1725
+    },
+    {
+      "epoch": 0.31477438136826785,
+      "grad_norm": 0.17060767114162445,
+      "learning_rate": 4.0125920047848e-05,
+      "loss": 0.15672838687896729,
+      "step": 1730
+    },
+    {
+      "epoch": 0.315684133915575,
+      "grad_norm": 0.22447620332241058,
+      "learning_rate": 4.006724800302394e-05,
+      "loss": 0.15339784622192382,
+      "step": 1735
+    },
+    {
+      "epoch": 0.3165938864628821,
+      "grad_norm": 0.14572037756443024,
+      "learning_rate": 4.000844533974878e-05,
+      "loss": 0.16566959619522095,
+      "step": 1740
+    },
+    {
+      "epoch": 0.31750363901018924,
+      "grad_norm": 0.15915483236312866,
+      "learning_rate": 3.9949512567784684e-05,
+      "loss": 0.16153957843780517,
+      "step": 1745
+    },
+    {
+      "epoch": 0.3184133915574964,
+      "grad_norm": 0.1668540984392166,
+      "learning_rate": 3.9890450198021704e-05,
+      "loss": 0.1659809947013855,
+      "step": 1750
+    },
+    {
+      "epoch": 0.3193231441048035,
+      "grad_norm": 0.16612035036087036,
+      "learning_rate": 3.983125874247341e-05,
+      "loss": 0.16941241025924683,
+      "step": 1755
+    },
+    {
+      "epoch": 0.32023289665211063,
+      "grad_norm": 0.15163679420948029,
+      "learning_rate": 3.9771938714272407e-05,
+      "loss": 0.16053590774536133,
+      "step": 1760
+    },
+    {
+      "epoch": 0.32114264919941776,
+      "grad_norm": 0.1797824203968048,
+      "learning_rate": 3.97124906276659e-05,
+      "loss": 0.1667110800743103,
+      "step": 1765
+    },
+    {
+      "epoch": 0.3220524017467249,
+      "grad_norm": 0.15076608955860138,
+      "learning_rate": 3.9652914998011237e-05,
+      "loss": 0.1607860803604126,
+      "step": 1770
+    },
+    {
+      "epoch": 0.322962154294032,
+      "grad_norm": 0.16523587703704834,
+      "learning_rate": 3.959321234177144e-05,
+      "loss": 0.16515827178955078,
+      "step": 1775
+    },
+    {
+      "epoch": 0.32387190684133915,
+      "grad_norm": 0.22065149247646332,
+      "learning_rate": 3.9533383176510746e-05,
+      "loss": 0.1618957757949829,
+      "step": 1780
+    },
+    {
+      "epoch": 0.3247816593886463,
+      "grad_norm": 0.16426463425159454,
+      "learning_rate": 3.9473428020890066e-05,
+      "loss": 0.15763382911682128,
+      "step": 1785
+    },
+    {
+      "epoch": 0.3256914119359534,
+      "grad_norm": 0.16474904119968414,
+      "learning_rate": 3.941334739466257e-05,
+      "loss": 0.15135571956634522,
+      "step": 1790
+    },
+    {
+      "epoch": 0.32660116448326054,
+      "grad_norm": 0.16746412217617035,
+      "learning_rate": 3.935314181866909e-05,
+      "loss": 0.15925389528274536,
+      "step": 1795
+    },
+    {
+      "epoch": 0.32751091703056767,
+      "grad_norm": 0.17819371819496155,
+      "learning_rate": 3.929281181483369e-05,
+      "loss": 0.1598669171333313,
+      "step": 1800
+    },
+    {
+      "epoch": 0.3284206695778748,
+      "grad_norm": 0.1816040277481079,
+      "learning_rate": 3.923235790615907e-05,
+      "loss": 0.1652522087097168,
+      "step": 1805
+    },
+    {
+      "epoch": 0.32933042212518193,
+      "grad_norm": 0.14846695959568024,
+      "learning_rate": 3.917178061672211e-05,
+      "loss": 0.16665585041046144,
+      "step": 1810
+    },
+    {
+      "epoch": 0.33024017467248906,
+      "grad_norm": 0.1734926551580429,
+      "learning_rate": 3.911108047166924e-05,
+      "loss": 0.16069791316986085,
+      "step": 1815
+    },
+    {
+      "epoch": 0.3311499272197962,
+      "grad_norm": 0.16154922544956207,
+      "learning_rate": 3.905025799721194e-05,
+      "loss": 0.16114097833633423,
+      "step": 1820
+    },
+    {
+      "epoch": 0.3320596797671033,
+      "grad_norm": 0.1538771390914917,
+      "learning_rate": 3.898931372062217e-05,
+      "loss": 0.1602831244468689,
+      "step": 1825
+    },
+    {
+      "epoch": 0.3329694323144105,
+      "grad_norm": 0.14036566019058228,
+      "learning_rate": 3.892824817022781e-05,
+      "loss": 0.1502395749092102,
+      "step": 1830
+    },
+    {
+      "epoch": 0.33387918486171764,
+      "grad_norm": 0.19212059676647186,
+      "learning_rate": 3.886706187540804e-05,
+      "loss": 0.16265250444412233,
+      "step": 1835
+    },
+    {
+      "epoch": 0.33478893740902477,
+      "grad_norm": 0.17410333454608917,
+      "learning_rate": 3.880575536658881e-05,
+      "loss": 0.15689224004745483,
+      "step": 1840
+    },
+    {
+      "epoch": 0.3356986899563319,
+      "grad_norm": 0.15165294706821442,
+      "learning_rate": 3.874432917523817e-05,
+      "loss": 0.15033140182495117,
+      "step": 1845
+    },
+    {
+      "epoch": 0.336608442503639,
+      "grad_norm": 0.16166730225086212,
+      "learning_rate": 3.8682783833861736e-05,
+      "loss": 0.16896235942840576,
+      "step": 1850
+    },
+    {
+      "epoch": 0.33751819505094616,
+      "grad_norm": 0.16497021913528442,
+      "learning_rate": 3.8621119875998026e-05,
+      "loss": 0.1600774645805359,
+      "step": 1855
+    },
+    {
+      "epoch": 0.3384279475982533,
+      "grad_norm": 0.17264948785305023,
+      "learning_rate": 3.855933783621384e-05,
+      "loss": 0.16947593688964843,
+      "step": 1860
+    },
+    {
+      "epoch": 0.3393377001455604,
+      "grad_norm": 0.16870704293251038,
+      "learning_rate": 3.8497438250099636e-05,
+      "loss": 0.16062095165252685,
+      "step": 1865
+    },
+    {
+      "epoch": 0.34024745269286755,
+      "grad_norm": 0.16644036769866943,
+      "learning_rate": 3.843542165426492e-05,
+      "loss": 0.16015599966049193,
+      "step": 1870
+    },
+    {
+      "epoch": 0.3411572052401747,
+      "grad_norm": 0.1626352220773697,
+      "learning_rate": 3.837328858633349e-05,
+      "loss": 0.17444703578948975,
+      "step": 1875
+    },
+    {
+      "epoch": 0.3420669577874818,
+      "grad_norm": 0.1427375227212906,
+      "learning_rate": 3.83110395849389e-05,
+      "loss": 0.1589805006980896,
+      "step": 1880
+    },
+    {
+      "epoch": 0.34297671033478894,
+      "grad_norm": 0.17840255796909332,
+      "learning_rate": 3.824867518971973e-05,
+      "loss": 0.15953952074050903,
+      "step": 1885
+    },
+    {
+      "epoch": 0.34388646288209607,
+      "grad_norm": 0.16998249292373657,
+      "learning_rate": 3.818619594131489e-05,
+      "loss": 0.16027032136917113,
+      "step": 1890
+    },
+    {
+      "epoch": 0.3447962154294032,
+      "grad_norm": 0.14950257539749146,
+      "learning_rate": 3.812360238135897e-05,
+      "loss": 0.15335670709609986,
+      "step": 1895
+    },
+    {
+      "epoch": 0.3457059679767103,
+      "grad_norm": 0.1678011417388916,
+      "learning_rate": 3.806089505247752e-05,
+      "loss": 0.1560648798942566,
+      "step": 1900
+    },
+    {
+      "epoch": 0.34661572052401746,
+      "grad_norm": 0.17944541573524475,
+      "learning_rate": 3.799807449828238e-05,
+      "loss": 0.16072254180908202,
+      "step": 1905
+    },
+    {
+      "epoch": 0.3475254730713246,
+      "grad_norm": 0.166817307472229,
+      "learning_rate": 3.793514126336691e-05,
+      "loss": 0.1542820692062378,
+      "step": 1910
+    },
+    {
+      "epoch": 0.3484352256186317,
+      "grad_norm": 0.16047626733779907,
+      "learning_rate": 3.787209589330134e-05,
+      "loss": 0.16092092990875245,
+      "step": 1915
+    },
+    {
+      "epoch": 0.34934497816593885,
+      "grad_norm": 0.16478900611400604,
+      "learning_rate": 3.7808938934627965e-05,
+      "loss": 0.16765867471694945,
+      "step": 1920
+    },
+    {
+      "epoch": 0.350254730713246,
+      "grad_norm": 0.15349514782428741,
+      "learning_rate": 3.774567093485648e-05,
+      "loss": 0.15890377759933472,
+      "step": 1925
+    },
+    {
+      "epoch": 0.3511644832605531,
+      "grad_norm": 0.1515921950340271,
+      "learning_rate": 3.768229244245917e-05,
+      "loss": 0.16668319702148438,
+      "step": 1930
+    },
+    {
+      "epoch": 0.35207423580786024,
+      "grad_norm": 0.16310466825962067,
+      "learning_rate": 3.7618804006866195e-05,
+      "loss": 0.15182652473449706,
+      "step": 1935
+    },
+    {
+      "epoch": 0.3529839883551674,
+      "grad_norm": 0.17294517159461975,
+      "learning_rate": 3.755520617846084e-05,
+      "loss": 0.16287628412246705,
+      "step": 1940
+    },
+    {
+      "epoch": 0.35389374090247455,
+      "grad_norm": 0.1482895463705063,
+      "learning_rate": 3.749149950857467e-05,
+      "loss": 0.15321952104568481,
+      "step": 1945
+    },
+    {
+      "epoch": 0.3548034934497817,
+      "grad_norm": 0.2236029952764511,
+      "learning_rate": 3.7427684549482847e-05,
+      "loss": 0.15403482913970948,
+      "step": 1950
+    },
+    {
+      "epoch": 0.3557132459970888,
+      "grad_norm": 0.20185327529907227,
+      "learning_rate": 3.736376185439927e-05,
+      "loss": 0.1633884072303772,
+      "step": 1955
+    },
+    {
+      "epoch": 0.35662299854439594,
+      "grad_norm": 0.13906247913837433,
+      "learning_rate": 3.7299731977471816e-05,
+      "loss": 0.15925350189208984,
+      "step": 1960
+    },
+    {
+      "epoch": 0.35753275109170307,
+      "grad_norm": 0.18665002286434174,
+      "learning_rate": 3.723559547377751e-05,
+      "loss": 0.1612026572227478,
+      "step": 1965
+    },
+    {
+      "epoch": 0.3584425036390102,
+      "grad_norm": 0.16913433372974396,
+      "learning_rate": 3.717135289931774e-05,
+      "loss": 0.15479494333267213,
+      "step": 1970
+    },
+    {
+      "epoch": 0.35935225618631733,
+      "grad_norm": 0.1620066910982132,
+      "learning_rate": 3.7107004811013434e-05,
+      "loss": 0.1604058027267456,
+      "step": 1975
+    },
+    {
+      "epoch": 0.36026200873362446,
+      "grad_norm": 0.16838301718235016,
+      "learning_rate": 3.704255176670021e-05,
+      "loss": 0.15335073471069335,
+      "step": 1980
+    },
+    {
+      "epoch": 0.3611717612809316,
+      "grad_norm": 0.3054695427417755,
+      "learning_rate": 3.6977994325123535e-05,
+      "loss": 0.16558053493499755,
+      "step": 1985
+    },
+    {
+      "epoch": 0.3620815138282387,
+      "grad_norm": 0.1526716649532318,
+      "learning_rate": 3.6913333045933934e-05,
+      "loss": 0.16148923635482787,
+      "step": 1990
+    },
+    {
+      "epoch": 0.36299126637554585,
+      "grad_norm": 0.15328513085842133,
+      "learning_rate": 3.684856848968209e-05,
+      "loss": 0.1553613781929016,
+      "step": 1995
+    },
+    {
+      "epoch": 0.363901018922853,
+      "grad_norm": 0.16129714250564575,
+      "learning_rate": 3.6783701217813995e-05,
+      "loss": 0.16724612712860107,
+      "step": 2000
+    },
+    {
+      "epoch": 0.3648107714701601,
+      "grad_norm": 0.15715539455413818,
+      "learning_rate": 3.6718731792666086e-05,
+      "loss": 0.15867922306060792,
+      "step": 2005
+    },
+    {
+      "epoch": 0.36572052401746724,
+      "grad_norm": 0.15569166839122772,
+      "learning_rate": 3.6653660777460366e-05,
+      "loss": 0.1552058696746826,
+      "step": 2010
+    },
+    {
+      "epoch": 0.36663027656477437,
+      "grad_norm": 0.16223010420799255,
+      "learning_rate": 3.6588488736299535e-05,
+      "loss": 0.1583200454711914,
+      "step": 2015
+    },
+    {
+      "epoch": 0.3675400291120815,
+      "grad_norm": 0.18441995978355408,
+      "learning_rate": 3.652321623416209e-05,
+      "loss": 0.15050662755966188,
+      "step": 2020
+    },
+    {
+      "epoch": 0.36844978165938863,
+      "grad_norm": 0.13792674243450165,
+      "learning_rate": 3.645784383689742e-05,
+      "loss": 0.15458759069442748,
+      "step": 2025
+    },
+    {
+      "epoch": 0.36935953420669576,
+      "grad_norm": 0.14993111789226532,
+      "learning_rate": 3.639237211122091e-05,
+      "loss": 0.15926222801208495,
+      "step": 2030
+    },
+    {
+      "epoch": 0.3702692867540029,
+      "grad_norm": 0.16815930604934692,
+      "learning_rate": 3.632680162470904e-05,
+      "loss": 0.15524441003799438,
+      "step": 2035
+    },
+    {
+      "epoch": 0.37117903930131,
+      "grad_norm": 0.13312821090221405,
+      "learning_rate": 3.626113294579441e-05,
+      "loss": 0.15883516073226928,
+      "step": 2040
+    },
+    {
+      "epoch": 0.37208879184861715,
+      "grad_norm": 0.16838273406028748,
+      "learning_rate": 3.619536664376091e-05,
+      "loss": 0.15829603672027587,
+      "step": 2045
+    },
+    {
+      "epoch": 0.37299854439592434,
+      "grad_norm": 0.14706873893737793,
+      "learning_rate": 3.612950328873869e-05,
+      "loss": 0.15644397735595703,
+      "step": 2050
+    },
+    {
+      "epoch": 0.37390829694323147,
+      "grad_norm": 0.1644199639558792,
+      "learning_rate": 3.606354345169926e-05,
+      "loss": 0.15858219861984252,
+      "step": 2055
+    },
+    {
+      "epoch": 0.3748180494905386,
+      "grad_norm": 0.18077051639556885,
+      "learning_rate": 3.599748770445055e-05,
+      "loss": 0.1641286849975586,
+      "step": 2060
+    },
+    {
+      "epoch": 0.3757278020378457,
+      "grad_norm": 0.16329127550125122,
+      "learning_rate": 3.5931336619631914e-05,
+      "loss": 0.15027186870574952,
+      "step": 2065
+    },
+    {
+      "epoch": 0.37663755458515286,
+      "grad_norm": 0.16346783936023712,
+      "learning_rate": 3.586509077070922e-05,
+      "loss": 0.1558641314506531,
+      "step": 2070
+    },
+    {
+      "epoch": 0.37754730713246,
+      "grad_norm": 0.1727602630853653,
+      "learning_rate": 3.5798750731969834e-05,
+      "loss": 0.15390506982803345,
+      "step": 2075
+    },
+    {
+      "epoch": 0.3784570596797671,
+      "grad_norm": 0.7598192691802979,
+      "learning_rate": 3.5732317078517654e-05,
+      "loss": 0.1533232808113098,
+      "step": 2080
+    },
+    {
+      "epoch": 0.37936681222707425,
+      "grad_norm": 0.1433355212211609,
+      "learning_rate": 3.5665790386268124e-05,
+      "loss": 0.15560413599014283,
+      "step": 2085
+    },
+    {
+      "epoch": 0.3802765647743814,
+      "grad_norm": 0.18439625203609467,
+      "learning_rate": 3.559917123194325e-05,
+      "loss": 0.16695556640625,
+      "step": 2090
+    },
+    {
+      "epoch": 0.3811863173216885,
+      "grad_norm": 0.1693502813577652,
+      "learning_rate": 3.55324601930666e-05,
+      "loss": 0.15957870483398437,
+      "step": 2095
+    },
+    {
+      "epoch": 0.38209606986899564,
+      "grad_norm": 0.17776088416576385,
+      "learning_rate": 3.54656578479583e-05,
+      "loss": 0.1527492880821228,
+      "step": 2100
+    },
+    {
+      "epoch": 0.38300582241630277,
+      "grad_norm": 0.15993724763393402,
+      "learning_rate": 3.539876477572998e-05,
+      "loss": 0.1567505717277527,
+      "step": 2105
+    },
+    {
+      "epoch": 0.3839155749636099,
+      "grad_norm": 0.17067375779151917,
+      "learning_rate": 3.533178155627981e-05,
+      "loss": 0.14660797119140626,
+      "step": 2110
+    },
+    {
+      "epoch": 0.384825327510917,
+      "grad_norm": 0.20239882171154022,
+      "learning_rate": 3.526470877028745e-05,
+      "loss": 0.1596767544746399,
+      "step": 2115
+    },
+    {
+      "epoch": 0.38573508005822416,
+      "grad_norm": 0.1863643079996109,
+      "learning_rate": 3.5197546999209005e-05,
+      "loss": 0.15738571882247926,
+      "step": 2120
+    },
+    {
+      "epoch": 0.3866448326055313,
+      "grad_norm": 0.16994133591651917,
+      "learning_rate": 3.5130296825272014e-05,
+      "loss": 0.16255316734313965,
+      "step": 2125
+    },
+    {
+      "epoch": 0.3875545851528384,
+      "grad_norm": 0.18703415989875793,
+      "learning_rate": 3.5062958831470355e-05,
+      "loss": 0.15206334590911866,
+      "step": 2130
+    },
+    {
+      "epoch": 0.38846433770014555,
+      "grad_norm": 0.15433982014656067,
+      "learning_rate": 3.4995533601559226e-05,
+      "loss": 0.1590178370475769,
+      "step": 2135
+    },
+    {
+      "epoch": 0.3893740902474527,
+      "grad_norm": 0.16498146951198578,
+      "learning_rate": 3.4928021720050104e-05,
+      "loss": 0.14759145975112914,
+      "step": 2140
+    },
+    {
+      "epoch": 0.3902838427947598,
+      "grad_norm": 0.17880478501319885,
+      "learning_rate": 3.486042377220562e-05,
+      "loss": 0.1642458915710449,
+      "step": 2145
+    },
+    {
+      "epoch": 0.39119359534206694,
+      "grad_norm": 0.14700061082839966,
+      "learning_rate": 3.479274034403455e-05,
+      "loss": 0.16105138063430785,
+      "step": 2150
+    },
+    {
+      "epoch": 0.39210334788937407,
+      "grad_norm": 0.1620762050151825,
+      "learning_rate": 3.472497202228664e-05,
+      "loss": 0.15104985237121582,
+      "step": 2155
+    },
+    {
+      "epoch": 0.3930131004366812,
+      "grad_norm": 0.1625058799982071,
+      "learning_rate": 3.4657119394447654e-05,
+      "loss": 0.16145485639572144,
+      "step": 2160
+    },
+    {
+      "epoch": 0.3939228529839884,
+      "grad_norm": 0.1631549596786499,
+      "learning_rate": 3.458918304873417e-05,
+      "loss": 0.16712255477905275,
+      "step": 2165
+    },
+    {
+      "epoch": 0.3948326055312955,
+      "grad_norm": 0.16041551530361176,
+      "learning_rate": 3.452116357408853e-05,
+      "loss": 0.15118330717086792,
+      "step": 2170
+    },
+    {
+      "epoch": 0.39574235807860264,
+      "grad_norm": 0.16692611575126648,
+      "learning_rate": 3.44530615601737e-05,
+      "loss": 0.16982550621032716,
+      "step": 2175
+    },
+    {
+      "epoch": 0.39665211062590977,
+      "grad_norm": 0.16082268953323364,
+      "learning_rate": 3.438487759736821e-05,
+      "loss": 0.1513260006904602,
+      "step": 2180
+    },
+    {
+      "epoch": 0.3975618631732169,
+      "grad_norm": 0.1474589854478836,
+      "learning_rate": 3.4316612276761004e-05,
+      "loss": 0.14968743324279785,
+      "step": 2185
+    },
+    {
+      "epoch": 0.39847161572052403,
+      "grad_norm": 0.14531342685222626,
+      "learning_rate": 3.42482661901463e-05,
+      "loss": 0.1563260555267334,
+      "step": 2190
+    },
+    {
+      "epoch": 0.39938136826783116,
+      "grad_norm": 0.16775506734848022,
+      "learning_rate": 3.41798399300185e-05,
+      "loss": 0.14861010313034057,
+      "step": 2195
+    },
+    {
+      "epoch": 0.4002911208151383,
+      "grad_norm": 0.15065217018127441,
+      "learning_rate": 3.411133408956703e-05,
+      "loss": 0.15559519529342652,
+      "step": 2200
+    },
+    {
+      "epoch": 0.4012008733624454,
+      "grad_norm": 0.16655296087265015,
+      "learning_rate": 3.4042749262671184e-05,
+      "loss": 0.16025567054748535,
+      "step": 2205
+    },
+    {
+      "epoch": 0.40211062590975255,
+      "grad_norm": 0.14773905277252197,
+      "learning_rate": 3.397408604389501e-05,
+      "loss": 0.15074082612991332,
+      "step": 2210
+    },
+    {
+      "epoch": 0.4030203784570597,
+      "grad_norm": 0.16233304142951965,
+      "learning_rate": 3.3905345028482125e-05,
+      "loss": 0.15490520000457764,
+      "step": 2215
+    },
+    {
+      "epoch": 0.4039301310043668,
+      "grad_norm": 0.17520153522491455,
+      "learning_rate": 3.383652681235058e-05,
+      "loss": 0.1517520785331726,
+      "step": 2220
+    },
+    {
+      "epoch": 0.40483988355167394,
+      "grad_norm": 0.14749875664710999,
+      "learning_rate": 3.376763199208766e-05,
+      "loss": 0.15410997867584228,
+      "step": 2225
+    },
+    {
+      "epoch": 0.40574963609898107,
+      "grad_norm": 0.16855919361114502,
+      "learning_rate": 3.369866116494477e-05,
+      "loss": 0.1510261058807373,
+      "step": 2230
+    },
+    {
+      "epoch": 0.4066593886462882,
+      "grad_norm": 0.1594122350215912,
+      "learning_rate": 3.362961492883218e-05,
+      "loss": 0.1493813395500183,
+      "step": 2235
+    },
+    {
+      "epoch": 0.40756914119359533,
+      "grad_norm": 0.13645926117897034,
+      "learning_rate": 3.3560493882313915e-05,
+      "loss": 0.14876762628555298,
+      "step": 2240
+    },
+    {
+      "epoch": 0.40847889374090246,
+      "grad_norm": 0.14304400980472565,
+      "learning_rate": 3.349129862460251e-05,
+      "loss": 0.15567013025283813,
+      "step": 2245
+    },
+    {
+      "epoch": 0.4093886462882096,
+      "grad_norm": 0.17040041089057922,
+      "learning_rate": 3.342202975555386e-05,
+      "loss": 0.1563249945640564,
+      "step": 2250
+    },
+    {
+      "epoch": 0.4102983988355167,
+      "grad_norm": 0.15594671666622162,
+      "learning_rate": 3.3352687875661984e-05,
+      "loss": 0.1546410083770752,
+      "step": 2255
+    },
+    {
+      "epoch": 0.41120815138282385,
+      "grad_norm": 0.1677195280790329,
+      "learning_rate": 3.328327358605384e-05,
+      "loss": 0.15710171461105346,
+      "step": 2260
+    },
+    {
+      "epoch": 0.412117903930131,
+      "grad_norm": 0.1731705516576767,
+      "learning_rate": 3.321378748848412e-05,
+      "loss": 0.16444036960601807,
+      "step": 2265
+    },
+    {
+      "epoch": 0.4130276564774381,
+      "grad_norm": 0.18779033422470093,
+      "learning_rate": 3.3144230185329984e-05,
+      "loss": 0.15659687519073487,
+      "step": 2270
+    },
+    {
+      "epoch": 0.4139374090247453,
+      "grad_norm": 0.1543768346309662,
+      "learning_rate": 3.3074602279585913e-05,
+      "loss": 0.15100739002227784,
+      "step": 2275
+    },
+    {
+      "epoch": 0.4148471615720524,
+      "grad_norm": 0.16672168672084808,
+      "learning_rate": 3.300490437485843e-05,
+      "loss": 0.15535364151000977,
+      "step": 2280
+    },
+    {
+      "epoch": 0.41575691411935956,
+      "grad_norm": 0.16741308569908142,
+      "learning_rate": 3.293513707536089e-05,
+      "loss": 0.15523911714553834,
+      "step": 2285
+    },
+    {
+      "epoch": 0.4166666666666667,
+      "grad_norm": 0.1488303542137146,
+      "learning_rate": 3.286530098590822e-05,
+      "loss": 0.1542000651359558,
+      "step": 2290
+    },
+    {
+      "epoch": 0.4175764192139738,
+      "grad_norm": 0.1637732982635498,
+      "learning_rate": 3.2795396711911694e-05,
+      "loss": 0.15354831218719484,
+      "step": 2295
+    },
+    {
+      "epoch": 0.41848617176128095,
+      "grad_norm": 0.1472022533416748,
+      "learning_rate": 3.272542485937369e-05,
+      "loss": 0.16235145330429077,
+      "step": 2300
+    },
+    {
+      "epoch": 0.4193959243085881,
+      "grad_norm": 0.15908290445804596,
+      "learning_rate": 3.265538603488241e-05,
+      "loss": 0.15642645359039306,
+      "step": 2305
+    },
+    {
+      "epoch": 0.4203056768558952,
+      "grad_norm": 0.1584865301847458,
+      "learning_rate": 3.2585280845606645e-05,
+      "loss": 0.15490249395370484,
+      "step": 2310
+    },
+    {
+      "epoch": 0.42121542940320233,
+      "grad_norm": 0.15893949568271637,
+      "learning_rate": 3.251510989929052e-05,
+      "loss": 0.1598116159439087,
+      "step": 2315
+    },
+    {
+      "epoch": 0.42212518195050946,
+      "grad_norm": 0.18930596113204956,
+      "learning_rate": 3.244487380424817e-05,
+      "loss": 0.1482008934020996,
+      "step": 2320
+    },
+    {
+      "epoch": 0.4230349344978166,
+      "grad_norm": 0.132876455783844,
+      "learning_rate": 3.237457316935856e-05,
+      "loss": 0.15304710865020751,
+      "step": 2325
+    },
+    {
+      "epoch": 0.4239446870451237,
+      "grad_norm": 0.16447032988071442,
+      "learning_rate": 3.2304208604060106e-05,
+      "loss": 0.15298750400543212,
+      "step": 2330
+    },
+    {
+      "epoch": 0.42485443959243085,
+      "grad_norm": 0.17748120427131653,
+      "learning_rate": 3.223378071834546e-05,
+      "loss": 0.1556084156036377,
+      "step": 2335
+    },
+    {
+      "epoch": 0.425764192139738,
+      "grad_norm": 0.16366586089134216,
+      "learning_rate": 3.2163290122756206e-05,
+      "loss": 0.14387927055358887,
+      "step": 2340
+    },
+    {
+      "epoch": 0.4266739446870451,
+      "grad_norm": 0.15398970246315002,
+      "learning_rate": 3.209273742837755e-05,
+      "loss": 0.16091293096542358,
+      "step": 2345
+    },
+    {
+      "epoch": 0.42758369723435224,
+      "grad_norm": 0.164212167263031,
+      "learning_rate": 3.202212324683305e-05,
+      "loss": 0.15523531436920165,
+      "step": 2350
+    },
+    {
+      "epoch": 0.4284934497816594,
+      "grad_norm": 0.16749800741672516,
+      "learning_rate": 3.1951448190279255e-05,
+      "loss": 0.15354975461959838,
+      "step": 2355
+    },
+    {
+      "epoch": 0.4294032023289665,
+      "grad_norm": 0.14137034118175507,
+      "learning_rate": 3.18807128714005e-05,
+      "loss": 0.14981694221496583,
+      "step": 2360
+    },
+    {
+      "epoch": 0.43031295487627363,
+      "grad_norm": 0.14848439395427704,
+      "learning_rate": 3.1809917903403507e-05,
+      "loss": 0.15448769330978393,
+      "step": 2365
+    },
+    {
+      "epoch": 0.43122270742358076,
+      "grad_norm": 0.1747605800628662,
+      "learning_rate": 3.1739063900012095e-05,
+      "loss": 0.15882387161254882,
+      "step": 2370
+    },
+    {
+      "epoch": 0.4321324599708879,
+      "grad_norm": 0.16054467856884003,
+      "learning_rate": 3.166815147546186e-05,
+      "loss": 0.15170297622680665,
+      "step": 2375
+    },
+    {
+      "epoch": 0.433042212518195,
+      "grad_norm": 0.15428027510643005,
+      "learning_rate": 3.1597181244494886e-05,
+      "loss": 0.16202548742294312,
+      "step": 2380
+    },
+    {
+      "epoch": 0.4339519650655022,
+      "grad_norm": 0.16747219860553741,
+      "learning_rate": 3.1526153822354325e-05,
+      "loss": 0.15461477041244506,
+      "step": 2385
+    },
+    {
+      "epoch": 0.43486171761280934,
+      "grad_norm": 0.17415772378444672,
+      "learning_rate": 3.145506982477918e-05,
+      "loss": 0.16173542737960817,
+      "step": 2390
+    },
+    {
+      "epoch": 0.43577147016011647,
+      "grad_norm": 0.1293518990278244,
+      "learning_rate": 3.1383929867998865e-05,
+      "loss": 0.15572521686553956,
+      "step": 2395
+    },
+    {
+      "epoch": 0.4366812227074236,
+      "grad_norm": 0.16909323632717133,
+      "learning_rate": 3.1312734568727935e-05,
+      "loss": 0.15898628234863282,
+      "step": 2400
+    },
+    {
+      "epoch": 0.43759097525473073,
+      "grad_norm": 0.16770294308662415,
+      "learning_rate": 3.124148454416069e-05,
+      "loss": 0.1536281704902649,
+      "step": 2405
+    },
+    {
+      "epoch": 0.43850072780203786,
+      "grad_norm": 0.14078612625598907,
+      "learning_rate": 3.117018041196585e-05,
+      "loss": 0.15274266004562378,
+      "step": 2410
+    },
+    {
+      "epoch": 0.439410480349345,
+      "grad_norm": 0.15457536280155182,
+      "learning_rate": 3.1098822790281226e-05,
+      "loss": 0.15391263961791993,
+      "step": 2415
+    },
+    {
+      "epoch": 0.4403202328966521,
+      "grad_norm": 0.1640717089176178,
+      "learning_rate": 3.102741229770827e-05,
+      "loss": 0.15515168905258178,
+      "step": 2420
+    },
+    {
+      "epoch": 0.44122998544395925,
+      "grad_norm": 0.2601533830165863,
+      "learning_rate": 3.095594955330683e-05,
+      "loss": 0.1587247371673584,
+      "step": 2425
+    },
+    {
+      "epoch": 0.4421397379912664,
+      "grad_norm": 0.1352529525756836,
+      "learning_rate": 3.08844351765897e-05,
+      "loss": 0.1483217477798462,
+      "step": 2430
+    },
+    {
+      "epoch": 0.4430494905385735,
+      "grad_norm": 0.18479721248149872,
+      "learning_rate": 3.081286978751728e-05,
+      "loss": 0.15121787786483765,
+      "step": 2435
+    },
+    {
+      "epoch": 0.44395924308588064,
+      "grad_norm": 0.16954511404037476,
+      "learning_rate": 3.074125400649221e-05,
+      "loss": 0.16073100566864013,
+      "step": 2440
+    },
+    {
+      "epoch": 0.44486899563318777,
+      "grad_norm": 0.15154729783535004,
+      "learning_rate": 3.0669588454353944e-05,
+      "loss": 0.15738017559051515,
+      "step": 2445
+    },
+    {
+      "epoch": 0.4457787481804949,
+      "grad_norm": 0.1540488302707672,
+      "learning_rate": 3.059787375237344e-05,
+      "loss": 0.1515384554862976,
+      "step": 2450
+    },
+    {
+      "epoch": 0.44668850072780203,
+      "grad_norm": 0.1814432442188263,
+      "learning_rate": 3.052611052224774e-05,
+      "loss": 0.15731438398361205,
+      "step": 2455
+    },
+    {
+      "epoch": 0.44759825327510916,
+      "grad_norm": 0.16657036542892456,
+      "learning_rate": 3.0454299386094542e-05,
+      "loss": 0.15741543769836425,
+      "step": 2460
+    },
+    {
+      "epoch": 0.4485080058224163,
+      "grad_norm": 0.2177237570285797,
+      "learning_rate": 3.0382440966446875e-05,
+      "loss": 0.14972515106201173,
+      "step": 2465
+    },
+    {
+      "epoch": 0.4494177583697234,
+      "grad_norm": 0.1669909954071045,
+      "learning_rate": 3.031053588624766e-05,
+      "loss": 0.1506432294845581,
+      "step": 2470
+    },
+    {
+      "epoch": 0.45032751091703055,
+      "grad_norm": 0.1752234250307083,
+      "learning_rate": 3.0238584768844313e-05,
+      "loss": 0.14969609975814818,
+      "step": 2475
+    },
+    {
+      "epoch": 0.4512372634643377,
+      "grad_norm": 0.18267901241779327,
+      "learning_rate": 3.0166588237983363e-05,
+      "loss": 0.15112748146057128,
+      "step": 2480
+    },
+    {
+      "epoch": 0.4521470160116448,
+      "grad_norm": 0.16250105202198029,
+      "learning_rate": 3.0094546917805007e-05,
+      "loss": 0.15864100456237792,
+      "step": 2485
+    },
+    {
+      "epoch": 0.45305676855895194,
+      "grad_norm": 0.14825721085071564,
+      "learning_rate": 3.0022461432837752e-05,
+      "loss": 0.1513954520225525,
+      "step": 2490
+    },
+    {
+      "epoch": 0.4539665211062591,
+      "grad_norm": 0.1626640111207962,
+      "learning_rate": 2.9950332407992943e-05,
+      "loss": 0.1505578875541687,
+      "step": 2495
+    },
+    {
+      "epoch": 0.45487627365356625,
+      "grad_norm": 0.1535351574420929,
+      "learning_rate": 2.987816046855939e-05,
+      "loss": 0.15255829095840454,
+      "step": 2500
+    },
+    {
+      "epoch": 0.4557860262008734,
+      "grad_norm": 0.17552775144577026,
+      "learning_rate": 2.9805946240197928e-05,
+      "loss": 0.1516443133354187,
+      "step": 2505
+    },
+    {
+      "epoch": 0.4566957787481805,
+      "grad_norm": 0.16020981967449188,
+      "learning_rate": 2.9733690348935994e-05,
+      "loss": 0.14519743919372557,
+      "step": 2510
+    },
+    {
+      "epoch": 0.45760553129548764,
+      "grad_norm": 0.17800211906433105,
+      "learning_rate": 2.9661393421162204e-05,
+      "loss": 0.15679080486297609,
+      "step": 2515
+    },
+    {
+      "epoch": 0.4585152838427948,
+      "grad_norm": 0.16016991436481476,
+      "learning_rate": 2.9589056083620902e-05,
+      "loss": 0.14768127202987671,
+      "step": 2520
+    },
+    {
+      "epoch": 0.4594250363901019,
+      "grad_norm": 0.16272081434726715,
+      "learning_rate": 2.951667896340679e-05,
+      "loss": 0.1513301968574524,
+      "step": 2525
+    },
+    {
+      "epoch": 0.46033478893740903,
+      "grad_norm": 0.1726413071155548,
+      "learning_rate": 2.9444262687959402e-05,
+      "loss": 0.14819332361221313,
+      "step": 2530
+    },
+    {
+      "epoch": 0.46124454148471616,
+      "grad_norm": 0.1670403778553009,
+      "learning_rate": 2.9371807885057735e-05,
+      "loss": 0.15245940685272216,
+      "step": 2535
+    },
+    {
+      "epoch": 0.4621542940320233,
+      "grad_norm": 0.1650049239397049,
+      "learning_rate": 2.9299315182814772e-05,
+      "loss": 0.15187418460845947,
+      "step": 2540
+    },
+    {
+      "epoch": 0.4630640465793304,
+      "grad_norm": 0.16327734291553497,
+      "learning_rate": 2.9226785209672047e-05,
+      "loss": 0.15579828023910522,
+      "step": 2545
+    },
+    {
+      "epoch": 0.46397379912663755,
+      "grad_norm": 0.3367880582809448,
+      "learning_rate": 2.91542185943942e-05,
+      "loss": 0.15617697238922118,
+      "step": 2550
+    },
+    {
+      "epoch": 0.4648835516739447,
+      "grad_norm": 0.1731594055891037,
+      "learning_rate": 2.908161596606353e-05,
+      "loss": 0.1559603691101074,
+      "step": 2555
+    },
+    {
+      "epoch": 0.4657933042212518,
+      "grad_norm": 0.1477293074131012,
+      "learning_rate": 2.9008977954074517e-05,
+      "loss": 0.15567959547042848,
+      "step": 2560
+    },
+    {
+      "epoch": 0.46670305676855894,
+      "grad_norm": 0.16227173805236816,
+      "learning_rate": 2.8936305188128392e-05,
+      "loss": 0.1522113561630249,
+      "step": 2565
+    },
+    {
+      "epoch": 0.4676128093158661,
+      "grad_norm": 0.2031075656414032,
+      "learning_rate": 2.8863598298227674e-05,
+      "loss": 0.15054640769958497,
+      "step": 2570
+    },
+    {
+      "epoch": 0.4685225618631732,
+      "grad_norm": 0.18351472914218903,
+      "learning_rate": 2.8790857914670698e-05,
+      "loss": 0.15837019681930542,
+      "step": 2575
+    },
+    {
+      "epoch": 0.46943231441048033,
+      "grad_norm": 0.15914765000343323,
+      "learning_rate": 2.871808466804616e-05,
+      "loss": 0.1550259470939636,
+      "step": 2580
+    },
+    {
+      "epoch": 0.47034206695778746,
+      "grad_norm": 0.17366717755794525,
+      "learning_rate": 2.8645279189227636e-05,
+      "loss": 0.15702390670776367,
+      "step": 2585
+    },
+    {
+      "epoch": 0.4712518195050946,
+      "grad_norm": 0.13677838444709778,
+      "learning_rate": 2.8572442109368134e-05,
+      "loss": 0.15485031604766847,
+      "step": 2590
+    },
+    {
+      "epoch": 0.4721615720524017,
+      "grad_norm": 0.1477748304605484,
+      "learning_rate": 2.8499574059894617e-05,
+      "loss": 0.14577245712280273,
+      "step": 2595
+    },
+    {
+      "epoch": 0.47307132459970885,
+      "grad_norm": 0.1582217663526535,
+      "learning_rate": 2.842667567250252e-05,
+      "loss": 0.15586793422698975,
+      "step": 2600
+    },
+    {
+      "epoch": 0.47398107714701604,
+      "grad_norm": 0.19658738374710083,
+      "learning_rate": 2.8353747579150268e-05,
+      "loss": 0.15060495138168334,
+      "step": 2605
+    },
+    {
+      "epoch": 0.47489082969432317,
+      "grad_norm": 0.176767036318779,
+      "learning_rate": 2.828079041205382e-05,
+      "loss": 0.15116705894470214,
+      "step": 2610
+    },
+    {
+      "epoch": 0.4758005822416303,
+      "grad_norm": 0.16972507536411285,
+      "learning_rate": 2.820780480368117e-05,
+      "loss": 0.1541937470436096,
+      "step": 2615
+    },
+    {
+      "epoch": 0.47671033478893743,
+      "grad_norm": 0.1548585742712021,
+      "learning_rate": 2.8134791386746884e-05,
+      "loss": 0.14334756135940552,
+      "step": 2620
+    },
+    {
+      "epoch": 0.47762008733624456,
+      "grad_norm": 0.15411986410617828,
+      "learning_rate": 2.806175079420658e-05,
+      "loss": 0.14642289876937867,
+      "step": 2625
+    },
+    {
+      "epoch": 0.4785298398835517,
+      "grad_norm": 0.16609491407871246,
+      "learning_rate": 2.7988683659251474e-05,
+      "loss": 0.15083469152450563,
+      "step": 2630
+    },
+    {
+      "epoch": 0.4794395924308588,
+      "grad_norm": 0.16592684388160706,
+      "learning_rate": 2.791559061530289e-05,
+      "loss": 0.14218480587005616,
+      "step": 2635
+    },
+    {
+      "epoch": 0.48034934497816595,
+      "grad_norm": 0.1764935404062271,
+      "learning_rate": 2.7842472296006722e-05,
+      "loss": 0.15004343986511232,
+      "step": 2640
+    },
+    {
+      "epoch": 0.4812590975254731,
+      "grad_norm": 0.20094354450702667,
+      "learning_rate": 2.7769329335228022e-05,
+      "loss": 0.14975016117095946,
+      "step": 2645
+    },
+    {
+      "epoch": 0.4821688500727802,
+      "grad_norm": 0.1869269460439682,
+      "learning_rate": 2.769616236704542e-05,
+      "loss": 0.155981707572937,
+      "step": 2650
+    },
+    {
+      "epoch": 0.48307860262008734,
+      "grad_norm": 0.16671574115753174,
+      "learning_rate": 2.762297202574571e-05,
+      "loss": 0.14633859395980836,
+      "step": 2655
+    },
+    {
+      "epoch": 0.48398835516739447,
+      "grad_norm": 0.14999663829803467,
+      "learning_rate": 2.754975894581826e-05,
+      "loss": 0.15692603588104248,
+      "step": 2660
+    },
+    {
+      "epoch": 0.4848981077147016,
+      "grad_norm": 0.16893649101257324,
+      "learning_rate": 2.7476523761949592e-05,
+      "loss": 0.14530394077301026,
+      "step": 2665
+    },
+    {
+      "epoch": 0.48580786026200873,
+      "grad_norm": 0.16039884090423584,
+      "learning_rate": 2.740326710901784e-05,
+      "loss": 0.15013915300369263,
+      "step": 2670
+    },
+    {
+      "epoch": 0.48671761280931586,
+      "grad_norm": 0.16672006249427795,
+      "learning_rate": 2.732998962208725e-05,
+      "loss": 0.15667349100112915,
+      "step": 2675
+    },
+    {
+      "epoch": 0.487627365356623,
+      "grad_norm": 0.2160867303609848,
+      "learning_rate": 2.7256691936402684e-05,
+      "loss": 0.14335414171218872,
+      "step": 2680
+    },
+    {
+      "epoch": 0.4885371179039301,
+      "grad_norm": 0.349030077457428,
+      "learning_rate": 2.71833746873841e-05,
+      "loss": 0.1437530279159546,
+      "step": 2685
+    },
+    {
+      "epoch": 0.48944687045123725,
+      "grad_norm": 0.18380966782569885,
+      "learning_rate": 2.7110038510621073e-05,
+      "loss": 0.1476014256477356,
+      "step": 2690
+    },
+    {
+      "epoch": 0.4903566229985444,
+      "grad_norm": 0.1523742377758026,
+      "learning_rate": 2.703668404186722e-05,
+      "loss": 0.14578526020050048,
+      "step": 2695
+    },
+    {
+      "epoch": 0.4912663755458515,
+      "grad_norm": 0.16092729568481445,
+      "learning_rate": 2.696331191703479e-05,
+      "loss": 0.15335593223571778,
+      "step": 2700
+    },
+    {
+      "epoch": 0.49217612809315864,
+      "grad_norm": 0.17185333371162415,
+      "learning_rate": 2.688992277218904e-05,
+      "loss": 0.1540898084640503,
+      "step": 2705
+    },
+    {
+      "epoch": 0.49308588064046577,
+      "grad_norm": 0.1521969735622406,
+      "learning_rate": 2.6816517243542792e-05,
+      "loss": 0.15171396732330322,
+      "step": 2710
+    },
+    {
+      "epoch": 0.49399563318777295,
+      "grad_norm": 0.16064171493053436,
+      "learning_rate": 2.674309596745092e-05,
+      "loss": 0.1505839228630066,
+      "step": 2715
+    },
+    {
+      "epoch": 0.4949053857350801,
+      "grad_norm": 0.16430898010730743,
+      "learning_rate": 2.6669659580404795e-05,
+      "loss": 0.1551363468170166,
+      "step": 2720
+    },
+    {
+      "epoch": 0.4958151382823872,
+      "grad_norm": 0.16125477850437164,
+      "learning_rate": 2.659620871902677e-05,
+      "loss": 0.15069286823272704,
+      "step": 2725
+    },
+    {
+      "epoch": 0.49672489082969434,
+      "grad_norm": 0.1428450047969818,
+      "learning_rate": 2.652274402006471e-05,
+      "loss": 0.15511081218719483,
+      "step": 2730
+    },
+    {
+      "epoch": 0.4976346433770015,
+      "grad_norm": 0.15452754497528076,
+      "learning_rate": 2.6449266120386406e-05,
+      "loss": 0.14941939115524291,
+      "step": 2735
+    },
+    {
+      "epoch": 0.4985443959243086,
+      "grad_norm": 0.17243537306785583,
+      "learning_rate": 2.6375775656974123e-05,
+      "loss": 0.151741623878479,
+      "step": 2740
+    },
+    {
+      "epoch": 0.49945414847161573,
+      "grad_norm": 0.13736453652381897,
+      "learning_rate": 2.6302273266919008e-05,
+      "loss": 0.147042977809906,
+      "step": 2745
+    },
+    {
+      "epoch": 0.5003639010189228,
+      "grad_norm": 0.16241495311260223,
+      "learning_rate": 2.6228759587415614e-05,
+      "loss": 0.14664684534072875,
+      "step": 2750
+    },
+    {
+      "epoch": 0.50127365356623,
+      "grad_norm": 0.193496435880661,
+      "learning_rate": 2.6155235255756356e-05,
+      "loss": 0.15486966371536254,
+      "step": 2755
+    },
+    {
+      "epoch": 0.5021834061135371,
+      "grad_norm": 0.1542847901582718,
+      "learning_rate": 2.6081700909326e-05,
+      "loss": 0.15148009061813356,
+      "step": 2760
+    },
+    {
+      "epoch": 0.5030931586608443,
+      "grad_norm": 0.1696511209011078,
+      "learning_rate": 2.6008157185596142e-05,
+      "loss": 0.14190055131912233,
+      "step": 2765
+    },
+    {
+      "epoch": 0.5040029112081513,
+      "grad_norm": 0.14690077304840088,
+      "learning_rate": 2.5934604722119655e-05,
+      "loss": 0.1570739269256592,
+      "step": 2770
+    },
+    {
+      "epoch": 0.5049126637554585,
+      "grad_norm": 0.17149671912193298,
+      "learning_rate": 2.5861044156525162e-05,
+      "loss": 0.14940304756164552,
+      "step": 2775
+    },
+    {
+      "epoch": 0.5058224163027657,
+      "grad_norm": 0.16639231145381927,
+      "learning_rate": 2.578747612651155e-05,
+      "loss": 0.15691237449645995,
+      "step": 2780
+    },
+    {
+      "epoch": 0.5067321688500728,
+      "grad_norm": 0.2062763124704361,
+      "learning_rate": 2.5713901269842404e-05,
+      "loss": 0.1564734935760498,
+      "step": 2785
+    },
+    {
+      "epoch": 0.50764192139738,
+      "grad_norm": 0.12636308372020721,
+      "learning_rate": 2.5640320224340502e-05,
+      "loss": 0.14539417028427123,
+      "step": 2790
+    },
+    {
+      "epoch": 0.508551673944687,
+      "grad_norm": 0.16893689334392548,
+      "learning_rate": 2.556673362788225e-05,
+      "loss": 0.15440930128097535,
+      "step": 2795
+    },
+    {
+      "epoch": 0.5094614264919942,
+      "grad_norm": 0.16250015795230865,
+      "learning_rate": 2.54931421183922e-05,
+      "loss": 0.14485647678375244,
+      "step": 2800
+    },
+    {
+      "epoch": 0.5103711790393013,
+      "grad_norm": 0.1700994372367859,
+      "learning_rate": 2.5419546333837462e-05,
+      "loss": 0.15411126613616943,
+      "step": 2805
+    },
+    {
+      "epoch": 0.5112809315866085,
+      "grad_norm": 0.1547706127166748,
+      "learning_rate": 2.5345946912222256e-05,
+      "loss": 0.15516072511672974,
+      "step": 2810
+    },
+    {
+      "epoch": 0.5121906841339156,
+      "grad_norm": 0.17955681681632996,
+      "learning_rate": 2.527234449158228e-05,
+      "loss": 0.15546923875808716,
+      "step": 2815
+    },
+    {
+      "epoch": 0.5131004366812227,
+      "grad_norm": 0.163709819316864,
+      "learning_rate": 2.519873970997927e-05,
+      "loss": 0.15665037631988527,
+      "step": 2820
+    },
+    {
+      "epoch": 0.5140101892285298,
+      "grad_norm": 0.17859576642513275,
+      "learning_rate": 2.5125133205495405e-05,
+      "loss": 0.1539722204208374,
+      "step": 2825
+    },
+    {
+      "epoch": 0.514919941775837,
+      "grad_norm": 0.17443150281906128,
+      "learning_rate": 2.5051525616227806e-05,
+      "loss": 0.148411762714386,
+      "step": 2830
+    },
+    {
+      "epoch": 0.5158296943231441,
+      "grad_norm": 0.17397581040859222,
+      "learning_rate": 2.4977917580283007e-05,
+      "loss": 0.14880497455596925,
+      "step": 2835
+    },
+    {
+      "epoch": 0.5167394468704513,
+      "grad_norm": 0.14565663039684296,
+      "learning_rate": 2.4904309735771405e-05,
+      "loss": 0.14934680461883545,
+      "step": 2840
+    },
+    {
+      "epoch": 0.5176491994177583,
+      "grad_norm": 0.17895659804344177,
+      "learning_rate": 2.4830702720801746e-05,
+      "loss": 0.15287939310073853,
+      "step": 2845
+    },
+    {
+      "epoch": 0.5185589519650655,
+      "grad_norm": 0.15812788903713226,
+      "learning_rate": 2.4757097173475572e-05,
+      "loss": 0.14576947689056396,
+      "step": 2850
+    },
+    {
+      "epoch": 0.5194687045123726,
+      "grad_norm": 0.17123781144618988,
+      "learning_rate": 2.46834937318817e-05,
+      "loss": 0.15224847793579102,
+      "step": 2855
+    },
+    {
+      "epoch": 0.5203784570596798,
+      "grad_norm": 0.14845474064350128,
+      "learning_rate": 2.460989303409072e-05,
+      "loss": 0.14901585578918458,
+      "step": 2860
+    },
+    {
+      "epoch": 0.5212882096069869,
+      "grad_norm": 0.23493704199790955,
+      "learning_rate": 2.4536295718149407e-05,
+      "loss": 0.1517487049102783,
+      "step": 2865
+    },
+    {
+      "epoch": 0.522197962154294,
+      "grad_norm": 0.16209843754768372,
+      "learning_rate": 2.4462702422075217e-05,
+      "loss": 0.14327445030212402,
+      "step": 2870
+    },
+    {
+      "epoch": 0.5231077147016011,
+      "grad_norm": 0.17249803245067596,
+      "learning_rate": 2.4389113783850793e-05,
+      "loss": 0.1517549753189087,
+      "step": 2875
+    },
+    {
+      "epoch": 0.5240174672489083,
+      "grad_norm": 0.14561402797698975,
+      "learning_rate": 2.431553044141836e-05,
+      "loss": 0.14764087200164794,
+      "step": 2880
+    },
+    {
+      "epoch": 0.5249272197962155,
+      "grad_norm": 0.17033302783966064,
+      "learning_rate": 2.4241953032674256e-05,
+      "loss": 0.15181604623794556,
+      "step": 2885
+    },
+    {
+      "epoch": 0.5258369723435226,
+      "grad_norm": 0.1184430941939354,
+      "learning_rate": 2.4168382195463367e-05,
+      "loss": 0.14264242649078368,
+      "step": 2890
+    },
+    {
+      "epoch": 0.5267467248908297,
+      "grad_norm": 0.17521196603775024,
+      "learning_rate": 2.4094818567573618e-05,
+      "loss": 0.1509538173675537,
+      "step": 2895
+    },
+    {
+      "epoch": 0.5276564774381368,
+      "grad_norm": 0.1681576371192932,
+      "learning_rate": 2.4021262786730428e-05,
+      "loss": 0.15344605445861817,
+      "step": 2900
+    },
+    {
+      "epoch": 0.528566229985444,
+      "grad_norm": 0.17134182155132294,
+      "learning_rate": 2.3947715490591206e-05,
+      "loss": 0.15161689519882202,
+      "step": 2905
+    },
+    {
+      "epoch": 0.5294759825327511,
+      "grad_norm": 0.1796472817659378,
+      "learning_rate": 2.3874177316739778e-05,
+      "loss": 0.15086464881896972,
+      "step": 2910
+    },
+    {
+      "epoch": 0.5303857350800583,
+      "grad_norm": 0.23268625140190125,
+      "learning_rate": 2.380064890268093e-05,
+      "loss": 0.15354180335998535,
+      "step": 2915
+    },
+    {
+      "epoch": 0.5312954876273653,
+      "grad_norm": 0.16318941116333008,
+      "learning_rate": 2.372713088583481e-05,
+      "loss": 0.15131797790527343,
+      "step": 2920
+    },
+    {
+      "epoch": 0.5322052401746725,
+      "grad_norm": 0.18171803653240204,
+      "learning_rate": 2.365362390353143e-05,
+      "loss": 0.15784090757369995,
+      "step": 2925
+    },
+    {
+      "epoch": 0.5331149927219796,
+      "grad_norm": 0.17672640085220337,
+      "learning_rate": 2.3580128593005156e-05,
+      "loss": 0.15509436130523682,
+      "step": 2930
+    },
+    {
+      "epoch": 0.5340247452692868,
+      "grad_norm": 0.15985223650932312,
+      "learning_rate": 2.3506645591389174e-05,
+      "loss": 0.14851027727127075,
+      "step": 2935
+    },
+    {
+      "epoch": 0.5349344978165939,
+      "grad_norm": 0.16597607731819153,
+      "learning_rate": 2.343317553570995e-05,
+      "loss": 0.1504931092262268,
+      "step": 2940
+    },
+    {
+      "epoch": 0.535844250363901,
+      "grad_norm": 0.20180748403072357,
+      "learning_rate": 2.3359719062881725e-05,
+      "loss": 0.15023820400238036,
+      "step": 2945
+    },
+    {
+      "epoch": 0.5367540029112081,
+      "grad_norm": 0.1735963076353073,
+      "learning_rate": 2.3286276809701e-05,
+      "loss": 0.15374408960342406,
+      "step": 2950
+    },
+    {
+      "epoch": 0.5376637554585153,
+      "grad_norm": 0.17629501223564148,
+      "learning_rate": 2.3212849412840995e-05,
+      "loss": 0.15007833242416382,
+      "step": 2955
+    },
+    {
+      "epoch": 0.5385735080058224,
+      "grad_norm": 0.1493796557188034,
+      "learning_rate": 2.3139437508846155e-05,
+      "loss": 0.15206656455993653,
+      "step": 2960
+    },
+    {
+      "epoch": 0.5394832605531296,
+      "grad_norm": 0.17426837980747223,
+      "learning_rate": 2.306604173412659e-05,
+      "loss": 0.1441131591796875,
+      "step": 2965
+    },
+    {
+      "epoch": 0.5403930131004366,
+      "grad_norm": 0.16984431445598602,
+      "learning_rate": 2.2992662724952613e-05,
+      "loss": 0.14438753128051757,
+      "step": 2970
+    },
+    {
+      "epoch": 0.5413027656477438,
+      "grad_norm": 0.1814386397600174,
+      "learning_rate": 2.2919301117449167e-05,
+      "loss": 0.14887022972106934,
+      "step": 2975
+    },
+    {
+      "epoch": 0.5422125181950509,
+      "grad_norm": 0.158392995595932,
+      "learning_rate": 2.2845957547590368e-05,
+      "loss": 0.14404361248016356,
+      "step": 2980
+    },
+    {
+      "epoch": 0.5431222707423581,
+      "grad_norm": 0.17496263980865479,
+      "learning_rate": 2.2772632651193953e-05,
+      "loss": 0.1454906702041626,
+      "step": 2985
+    },
+    {
+      "epoch": 0.5440320232896652,
+      "grad_norm": 0.157533198595047,
+      "learning_rate": 2.2699327063915766e-05,
+      "loss": 0.1458217740058899,
+      "step": 2990
+    },
+    {
+      "epoch": 0.5449417758369723,
+      "grad_norm": 0.1767890453338623,
+      "learning_rate": 2.262604142124427e-05,
+      "loss": 0.14384825229644777,
+      "step": 2995
+    },
+    {
+      "epoch": 0.5458515283842795,
+      "grad_norm": 0.1851050704717636,
+      "learning_rate": 2.2552776358495033e-05,
+      "loss": 0.14832457304000854,
+      "step": 3000
+    },
+    {
+      "epoch": 0.5467612809315866,
+      "grad_norm": 0.164175882935524,
+      "learning_rate": 2.247953251080521e-05,
+      "loss": 0.14999878406524658,
+      "step": 3005
+    },
+    {
+      "epoch": 0.5476710334788938,
+      "grad_norm": 0.3403675854206085,
+      "learning_rate": 2.240631051312804e-05,
+      "loss": 0.1443937063217163,
+      "step": 3010
+    },
+    {
+      "epoch": 0.5485807860262009,
+      "grad_norm": 0.16751109063625336,
+      "learning_rate": 2.2333111000227342e-05,
+      "loss": 0.1462402105331421,
+      "step": 3015
+    },
+    {
+      "epoch": 0.549490538573508,
+      "grad_norm": 0.14741151034832,
+      "learning_rate": 2.225993460667201e-05,
+      "loss": 0.149855899810791,
+      "step": 3020
+    },
+    {
+      "epoch": 0.5504002911208151,
+      "grad_norm": 0.20605266094207764,
+      "learning_rate": 2.218678196683054e-05,
+      "loss": 0.15413178205490113,
+      "step": 3025
+    },
+    {
+      "epoch": 0.5513100436681223,
+      "grad_norm": 0.14884796738624573,
+      "learning_rate": 2.2113653714865473e-05,
+      "loss": 0.14592334032058715,
+      "step": 3030
+    },
+    {
+      "epoch": 0.5522197962154294,
+      "grad_norm": 0.17114350199699402,
+      "learning_rate": 2.2040550484727943e-05,
+      "loss": 0.1498338460922241,
+      "step": 3035
+    },
+    {
+      "epoch": 0.5531295487627366,
+      "grad_norm": 0.16496853530406952,
+      "learning_rate": 2.196747291015219e-05,
+      "loss": 0.14650315046310425,
+      "step": 3040
+    },
+    {
+      "epoch": 0.5540393013100436,
+      "grad_norm": 0.15172401070594788,
+      "learning_rate": 2.189442162465001e-05,
+      "loss": 0.14984124898910522,
+      "step": 3045
+    },
+    {
+      "epoch": 0.5549490538573508,
+      "grad_norm": 0.19258467853069305,
+      "learning_rate": 2.182139726150532e-05,
+      "loss": 0.1486764669418335,
+      "step": 3050
+    },
+    {
+      "epoch": 0.5558588064046579,
+      "grad_norm": 0.1749001443386078,
+      "learning_rate": 2.1748400453768652e-05,
+      "loss": 0.14983701705932617,
+      "step": 3055
+    },
+    {
+      "epoch": 0.5567685589519651,
+      "grad_norm": 0.37510567903518677,
+      "learning_rate": 2.1675431834251637e-05,
+      "loss": 0.14483561515808105,
+      "step": 3060
+    },
+    {
+      "epoch": 0.5576783114992722,
+      "grad_norm": 0.16932405531406403,
+      "learning_rate": 2.1602492035521553e-05,
+      "loss": 0.14487643241882325,
+      "step": 3065
+    },
+    {
+      "epoch": 0.5585880640465793,
+      "grad_norm": 0.174176424741745,
+      "learning_rate": 2.152958168989584e-05,
+      "loss": 0.14737497568130492,
+      "step": 3070
+    },
+    {
+      "epoch": 0.5594978165938864,
+      "grad_norm": 0.1601252257823944,
+      "learning_rate": 2.1456701429436577e-05,
+      "loss": 0.15183379650115966,
+      "step": 3075
+    },
+    {
+      "epoch": 0.5604075691411936,
+      "grad_norm": 0.14960910379886627,
+      "learning_rate": 2.1383851885945085e-05,
+      "loss": 0.143074893951416,
+      "step": 3080
+    },
+    {
+      "epoch": 0.5613173216885007,
+      "grad_norm": 0.1678633838891983,
+      "learning_rate": 2.1311033690956346e-05,
+      "loss": 0.14961432218551635,
+      "step": 3085
+    },
+    {
+      "epoch": 0.5622270742358079,
+      "grad_norm": 0.15814319252967834,
+      "learning_rate": 2.1238247475733613e-05,
+      "loss": 0.14308581352233887,
+      "step": 3090
+    },
+    {
+      "epoch": 0.5631368267831149,
+      "grad_norm": 0.21240772306919098,
+      "learning_rate": 2.1165493871262887e-05,
+      "loss": 0.14737485647201537,
+      "step": 3095
+    },
+    {
+      "epoch": 0.5640465793304221,
+      "grad_norm": 0.15161271393299103,
+      "learning_rate": 2.109277350824749e-05,
+      "loss": 0.14534420967102052,
+      "step": 3100
+    },
+    {
+      "epoch": 0.5649563318777293,
+      "grad_norm": 0.16572362184524536,
+      "learning_rate": 2.1020087017102537e-05,
+      "loss": 0.14299670457839966,
+      "step": 3105
+    },
+    {
+      "epoch": 0.5658660844250364,
+      "grad_norm": 0.1548164039850235,
+      "learning_rate": 2.094743502794954e-05,
+      "loss": 0.14371142387390137,
+      "step": 3110
+    },
+    {
+      "epoch": 0.5667758369723436,
+      "grad_norm": 0.2574169933795929,
+      "learning_rate": 2.0874818170610885e-05,
+      "loss": 0.14350423812866211,
+      "step": 3115
+    },
+    {
+      "epoch": 0.5676855895196506,
+      "grad_norm": 0.16359548270702362,
+      "learning_rate": 2.080223707460443e-05,
+      "loss": 0.1520243763923645,
+      "step": 3120
+    },
+    {
+      "epoch": 0.5685953420669578,
+      "grad_norm": 0.1798320859670639,
+      "learning_rate": 2.072969236913799e-05,
+      "loss": 0.14832595586776734,
+      "step": 3125
+    },
+    {
+      "epoch": 0.5695050946142649,
+      "grad_norm": 0.17045916616916656,
+      "learning_rate": 2.0657184683103926e-05,
+      "loss": 0.15308042764663696,
+      "step": 3130
+    },
+    {
+      "epoch": 0.5704148471615721,
+      "grad_norm": 0.16345897316932678,
+      "learning_rate": 2.058471464507366e-05,
+      "loss": 0.14564799070358275,
+      "step": 3135
+    },
+    {
+      "epoch": 0.5713245997088792,
+      "grad_norm": 0.15170110762119293,
+      "learning_rate": 2.0512282883292257e-05,
+      "loss": 0.14161767959594726,
+      "step": 3140
+    },
+    {
+      "epoch": 0.5722343522561864,
+      "grad_norm": 0.8107472658157349,
+      "learning_rate": 2.0439890025672955e-05,
+      "loss": 0.14481087923049926,
+      "step": 3145
+    },
+    {
+      "epoch": 0.5731441048034934,
+      "grad_norm": 0.15346679091453552,
+      "learning_rate": 2.036753669979174e-05,
+      "loss": 0.14860262870788574,
+      "step": 3150
+    },
+    {
+      "epoch": 0.5740538573508006,
+      "grad_norm": 0.1632593423128128,
+      "learning_rate": 2.0295223532881886e-05,
+      "loss": 0.1481687307357788,
+      "step": 3155
+    },
+    {
+      "epoch": 0.5749636098981077,
+      "grad_norm": 0.23399172723293304,
+      "learning_rate": 2.022295115182852e-05,
+      "loss": 0.149153733253479,
+      "step": 3160
+    },
+    {
+      "epoch": 0.5758733624454149,
+      "grad_norm": 0.14977394044399261,
+      "learning_rate": 2.015072018316323e-05,
+      "loss": 0.14921388626098633,
+      "step": 3165
+    },
+    {
+      "epoch": 0.576783114992722,
+      "grad_norm": 0.1550658792257309,
+      "learning_rate": 2.007853125305856e-05,
+      "loss": 0.1482759475708008,
+      "step": 3170
+    },
+    {
+      "epoch": 0.5776928675400291,
+      "grad_norm": 0.16661737859249115,
+      "learning_rate": 2.0006384987322645e-05,
+      "loss": 0.14903552532196046,
+      "step": 3175
+    },
+    {
+      "epoch": 0.5786026200873362,
+      "grad_norm": 0.1746823936700821,
+      "learning_rate": 1.9934282011393753e-05,
+      "loss": 0.1412947654724121,
+      "step": 3180
+    },
+    {
+      "epoch": 0.5795123726346434,
+      "grad_norm": 0.17025792598724365,
+      "learning_rate": 1.9862222950334857e-05,
+      "loss": 0.15289769172668458,
+      "step": 3185
+    },
+    {
+      "epoch": 0.5804221251819505,
+      "grad_norm": 0.16857658326625824,
+      "learning_rate": 1.9790208428828252e-05,
+      "loss": 0.14419941902160643,
+      "step": 3190
+    },
+    {
+      "epoch": 0.5813318777292577,
+      "grad_norm": 0.16099876165390015,
+      "learning_rate": 1.9718239071170118e-05,
+      "loss": 0.14476487636566163,
+      "step": 3195
+    },
+    {
+      "epoch": 0.5822416302765647,
+      "grad_norm": 0.16140873730182648,
+      "learning_rate": 1.964631550126508e-05,
+      "loss": 0.14588416814804078,
+      "step": 3200
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.7637015226676337e+18,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-3200/training_args.bin b/checkpoint-3200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-3200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/checkpoint-3300/README.md b/checkpoint-3300/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-3300/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-3300/adapter_config.json b/checkpoint-3300/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-3300/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-3300/adapter_model.safetensors b/checkpoint-3300/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..9511bf5a0688db833bc47fbc607c9fde10e1f8d9
--- /dev/null
+++ b/checkpoint-3300/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8c1f146ba2327ee02f804685347ed602693fc59d0277708585446f16fe7f35b6
+size 169741912
diff --git a/checkpoint-3300/chat_template.jinja b/checkpoint-3300/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-3300/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-3300/optimizer.pt b/checkpoint-3300/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9bf5f3f4d2dfa952cd7a0dd3bb57c95fff55d368
--- /dev/null
+++ b/checkpoint-3300/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dc5320188332556e5821be0cd8607d3554c07d98ba5100a705efb88727a70bfd
+size 72807355
diff --git a/checkpoint-3300/processor_config.json b/checkpoint-3300/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-3300/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-3300/rng_state.pth b/checkpoint-3300/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-3300/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-3300/scheduler.pt b/checkpoint-3300/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1c1d54cfdf5d38e1b5df758ecefaf3ecd3487f9c
--- /dev/null
+++ b/checkpoint-3300/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:061e9564ca1cd5e50b2a0842dab0cf4dbfb6db44af39595b8259d6c65c843209
+size 1465
diff --git a/checkpoint-3300/tokenizer.json b/checkpoint-3300/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-3300/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-3300/tokenizer_config.json b/checkpoint-3300/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-3300/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-3300/trainer_state.json b/checkpoint-3300/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..1b2d7d31f08c2829ce93a2ac937d93c96a44e155
--- /dev/null
+++ b/checkpoint-3300/trainer_state.json
@@ -0,0 +1,4662 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.6004366812227074,
+  "eval_steps": 100,
+  "global_step": 3300,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    },
+    {
+      "epoch": 0.03729985443959243,
+      "grad_norm": 0.061678580939769745,
+      "learning_rate": 4.999340748636462e-05,
+      "loss": 0.24956226348876953,
+      "step": 205
+    },
+    {
+      "epoch": 0.03820960698689956,
+      "grad_norm": 0.07328873127698898,
+      "learning_rate": 4.999160884067051e-05,
+      "loss": 0.26169676780700685,
+      "step": 210
+    },
+    {
+      "epoch": 0.0391193595342067,
+      "grad_norm": 0.08287990838289261,
+      "learning_rate": 4.9989593541926246e-05,
+      "loss": 0.2574604034423828,
+      "step": 215
+    },
+    {
+      "epoch": 0.04002911208151383,
+      "grad_norm": 0.06787359714508057,
+      "learning_rate": 4.9987361607602525e-05,
+      "loss": 0.25351409912109374,
+      "step": 220
+    },
+    {
+      "epoch": 0.04093886462882096,
+      "grad_norm": 0.06695502996444702,
+      "learning_rate": 4.998491305704805e-05,
+      "loss": 0.24522039890289307,
+      "step": 225
+    },
+    {
+      "epoch": 0.041848617176128096,
+      "grad_norm": 0.08872214704751968,
+      "learning_rate": 4.9982247911489375e-05,
+      "loss": 0.2581867933273315,
+      "step": 230
+    },
+    {
+      "epoch": 0.042758369723435226,
+      "grad_norm": 0.07637131959199905,
+      "learning_rate": 4.9979366194030743e-05,
+      "loss": 0.25569658279418944,
+      "step": 235
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 0.08158119022846222,
+      "learning_rate": 4.997626792965385e-05,
+      "loss": 0.2529409646987915,
+      "step": 240
+    },
+    {
+      "epoch": 0.04457787481804949,
+      "grad_norm": 0.07529161125421524,
+      "learning_rate": 4.997295314521766e-05,
+      "loss": 0.24049024581909179,
+      "step": 245
+    },
+    {
+      "epoch": 0.04548762736535662,
+      "grad_norm": 0.08860139548778534,
+      "learning_rate": 4.996942186945813e-05,
+      "loss": 0.2490522861480713,
+      "step": 250
+    },
+    {
+      "epoch": 0.04639737991266375,
+      "grad_norm": 0.0850321501493454,
+      "learning_rate": 4.9965674132988005e-05,
+      "loss": 0.24180831909179687,
+      "step": 255
+    },
+    {
+      "epoch": 0.04730713245997089,
+      "grad_norm": 0.07556115090847015,
+      "learning_rate": 4.996170996829653e-05,
+      "loss": 0.2509631872177124,
+      "step": 260
+    },
+    {
+      "epoch": 0.04821688500727802,
+      "grad_norm": 0.07971206307411194,
+      "learning_rate": 4.995752940974918e-05,
+      "loss": 0.24398891925811766,
+      "step": 265
+    },
+    {
+      "epoch": 0.04912663755458515,
+      "grad_norm": 0.09149336814880371,
+      "learning_rate": 4.9953132493587344e-05,
+      "loss": 0.2300492286682129,
+      "step": 270
+    },
+    {
+      "epoch": 0.050036390101892286,
+      "grad_norm": 0.08265820890665054,
+      "learning_rate": 4.9948519257928034e-05,
+      "loss": 0.24246792793273925,
+      "step": 275
+    },
+    {
+      "epoch": 0.050946142649199416,
+      "grad_norm": 0.10328587144613266,
+      "learning_rate": 4.9943689742763534e-05,
+      "loss": 0.2367171049118042,
+      "step": 280
+    },
+    {
+      "epoch": 0.05185589519650655,
+      "grad_norm": 0.0836917981505394,
+      "learning_rate": 4.993864398996105e-05,
+      "loss": 0.23215813636779786,
+      "step": 285
+    },
+    {
+      "epoch": 0.05276564774381368,
+      "grad_norm": 0.09475161135196686,
+      "learning_rate": 4.99333820432624e-05,
+      "loss": 0.2350748062133789,
+      "step": 290
+    },
+    {
+      "epoch": 0.05367540029112081,
+      "grad_norm": 0.08040128648281097,
+      "learning_rate": 4.992790394828355e-05,
+      "loss": 0.23253886699676513,
+      "step": 295
+    },
+    {
+      "epoch": 0.05458515283842795,
+      "grad_norm": 0.08852150291204453,
+      "learning_rate": 4.992220975251428e-05,
+      "loss": 0.23856515884399415,
+      "step": 300
+    },
+    {
+      "epoch": 0.05549490538573508,
+      "grad_norm": 0.09565229713916779,
+      "learning_rate": 4.991629950531775e-05,
+      "loss": 0.23311660289764405,
+      "step": 305
+    },
+    {
+      "epoch": 0.05640465793304221,
+      "grad_norm": 0.08158160001039505,
+      "learning_rate": 4.991017325793009e-05,
+      "loss": 0.22467944622039795,
+      "step": 310
+    },
+    {
+      "epoch": 0.05731441048034935,
+      "grad_norm": 0.07746429741382599,
+      "learning_rate": 4.990383106345994e-05,
+      "loss": 0.229844069480896,
+      "step": 315
+    },
+    {
+      "epoch": 0.05822416302765648,
+      "grad_norm": 0.08564355969429016,
+      "learning_rate": 4.989727297688797e-05,
+      "loss": 0.22414517402648926,
+      "step": 320
+    },
+    {
+      "epoch": 0.05913391557496361,
+      "grad_norm": 0.07517435401678085,
+      "learning_rate": 4.9890499055066435e-05,
+      "loss": 0.2236532211303711,
+      "step": 325
+    },
+    {
+      "epoch": 0.060043668122270744,
+      "grad_norm": 0.111734539270401,
+      "learning_rate": 4.988350935671869e-05,
+      "loss": 0.21474847793579102,
+      "step": 330
+    },
+    {
+      "epoch": 0.060953420669577874,
+      "grad_norm": 0.09906989336013794,
+      "learning_rate": 4.987630394243866e-05,
+      "loss": 0.23321933746337892,
+      "step": 335
+    },
+    {
+      "epoch": 0.06186317321688501,
+      "grad_norm": 0.10131457448005676,
+      "learning_rate": 4.98688828746903e-05,
+      "loss": 0.2310662031173706,
+      "step": 340
+    },
+    {
+      "epoch": 0.06277292576419213,
+      "grad_norm": 0.09203507006168365,
+      "learning_rate": 4.986124621780708e-05,
+      "loss": 0.22021169662475587,
+      "step": 345
+    },
+    {
+      "epoch": 0.06368267831149928,
+      "grad_norm": 0.09505912661552429,
+      "learning_rate": 4.9853394037991416e-05,
+      "loss": 0.2197155237197876,
+      "step": 350
+    },
+    {
+      "epoch": 0.06459243085880641,
+      "grad_norm": 0.09038657695055008,
+      "learning_rate": 4.984532640331412e-05,
+      "loss": 0.22066287994384765,
+      "step": 355
+    },
+    {
+      "epoch": 0.06550218340611354,
+      "grad_norm": 0.09707064181566238,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 0.22455451488494874,
+      "step": 360
+    },
+    {
+      "epoch": 0.06641193595342067,
+      "grad_norm": 0.10367228090763092,
+      "learning_rate": 4.98285450509961e-05,
+      "loss": 0.21993820667266845,
+      "step": 365
+    },
+    {
+      "epoch": 0.0673216885007278,
+      "grad_norm": 0.12229471653699875,
+      "learning_rate": 4.9819831478833456e-05,
+      "loss": 0.2168867588043213,
+      "step": 370
+    },
+    {
+      "epoch": 0.06823144104803494,
+      "grad_norm": 0.0964592918753624,
+      "learning_rate": 4.981090274276406e-05,
+      "loss": 0.21579203605651856,
+      "step": 375
+    },
+    {
+      "epoch": 0.06914119359534207,
+      "grad_norm": 0.09400496631860733,
+      "learning_rate": 4.980175892019141e-05,
+      "loss": 0.20972180366516113,
+      "step": 380
+    },
+    {
+      "epoch": 0.0700509461426492,
+      "grad_norm": 0.08158645778894424,
+      "learning_rate": 4.9792400090383594e-05,
+      "loss": 0.22148358821868896,
+      "step": 385
+    },
+    {
+      "epoch": 0.07096069868995633,
+      "grad_norm": 0.10916394740343094,
+      "learning_rate": 4.978282633447261e-05,
+      "loss": 0.2214418649673462,
+      "step": 390
+    },
+    {
+      "epoch": 0.07187045123726346,
+      "grad_norm": 0.11138810962438583,
+      "learning_rate": 4.9773037735453636e-05,
+      "loss": 0.21814754009246826,
+      "step": 395
+    },
+    {
+      "epoch": 0.07278020378457059,
+      "grad_norm": 0.10914396494626999,
+      "learning_rate": 4.9763034378184365e-05,
+      "loss": 0.21310818195343018,
+      "step": 400
+    },
+    {
+      "epoch": 0.07368995633187773,
+      "grad_norm": 0.1043366864323616,
+      "learning_rate": 4.975281634938421e-05,
+      "loss": 0.21266789436340333,
+      "step": 405
+    },
+    {
+      "epoch": 0.07459970887918486,
+      "grad_norm": 0.1036868542432785,
+      "learning_rate": 4.9742383737633594e-05,
+      "loss": 0.21606721878051757,
+      "step": 410
+    },
+    {
+      "epoch": 0.075509461426492,
+      "grad_norm": 0.11640442907810211,
+      "learning_rate": 4.9731736633373144e-05,
+      "loss": 0.21532948017120362,
+      "step": 415
+    },
+    {
+      "epoch": 0.07641921397379912,
+      "grad_norm": 0.11219926178455353,
+      "learning_rate": 4.9720875128902956e-05,
+      "loss": 0.2191627025604248,
+      "step": 420
+    },
+    {
+      "epoch": 0.07732896652110625,
+      "grad_norm": 0.12103637307882309,
+      "learning_rate": 4.970979931838176e-05,
+      "loss": 0.20938868522644044,
+      "step": 425
+    },
+    {
+      "epoch": 0.0782387190684134,
+      "grad_norm": 0.13274189829826355,
+      "learning_rate": 4.96985092978261e-05,
+      "loss": 0.21792960166931152,
+      "step": 430
+    },
+    {
+      "epoch": 0.07914847161572053,
+      "grad_norm": 0.11164513230323792,
+      "learning_rate": 4.968700516510954e-05,
+      "loss": 0.2022618055343628,
+      "step": 435
+    },
+    {
+      "epoch": 0.08005822416302766,
+      "grad_norm": 0.09532847255468369,
+      "learning_rate": 4.967528701996174e-05,
+      "loss": 0.21255812644958497,
+      "step": 440
+    },
+    {
+      "epoch": 0.08096797671033479,
+      "grad_norm": 0.10279258340597153,
+      "learning_rate": 4.96633549639677e-05,
+      "loss": 0.20683050155639648,
+      "step": 445
+    },
+    {
+      "epoch": 0.08187772925764192,
+      "grad_norm": 0.1257462352514267,
+      "learning_rate": 4.965120910056677e-05,
+      "loss": 0.21419920921325683,
+      "step": 450
+    },
+    {
+      "epoch": 0.08278748180494905,
+      "grad_norm": 0.11663137376308441,
+      "learning_rate": 4.963884953505186e-05,
+      "loss": 0.2072287082672119,
+      "step": 455
+    },
+    {
+      "epoch": 0.08369723435225619,
+      "grad_norm": 0.10488224029541016,
+      "learning_rate": 4.96262763745684e-05,
+      "loss": 0.1982678532600403,
+      "step": 460
+    },
+    {
+      "epoch": 0.08460698689956332,
+      "grad_norm": 0.11801692098379135,
+      "learning_rate": 4.961348972811354e-05,
+      "loss": 0.20662031173706055,
+      "step": 465
+    },
+    {
+      "epoch": 0.08551673944687045,
+      "grad_norm": 0.11318827420473099,
+      "learning_rate": 4.96004897065351e-05,
+      "loss": 0.20947303771972656,
+      "step": 470
+    },
+    {
+      "epoch": 0.08642649199417758,
+      "grad_norm": 0.13409486413002014,
+      "learning_rate": 4.95872764225307e-05,
+      "loss": 0.19670876264572143,
+      "step": 475
+    },
+    {
+      "epoch": 0.08733624454148471,
+      "grad_norm": 0.14440792798995972,
+      "learning_rate": 4.957384999064672e-05,
+      "loss": 0.19842848777770997,
+      "step": 480
+    },
+    {
+      "epoch": 0.08824599708879186,
+      "grad_norm": 0.12246996909379959,
+      "learning_rate": 4.956021052727731e-05,
+      "loss": 0.20318071842193602,
+      "step": 485
+    },
+    {
+      "epoch": 0.08915574963609899,
+      "grad_norm": 0.13437233865261078,
+      "learning_rate": 4.954635815066342e-05,
+      "loss": 0.21675212383270265,
+      "step": 490
+    },
+    {
+      "epoch": 0.09006550218340612,
+      "grad_norm": 0.11109672486782074,
+      "learning_rate": 4.9532292980891744e-05,
+      "loss": 0.2100757837295532,
+      "step": 495
+    },
+    {
+      "epoch": 0.09097525473071325,
+      "grad_norm": 0.1388893872499466,
+      "learning_rate": 4.9518015139893675e-05,
+      "loss": 0.20303285121917725,
+      "step": 500
+    },
+    {
+      "epoch": 0.09188500727802038,
+      "grad_norm": 0.13239721953868866,
+      "learning_rate": 4.950352475144427e-05,
+      "loss": 0.2152268409729004,
+      "step": 505
+    },
+    {
+      "epoch": 0.0927947598253275,
+      "grad_norm": 0.12834979593753815,
+      "learning_rate": 4.948882194116119e-05,
+      "loss": 0.20799248218536376,
+      "step": 510
+    },
+    {
+      "epoch": 0.09370451237263465,
+      "grad_norm": 0.11886704713106155,
+      "learning_rate": 4.947390683650354e-05,
+      "loss": 0.20394976139068605,
+      "step": 515
+    },
+    {
+      "epoch": 0.09461426491994178,
+      "grad_norm": 0.11398876458406448,
+      "learning_rate": 4.945877956677083e-05,
+      "loss": 0.2091092586517334,
+      "step": 520
+    },
+    {
+      "epoch": 0.09552401746724891,
+      "grad_norm": 0.1422540694475174,
+      "learning_rate": 4.944344026310186e-05,
+      "loss": 0.19564238786697388,
+      "step": 525
+    },
+    {
+      "epoch": 0.09643377001455604,
+      "grad_norm": 0.11359584331512451,
+      "learning_rate": 4.9427889058473535e-05,
+      "loss": 0.20493624210357667,
+      "step": 530
+    },
+    {
+      "epoch": 0.09734352256186317,
+      "grad_norm": 0.11703553050756454,
+      "learning_rate": 4.941212608769974e-05,
+      "loss": 0.2098615884780884,
+      "step": 535
+    },
+    {
+      "epoch": 0.0982532751091703,
+      "grad_norm": 0.14552047848701477,
+      "learning_rate": 4.939615148743017e-05,
+      "loss": 0.20382182598114013,
+      "step": 540
+    },
+    {
+      "epoch": 0.09916302765647744,
+      "grad_norm": 0.13178016245365143,
+      "learning_rate": 4.937996539614914e-05,
+      "loss": 0.19901862144470214,
+      "step": 545
+    },
+    {
+      "epoch": 0.10007278020378457,
+      "grad_norm": 0.635392427444458,
+      "learning_rate": 4.936356795417439e-05,
+      "loss": 0.20694944858551026,
+      "step": 550
+    },
+    {
+      "epoch": 0.1009825327510917,
+      "grad_norm": 0.15019077062606812,
+      "learning_rate": 4.934695930365586e-05,
+      "loss": 0.19313746690750122,
+      "step": 555
+    },
+    {
+      "epoch": 0.10189228529839883,
+      "grad_norm": 0.12941956520080566,
+      "learning_rate": 4.9330139588574474e-05,
+      "loss": 0.19671722650527954,
+      "step": 560
+    },
+    {
+      "epoch": 0.10280203784570596,
+      "grad_norm": 0.13818831741809845,
+      "learning_rate": 4.931310895474088e-05,
+      "loss": 0.20026786327362062,
+      "step": 565
+    },
+    {
+      "epoch": 0.1037117903930131,
+      "grad_norm": 0.12011194974184036,
+      "learning_rate": 4.929586754979417e-05,
+      "loss": 0.1932437539100647,
+      "step": 570
+    },
+    {
+      "epoch": 0.10462154294032024,
+      "grad_norm": 0.1345364898443222,
+      "learning_rate": 4.9278415523200644e-05,
+      "loss": 0.20245940685272218,
+      "step": 575
+    },
+    {
+      "epoch": 0.10553129548762737,
+      "grad_norm": 0.13281017541885376,
+      "learning_rate": 4.926075302625247e-05,
+      "loss": 0.19864981174468993,
+      "step": 580
+    },
+    {
+      "epoch": 0.1064410480349345,
+      "grad_norm": 0.13465586304664612,
+      "learning_rate": 4.924288021206639e-05,
+      "loss": 0.19573183059692384,
+      "step": 585
+    },
+    {
+      "epoch": 0.10735080058224163,
+      "grad_norm": 0.15225961804389954,
+      "learning_rate": 4.9224797235582396e-05,
+      "loss": 0.19946801662445068,
+      "step": 590
+    },
+    {
+      "epoch": 0.10826055312954876,
+      "grad_norm": 0.12816746532917023,
+      "learning_rate": 4.92065042535624e-05,
+      "loss": 0.19851526021957397,
+      "step": 595
+    },
+    {
+      "epoch": 0.1091703056768559,
+      "grad_norm": 0.13802853226661682,
+      "learning_rate": 4.9188001424588824e-05,
+      "loss": 0.19321763515472412,
+      "step": 600
+    },
+    {
+      "epoch": 0.11008005822416303,
+      "grad_norm": 0.17504797875881195,
+      "learning_rate": 4.9169288909063295e-05,
+      "loss": 0.2032616138458252,
+      "step": 605
+    },
+    {
+      "epoch": 0.11098981077147016,
+      "grad_norm": 0.13544194400310516,
+      "learning_rate": 4.91503668692052e-05,
+      "loss": 0.2011256456375122,
+      "step": 610
+    },
+    {
+      "epoch": 0.11189956331877729,
+      "grad_norm": 1.3976134061813354,
+      "learning_rate": 4.91312354690503e-05,
+      "loss": 0.19916868209838867,
+      "step": 615
+    },
+    {
+      "epoch": 0.11280931586608442,
+      "grad_norm": 0.1465059071779251,
+      "learning_rate": 4.91118948744493e-05,
+      "loss": 0.19487457275390624,
+      "step": 620
+    },
+    {
+      "epoch": 0.11371906841339156,
+      "grad_norm": 0.12103168666362762,
+      "learning_rate": 4.909234525306645e-05,
+      "loss": 0.1907251238822937,
+      "step": 625
+    },
+    {
+      "epoch": 0.1146288209606987,
+      "grad_norm": 0.12660574913024902,
+      "learning_rate": 4.907258677437802e-05,
+      "loss": 0.19327253103256226,
+      "step": 630
+    },
+    {
+      "epoch": 0.11553857350800582,
+      "grad_norm": 0.1347813606262207,
+      "learning_rate": 4.90526196096709e-05,
+      "loss": 0.19637736082077026,
+      "step": 635
+    },
+    {
+      "epoch": 0.11644832605531295,
+      "grad_norm": 0.14953652024269104,
+      "learning_rate": 4.903244393204107e-05,
+      "loss": 0.20325069427490233,
+      "step": 640
+    },
+    {
+      "epoch": 0.11735807860262008,
+      "grad_norm": 0.13936272263526917,
+      "learning_rate": 4.901205991639213e-05,
+      "loss": 0.1930275321006775,
+      "step": 645
+    },
+    {
+      "epoch": 0.11826783114992721,
+      "grad_norm": 0.1448420137166977,
+      "learning_rate": 4.899146773943374e-05,
+      "loss": 0.20026936531066894,
+      "step": 650
+    },
+    {
+      "epoch": 0.11917758369723436,
+      "grad_norm": 0.1312534064054489,
+      "learning_rate": 4.897066757968014e-05,
+      "loss": 0.19062033891677857,
+      "step": 655
+    },
+    {
+      "epoch": 0.12008733624454149,
+      "grad_norm": 0.13644742965698242,
+      "learning_rate": 4.894965961744859e-05,
+      "loss": 0.18719595670700073,
+      "step": 660
+    },
+    {
+      "epoch": 0.12099708879184862,
+      "grad_norm": 0.14276087284088135,
+      "learning_rate": 4.892844403485777e-05,
+      "loss": 0.19784307479858398,
+      "step": 665
+    },
+    {
+      "epoch": 0.12190684133915575,
+      "grad_norm": 0.14735399186611176,
+      "learning_rate": 4.890702101582623e-05,
+      "loss": 0.19163782596588136,
+      "step": 670
+    },
+    {
+      "epoch": 0.12281659388646288,
+      "grad_norm": 0.15742065012454987,
+      "learning_rate": 4.888539074607082e-05,
+      "loss": 0.19312986135482788,
+      "step": 675
+    },
+    {
+      "epoch": 0.12372634643377002,
+      "grad_norm": 0.12917031347751617,
+      "learning_rate": 4.8863553413105025e-05,
+      "loss": 0.20066320896148682,
+      "step": 680
+    },
+    {
+      "epoch": 0.12463609898107715,
+      "grad_norm": 0.1484801322221756,
+      "learning_rate": 4.884150920623737e-05,
+      "loss": 0.20096096992492676,
+      "step": 685
+    },
+    {
+      "epoch": 0.12554585152838427,
+      "grad_norm": 0.1455296128988266,
+      "learning_rate": 4.88192583165698e-05,
+      "loss": 0.20518505573272705,
+      "step": 690
+    },
+    {
+      "epoch": 0.12645560407569142,
+      "grad_norm": 0.14517490565776825,
+      "learning_rate": 4.879680093699598e-05,
+      "loss": 0.18859238624572755,
+      "step": 695
+    },
+    {
+      "epoch": 0.12736535662299855,
+      "grad_norm": 0.18778090178966522,
+      "learning_rate": 4.877413726219964e-05,
+      "loss": 0.197074818611145,
+      "step": 700
+    },
+    {
+      "epoch": 0.12827510917030568,
+      "grad_norm": 0.13497677445411682,
+      "learning_rate": 4.87512674886529e-05,
+      "loss": 0.18713107109069824,
+      "step": 705
+    },
+    {
+      "epoch": 0.12918486171761281,
+      "grad_norm": 0.12657155096530914,
+      "learning_rate": 4.872819181461455e-05,
+      "loss": 0.1858484387397766,
+      "step": 710
+    },
+    {
+      "epoch": 0.13009461426491994,
+      "grad_norm": 0.11458148807287216,
+      "learning_rate": 4.870491044012834e-05,
+      "loss": 0.18732179403305055,
+      "step": 715
+    },
+    {
+      "epoch": 0.13100436681222707,
+      "grad_norm": 0.13000249862670898,
+      "learning_rate": 4.8681423567021244e-05,
+      "loss": 0.1872936010360718,
+      "step": 720
+    },
+    {
+      "epoch": 0.1319141193595342,
+      "grad_norm": 0.14580890536308289,
+      "learning_rate": 4.865773139890172e-05,
+      "loss": 0.19280019998550416,
+      "step": 725
+    },
+    {
+      "epoch": 0.13282387190684133,
+      "grad_norm": 0.1507277935743332,
+      "learning_rate": 4.8633834141157913e-05,
+      "loss": 0.1898929238319397,
+      "step": 730
+    },
+    {
+      "epoch": 0.13373362445414846,
+      "grad_norm": 0.1418737769126892,
+      "learning_rate": 4.860973200095592e-05,
+      "loss": 0.17926375865936278,
+      "step": 735
+    },
+    {
+      "epoch": 0.1346433770014556,
+      "grad_norm": 0.17151866853237152,
+      "learning_rate": 4.858542518723794e-05,
+      "loss": 0.18963592052459716,
+      "step": 740
+    },
+    {
+      "epoch": 0.13555312954876272,
+      "grad_norm": 0.11162743717432022,
+      "learning_rate": 4.8560913910720535e-05,
+      "loss": 0.19466646909713745,
+      "step": 745
+    },
+    {
+      "epoch": 0.13646288209606988,
+      "grad_norm": 0.15628376603126526,
+      "learning_rate": 4.8536198383892725e-05,
+      "loss": 0.19494034051895143,
+      "step": 750
+    },
+    {
+      "epoch": 0.137372634643377,
+      "grad_norm": 0.18209289014339447,
+      "learning_rate": 4.851127882101421e-05,
+      "loss": 0.18747550249099731,
+      "step": 755
+    },
+    {
+      "epoch": 0.13828238719068414,
+      "grad_norm": 0.14559614658355713,
+      "learning_rate": 4.8486155438113454e-05,
+      "loss": 0.1897158980369568,
+      "step": 760
+    },
+    {
+      "epoch": 0.13919213973799127,
+      "grad_norm": 0.3198587894439697,
+      "learning_rate": 4.846082845298586e-05,
+      "loss": 0.18571001291275024,
+      "step": 765
+    },
+    {
+      "epoch": 0.1401018922852984,
+      "grad_norm": 0.1486678421497345,
+      "learning_rate": 4.843529808519189e-05,
+      "loss": 0.19561930894851684,
+      "step": 770
+    },
+    {
+      "epoch": 0.14101164483260553,
+      "grad_norm": 0.15318170189857483,
+      "learning_rate": 4.840956455605509e-05,
+      "loss": 0.187040114402771,
+      "step": 775
+    },
+    {
+      "epoch": 0.14192139737991266,
+      "grad_norm": 0.13754244148731232,
+      "learning_rate": 4.838362808866025e-05,
+      "loss": 0.18345539569854735,
+      "step": 780
+    },
+    {
+      "epoch": 0.1428311499272198,
+      "grad_norm": 0.12943248450756073,
+      "learning_rate": 4.835748890785143e-05,
+      "loss": 0.1921079397201538,
+      "step": 785
+    },
+    {
+      "epoch": 0.14374090247452692,
+      "grad_norm": 0.110458143055439,
+      "learning_rate": 4.833114724023001e-05,
+      "loss": 0.17927205562591553,
+      "step": 790
+    },
+    {
+      "epoch": 0.14465065502183405,
+      "grad_norm": 0.2421770840883255,
+      "learning_rate": 4.830460331415275e-05,
+      "loss": 0.18317567110061644,
+      "step": 795
+    },
+    {
+      "epoch": 0.14556040756914118,
+      "grad_norm": 0.14752762019634247,
+      "learning_rate": 4.8277857359729787e-05,
+      "loss": 0.1843916058540344,
+      "step": 800
+    },
+    {
+      "epoch": 0.14647016011644834,
+      "grad_norm": 0.15043556690216064,
+      "learning_rate": 4.8250909608822644e-05,
+      "loss": 0.18354393243789674,
+      "step": 805
+    },
+    {
+      "epoch": 0.14737991266375547,
+      "grad_norm": 0.1381794661283493,
+      "learning_rate": 4.822376029504223e-05,
+      "loss": 0.1789781332015991,
+      "step": 810
+    },
+    {
+      "epoch": 0.1482896652110626,
+      "grad_norm": 0.18386174738407135,
+      "learning_rate": 4.819640965374681e-05,
+      "loss": 0.19494292736053467,
+      "step": 815
+    },
+    {
+      "epoch": 0.14919941775836973,
+      "grad_norm": 0.13829593360424042,
+      "learning_rate": 4.816885792203996e-05,
+      "loss": 0.18486063480377196,
+      "step": 820
+    },
+    {
+      "epoch": 0.15010917030567686,
+      "grad_norm": 0.15033291280269623,
+      "learning_rate": 4.814110533876852e-05,
+      "loss": 0.18061509132385253,
+      "step": 825
+    },
+    {
+      "epoch": 0.151018922852984,
+      "grad_norm": 0.17150473594665527,
+      "learning_rate": 4.811315214452051e-05,
+      "loss": 0.18464866876602173,
+      "step": 830
+    },
+    {
+      "epoch": 0.15192867540029112,
+      "grad_norm": 0.15317125618457794,
+      "learning_rate": 4.808499858162307e-05,
+      "loss": 0.1837708592414856,
+      "step": 835
+    },
+    {
+      "epoch": 0.15283842794759825,
+      "grad_norm": 0.2671392560005188,
+      "learning_rate": 4.805664489414031e-05,
+      "loss": 0.19338636398315429,
+      "step": 840
+    },
+    {
+      "epoch": 0.15374818049490538,
+      "grad_norm": 0.14047028124332428,
+      "learning_rate": 4.802809132787125e-05,
+      "loss": 0.17069108486175538,
+      "step": 845
+    },
+    {
+      "epoch": 0.1546579330422125,
+      "grad_norm": 0.1520431935787201,
+      "learning_rate": 4.799933813034768e-05,
+      "loss": 0.18607735633850098,
+      "step": 850
+    },
+    {
+      "epoch": 0.15556768558951964,
+      "grad_norm": 0.17239463329315186,
+      "learning_rate": 4.797038555083197e-05,
+      "loss": 0.18069062232971192,
+      "step": 855
+    },
+    {
+      "epoch": 0.1564774381368268,
+      "grad_norm": 0.1377955675125122,
+      "learning_rate": 4.794123384031495e-05,
+      "loss": 0.18870222568511963,
+      "step": 860
+    },
+    {
+      "epoch": 0.15738719068413393,
+      "grad_norm": 0.15901461243629456,
+      "learning_rate": 4.791188325151373e-05,
+      "loss": 0.18128334283828734,
+      "step": 865
+    },
+    {
+      "epoch": 0.15829694323144106,
+      "grad_norm": 0.14634132385253906,
+      "learning_rate": 4.7882334038869495e-05,
+      "loss": 0.1866163969039917,
+      "step": 870
+    },
+    {
+      "epoch": 0.1592066957787482,
+      "grad_norm": 0.15361061692237854,
+      "learning_rate": 4.785258645854529e-05,
+      "loss": 0.17850807905197144,
+      "step": 875
+    },
+    {
+      "epoch": 0.16011644832605532,
+      "grad_norm": 0.13751649856567383,
+      "learning_rate": 4.782264076842385e-05,
+      "loss": 0.17731113433837892,
+      "step": 880
+    },
+    {
+      "epoch": 0.16102620087336245,
+      "grad_norm": 0.17909638583660126,
+      "learning_rate": 4.7792497228105314e-05,
+      "loss": 0.18344542980194092,
+      "step": 885
+    },
+    {
+      "epoch": 0.16193595342066958,
+      "grad_norm": 0.16038304567337036,
+      "learning_rate": 4.776215609890498e-05,
+      "loss": 0.18868647813796996,
+      "step": 890
+    },
+    {
+      "epoch": 0.1628457059679767,
+      "grad_norm": 0.1653951108455658,
+      "learning_rate": 4.773161764385107e-05,
+      "loss": 0.18614152669906617,
+      "step": 895
+    },
+    {
+      "epoch": 0.16375545851528384,
+      "grad_norm": 0.16193026304244995,
+      "learning_rate": 4.770088212768241e-05,
+      "loss": 0.18564575910568237,
+      "step": 900
+    },
+    {
+      "epoch": 0.16466521106259097,
+      "grad_norm": 0.16048531234264374,
+      "learning_rate": 4.7669949816846173e-05,
+      "loss": 0.18330031633377075,
+      "step": 905
+    },
+    {
+      "epoch": 0.1655749636098981,
+      "grad_norm": 0.1440177708864212,
+      "learning_rate": 4.7638820979495534e-05,
+      "loss": 0.17712442874908446,
+      "step": 910
+    },
+    {
+      "epoch": 0.16648471615720525,
+      "grad_norm": 0.19635969400405884,
+      "learning_rate": 4.760749588548738e-05,
+      "loss": 0.18679027557373046,
+      "step": 915
+    },
+    {
+      "epoch": 0.16739446870451238,
+      "grad_norm": 0.15576541423797607,
+      "learning_rate": 4.757597480637995e-05,
+      "loss": 0.19283764362335204,
+      "step": 920
+    },
+    {
+      "epoch": 0.1683042212518195,
+      "grad_norm": 0.1550331562757492,
+      "learning_rate": 4.7544258015430463e-05,
+      "loss": 0.18269542455673218,
+      "step": 925
+    },
+    {
+      "epoch": 0.16921397379912664,
+      "grad_norm": 0.18369626998901367,
+      "learning_rate": 4.75123457875928e-05,
+      "loss": 0.1697891116142273,
+      "step": 930
+    },
+    {
+      "epoch": 0.17012372634643377,
+      "grad_norm": 0.15266314148902893,
+      "learning_rate": 4.7480238399515074e-05,
+      "loss": 0.18523451089859008,
+      "step": 935
+    },
+    {
+      "epoch": 0.1710334788937409,
+      "grad_norm": 0.16709664463996887,
+      "learning_rate": 4.744793612953724e-05,
+      "loss": 0.1803238034248352,
+      "step": 940
+    },
+    {
+      "epoch": 0.17194323144104803,
+      "grad_norm": 0.14929179847240448,
+      "learning_rate": 4.741543925768872e-05,
+      "loss": 0.1861217737197876,
+      "step": 945
+    },
+    {
+      "epoch": 0.17285298398835516,
+      "grad_norm": 0.1362280696630478,
+      "learning_rate": 4.7382748065685915e-05,
+      "loss": 0.17896100282669067,
+      "step": 950
+    },
+    {
+      "epoch": 0.1737627365356623,
+      "grad_norm": 0.15290239453315735,
+      "learning_rate": 4.734986283692982e-05,
+      "loss": 0.18432788848876952,
+      "step": 955
+    },
+    {
+      "epoch": 0.17467248908296942,
+      "grad_norm": 0.1287035197019577,
+      "learning_rate": 4.73167838565035e-05,
+      "loss": 0.18485682010650634,
+      "step": 960
+    },
+    {
+      "epoch": 0.17558224163027655,
+      "grad_norm": 0.17969627678394318,
+      "learning_rate": 4.728351141116971e-05,
+      "loss": 0.17361557483673096,
+      "step": 965
+    },
+    {
+      "epoch": 0.1764919941775837,
+      "grad_norm": 0.13751201331615448,
+      "learning_rate": 4.7250045789368326e-05,
+      "loss": 0.1731679320335388,
+      "step": 970
+    },
+    {
+      "epoch": 0.17740174672489084,
+      "grad_norm": 0.1603265255689621,
+      "learning_rate": 4.721638728121388e-05,
+      "loss": 0.17308170795440675,
+      "step": 975
+    },
+    {
+      "epoch": 0.17831149927219797,
+      "grad_norm": 0.1592789888381958,
+      "learning_rate": 4.718253617849306e-05,
+      "loss": 0.17534757852554322,
+      "step": 980
+    },
+    {
+      "epoch": 0.1792212518195051,
+      "grad_norm": 0.12727224826812744,
+      "learning_rate": 4.714849277466214e-05,
+      "loss": 0.17817609310150145,
+      "step": 985
+    },
+    {
+      "epoch": 0.18013100436681223,
+      "grad_norm": 0.15401554107666016,
+      "learning_rate": 4.711425736484447e-05,
+      "loss": 0.1733405351638794,
+      "step": 990
+    },
+    {
+      "epoch": 0.18104075691411936,
+      "grad_norm": 0.13253968954086304,
+      "learning_rate": 4.7079830245827906e-05,
+      "loss": 0.17846795320510864,
+      "step": 995
+    },
+    {
+      "epoch": 0.1819505094614265,
+      "grad_norm": 0.21846213936805725,
+      "learning_rate": 4.7045211716062245e-05,
+      "loss": 0.18021599054336548,
+      "step": 1000
+    },
+    {
+      "epoch": 0.18286026200873362,
+      "grad_norm": 0.16867990791797638,
+      "learning_rate": 4.7010402075656595e-05,
+      "loss": 0.18232386112213134,
+      "step": 1005
+    },
+    {
+      "epoch": 0.18377001455604075,
+      "grad_norm": 0.17180582880973816,
+      "learning_rate": 4.697540162637686e-05,
+      "loss": 0.1816317319869995,
+      "step": 1010
+    },
+    {
+      "epoch": 0.18467976710334788,
+      "grad_norm": 0.16480213403701782,
+      "learning_rate": 4.694021067164303e-05,
+      "loss": 0.17718446254730225,
+      "step": 1015
+    },
+    {
+      "epoch": 0.185589519650655,
+      "grad_norm": 0.15015918016433716,
+      "learning_rate": 4.6904829516526605e-05,
+      "loss": 0.17412011623382567,
+      "step": 1020
+    },
+    {
+      "epoch": 0.18649927219796217,
+      "grad_norm": 0.14445139467716217,
+      "learning_rate": 4.686925846774795e-05,
+      "loss": 0.1778018832206726,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1874090247452693,
+      "grad_norm": 0.1701960265636444,
+      "learning_rate": 4.683349783367362e-05,
+      "loss": 0.16901081800460815,
+      "step": 1030
+    },
+    {
+      "epoch": 0.18831877729257643,
+      "grad_norm": 0.15894867479801178,
+      "learning_rate": 4.679754792431368e-05,
+      "loss": 0.17055928707122803,
+      "step": 1035
+    },
+    {
+      "epoch": 0.18922852983988356,
+      "grad_norm": 0.1511942446231842,
+      "learning_rate": 4.676140905131903e-05,
+      "loss": 0.17339680194854737,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1901382823871907,
+      "grad_norm": 0.14735209941864014,
+      "learning_rate": 4.672508152797872e-05,
+      "loss": 0.17802717685699462,
+      "step": 1045
+    },
+    {
+      "epoch": 0.19104803493449782,
+      "grad_norm": 0.17367291450500488,
+      "learning_rate": 4.66885656692172e-05,
+      "loss": 0.1732744097709656,
+      "step": 1050
+    },
+    {
+      "epoch": 0.19195778748180495,
+      "grad_norm": 0.147227481007576,
+      "learning_rate": 4.665186179159159e-05,
+      "loss": 0.17040517330169677,
+      "step": 1055
+    },
+    {
+      "epoch": 0.19286754002911208,
+      "grad_norm": 0.1709655076265335,
+      "learning_rate": 4.6614970213289e-05,
+      "loss": 0.17794088125228882,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1937772925764192,
+      "grad_norm": 0.1588088721036911,
+      "learning_rate": 4.657789125412366e-05,
+      "loss": 0.17180380821228028,
+      "step": 1065
+    },
+    {
+      "epoch": 0.19468704512372634,
+      "grad_norm": 0.14827021956443787,
+      "learning_rate": 4.654062523553428e-05,
+      "loss": 0.182997989654541,
+      "step": 1070
+    },
+    {
+      "epoch": 0.19559679767103347,
+      "grad_norm": 0.16230466961860657,
+      "learning_rate": 4.6503172480581126e-05,
+      "loss": 0.17346880435943604,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1965065502183406,
+      "grad_norm": 0.1637624353170395,
+      "learning_rate": 4.646553331394333e-05,
+      "loss": 0.17263576984405518,
+      "step": 1080
+    },
+    {
+      "epoch": 0.19741630276564776,
+      "grad_norm": 0.15977843105793,
+      "learning_rate": 4.642770806191603e-05,
+      "loss": 0.17284308671951293,
+      "step": 1085
+    },
+    {
+      "epoch": 0.19832605531295489,
+      "grad_norm": 0.15394869446754456,
+      "learning_rate": 4.6389697052407534e-05,
+      "loss": 0.17797101736068727,
+      "step": 1090
+    },
+    {
+      "epoch": 0.19923580786026202,
+      "grad_norm": 0.15995225310325623,
+      "learning_rate": 4.6351500614936485e-05,
+      "loss": 0.18137198686599731,
+      "step": 1095
+    },
+    {
+      "epoch": 0.20014556040756915,
+      "grad_norm": 0.1779479682445526,
+      "learning_rate": 4.6313119080629006e-05,
+      "loss": 0.17998344898223878,
+      "step": 1100
+    },
+    {
+      "epoch": 0.20105531295487628,
+      "grad_norm": 0.14362832903862,
+      "learning_rate": 4.627455278221584e-05,
+      "loss": 0.18196423053741456,
+      "step": 1105
+    },
+    {
+      "epoch": 0.2019650655021834,
+      "grad_norm": 0.15951639413833618,
+      "learning_rate": 4.623580205402947e-05,
+      "loss": 0.17423888444900512,
+      "step": 1110
+    },
+    {
+      "epoch": 0.20287481804949054,
+      "grad_norm": 0.17273563146591187,
+      "learning_rate": 4.619686723200115e-05,
+      "loss": 0.17392473220825194,
+      "step": 1115
+    },
+    {
+      "epoch": 0.20378457059679767,
+      "grad_norm": 0.1655360758304596,
+      "learning_rate": 4.615774865365813e-05,
+      "loss": 0.17528389692306517,
+      "step": 1120
+    },
+    {
+      "epoch": 0.2046943231441048,
+      "grad_norm": 0.15920691192150116,
+      "learning_rate": 4.611844665812058e-05,
+      "loss": 0.1806849241256714,
+      "step": 1125
+    },
+    {
+      "epoch": 0.20560407569141192,
+      "grad_norm": 0.16114577651023865,
+      "learning_rate": 4.607896158609875e-05,
+      "loss": 0.17217352390289306,
+      "step": 1130
+    },
+    {
+      "epoch": 0.20651382823871905,
+      "grad_norm": 0.1499422937631607,
+      "learning_rate": 4.603929377988999e-05,
+      "loss": 0.17806737422943114,
+      "step": 1135
+    },
+    {
+      "epoch": 0.2074235807860262,
+      "grad_norm": 0.17605191469192505,
+      "learning_rate": 4.5999443583375765e-05,
+      "loss": 0.17842113971710205,
+      "step": 1140
+    },
+    {
+      "epoch": 0.20833333333333334,
+      "grad_norm": 0.16117210686206818,
+      "learning_rate": 4.595941134201871e-05,
+      "loss": 0.18379683494567872,
+      "step": 1145
+    },
+    {
+      "epoch": 0.20924308588064047,
+      "grad_norm": 0.21199050545692444,
+      "learning_rate": 4.591919740285957e-05,
+      "loss": 0.16286123991012574,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2101528384279476,
+      "grad_norm": 0.15100529789924622,
+      "learning_rate": 4.587880211451427e-05,
+      "loss": 0.17995200157165528,
+      "step": 1155
+    },
+    {
+      "epoch": 0.21106259097525473,
+      "grad_norm": 0.16618172824382782,
+      "learning_rate": 4.583822582717085e-05,
+      "loss": 0.16960303783416747,
+      "step": 1160
+    },
+    {
+      "epoch": 0.21197234352256186,
+      "grad_norm": 0.14743569493293762,
+      "learning_rate": 4.579746889258643e-05,
+      "loss": 0.17762668132781984,
+      "step": 1165
+    },
+    {
+      "epoch": 0.212882096069869,
+      "grad_norm": 0.1697179079055786,
+      "learning_rate": 4.575653166408417e-05,
+      "loss": 0.16656005382537842,
+      "step": 1170
+    },
+    {
+      "epoch": 0.21379184861717612,
+      "grad_norm": 0.14886513352394104,
+      "learning_rate": 4.57154144965502e-05,
+      "loss": 0.17091882228851318,
+      "step": 1175
+    },
+    {
+      "epoch": 0.21470160116448325,
+      "grad_norm": 0.18197473883628845,
+      "learning_rate": 4.5674117746430556e-05,
+      "loss": 0.1770920753479004,
+      "step": 1180
+    },
+    {
+      "epoch": 0.21561135371179038,
+      "grad_norm": 0.17323088645935059,
+      "learning_rate": 4.563264177172807e-05,
+      "loss": 0.1734643578529358,
+      "step": 1185
+    },
+    {
+      "epoch": 0.2165211062590975,
+      "grad_norm": 0.1521984338760376,
+      "learning_rate": 4.559098693199929e-05,
+      "loss": 0.17515116930007935,
+      "step": 1190
+    },
+    {
+      "epoch": 0.21743085880640467,
+      "grad_norm": 0.1842304915189743,
+      "learning_rate": 4.554915358835134e-05,
+      "loss": 0.16798022985458375,
+      "step": 1195
+    },
+    {
+      "epoch": 0.2183406113537118,
+      "grad_norm": 0.14753451943397522,
+      "learning_rate": 4.5507142103438794e-05,
+      "loss": 0.1755476713180542,
+      "step": 1200
+    },
+    {
+      "epoch": 0.21925036390101893,
+      "grad_norm": 0.17096194624900818,
+      "learning_rate": 4.546495284146057e-05,
+      "loss": 0.1792473554611206,
+      "step": 1205
+    },
+    {
+      "epoch": 0.22016011644832606,
+      "grad_norm": 0.1579233556985855,
+      "learning_rate": 4.542258616815669e-05,
+      "loss": 0.17230144739151002,
+      "step": 1210
+    },
+    {
+      "epoch": 0.2210698689956332,
+      "grad_norm": 0.177297905087471,
+      "learning_rate": 4.5380042450805216e-05,
+      "loss": 0.1807127833366394,
+      "step": 1215
+    },
+    {
+      "epoch": 0.22197962154294032,
+      "grad_norm": 0.14331696927547455,
+      "learning_rate": 4.533732205821897e-05,
+      "loss": 0.17201389074325563,
+      "step": 1220
+    },
+    {
+      "epoch": 0.22288937409024745,
+      "grad_norm": 0.14473360776901245,
+      "learning_rate": 4.529442536074239e-05,
+      "loss": 0.17036900520324708,
+      "step": 1225
+    },
+    {
+      "epoch": 0.22379912663755458,
+      "grad_norm": 0.1820901483297348,
+      "learning_rate": 4.5251352730248314e-05,
+      "loss": 0.17704882621765136,
+      "step": 1230
+    },
+    {
+      "epoch": 0.2247088791848617,
+      "grad_norm": 0.1948976367712021,
+      "learning_rate": 4.5208104540134746e-05,
+      "loss": 0.1706973433494568,
+      "step": 1235
+    },
+    {
+      "epoch": 0.22561863173216884,
+      "grad_norm": 0.16660070419311523,
+      "learning_rate": 4.51646811653216e-05,
+      "loss": 0.17636821269989014,
+      "step": 1240
+    },
+    {
+      "epoch": 0.22652838427947597,
+      "grad_norm": 0.1699984073638916,
+      "learning_rate": 4.512108298224751e-05,
+      "loss": 0.16986632347106934,
+      "step": 1245
+    },
+    {
+      "epoch": 0.22743813682678313,
+      "grad_norm": 0.17601042985916138,
+      "learning_rate": 4.50773103688665e-05,
+      "loss": 0.17507898807525635,
+      "step": 1250
+    },
+    {
+      "epoch": 0.22834788937409026,
+      "grad_norm": 0.17557238042354584,
+      "learning_rate": 4.503336370464476e-05,
+      "loss": 0.17702863216400147,
+      "step": 1255
+    },
+    {
+      "epoch": 0.2292576419213974,
+      "grad_norm": 0.1800651252269745,
+      "learning_rate": 4.498924337055729e-05,
+      "loss": 0.16419180631637573,
+      "step": 1260
+    },
+    {
+      "epoch": 0.23016739446870452,
+      "grad_norm": 0.2022479772567749,
+      "learning_rate": 4.494494974908468e-05,
+      "loss": 0.17482060194015503,
+      "step": 1265
+    },
+    {
+      "epoch": 0.23107714701601165,
+      "grad_norm": 0.14180205762386322,
+      "learning_rate": 4.490048322420973e-05,
+      "loss": 0.1723136067390442,
+      "step": 1270
+    },
+    {
+      "epoch": 0.23198689956331878,
+      "grad_norm": 0.18607310950756073,
+      "learning_rate": 4.485584418141419e-05,
+      "loss": 0.17096419334411622,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2328966521106259,
+      "grad_norm": 0.15958310663700104,
+      "learning_rate": 4.481103300767529e-05,
+      "loss": 0.1656244158744812,
+      "step": 1280
+    },
+    {
+      "epoch": 0.23380640465793304,
+      "grad_norm": 0.17552383244037628,
+      "learning_rate": 4.476605009146255e-05,
+      "loss": 0.17677626609802247,
+      "step": 1285
+    },
+    {
+      "epoch": 0.23471615720524017,
+      "grad_norm": 0.15299823880195618,
+      "learning_rate": 4.472089582273429e-05,
+      "loss": 0.1778991103172302,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2356259097525473,
+      "grad_norm": 0.14613987505435944,
+      "learning_rate": 4.46755705929343e-05,
+      "loss": 0.17071452140808105,
+      "step": 1295
+    },
+    {
+      "epoch": 0.23653566229985443,
+      "grad_norm": 0.17781122028827667,
+      "learning_rate": 4.463007479498843e-05,
+      "loss": 0.16955430507659913,
+      "step": 1300
+    },
+    {
+      "epoch": 0.23744541484716158,
+      "grad_norm": 0.16326487064361572,
+      "learning_rate": 4.458440882330119e-05,
+      "loss": 0.1777693510055542,
+      "step": 1305
+    },
+    {
+      "epoch": 0.23835516739446871,
+      "grad_norm": 0.17701926827430725,
+      "learning_rate": 4.4538573073752365e-05,
+      "loss": 0.16323351860046387,
+      "step": 1310
+    },
+    {
+      "epoch": 0.23926491994177584,
+      "grad_norm": 0.13104717433452606,
+      "learning_rate": 4.449256794369349e-05,
+      "loss": 0.17653456926345826,
+      "step": 1315
+    },
+    {
+      "epoch": 0.24017467248908297,
+      "grad_norm": 0.1796836256980896,
+      "learning_rate": 4.444639383194452e-05,
+      "loss": 0.17189600467681884,
+      "step": 1320
+    },
+    {
+      "epoch": 0.2410844250363901,
+      "grad_norm": 0.14919696748256683,
+      "learning_rate": 4.440005113879029e-05,
+      "loss": 0.17003334760665895,
+      "step": 1325
+    },
+    {
+      "epoch": 0.24199417758369723,
+      "grad_norm": 0.1728784441947937,
+      "learning_rate": 4.4353540265977064e-05,
+      "loss": 0.17397408485412597,
+      "step": 1330
+    },
+    {
+      "epoch": 0.24290393013100436,
+      "grad_norm": 0.14591015875339508,
+      "learning_rate": 4.43068616167091e-05,
+      "loss": 0.16498478651046752,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2438136826783115,
+      "grad_norm": 0.18417201936244965,
+      "learning_rate": 4.4260015595645055e-05,
+      "loss": 0.16841750144958495,
+      "step": 1340
+    },
+    {
+      "epoch": 0.24472343522561862,
+      "grad_norm": 0.16264279186725616,
+      "learning_rate": 4.4213002608894605e-05,
+      "loss": 0.16907373666763306,
+      "step": 1345
+    },
+    {
+      "epoch": 0.24563318777292575,
+      "grad_norm": 0.15248481929302216,
+      "learning_rate": 4.416582306401481e-05,
+      "loss": 0.15931472778320313,
+      "step": 1350
+    },
+    {
+      "epoch": 0.24654294032023288,
+      "grad_norm": 0.1488373875617981,
+      "learning_rate": 4.4118477370006636e-05,
+      "loss": 0.1701716423034668,
+      "step": 1355
+    },
+    {
+      "epoch": 0.24745269286754004,
+      "grad_norm": 0.14679782092571259,
+      "learning_rate": 4.407096593731142e-05,
+      "loss": 0.157412326335907,
+      "step": 1360
+    },
+    {
+      "epoch": 0.24836244541484717,
+      "grad_norm": 0.17139530181884766,
+      "learning_rate": 4.402328917780728e-05,
+      "loss": 0.17303754091262818,
+      "step": 1365
+    },
+    {
+      "epoch": 0.2492721979621543,
+      "grad_norm": 0.1534871757030487,
+      "learning_rate": 4.397544750480554e-05,
+      "loss": 0.1786255121231079,
+      "step": 1370
+    },
+    {
+      "epoch": 0.2501819505094614,
+      "grad_norm": 0.1876252293586731,
+      "learning_rate": 4.39274413330472e-05,
+      "loss": 0.16442898511886597,
+      "step": 1375
+    },
+    {
+      "epoch": 0.25109170305676853,
+      "grad_norm": 0.16165752708911896,
+      "learning_rate": 4.387927107869928e-05,
+      "loss": 0.1780426025390625,
+      "step": 1380
+    },
+    {
+      "epoch": 0.25200145560407566,
+      "grad_norm": 0.17242255806922913,
+      "learning_rate": 4.383093715935124e-05,
+      "loss": 0.15959256887435913,
+      "step": 1385
+    },
+    {
+      "epoch": 0.25291120815138285,
+      "grad_norm": 0.1627114862203598,
+      "learning_rate": 4.378243999401137e-05,
+      "loss": 0.17606115341186523,
+      "step": 1390
+    },
+    {
+      "epoch": 0.25382096069869,
+      "grad_norm": 0.15911224484443665,
+      "learning_rate": 4.373378000310312e-05,
+      "loss": 0.16798585653305054,
+      "step": 1395
+    },
+    {
+      "epoch": 0.2547307132459971,
+      "grad_norm": 0.15542249381542206,
+      "learning_rate": 4.3684957608461505e-05,
+      "loss": 0.1695417881011963,
+      "step": 1400
+    },
+    {
+      "epoch": 0.25564046579330424,
+      "grad_norm": 0.1475304812192917,
+      "learning_rate": 4.363597323332941e-05,
+      "loss": 0.16340878009796142,
+      "step": 1405
+    },
+    {
+      "epoch": 0.25655021834061137,
+      "grad_norm": 0.16943927109241486,
+      "learning_rate": 4.358682730235395e-05,
+      "loss": 0.17240238189697266,
+      "step": 1410
+    },
+    {
+      "epoch": 0.2574599708879185,
+      "grad_norm": 0.1816391944885254,
+      "learning_rate": 4.3537520241582744e-05,
+      "loss": 0.16558437347412108,
+      "step": 1415
+    },
+    {
+      "epoch": 0.25836972343522563,
+      "grad_norm": 0.23851341009140015,
+      "learning_rate": 4.348805247846027e-05,
+      "loss": 0.16796000003814698,
+      "step": 1420
+    },
+    {
+      "epoch": 0.25927947598253276,
+      "grad_norm": 0.15415243804454803,
+      "learning_rate": 4.343842444182414e-05,
+      "loss": 0.1746017098426819,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2601892285298399,
+      "grad_norm": 0.15651032328605652,
+      "learning_rate": 4.338863656190139e-05,
+      "loss": 0.1649057984352112,
+      "step": 1430
+    },
+    {
+      "epoch": 0.261098981077147,
+      "grad_norm": 0.16601966321468353,
+      "learning_rate": 4.333868927030471e-05,
+      "loss": 0.15888988971710205,
+      "step": 1435
+    },
+    {
+      "epoch": 0.26200873362445415,
+      "grad_norm": 0.1549467295408249,
+      "learning_rate": 4.328858300002876e-05,
+      "loss": 0.16357985734939576,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2629184861717613,
+      "grad_norm": 0.16332370042800903,
+      "learning_rate": 4.32383181854464e-05,
+      "loss": 0.16749982833862304,
+      "step": 1445
+    },
+    {
+      "epoch": 0.2638282387190684,
+      "grad_norm": 0.14827077090740204,
+      "learning_rate": 4.3187895262304894e-05,
+      "loss": 0.16886214017868043,
+      "step": 1450
+    },
+    {
+      "epoch": 0.26473799126637554,
+      "grad_norm": 0.1557198166847229,
+      "learning_rate": 4.313731466772216e-05,
+      "loss": 0.17512214183807373,
+      "step": 1455
+    },
+    {
+      "epoch": 0.26564774381368267,
+      "grad_norm": 0.17263570427894592,
+      "learning_rate": 4.308657684018299e-05,
+      "loss": 0.16248074769973755,
+      "step": 1460
+    },
+    {
+      "epoch": 0.2665574963609898,
+      "grad_norm": 0.17135761678218842,
+      "learning_rate": 4.303568221953521e-05,
+      "loss": 0.16605921983718872,
+      "step": 1465
+    },
+    {
+      "epoch": 0.26746724890829693,
+      "grad_norm": 0.14322632551193237,
+      "learning_rate": 4.2984631246985897e-05,
+      "loss": 0.1610772728919983,
+      "step": 1470
+    },
+    {
+      "epoch": 0.26837700145560406,
+      "grad_norm": 0.18852312862873077,
+      "learning_rate": 4.2933424365097564e-05,
+      "loss": 0.1686462163925171,
+      "step": 1475
+    },
+    {
+      "epoch": 0.2692867540029112,
+      "grad_norm": 0.1780245155096054,
+      "learning_rate": 4.2882062017784294e-05,
+      "loss": 0.16953932046890258,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2701965065502183,
+      "grad_norm": 0.180568665266037,
+      "learning_rate": 4.2830544650307895e-05,
+      "loss": 0.16442664861679077,
+      "step": 1485
+    },
+    {
+      "epoch": 0.27110625909752545,
+      "grad_norm": 0.16876435279846191,
+      "learning_rate": 4.277887270927407e-05,
+      "loss": 0.17128173112869263,
+      "step": 1490
+    },
+    {
+      "epoch": 0.2720160116448326,
+      "grad_norm": 0.164053276181221,
+      "learning_rate": 4.2727046642628513e-05,
+      "loss": 0.16331382989883422,
+      "step": 1495
+    },
+    {
+      "epoch": 0.27292576419213976,
+      "grad_norm": 0.14577528834342957,
+      "learning_rate": 4.267506689965305e-05,
+      "loss": 0.1638316035270691,
+      "step": 1500
+    },
+    {
+      "epoch": 0.2738355167394469,
+      "grad_norm": 0.1648740917444229,
+      "learning_rate": 4.262293393096171e-05,
+      "loss": 0.15332664251327516,
+      "step": 1505
+    },
+    {
+      "epoch": 0.274745269286754,
+      "grad_norm": 0.16445094347000122,
+      "learning_rate": 4.257064818849685e-05,
+      "loss": 0.1706634521484375,
+      "step": 1510
+    },
+    {
+      "epoch": 0.27565502183406115,
+      "grad_norm": 0.1584935486316681,
+      "learning_rate": 4.251821012552524e-05,
+      "loss": 0.1684114694595337,
+      "step": 1515
+    },
+    {
+      "epoch": 0.2765647743813683,
+      "grad_norm": 0.17215611040592194,
+      "learning_rate": 4.24656201966341e-05,
+      "loss": 0.15594131946563722,
+      "step": 1520
+    },
+    {
+      "epoch": 0.2774745269286754,
+      "grad_norm": 0.15945589542388916,
+      "learning_rate": 4.2412878857727214e-05,
+      "loss": 0.1686659574508667,
+      "step": 1525
+    },
+    {
+      "epoch": 0.27838427947598254,
+      "grad_norm": 0.16103951632976532,
+      "learning_rate": 4.2359986566020906e-05,
+      "loss": 0.17779340744018554,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2792940320232897,
+      "grad_norm": 0.1770307570695877,
+      "learning_rate": 4.230694378004014e-05,
+      "loss": 0.16786882877349854,
+      "step": 1535
+    },
+    {
+      "epoch": 0.2802037845705968,
+      "grad_norm": 0.16225053369998932,
+      "learning_rate": 4.2253750959614504e-05,
+      "loss": 0.16558897495269775,
+      "step": 1540
+    },
+    {
+      "epoch": 0.28111353711790393,
+      "grad_norm": 0.27213969826698303,
+      "learning_rate": 4.220040856587425e-05,
+      "loss": 0.1641119599342346,
+      "step": 1545
+    },
+    {
+      "epoch": 0.28202328966521106,
+      "grad_norm": 0.1773071587085724,
+      "learning_rate": 4.2146917061246284e-05,
+      "loss": 0.16919140815734862,
+      "step": 1550
+    },
+    {
+      "epoch": 0.2829330422125182,
+      "grad_norm": 0.15519705414772034,
+      "learning_rate": 4.209327690945014e-05,
+      "loss": 0.15501506328582765,
+      "step": 1555
+    },
+    {
+      "epoch": 0.2838427947598253,
+      "grad_norm": 0.19921597838401794,
+      "learning_rate": 4.203948857549402e-05,
+      "loss": 0.1690821886062622,
+      "step": 1560
+    },
+    {
+      "epoch": 0.28475254730713245,
+      "grad_norm": 0.15417630970478058,
+      "learning_rate": 4.1985552525670696e-05,
+      "loss": 0.1675640344619751,
+      "step": 1565
+    },
+    {
+      "epoch": 0.2856622998544396,
+      "grad_norm": 0.1739572137594223,
+      "learning_rate": 4.193146922755348e-05,
+      "loss": 0.16738017797470092,
+      "step": 1570
+    },
+    {
+      "epoch": 0.2865720524017467,
+      "grad_norm": 0.1384361982345581,
+      "learning_rate": 4.187723914999221e-05,
+      "loss": 0.16802358627319336,
+      "step": 1575
+    },
+    {
+      "epoch": 0.28748180494905384,
+      "grad_norm": 0.1491454839706421,
+      "learning_rate": 4.182286276310915e-05,
+      "loss": 0.1619583249092102,
+      "step": 1580
+    },
+    {
+      "epoch": 0.288391557496361,
+      "grad_norm": 0.15831919014453888,
+      "learning_rate": 4.176834053829492e-05,
+      "loss": 0.1625199794769287,
+      "step": 1585
+    },
+    {
+      "epoch": 0.2893013100436681,
+      "grad_norm": 0.16265396773815155,
+      "learning_rate": 4.1713672948204416e-05,
+      "loss": 0.16718552112579346,
+      "step": 1590
+    },
+    {
+      "epoch": 0.29021106259097523,
+      "grad_norm": 0.15153461694717407,
+      "learning_rate": 4.1658860466752714e-05,
+      "loss": 0.15979087352752686,
+      "step": 1595
+    },
+    {
+      "epoch": 0.29112081513828236,
+      "grad_norm": 0.1620412915945053,
+      "learning_rate": 4.160390356911096e-05,
+      "loss": 0.16103557348251343,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2920305676855895,
+      "grad_norm": 0.16673807799816132,
+      "learning_rate": 4.154880273170223e-05,
+      "loss": 0.16394708156585694,
+      "step": 1605
+    },
+    {
+      "epoch": 0.2929403202328967,
+      "grad_norm": 0.14834867417812347,
+      "learning_rate": 4.149355843219744e-05,
+      "loss": 0.15916435718536376,
+      "step": 1610
+    },
+    {
+      "epoch": 0.2938500727802038,
+      "grad_norm": 0.16977964341640472,
+      "learning_rate": 4.143817114951119e-05,
+      "loss": 0.16538127660751342,
+      "step": 1615
+    },
+    {
+      "epoch": 0.29475982532751094,
+      "grad_norm": 0.17986875772476196,
+      "learning_rate": 4.138264136379756e-05,
+      "loss": 0.15514618158340454,
+      "step": 1620
+    },
+    {
+      "epoch": 0.29566957787481807,
+      "grad_norm": 0.15794920921325684,
+      "learning_rate": 4.132696955644605e-05,
+      "loss": 0.15992183685302735,
+      "step": 1625
+    },
+    {
+      "epoch": 0.2965793304221252,
+      "grad_norm": 0.19955399632453918,
+      "learning_rate": 4.127115621007731e-05,
+      "loss": 0.16362056732177735,
+      "step": 1630
+    },
+    {
+      "epoch": 0.29748908296943233,
+      "grad_norm": 0.1352023035287857,
+      "learning_rate": 4.121520180853903e-05,
+      "loss": 0.15631601810455323,
+      "step": 1635
+    },
+    {
+      "epoch": 0.29839883551673946,
+      "grad_norm": 0.15340781211853027,
+      "learning_rate": 4.1159106836901674e-05,
+      "loss": 0.1571858048439026,
+      "step": 1640
+    },
+    {
+      "epoch": 0.2993085880640466,
+      "grad_norm": 0.15311770141124725,
+      "learning_rate": 4.110287178145433e-05,
+      "loss": 0.16082344055175782,
+      "step": 1645
+    },
+    {
+      "epoch": 0.3002183406113537,
+      "grad_norm": 0.17811627686023712,
+      "learning_rate": 4.10464971297005e-05,
+      "loss": 0.16117215156555176,
+      "step": 1650
+    },
+    {
+      "epoch": 0.30112809315866085,
+      "grad_norm": 0.21060039103031158,
+      "learning_rate": 4.0989983370353805e-05,
+      "loss": 0.15838587284088135,
+      "step": 1655
+    },
+    {
+      "epoch": 0.302037845705968,
+      "grad_norm": 0.155836820602417,
+      "learning_rate": 4.093333099333383e-05,
+      "loss": 0.16648870706558228,
+      "step": 1660
+    },
+    {
+      "epoch": 0.3029475982532751,
+      "grad_norm": 0.13711698353290558,
+      "learning_rate": 4.0876540489761826e-05,
+      "loss": 0.16899349689483642,
+      "step": 1665
+    },
+    {
+      "epoch": 0.30385735080058224,
+      "grad_norm": 0.15162716805934906,
+      "learning_rate": 4.0819612351956485e-05,
+      "loss": 0.16574090719223022,
+      "step": 1670
+    },
+    {
+      "epoch": 0.30476710334788937,
+      "grad_norm": 0.15016348659992218,
+      "learning_rate": 4.0762547073429615e-05,
+      "loss": 0.1689780354499817,
+      "step": 1675
+    },
+    {
+      "epoch": 0.3056768558951965,
+      "grad_norm": 0.15182986855506897,
+      "learning_rate": 4.070534514888194e-05,
+      "loss": 0.1593686819076538,
+      "step": 1680
+    },
+    {
+      "epoch": 0.3065866084425036,
+      "grad_norm": 0.15648750960826874,
+      "learning_rate": 4.0648007074198765e-05,
+      "loss": 0.16436235904693602,
+      "step": 1685
+    },
+    {
+      "epoch": 0.30749636098981076,
+      "grad_norm": 0.18339484930038452,
+      "learning_rate": 4.0590533346445665e-05,
+      "loss": 0.1678077220916748,
+      "step": 1690
+    },
+    {
+      "epoch": 0.3084061135371179,
+      "grad_norm": 0.16426527500152588,
+      "learning_rate": 4.053292446386422e-05,
+      "loss": 0.1689227342605591,
+      "step": 1695
+    },
+    {
+      "epoch": 0.309315866084425,
+      "grad_norm": 0.16129335761070251,
+      "learning_rate": 4.047518092586766e-05,
+      "loss": 0.16592445373535156,
+      "step": 1700
+    },
+    {
+      "epoch": 0.31022561863173215,
+      "grad_norm": 0.15512363612651825,
+      "learning_rate": 4.041730323303654e-05,
+      "loss": 0.16142364740371704,
+      "step": 1705
+    },
+    {
+      "epoch": 0.3111353711790393,
+      "grad_norm": 0.159842386841774,
+      "learning_rate": 4.0359291887114425e-05,
+      "loss": 0.1702875852584839,
+      "step": 1710
+    },
+    {
+      "epoch": 0.3120451237263464,
+      "grad_norm": 0.19558854401111603,
+      "learning_rate": 4.030114739100352e-05,
+      "loss": 0.15966148376464845,
+      "step": 1715
+    },
+    {
+      "epoch": 0.3129548762736536,
+      "grad_norm": 0.1577496975660324,
+      "learning_rate": 4.024287024876029e-05,
+      "loss": 0.1620358943939209,
+      "step": 1720
+    },
+    {
+      "epoch": 0.3138646288209607,
+      "grad_norm": 0.1629355251789093,
+      "learning_rate": 4.0184460965591144e-05,
+      "loss": 0.16511552333831786,
+      "step": 1725
+    },
+    {
+      "epoch": 0.31477438136826785,
+      "grad_norm": 0.17060767114162445,
+      "learning_rate": 4.0125920047848e-05,
+      "loss": 0.15672838687896729,
+      "step": 1730
+    },
+    {
+      "epoch": 0.315684133915575,
+      "grad_norm": 0.22447620332241058,
+      "learning_rate": 4.006724800302394e-05,
+      "loss": 0.15339784622192382,
+      "step": 1735
+    },
+    {
+      "epoch": 0.3165938864628821,
+      "grad_norm": 0.14572037756443024,
+      "learning_rate": 4.000844533974878e-05,
+      "loss": 0.16566959619522095,
+      "step": 1740
+    },
+    {
+      "epoch": 0.31750363901018924,
+      "grad_norm": 0.15915483236312866,
+      "learning_rate": 3.9949512567784684e-05,
+      "loss": 0.16153957843780517,
+      "step": 1745
+    },
+    {
+      "epoch": 0.3184133915574964,
+      "grad_norm": 0.1668540984392166,
+      "learning_rate": 3.9890450198021704e-05,
+      "loss": 0.1659809947013855,
+      "step": 1750
+    },
+    {
+      "epoch": 0.3193231441048035,
+      "grad_norm": 0.16612035036087036,
+      "learning_rate": 3.983125874247341e-05,
+      "loss": 0.16941241025924683,
+      "step": 1755
+    },
+    {
+      "epoch": 0.32023289665211063,
+      "grad_norm": 0.15163679420948029,
+      "learning_rate": 3.9771938714272407e-05,
+      "loss": 0.16053590774536133,
+      "step": 1760
+    },
+    {
+      "epoch": 0.32114264919941776,
+      "grad_norm": 0.1797824203968048,
+      "learning_rate": 3.97124906276659e-05,
+      "loss": 0.1667110800743103,
+      "step": 1765
+    },
+    {
+      "epoch": 0.3220524017467249,
+      "grad_norm": 0.15076608955860138,
+      "learning_rate": 3.9652914998011237e-05,
+      "loss": 0.1607860803604126,
+      "step": 1770
+    },
+    {
+      "epoch": 0.322962154294032,
+      "grad_norm": 0.16523587703704834,
+      "learning_rate": 3.959321234177144e-05,
+      "loss": 0.16515827178955078,
+      "step": 1775
+    },
+    {
+      "epoch": 0.32387190684133915,
+      "grad_norm": 0.22065149247646332,
+      "learning_rate": 3.9533383176510746e-05,
+      "loss": 0.1618957757949829,
+      "step": 1780
+    },
+    {
+      "epoch": 0.3247816593886463,
+      "grad_norm": 0.16426463425159454,
+      "learning_rate": 3.9473428020890066e-05,
+      "loss": 0.15763382911682128,
+      "step": 1785
+    },
+    {
+      "epoch": 0.3256914119359534,
+      "grad_norm": 0.16474904119968414,
+      "learning_rate": 3.941334739466257e-05,
+      "loss": 0.15135571956634522,
+      "step": 1790
+    },
+    {
+      "epoch": 0.32660116448326054,
+      "grad_norm": 0.16746412217617035,
+      "learning_rate": 3.935314181866909e-05,
+      "loss": 0.15925389528274536,
+      "step": 1795
+    },
+    {
+      "epoch": 0.32751091703056767,
+      "grad_norm": 0.17819371819496155,
+      "learning_rate": 3.929281181483369e-05,
+      "loss": 0.1598669171333313,
+      "step": 1800
+    },
+    {
+      "epoch": 0.3284206695778748,
+      "grad_norm": 0.1816040277481079,
+      "learning_rate": 3.923235790615907e-05,
+      "loss": 0.1652522087097168,
+      "step": 1805
+    },
+    {
+      "epoch": 0.32933042212518193,
+      "grad_norm": 0.14846695959568024,
+      "learning_rate": 3.917178061672211e-05,
+      "loss": 0.16665585041046144,
+      "step": 1810
+    },
+    {
+      "epoch": 0.33024017467248906,
+      "grad_norm": 0.1734926551580429,
+      "learning_rate": 3.911108047166924e-05,
+      "loss": 0.16069791316986085,
+      "step": 1815
+    },
+    {
+      "epoch": 0.3311499272197962,
+      "grad_norm": 0.16154922544956207,
+      "learning_rate": 3.905025799721194e-05,
+      "loss": 0.16114097833633423,
+      "step": 1820
+    },
+    {
+      "epoch": 0.3320596797671033,
+      "grad_norm": 0.1538771390914917,
+      "learning_rate": 3.898931372062217e-05,
+      "loss": 0.1602831244468689,
+      "step": 1825
+    },
+    {
+      "epoch": 0.3329694323144105,
+      "grad_norm": 0.14036566019058228,
+      "learning_rate": 3.892824817022781e-05,
+      "loss": 0.1502395749092102,
+      "step": 1830
+    },
+    {
+      "epoch": 0.33387918486171764,
+      "grad_norm": 0.19212059676647186,
+      "learning_rate": 3.886706187540804e-05,
+      "loss": 0.16265250444412233,
+      "step": 1835
+    },
+    {
+      "epoch": 0.33478893740902477,
+      "grad_norm": 0.17410333454608917,
+      "learning_rate": 3.880575536658881e-05,
+      "loss": 0.15689224004745483,
+      "step": 1840
+    },
+    {
+      "epoch": 0.3356986899563319,
+      "grad_norm": 0.15165294706821442,
+      "learning_rate": 3.874432917523817e-05,
+      "loss": 0.15033140182495117,
+      "step": 1845
+    },
+    {
+      "epoch": 0.336608442503639,
+      "grad_norm": 0.16166730225086212,
+      "learning_rate": 3.8682783833861736e-05,
+      "loss": 0.16896235942840576,
+      "step": 1850
+    },
+    {
+      "epoch": 0.33751819505094616,
+      "grad_norm": 0.16497021913528442,
+      "learning_rate": 3.8621119875998026e-05,
+      "loss": 0.1600774645805359,
+      "step": 1855
+    },
+    {
+      "epoch": 0.3384279475982533,
+      "grad_norm": 0.17264948785305023,
+      "learning_rate": 3.855933783621384e-05,
+      "loss": 0.16947593688964843,
+      "step": 1860
+    },
+    {
+      "epoch": 0.3393377001455604,
+      "grad_norm": 0.16870704293251038,
+      "learning_rate": 3.8497438250099636e-05,
+      "loss": 0.16062095165252685,
+      "step": 1865
+    },
+    {
+      "epoch": 0.34024745269286755,
+      "grad_norm": 0.16644036769866943,
+      "learning_rate": 3.843542165426492e-05,
+      "loss": 0.16015599966049193,
+      "step": 1870
+    },
+    {
+      "epoch": 0.3411572052401747,
+      "grad_norm": 0.1626352220773697,
+      "learning_rate": 3.837328858633349e-05,
+      "loss": 0.17444703578948975,
+      "step": 1875
+    },
+    {
+      "epoch": 0.3420669577874818,
+      "grad_norm": 0.1427375227212906,
+      "learning_rate": 3.83110395849389e-05,
+      "loss": 0.1589805006980896,
+      "step": 1880
+    },
+    {
+      "epoch": 0.34297671033478894,
+      "grad_norm": 0.17840255796909332,
+      "learning_rate": 3.824867518971973e-05,
+      "loss": 0.15953952074050903,
+      "step": 1885
+    },
+    {
+      "epoch": 0.34388646288209607,
+      "grad_norm": 0.16998249292373657,
+      "learning_rate": 3.818619594131489e-05,
+      "loss": 0.16027032136917113,
+      "step": 1890
+    },
+    {
+      "epoch": 0.3447962154294032,
+      "grad_norm": 0.14950257539749146,
+      "learning_rate": 3.812360238135897e-05,
+      "loss": 0.15335670709609986,
+      "step": 1895
+    },
+    {
+      "epoch": 0.3457059679767103,
+      "grad_norm": 0.1678011417388916,
+      "learning_rate": 3.806089505247752e-05,
+      "loss": 0.1560648798942566,
+      "step": 1900
+    },
+    {
+      "epoch": 0.34661572052401746,
+      "grad_norm": 0.17944541573524475,
+      "learning_rate": 3.799807449828238e-05,
+      "loss": 0.16072254180908202,
+      "step": 1905
+    },
+    {
+      "epoch": 0.3475254730713246,
+      "grad_norm": 0.166817307472229,
+      "learning_rate": 3.793514126336691e-05,
+      "loss": 0.1542820692062378,
+      "step": 1910
+    },
+    {
+      "epoch": 0.3484352256186317,
+      "grad_norm": 0.16047626733779907,
+      "learning_rate": 3.787209589330134e-05,
+      "loss": 0.16092092990875245,
+      "step": 1915
+    },
+    {
+      "epoch": 0.34934497816593885,
+      "grad_norm": 0.16478900611400604,
+      "learning_rate": 3.7808938934627965e-05,
+      "loss": 0.16765867471694945,
+      "step": 1920
+    },
+    {
+      "epoch": 0.350254730713246,
+      "grad_norm": 0.15349514782428741,
+      "learning_rate": 3.774567093485648e-05,
+      "loss": 0.15890377759933472,
+      "step": 1925
+    },
+    {
+      "epoch": 0.3511644832605531,
+      "grad_norm": 0.1515921950340271,
+      "learning_rate": 3.768229244245917e-05,
+      "loss": 0.16668319702148438,
+      "step": 1930
+    },
+    {
+      "epoch": 0.35207423580786024,
+      "grad_norm": 0.16310466825962067,
+      "learning_rate": 3.7618804006866195e-05,
+      "loss": 0.15182652473449706,
+      "step": 1935
+    },
+    {
+      "epoch": 0.3529839883551674,
+      "grad_norm": 0.17294517159461975,
+      "learning_rate": 3.755520617846084e-05,
+      "loss": 0.16287628412246705,
+      "step": 1940
+    },
+    {
+      "epoch": 0.35389374090247455,
+      "grad_norm": 0.1482895463705063,
+      "learning_rate": 3.749149950857467e-05,
+      "loss": 0.15321952104568481,
+      "step": 1945
+    },
+    {
+      "epoch": 0.3548034934497817,
+      "grad_norm": 0.2236029952764511,
+      "learning_rate": 3.7427684549482847e-05,
+      "loss": 0.15403482913970948,
+      "step": 1950
+    },
+    {
+      "epoch": 0.3557132459970888,
+      "grad_norm": 0.20185327529907227,
+      "learning_rate": 3.736376185439927e-05,
+      "loss": 0.1633884072303772,
+      "step": 1955
+    },
+    {
+      "epoch": 0.35662299854439594,
+      "grad_norm": 0.13906247913837433,
+      "learning_rate": 3.7299731977471816e-05,
+      "loss": 0.15925350189208984,
+      "step": 1960
+    },
+    {
+      "epoch": 0.35753275109170307,
+      "grad_norm": 0.18665002286434174,
+      "learning_rate": 3.723559547377751e-05,
+      "loss": 0.1612026572227478,
+      "step": 1965
+    },
+    {
+      "epoch": 0.3584425036390102,
+      "grad_norm": 0.16913433372974396,
+      "learning_rate": 3.717135289931774e-05,
+      "loss": 0.15479494333267213,
+      "step": 1970
+    },
+    {
+      "epoch": 0.35935225618631733,
+      "grad_norm": 0.1620066910982132,
+      "learning_rate": 3.7107004811013434e-05,
+      "loss": 0.1604058027267456,
+      "step": 1975
+    },
+    {
+      "epoch": 0.36026200873362446,
+      "grad_norm": 0.16838301718235016,
+      "learning_rate": 3.704255176670021e-05,
+      "loss": 0.15335073471069335,
+      "step": 1980
+    },
+    {
+      "epoch": 0.3611717612809316,
+      "grad_norm": 0.3054695427417755,
+      "learning_rate": 3.6977994325123535e-05,
+      "loss": 0.16558053493499755,
+      "step": 1985
+    },
+    {
+      "epoch": 0.3620815138282387,
+      "grad_norm": 0.1526716649532318,
+      "learning_rate": 3.6913333045933934e-05,
+      "loss": 0.16148923635482787,
+      "step": 1990
+    },
+    {
+      "epoch": 0.36299126637554585,
+      "grad_norm": 0.15328513085842133,
+      "learning_rate": 3.684856848968209e-05,
+      "loss": 0.1553613781929016,
+      "step": 1995
+    },
+    {
+      "epoch": 0.363901018922853,
+      "grad_norm": 0.16129714250564575,
+      "learning_rate": 3.6783701217813995e-05,
+      "loss": 0.16724612712860107,
+      "step": 2000
+    },
+    {
+      "epoch": 0.3648107714701601,
+      "grad_norm": 0.15715539455413818,
+      "learning_rate": 3.6718731792666086e-05,
+      "loss": 0.15867922306060792,
+      "step": 2005
+    },
+    {
+      "epoch": 0.36572052401746724,
+      "grad_norm": 0.15569166839122772,
+      "learning_rate": 3.6653660777460366e-05,
+      "loss": 0.1552058696746826,
+      "step": 2010
+    },
+    {
+      "epoch": 0.36663027656477437,
+      "grad_norm": 0.16223010420799255,
+      "learning_rate": 3.6588488736299535e-05,
+      "loss": 0.1583200454711914,
+      "step": 2015
+    },
+    {
+      "epoch": 0.3675400291120815,
+      "grad_norm": 0.18441995978355408,
+      "learning_rate": 3.652321623416209e-05,
+      "loss": 0.15050662755966188,
+      "step": 2020
+    },
+    {
+      "epoch": 0.36844978165938863,
+      "grad_norm": 0.13792674243450165,
+      "learning_rate": 3.645784383689742e-05,
+      "loss": 0.15458759069442748,
+      "step": 2025
+    },
+    {
+      "epoch": 0.36935953420669576,
+      "grad_norm": 0.14993111789226532,
+      "learning_rate": 3.639237211122091e-05,
+      "loss": 0.15926222801208495,
+      "step": 2030
+    },
+    {
+      "epoch": 0.3702692867540029,
+      "grad_norm": 0.16815930604934692,
+      "learning_rate": 3.632680162470904e-05,
+      "loss": 0.15524441003799438,
+      "step": 2035
+    },
+    {
+      "epoch": 0.37117903930131,
+      "grad_norm": 0.13312821090221405,
+      "learning_rate": 3.626113294579441e-05,
+      "loss": 0.15883516073226928,
+      "step": 2040
+    },
+    {
+      "epoch": 0.37208879184861715,
+      "grad_norm": 0.16838273406028748,
+      "learning_rate": 3.619536664376091e-05,
+      "loss": 0.15829603672027587,
+      "step": 2045
+    },
+    {
+      "epoch": 0.37299854439592434,
+      "grad_norm": 0.14706873893737793,
+      "learning_rate": 3.612950328873869e-05,
+      "loss": 0.15644397735595703,
+      "step": 2050
+    },
+    {
+      "epoch": 0.37390829694323147,
+      "grad_norm": 0.1644199639558792,
+      "learning_rate": 3.606354345169926e-05,
+      "loss": 0.15858219861984252,
+      "step": 2055
+    },
+    {
+      "epoch": 0.3748180494905386,
+      "grad_norm": 0.18077051639556885,
+      "learning_rate": 3.599748770445055e-05,
+      "loss": 0.1641286849975586,
+      "step": 2060
+    },
+    {
+      "epoch": 0.3757278020378457,
+      "grad_norm": 0.16329127550125122,
+      "learning_rate": 3.5931336619631914e-05,
+      "loss": 0.15027186870574952,
+      "step": 2065
+    },
+    {
+      "epoch": 0.37663755458515286,
+      "grad_norm": 0.16346783936023712,
+      "learning_rate": 3.586509077070922e-05,
+      "loss": 0.1558641314506531,
+      "step": 2070
+    },
+    {
+      "epoch": 0.37754730713246,
+      "grad_norm": 0.1727602630853653,
+      "learning_rate": 3.5798750731969834e-05,
+      "loss": 0.15390506982803345,
+      "step": 2075
+    },
+    {
+      "epoch": 0.3784570596797671,
+      "grad_norm": 0.7598192691802979,
+      "learning_rate": 3.5732317078517654e-05,
+      "loss": 0.1533232808113098,
+      "step": 2080
+    },
+    {
+      "epoch": 0.37936681222707425,
+      "grad_norm": 0.1433355212211609,
+      "learning_rate": 3.5665790386268124e-05,
+      "loss": 0.15560413599014283,
+      "step": 2085
+    },
+    {
+      "epoch": 0.3802765647743814,
+      "grad_norm": 0.18439625203609467,
+      "learning_rate": 3.559917123194325e-05,
+      "loss": 0.16695556640625,
+      "step": 2090
+    },
+    {
+      "epoch": 0.3811863173216885,
+      "grad_norm": 0.1693502813577652,
+      "learning_rate": 3.55324601930666e-05,
+      "loss": 0.15957870483398437,
+      "step": 2095
+    },
+    {
+      "epoch": 0.38209606986899564,
+      "grad_norm": 0.17776088416576385,
+      "learning_rate": 3.54656578479583e-05,
+      "loss": 0.1527492880821228,
+      "step": 2100
+    },
+    {
+      "epoch": 0.38300582241630277,
+      "grad_norm": 0.15993724763393402,
+      "learning_rate": 3.539876477572998e-05,
+      "loss": 0.1567505717277527,
+      "step": 2105
+    },
+    {
+      "epoch": 0.3839155749636099,
+      "grad_norm": 0.17067375779151917,
+      "learning_rate": 3.533178155627981e-05,
+      "loss": 0.14660797119140626,
+      "step": 2110
+    },
+    {
+      "epoch": 0.384825327510917,
+      "grad_norm": 0.20239882171154022,
+      "learning_rate": 3.526470877028745e-05,
+      "loss": 0.1596767544746399,
+      "step": 2115
+    },
+    {
+      "epoch": 0.38573508005822416,
+      "grad_norm": 0.1863643079996109,
+      "learning_rate": 3.5197546999209005e-05,
+      "loss": 0.15738571882247926,
+      "step": 2120
+    },
+    {
+      "epoch": 0.3866448326055313,
+      "grad_norm": 0.16994133591651917,
+      "learning_rate": 3.5130296825272014e-05,
+      "loss": 0.16255316734313965,
+      "step": 2125
+    },
+    {
+      "epoch": 0.3875545851528384,
+      "grad_norm": 0.18703415989875793,
+      "learning_rate": 3.5062958831470355e-05,
+      "loss": 0.15206334590911866,
+      "step": 2130
+    },
+    {
+      "epoch": 0.38846433770014555,
+      "grad_norm": 0.15433982014656067,
+      "learning_rate": 3.4995533601559226e-05,
+      "loss": 0.1590178370475769,
+      "step": 2135
+    },
+    {
+      "epoch": 0.3893740902474527,
+      "grad_norm": 0.16498146951198578,
+      "learning_rate": 3.4928021720050104e-05,
+      "loss": 0.14759145975112914,
+      "step": 2140
+    },
+    {
+      "epoch": 0.3902838427947598,
+      "grad_norm": 0.17880478501319885,
+      "learning_rate": 3.486042377220562e-05,
+      "loss": 0.1642458915710449,
+      "step": 2145
+    },
+    {
+      "epoch": 0.39119359534206694,
+      "grad_norm": 0.14700061082839966,
+      "learning_rate": 3.479274034403455e-05,
+      "loss": 0.16105138063430785,
+      "step": 2150
+    },
+    {
+      "epoch": 0.39210334788937407,
+      "grad_norm": 0.1620762050151825,
+      "learning_rate": 3.472497202228664e-05,
+      "loss": 0.15104985237121582,
+      "step": 2155
+    },
+    {
+      "epoch": 0.3930131004366812,
+      "grad_norm": 0.1625058799982071,
+      "learning_rate": 3.4657119394447654e-05,
+      "loss": 0.16145485639572144,
+      "step": 2160
+    },
+    {
+      "epoch": 0.3939228529839884,
+      "grad_norm": 0.1631549596786499,
+      "learning_rate": 3.458918304873417e-05,
+      "loss": 0.16712255477905275,
+      "step": 2165
+    },
+    {
+      "epoch": 0.3948326055312955,
+      "grad_norm": 0.16041551530361176,
+      "learning_rate": 3.452116357408853e-05,
+      "loss": 0.15118330717086792,
+      "step": 2170
+    },
+    {
+      "epoch": 0.39574235807860264,
+      "grad_norm": 0.16692611575126648,
+      "learning_rate": 3.44530615601737e-05,
+      "loss": 0.16982550621032716,
+      "step": 2175
+    },
+    {
+      "epoch": 0.39665211062590977,
+      "grad_norm": 0.16082268953323364,
+      "learning_rate": 3.438487759736821e-05,
+      "loss": 0.1513260006904602,
+      "step": 2180
+    },
+    {
+      "epoch": 0.3975618631732169,
+      "grad_norm": 0.1474589854478836,
+      "learning_rate": 3.4316612276761004e-05,
+      "loss": 0.14968743324279785,
+      "step": 2185
+    },
+    {
+      "epoch": 0.39847161572052403,
+      "grad_norm": 0.14531342685222626,
+      "learning_rate": 3.42482661901463e-05,
+      "loss": 0.1563260555267334,
+      "step": 2190
+    },
+    {
+      "epoch": 0.39938136826783116,
+      "grad_norm": 0.16775506734848022,
+      "learning_rate": 3.41798399300185e-05,
+      "loss": 0.14861010313034057,
+      "step": 2195
+    },
+    {
+      "epoch": 0.4002911208151383,
+      "grad_norm": 0.15065217018127441,
+      "learning_rate": 3.411133408956703e-05,
+      "loss": 0.15559519529342652,
+      "step": 2200
+    },
+    {
+      "epoch": 0.4012008733624454,
+      "grad_norm": 0.16655296087265015,
+      "learning_rate": 3.4042749262671184e-05,
+      "loss": 0.16025567054748535,
+      "step": 2205
+    },
+    {
+      "epoch": 0.40211062590975255,
+      "grad_norm": 0.14773905277252197,
+      "learning_rate": 3.397408604389501e-05,
+      "loss": 0.15074082612991332,
+      "step": 2210
+    },
+    {
+      "epoch": 0.4030203784570597,
+      "grad_norm": 0.16233304142951965,
+      "learning_rate": 3.3905345028482125e-05,
+      "loss": 0.15490520000457764,
+      "step": 2215
+    },
+    {
+      "epoch": 0.4039301310043668,
+      "grad_norm": 0.17520153522491455,
+      "learning_rate": 3.383652681235058e-05,
+      "loss": 0.1517520785331726,
+      "step": 2220
+    },
+    {
+      "epoch": 0.40483988355167394,
+      "grad_norm": 0.14749875664710999,
+      "learning_rate": 3.376763199208766e-05,
+      "loss": 0.15410997867584228,
+      "step": 2225
+    },
+    {
+      "epoch": 0.40574963609898107,
+      "grad_norm": 0.16855919361114502,
+      "learning_rate": 3.369866116494477e-05,
+      "loss": 0.1510261058807373,
+      "step": 2230
+    },
+    {
+      "epoch": 0.4066593886462882,
+      "grad_norm": 0.1594122350215912,
+      "learning_rate": 3.362961492883218e-05,
+      "loss": 0.1493813395500183,
+      "step": 2235
+    },
+    {
+      "epoch": 0.40756914119359533,
+      "grad_norm": 0.13645926117897034,
+      "learning_rate": 3.3560493882313915e-05,
+      "loss": 0.14876762628555298,
+      "step": 2240
+    },
+    {
+      "epoch": 0.40847889374090246,
+      "grad_norm": 0.14304400980472565,
+      "learning_rate": 3.349129862460251e-05,
+      "loss": 0.15567013025283813,
+      "step": 2245
+    },
+    {
+      "epoch": 0.4093886462882096,
+      "grad_norm": 0.17040041089057922,
+      "learning_rate": 3.342202975555386e-05,
+      "loss": 0.1563249945640564,
+      "step": 2250
+    },
+    {
+      "epoch": 0.4102983988355167,
+      "grad_norm": 0.15594671666622162,
+      "learning_rate": 3.3352687875661984e-05,
+      "loss": 0.1546410083770752,
+      "step": 2255
+    },
+    {
+      "epoch": 0.41120815138282385,
+      "grad_norm": 0.1677195280790329,
+      "learning_rate": 3.328327358605384e-05,
+      "loss": 0.15710171461105346,
+      "step": 2260
+    },
+    {
+      "epoch": 0.412117903930131,
+      "grad_norm": 0.1731705516576767,
+      "learning_rate": 3.321378748848412e-05,
+      "loss": 0.16444036960601807,
+      "step": 2265
+    },
+    {
+      "epoch": 0.4130276564774381,
+      "grad_norm": 0.18779033422470093,
+      "learning_rate": 3.3144230185329984e-05,
+      "loss": 0.15659687519073487,
+      "step": 2270
+    },
+    {
+      "epoch": 0.4139374090247453,
+      "grad_norm": 0.1543768346309662,
+      "learning_rate": 3.3074602279585913e-05,
+      "loss": 0.15100739002227784,
+      "step": 2275
+    },
+    {
+      "epoch": 0.4148471615720524,
+      "grad_norm": 0.16672168672084808,
+      "learning_rate": 3.300490437485843e-05,
+      "loss": 0.15535364151000977,
+      "step": 2280
+    },
+    {
+      "epoch": 0.41575691411935956,
+      "grad_norm": 0.16741308569908142,
+      "learning_rate": 3.293513707536089e-05,
+      "loss": 0.15523911714553834,
+      "step": 2285
+    },
+    {
+      "epoch": 0.4166666666666667,
+      "grad_norm": 0.1488303542137146,
+      "learning_rate": 3.286530098590822e-05,
+      "loss": 0.1542000651359558,
+      "step": 2290
+    },
+    {
+      "epoch": 0.4175764192139738,
+      "grad_norm": 0.1637732982635498,
+      "learning_rate": 3.2795396711911694e-05,
+      "loss": 0.15354831218719484,
+      "step": 2295
+    },
+    {
+      "epoch": 0.41848617176128095,
+      "grad_norm": 0.1472022533416748,
+      "learning_rate": 3.272542485937369e-05,
+      "loss": 0.16235145330429077,
+      "step": 2300
+    },
+    {
+      "epoch": 0.4193959243085881,
+      "grad_norm": 0.15908290445804596,
+      "learning_rate": 3.265538603488241e-05,
+      "loss": 0.15642645359039306,
+      "step": 2305
+    },
+    {
+      "epoch": 0.4203056768558952,
+      "grad_norm": 0.1584865301847458,
+      "learning_rate": 3.2585280845606645e-05,
+      "loss": 0.15490249395370484,
+      "step": 2310
+    },
+    {
+      "epoch": 0.42121542940320233,
+      "grad_norm": 0.15893949568271637,
+      "learning_rate": 3.251510989929052e-05,
+      "loss": 0.1598116159439087,
+      "step": 2315
+    },
+    {
+      "epoch": 0.42212518195050946,
+      "grad_norm": 0.18930596113204956,
+      "learning_rate": 3.244487380424817e-05,
+      "loss": 0.1482008934020996,
+      "step": 2320
+    },
+    {
+      "epoch": 0.4230349344978166,
+      "grad_norm": 0.132876455783844,
+      "learning_rate": 3.237457316935856e-05,
+      "loss": 0.15304710865020751,
+      "step": 2325
+    },
+    {
+      "epoch": 0.4239446870451237,
+      "grad_norm": 0.16447032988071442,
+      "learning_rate": 3.2304208604060106e-05,
+      "loss": 0.15298750400543212,
+      "step": 2330
+    },
+    {
+      "epoch": 0.42485443959243085,
+      "grad_norm": 0.17748120427131653,
+      "learning_rate": 3.223378071834546e-05,
+      "loss": 0.1556084156036377,
+      "step": 2335
+    },
+    {
+      "epoch": 0.425764192139738,
+      "grad_norm": 0.16366586089134216,
+      "learning_rate": 3.2163290122756206e-05,
+      "loss": 0.14387927055358887,
+      "step": 2340
+    },
+    {
+      "epoch": 0.4266739446870451,
+      "grad_norm": 0.15398970246315002,
+      "learning_rate": 3.209273742837755e-05,
+      "loss": 0.16091293096542358,
+      "step": 2345
+    },
+    {
+      "epoch": 0.42758369723435224,
+      "grad_norm": 0.164212167263031,
+      "learning_rate": 3.202212324683305e-05,
+      "loss": 0.15523531436920165,
+      "step": 2350
+    },
+    {
+      "epoch": 0.4284934497816594,
+      "grad_norm": 0.16749800741672516,
+      "learning_rate": 3.1951448190279255e-05,
+      "loss": 0.15354975461959838,
+      "step": 2355
+    },
+    {
+      "epoch": 0.4294032023289665,
+      "grad_norm": 0.14137034118175507,
+      "learning_rate": 3.18807128714005e-05,
+      "loss": 0.14981694221496583,
+      "step": 2360
+    },
+    {
+      "epoch": 0.43031295487627363,
+      "grad_norm": 0.14848439395427704,
+      "learning_rate": 3.1809917903403507e-05,
+      "loss": 0.15448769330978393,
+      "step": 2365
+    },
+    {
+      "epoch": 0.43122270742358076,
+      "grad_norm": 0.1747605800628662,
+      "learning_rate": 3.1739063900012095e-05,
+      "loss": 0.15882387161254882,
+      "step": 2370
+    },
+    {
+      "epoch": 0.4321324599708879,
+      "grad_norm": 0.16054467856884003,
+      "learning_rate": 3.166815147546186e-05,
+      "loss": 0.15170297622680665,
+      "step": 2375
+    },
+    {
+      "epoch": 0.433042212518195,
+      "grad_norm": 0.15428027510643005,
+      "learning_rate": 3.1597181244494886e-05,
+      "loss": 0.16202548742294312,
+      "step": 2380
+    },
+    {
+      "epoch": 0.4339519650655022,
+      "grad_norm": 0.16747219860553741,
+      "learning_rate": 3.1526153822354325e-05,
+      "loss": 0.15461477041244506,
+      "step": 2385
+    },
+    {
+      "epoch": 0.43486171761280934,
+      "grad_norm": 0.17415772378444672,
+      "learning_rate": 3.145506982477918e-05,
+      "loss": 0.16173542737960817,
+      "step": 2390
+    },
+    {
+      "epoch": 0.43577147016011647,
+      "grad_norm": 0.1293518990278244,
+      "learning_rate": 3.1383929867998865e-05,
+      "loss": 0.15572521686553956,
+      "step": 2395
+    },
+    {
+      "epoch": 0.4366812227074236,
+      "grad_norm": 0.16909323632717133,
+      "learning_rate": 3.1312734568727935e-05,
+      "loss": 0.15898628234863282,
+      "step": 2400
+    },
+    {
+      "epoch": 0.43759097525473073,
+      "grad_norm": 0.16770294308662415,
+      "learning_rate": 3.124148454416069e-05,
+      "loss": 0.1536281704902649,
+      "step": 2405
+    },
+    {
+      "epoch": 0.43850072780203786,
+      "grad_norm": 0.14078612625598907,
+      "learning_rate": 3.117018041196585e-05,
+      "loss": 0.15274266004562378,
+      "step": 2410
+    },
+    {
+      "epoch": 0.439410480349345,
+      "grad_norm": 0.15457536280155182,
+      "learning_rate": 3.1098822790281226e-05,
+      "loss": 0.15391263961791993,
+      "step": 2415
+    },
+    {
+      "epoch": 0.4403202328966521,
+      "grad_norm": 0.1640717089176178,
+      "learning_rate": 3.102741229770827e-05,
+      "loss": 0.15515168905258178,
+      "step": 2420
+    },
+    {
+      "epoch": 0.44122998544395925,
+      "grad_norm": 0.2601533830165863,
+      "learning_rate": 3.095594955330683e-05,
+      "loss": 0.1587247371673584,
+      "step": 2425
+    },
+    {
+      "epoch": 0.4421397379912664,
+      "grad_norm": 0.1352529525756836,
+      "learning_rate": 3.08844351765897e-05,
+      "loss": 0.1483217477798462,
+      "step": 2430
+    },
+    {
+      "epoch": 0.4430494905385735,
+      "grad_norm": 0.18479721248149872,
+      "learning_rate": 3.081286978751728e-05,
+      "loss": 0.15121787786483765,
+      "step": 2435
+    },
+    {
+      "epoch": 0.44395924308588064,
+      "grad_norm": 0.16954511404037476,
+      "learning_rate": 3.074125400649221e-05,
+      "loss": 0.16073100566864013,
+      "step": 2440
+    },
+    {
+      "epoch": 0.44486899563318777,
+      "grad_norm": 0.15154729783535004,
+      "learning_rate": 3.0669588454353944e-05,
+      "loss": 0.15738017559051515,
+      "step": 2445
+    },
+    {
+      "epoch": 0.4457787481804949,
+      "grad_norm": 0.1540488302707672,
+      "learning_rate": 3.059787375237344e-05,
+      "loss": 0.1515384554862976,
+      "step": 2450
+    },
+    {
+      "epoch": 0.44668850072780203,
+      "grad_norm": 0.1814432442188263,
+      "learning_rate": 3.052611052224774e-05,
+      "loss": 0.15731438398361205,
+      "step": 2455
+    },
+    {
+      "epoch": 0.44759825327510916,
+      "grad_norm": 0.16657036542892456,
+      "learning_rate": 3.0454299386094542e-05,
+      "loss": 0.15741543769836425,
+      "step": 2460
+    },
+    {
+      "epoch": 0.4485080058224163,
+      "grad_norm": 0.2177237570285797,
+      "learning_rate": 3.0382440966446875e-05,
+      "loss": 0.14972515106201173,
+      "step": 2465
+    },
+    {
+      "epoch": 0.4494177583697234,
+      "grad_norm": 0.1669909954071045,
+      "learning_rate": 3.031053588624766e-05,
+      "loss": 0.1506432294845581,
+      "step": 2470
+    },
+    {
+      "epoch": 0.45032751091703055,
+      "grad_norm": 0.1752234250307083,
+      "learning_rate": 3.0238584768844313e-05,
+      "loss": 0.14969609975814818,
+      "step": 2475
+    },
+    {
+      "epoch": 0.4512372634643377,
+      "grad_norm": 0.18267901241779327,
+      "learning_rate": 3.0166588237983363e-05,
+      "loss": 0.15112748146057128,
+      "step": 2480
+    },
+    {
+      "epoch": 0.4521470160116448,
+      "grad_norm": 0.16250105202198029,
+      "learning_rate": 3.0094546917805007e-05,
+      "loss": 0.15864100456237792,
+      "step": 2485
+    },
+    {
+      "epoch": 0.45305676855895194,
+      "grad_norm": 0.14825721085071564,
+      "learning_rate": 3.0022461432837752e-05,
+      "loss": 0.1513954520225525,
+      "step": 2490
+    },
+    {
+      "epoch": 0.4539665211062591,
+      "grad_norm": 0.1626640111207962,
+      "learning_rate": 2.9950332407992943e-05,
+      "loss": 0.1505578875541687,
+      "step": 2495
+    },
+    {
+      "epoch": 0.45487627365356625,
+      "grad_norm": 0.1535351574420929,
+      "learning_rate": 2.987816046855939e-05,
+      "loss": 0.15255829095840454,
+      "step": 2500
+    },
+    {
+      "epoch": 0.4557860262008734,
+      "grad_norm": 0.17552775144577026,
+      "learning_rate": 2.9805946240197928e-05,
+      "loss": 0.1516443133354187,
+      "step": 2505
+    },
+    {
+      "epoch": 0.4566957787481805,
+      "grad_norm": 0.16020981967449188,
+      "learning_rate": 2.9733690348935994e-05,
+      "loss": 0.14519743919372557,
+      "step": 2510
+    },
+    {
+      "epoch": 0.45760553129548764,
+      "grad_norm": 0.17800211906433105,
+      "learning_rate": 2.9661393421162204e-05,
+      "loss": 0.15679080486297609,
+      "step": 2515
+    },
+    {
+      "epoch": 0.4585152838427948,
+      "grad_norm": 0.16016991436481476,
+      "learning_rate": 2.9589056083620902e-05,
+      "loss": 0.14768127202987671,
+      "step": 2520
+    },
+    {
+      "epoch": 0.4594250363901019,
+      "grad_norm": 0.16272081434726715,
+      "learning_rate": 2.951667896340679e-05,
+      "loss": 0.1513301968574524,
+      "step": 2525
+    },
+    {
+      "epoch": 0.46033478893740903,
+      "grad_norm": 0.1726413071155548,
+      "learning_rate": 2.9444262687959402e-05,
+      "loss": 0.14819332361221313,
+      "step": 2530
+    },
+    {
+      "epoch": 0.46124454148471616,
+      "grad_norm": 0.1670403778553009,
+      "learning_rate": 2.9371807885057735e-05,
+      "loss": 0.15245940685272216,
+      "step": 2535
+    },
+    {
+      "epoch": 0.4621542940320233,
+      "grad_norm": 0.1650049239397049,
+      "learning_rate": 2.9299315182814772e-05,
+      "loss": 0.15187418460845947,
+      "step": 2540
+    },
+    {
+      "epoch": 0.4630640465793304,
+      "grad_norm": 0.16327734291553497,
+      "learning_rate": 2.9226785209672047e-05,
+      "loss": 0.15579828023910522,
+      "step": 2545
+    },
+    {
+      "epoch": 0.46397379912663755,
+      "grad_norm": 0.3367880582809448,
+      "learning_rate": 2.91542185943942e-05,
+      "loss": 0.15617697238922118,
+      "step": 2550
+    },
+    {
+      "epoch": 0.4648835516739447,
+      "grad_norm": 0.1731594055891037,
+      "learning_rate": 2.908161596606353e-05,
+      "loss": 0.1559603691101074,
+      "step": 2555
+    },
+    {
+      "epoch": 0.4657933042212518,
+      "grad_norm": 0.1477293074131012,
+      "learning_rate": 2.9008977954074517e-05,
+      "loss": 0.15567959547042848,
+      "step": 2560
+    },
+    {
+      "epoch": 0.46670305676855894,
+      "grad_norm": 0.16227173805236816,
+      "learning_rate": 2.8936305188128392e-05,
+      "loss": 0.1522113561630249,
+      "step": 2565
+    },
+    {
+      "epoch": 0.4676128093158661,
+      "grad_norm": 0.2031075656414032,
+      "learning_rate": 2.8863598298227674e-05,
+      "loss": 0.15054640769958497,
+      "step": 2570
+    },
+    {
+      "epoch": 0.4685225618631732,
+      "grad_norm": 0.18351472914218903,
+      "learning_rate": 2.8790857914670698e-05,
+      "loss": 0.15837019681930542,
+      "step": 2575
+    },
+    {
+      "epoch": 0.46943231441048033,
+      "grad_norm": 0.15914765000343323,
+      "learning_rate": 2.871808466804616e-05,
+      "loss": 0.1550259470939636,
+      "step": 2580
+    },
+    {
+      "epoch": 0.47034206695778746,
+      "grad_norm": 0.17366717755794525,
+      "learning_rate": 2.8645279189227636e-05,
+      "loss": 0.15702390670776367,
+      "step": 2585
+    },
+    {
+      "epoch": 0.4712518195050946,
+      "grad_norm": 0.13677838444709778,
+      "learning_rate": 2.8572442109368134e-05,
+      "loss": 0.15485031604766847,
+      "step": 2590
+    },
+    {
+      "epoch": 0.4721615720524017,
+      "grad_norm": 0.1477748304605484,
+      "learning_rate": 2.8499574059894617e-05,
+      "loss": 0.14577245712280273,
+      "step": 2595
+    },
+    {
+      "epoch": 0.47307132459970885,
+      "grad_norm": 0.1582217663526535,
+      "learning_rate": 2.842667567250252e-05,
+      "loss": 0.15586793422698975,
+      "step": 2600
+    },
+    {
+      "epoch": 0.47398107714701604,
+      "grad_norm": 0.19658738374710083,
+      "learning_rate": 2.8353747579150268e-05,
+      "loss": 0.15060495138168334,
+      "step": 2605
+    },
+    {
+      "epoch": 0.47489082969432317,
+      "grad_norm": 0.176767036318779,
+      "learning_rate": 2.828079041205382e-05,
+      "loss": 0.15116705894470214,
+      "step": 2610
+    },
+    {
+      "epoch": 0.4758005822416303,
+      "grad_norm": 0.16972507536411285,
+      "learning_rate": 2.820780480368117e-05,
+      "loss": 0.1541937470436096,
+      "step": 2615
+    },
+    {
+      "epoch": 0.47671033478893743,
+      "grad_norm": 0.1548585742712021,
+      "learning_rate": 2.8134791386746884e-05,
+      "loss": 0.14334756135940552,
+      "step": 2620
+    },
+    {
+      "epoch": 0.47762008733624456,
+      "grad_norm": 0.15411986410617828,
+      "learning_rate": 2.806175079420658e-05,
+      "loss": 0.14642289876937867,
+      "step": 2625
+    },
+    {
+      "epoch": 0.4785298398835517,
+      "grad_norm": 0.16609491407871246,
+      "learning_rate": 2.7988683659251474e-05,
+      "loss": 0.15083469152450563,
+      "step": 2630
+    },
+    {
+      "epoch": 0.4794395924308588,
+      "grad_norm": 0.16592684388160706,
+      "learning_rate": 2.791559061530289e-05,
+      "loss": 0.14218480587005616,
+      "step": 2635
+    },
+    {
+      "epoch": 0.48034934497816595,
+      "grad_norm": 0.1764935404062271,
+      "learning_rate": 2.7842472296006722e-05,
+      "loss": 0.15004343986511232,
+      "step": 2640
+    },
+    {
+      "epoch": 0.4812590975254731,
+      "grad_norm": 0.20094354450702667,
+      "learning_rate": 2.7769329335228022e-05,
+      "loss": 0.14975016117095946,
+      "step": 2645
+    },
+    {
+      "epoch": 0.4821688500727802,
+      "grad_norm": 0.1869269460439682,
+      "learning_rate": 2.769616236704542e-05,
+      "loss": 0.155981707572937,
+      "step": 2650
+    },
+    {
+      "epoch": 0.48307860262008734,
+      "grad_norm": 0.16671574115753174,
+      "learning_rate": 2.762297202574571e-05,
+      "loss": 0.14633859395980836,
+      "step": 2655
+    },
+    {
+      "epoch": 0.48398835516739447,
+      "grad_norm": 0.14999663829803467,
+      "learning_rate": 2.754975894581826e-05,
+      "loss": 0.15692603588104248,
+      "step": 2660
+    },
+    {
+      "epoch": 0.4848981077147016,
+      "grad_norm": 0.16893649101257324,
+      "learning_rate": 2.7476523761949592e-05,
+      "loss": 0.14530394077301026,
+      "step": 2665
+    },
+    {
+      "epoch": 0.48580786026200873,
+      "grad_norm": 0.16039884090423584,
+      "learning_rate": 2.740326710901784e-05,
+      "loss": 0.15013915300369263,
+      "step": 2670
+    },
+    {
+      "epoch": 0.48671761280931586,
+      "grad_norm": 0.16672006249427795,
+      "learning_rate": 2.732998962208725e-05,
+      "loss": 0.15667349100112915,
+      "step": 2675
+    },
+    {
+      "epoch": 0.487627365356623,
+      "grad_norm": 0.2160867303609848,
+      "learning_rate": 2.7256691936402684e-05,
+      "loss": 0.14335414171218872,
+      "step": 2680
+    },
+    {
+      "epoch": 0.4885371179039301,
+      "grad_norm": 0.349030077457428,
+      "learning_rate": 2.71833746873841e-05,
+      "loss": 0.1437530279159546,
+      "step": 2685
+    },
+    {
+      "epoch": 0.48944687045123725,
+      "grad_norm": 0.18380966782569885,
+      "learning_rate": 2.7110038510621073e-05,
+      "loss": 0.1476014256477356,
+      "step": 2690
+    },
+    {
+      "epoch": 0.4903566229985444,
+      "grad_norm": 0.1523742377758026,
+      "learning_rate": 2.703668404186722e-05,
+      "loss": 0.14578526020050048,
+      "step": 2695
+    },
+    {
+      "epoch": 0.4912663755458515,
+      "grad_norm": 0.16092729568481445,
+      "learning_rate": 2.696331191703479e-05,
+      "loss": 0.15335593223571778,
+      "step": 2700
+    },
+    {
+      "epoch": 0.49217612809315864,
+      "grad_norm": 0.17185333371162415,
+      "learning_rate": 2.688992277218904e-05,
+      "loss": 0.1540898084640503,
+      "step": 2705
+    },
+    {
+      "epoch": 0.49308588064046577,
+      "grad_norm": 0.1521969735622406,
+      "learning_rate": 2.6816517243542792e-05,
+      "loss": 0.15171396732330322,
+      "step": 2710
+    },
+    {
+      "epoch": 0.49399563318777295,
+      "grad_norm": 0.16064171493053436,
+      "learning_rate": 2.674309596745092e-05,
+      "loss": 0.1505839228630066,
+      "step": 2715
+    },
+    {
+      "epoch": 0.4949053857350801,
+      "grad_norm": 0.16430898010730743,
+      "learning_rate": 2.6669659580404795e-05,
+      "loss": 0.1551363468170166,
+      "step": 2720
+    },
+    {
+      "epoch": 0.4958151382823872,
+      "grad_norm": 0.16125477850437164,
+      "learning_rate": 2.659620871902677e-05,
+      "loss": 0.15069286823272704,
+      "step": 2725
+    },
+    {
+      "epoch": 0.49672489082969434,
+      "grad_norm": 0.1428450047969818,
+      "learning_rate": 2.652274402006471e-05,
+      "loss": 0.15511081218719483,
+      "step": 2730
+    },
+    {
+      "epoch": 0.4976346433770015,
+      "grad_norm": 0.15452754497528076,
+      "learning_rate": 2.6449266120386406e-05,
+      "loss": 0.14941939115524291,
+      "step": 2735
+    },
+    {
+      "epoch": 0.4985443959243086,
+      "grad_norm": 0.17243537306785583,
+      "learning_rate": 2.6375775656974123e-05,
+      "loss": 0.151741623878479,
+      "step": 2740
+    },
+    {
+      "epoch": 0.49945414847161573,
+      "grad_norm": 0.13736453652381897,
+      "learning_rate": 2.6302273266919008e-05,
+      "loss": 0.147042977809906,
+      "step": 2745
+    },
+    {
+      "epoch": 0.5003639010189228,
+      "grad_norm": 0.16241495311260223,
+      "learning_rate": 2.6228759587415614e-05,
+      "loss": 0.14664684534072875,
+      "step": 2750
+    },
+    {
+      "epoch": 0.50127365356623,
+      "grad_norm": 0.193496435880661,
+      "learning_rate": 2.6155235255756356e-05,
+      "loss": 0.15486966371536254,
+      "step": 2755
+    },
+    {
+      "epoch": 0.5021834061135371,
+      "grad_norm": 0.1542847901582718,
+      "learning_rate": 2.6081700909326e-05,
+      "loss": 0.15148009061813356,
+      "step": 2760
+    },
+    {
+      "epoch": 0.5030931586608443,
+      "grad_norm": 0.1696511209011078,
+      "learning_rate": 2.6008157185596142e-05,
+      "loss": 0.14190055131912233,
+      "step": 2765
+    },
+    {
+      "epoch": 0.5040029112081513,
+      "grad_norm": 0.14690077304840088,
+      "learning_rate": 2.5934604722119655e-05,
+      "loss": 0.1570739269256592,
+      "step": 2770
+    },
+    {
+      "epoch": 0.5049126637554585,
+      "grad_norm": 0.17149671912193298,
+      "learning_rate": 2.5861044156525162e-05,
+      "loss": 0.14940304756164552,
+      "step": 2775
+    },
+    {
+      "epoch": 0.5058224163027657,
+      "grad_norm": 0.16639231145381927,
+      "learning_rate": 2.578747612651155e-05,
+      "loss": 0.15691237449645995,
+      "step": 2780
+    },
+    {
+      "epoch": 0.5067321688500728,
+      "grad_norm": 0.2062763124704361,
+      "learning_rate": 2.5713901269842404e-05,
+      "loss": 0.1564734935760498,
+      "step": 2785
+    },
+    {
+      "epoch": 0.50764192139738,
+      "grad_norm": 0.12636308372020721,
+      "learning_rate": 2.5640320224340502e-05,
+      "loss": 0.14539417028427123,
+      "step": 2790
+    },
+    {
+      "epoch": 0.508551673944687,
+      "grad_norm": 0.16893689334392548,
+      "learning_rate": 2.556673362788225e-05,
+      "loss": 0.15440930128097535,
+      "step": 2795
+    },
+    {
+      "epoch": 0.5094614264919942,
+      "grad_norm": 0.16250015795230865,
+      "learning_rate": 2.54931421183922e-05,
+      "loss": 0.14485647678375244,
+      "step": 2800
+    },
+    {
+      "epoch": 0.5103711790393013,
+      "grad_norm": 0.1700994372367859,
+      "learning_rate": 2.5419546333837462e-05,
+      "loss": 0.15411126613616943,
+      "step": 2805
+    },
+    {
+      "epoch": 0.5112809315866085,
+      "grad_norm": 0.1547706127166748,
+      "learning_rate": 2.5345946912222256e-05,
+      "loss": 0.15516072511672974,
+      "step": 2810
+    },
+    {
+      "epoch": 0.5121906841339156,
+      "grad_norm": 0.17955681681632996,
+      "learning_rate": 2.527234449158228e-05,
+      "loss": 0.15546923875808716,
+      "step": 2815
+    },
+    {
+      "epoch": 0.5131004366812227,
+      "grad_norm": 0.163709819316864,
+      "learning_rate": 2.519873970997927e-05,
+      "loss": 0.15665037631988527,
+      "step": 2820
+    },
+    {
+      "epoch": 0.5140101892285298,
+      "grad_norm": 0.17859576642513275,
+      "learning_rate": 2.5125133205495405e-05,
+      "loss": 0.1539722204208374,
+      "step": 2825
+    },
+    {
+      "epoch": 0.514919941775837,
+      "grad_norm": 0.17443150281906128,
+      "learning_rate": 2.5051525616227806e-05,
+      "loss": 0.148411762714386,
+      "step": 2830
+    },
+    {
+      "epoch": 0.5158296943231441,
+      "grad_norm": 0.17397581040859222,
+      "learning_rate": 2.4977917580283007e-05,
+      "loss": 0.14880497455596925,
+      "step": 2835
+    },
+    {
+      "epoch": 0.5167394468704513,
+      "grad_norm": 0.14565663039684296,
+      "learning_rate": 2.4904309735771405e-05,
+      "loss": 0.14934680461883545,
+      "step": 2840
+    },
+    {
+      "epoch": 0.5176491994177583,
+      "grad_norm": 0.17895659804344177,
+      "learning_rate": 2.4830702720801746e-05,
+      "loss": 0.15287939310073853,
+      "step": 2845
+    },
+    {
+      "epoch": 0.5185589519650655,
+      "grad_norm": 0.15812788903713226,
+      "learning_rate": 2.4757097173475572e-05,
+      "loss": 0.14576947689056396,
+      "step": 2850
+    },
+    {
+      "epoch": 0.5194687045123726,
+      "grad_norm": 0.17123781144618988,
+      "learning_rate": 2.46834937318817e-05,
+      "loss": 0.15224847793579102,
+      "step": 2855
+    },
+    {
+      "epoch": 0.5203784570596798,
+      "grad_norm": 0.14845474064350128,
+      "learning_rate": 2.460989303409072e-05,
+      "loss": 0.14901585578918458,
+      "step": 2860
+    },
+    {
+      "epoch": 0.5212882096069869,
+      "grad_norm": 0.23493704199790955,
+      "learning_rate": 2.4536295718149407e-05,
+      "loss": 0.1517487049102783,
+      "step": 2865
+    },
+    {
+      "epoch": 0.522197962154294,
+      "grad_norm": 0.16209843754768372,
+      "learning_rate": 2.4462702422075217e-05,
+      "loss": 0.14327445030212402,
+      "step": 2870
+    },
+    {
+      "epoch": 0.5231077147016011,
+      "grad_norm": 0.17249803245067596,
+      "learning_rate": 2.4389113783850793e-05,
+      "loss": 0.1517549753189087,
+      "step": 2875
+    },
+    {
+      "epoch": 0.5240174672489083,
+      "grad_norm": 0.14561402797698975,
+      "learning_rate": 2.431553044141836e-05,
+      "loss": 0.14764087200164794,
+      "step": 2880
+    },
+    {
+      "epoch": 0.5249272197962155,
+      "grad_norm": 0.17033302783966064,
+      "learning_rate": 2.4241953032674256e-05,
+      "loss": 0.15181604623794556,
+      "step": 2885
+    },
+    {
+      "epoch": 0.5258369723435226,
+      "grad_norm": 0.1184430941939354,
+      "learning_rate": 2.4168382195463367e-05,
+      "loss": 0.14264242649078368,
+      "step": 2890
+    },
+    {
+      "epoch": 0.5267467248908297,
+      "grad_norm": 0.17521196603775024,
+      "learning_rate": 2.4094818567573618e-05,
+      "loss": 0.1509538173675537,
+      "step": 2895
+    },
+    {
+      "epoch": 0.5276564774381368,
+      "grad_norm": 0.1681576371192932,
+      "learning_rate": 2.4021262786730428e-05,
+      "loss": 0.15344605445861817,
+      "step": 2900
+    },
+    {
+      "epoch": 0.528566229985444,
+      "grad_norm": 0.17134182155132294,
+      "learning_rate": 2.3947715490591206e-05,
+      "loss": 0.15161689519882202,
+      "step": 2905
+    },
+    {
+      "epoch": 0.5294759825327511,
+      "grad_norm": 0.1796472817659378,
+      "learning_rate": 2.3874177316739778e-05,
+      "loss": 0.15086464881896972,
+      "step": 2910
+    },
+    {
+      "epoch": 0.5303857350800583,
+      "grad_norm": 0.23268625140190125,
+      "learning_rate": 2.380064890268093e-05,
+      "loss": 0.15354180335998535,
+      "step": 2915
+    },
+    {
+      "epoch": 0.5312954876273653,
+      "grad_norm": 0.16318941116333008,
+      "learning_rate": 2.372713088583481e-05,
+      "loss": 0.15131797790527343,
+      "step": 2920
+    },
+    {
+      "epoch": 0.5322052401746725,
+      "grad_norm": 0.18171803653240204,
+      "learning_rate": 2.365362390353143e-05,
+      "loss": 0.15784090757369995,
+      "step": 2925
+    },
+    {
+      "epoch": 0.5331149927219796,
+      "grad_norm": 0.17672640085220337,
+      "learning_rate": 2.3580128593005156e-05,
+      "loss": 0.15509436130523682,
+      "step": 2930
+    },
+    {
+      "epoch": 0.5340247452692868,
+      "grad_norm": 0.15985223650932312,
+      "learning_rate": 2.3506645591389174e-05,
+      "loss": 0.14851027727127075,
+      "step": 2935
+    },
+    {
+      "epoch": 0.5349344978165939,
+      "grad_norm": 0.16597607731819153,
+      "learning_rate": 2.343317553570995e-05,
+      "loss": 0.1504931092262268,
+      "step": 2940
+    },
+    {
+      "epoch": 0.535844250363901,
+      "grad_norm": 0.20180748403072357,
+      "learning_rate": 2.3359719062881725e-05,
+      "loss": 0.15023820400238036,
+      "step": 2945
+    },
+    {
+      "epoch": 0.5367540029112081,
+      "grad_norm": 0.1735963076353073,
+      "learning_rate": 2.3286276809701e-05,
+      "loss": 0.15374408960342406,
+      "step": 2950
+    },
+    {
+      "epoch": 0.5376637554585153,
+      "grad_norm": 0.17629501223564148,
+      "learning_rate": 2.3212849412840995e-05,
+      "loss": 0.15007833242416382,
+      "step": 2955
+    },
+    {
+      "epoch": 0.5385735080058224,
+      "grad_norm": 0.1493796557188034,
+      "learning_rate": 2.3139437508846155e-05,
+      "loss": 0.15206656455993653,
+      "step": 2960
+    },
+    {
+      "epoch": 0.5394832605531296,
+      "grad_norm": 0.17426837980747223,
+      "learning_rate": 2.306604173412659e-05,
+      "loss": 0.1441131591796875,
+      "step": 2965
+    },
+    {
+      "epoch": 0.5403930131004366,
+      "grad_norm": 0.16984431445598602,
+      "learning_rate": 2.2992662724952613e-05,
+      "loss": 0.14438753128051757,
+      "step": 2970
+    },
+    {
+      "epoch": 0.5413027656477438,
+      "grad_norm": 0.1814386397600174,
+      "learning_rate": 2.2919301117449167e-05,
+      "loss": 0.14887022972106934,
+      "step": 2975
+    },
+    {
+      "epoch": 0.5422125181950509,
+      "grad_norm": 0.158392995595932,
+      "learning_rate": 2.2845957547590368e-05,
+      "loss": 0.14404361248016356,
+      "step": 2980
+    },
+    {
+      "epoch": 0.5431222707423581,
+      "grad_norm": 0.17496263980865479,
+      "learning_rate": 2.2772632651193953e-05,
+      "loss": 0.1454906702041626,
+      "step": 2985
+    },
+    {
+      "epoch": 0.5440320232896652,
+      "grad_norm": 0.157533198595047,
+      "learning_rate": 2.2699327063915766e-05,
+      "loss": 0.1458217740058899,
+      "step": 2990
+    },
+    {
+      "epoch": 0.5449417758369723,
+      "grad_norm": 0.1767890453338623,
+      "learning_rate": 2.262604142124427e-05,
+      "loss": 0.14384825229644777,
+      "step": 2995
+    },
+    {
+      "epoch": 0.5458515283842795,
+      "grad_norm": 0.1851050704717636,
+      "learning_rate": 2.2552776358495033e-05,
+      "loss": 0.14832457304000854,
+      "step": 3000
+    },
+    {
+      "epoch": 0.5467612809315866,
+      "grad_norm": 0.164175882935524,
+      "learning_rate": 2.247953251080521e-05,
+      "loss": 0.14999878406524658,
+      "step": 3005
+    },
+    {
+      "epoch": 0.5476710334788938,
+      "grad_norm": 0.3403675854206085,
+      "learning_rate": 2.240631051312804e-05,
+      "loss": 0.1443937063217163,
+      "step": 3010
+    },
+    {
+      "epoch": 0.5485807860262009,
+      "grad_norm": 0.16751109063625336,
+      "learning_rate": 2.2333111000227342e-05,
+      "loss": 0.1462402105331421,
+      "step": 3015
+    },
+    {
+      "epoch": 0.549490538573508,
+      "grad_norm": 0.14741151034832,
+      "learning_rate": 2.225993460667201e-05,
+      "loss": 0.149855899810791,
+      "step": 3020
+    },
+    {
+      "epoch": 0.5504002911208151,
+      "grad_norm": 0.20605266094207764,
+      "learning_rate": 2.218678196683054e-05,
+      "loss": 0.15413178205490113,
+      "step": 3025
+    },
+    {
+      "epoch": 0.5513100436681223,
+      "grad_norm": 0.14884796738624573,
+      "learning_rate": 2.2113653714865473e-05,
+      "loss": 0.14592334032058715,
+      "step": 3030
+    },
+    {
+      "epoch": 0.5522197962154294,
+      "grad_norm": 0.17114350199699402,
+      "learning_rate": 2.2040550484727943e-05,
+      "loss": 0.1498338460922241,
+      "step": 3035
+    },
+    {
+      "epoch": 0.5531295487627366,
+      "grad_norm": 0.16496853530406952,
+      "learning_rate": 2.196747291015219e-05,
+      "loss": 0.14650315046310425,
+      "step": 3040
+    },
+    {
+      "epoch": 0.5540393013100436,
+      "grad_norm": 0.15172401070594788,
+      "learning_rate": 2.189442162465001e-05,
+      "loss": 0.14984124898910522,
+      "step": 3045
+    },
+    {
+      "epoch": 0.5549490538573508,
+      "grad_norm": 0.19258467853069305,
+      "learning_rate": 2.182139726150532e-05,
+      "loss": 0.1486764669418335,
+      "step": 3050
+    },
+    {
+      "epoch": 0.5558588064046579,
+      "grad_norm": 0.1749001443386078,
+      "learning_rate": 2.1748400453768652e-05,
+      "loss": 0.14983701705932617,
+      "step": 3055
+    },
+    {
+      "epoch": 0.5567685589519651,
+      "grad_norm": 0.37510567903518677,
+      "learning_rate": 2.1675431834251637e-05,
+      "loss": 0.14483561515808105,
+      "step": 3060
+    },
+    {
+      "epoch": 0.5576783114992722,
+      "grad_norm": 0.16932405531406403,
+      "learning_rate": 2.1602492035521553e-05,
+      "loss": 0.14487643241882325,
+      "step": 3065
+    },
+    {
+      "epoch": 0.5585880640465793,
+      "grad_norm": 0.174176424741745,
+      "learning_rate": 2.152958168989584e-05,
+      "loss": 0.14737497568130492,
+      "step": 3070
+    },
+    {
+      "epoch": 0.5594978165938864,
+      "grad_norm": 0.1601252257823944,
+      "learning_rate": 2.1456701429436577e-05,
+      "loss": 0.15183379650115966,
+      "step": 3075
+    },
+    {
+      "epoch": 0.5604075691411936,
+      "grad_norm": 0.14960910379886627,
+      "learning_rate": 2.1383851885945085e-05,
+      "loss": 0.143074893951416,
+      "step": 3080
+    },
+    {
+      "epoch": 0.5613173216885007,
+      "grad_norm": 0.1678633838891983,
+      "learning_rate": 2.1311033690956346e-05,
+      "loss": 0.14961432218551635,
+      "step": 3085
+    },
+    {
+      "epoch": 0.5622270742358079,
+      "grad_norm": 0.15814319252967834,
+      "learning_rate": 2.1238247475733613e-05,
+      "loss": 0.14308581352233887,
+      "step": 3090
+    },
+    {
+      "epoch": 0.5631368267831149,
+      "grad_norm": 0.21240772306919098,
+      "learning_rate": 2.1165493871262887e-05,
+      "loss": 0.14737485647201537,
+      "step": 3095
+    },
+    {
+      "epoch": 0.5640465793304221,
+      "grad_norm": 0.15161271393299103,
+      "learning_rate": 2.109277350824749e-05,
+      "loss": 0.14534420967102052,
+      "step": 3100
+    },
+    {
+      "epoch": 0.5649563318777293,
+      "grad_norm": 0.16572362184524536,
+      "learning_rate": 2.1020087017102537e-05,
+      "loss": 0.14299670457839966,
+      "step": 3105
+    },
+    {
+      "epoch": 0.5658660844250364,
+      "grad_norm": 0.1548164039850235,
+      "learning_rate": 2.094743502794954e-05,
+      "loss": 0.14371142387390137,
+      "step": 3110
+    },
+    {
+      "epoch": 0.5667758369723436,
+      "grad_norm": 0.2574169933795929,
+      "learning_rate": 2.0874818170610885e-05,
+      "loss": 0.14350423812866211,
+      "step": 3115
+    },
+    {
+      "epoch": 0.5676855895196506,
+      "grad_norm": 0.16359548270702362,
+      "learning_rate": 2.080223707460443e-05,
+      "loss": 0.1520243763923645,
+      "step": 3120
+    },
+    {
+      "epoch": 0.5685953420669578,
+      "grad_norm": 0.1798320859670639,
+      "learning_rate": 2.072969236913799e-05,
+      "loss": 0.14832595586776734,
+      "step": 3125
+    },
+    {
+      "epoch": 0.5695050946142649,
+      "grad_norm": 0.17045916616916656,
+      "learning_rate": 2.0657184683103926e-05,
+      "loss": 0.15308042764663696,
+      "step": 3130
+    },
+    {
+      "epoch": 0.5704148471615721,
+      "grad_norm": 0.16345897316932678,
+      "learning_rate": 2.058471464507366e-05,
+      "loss": 0.14564799070358275,
+      "step": 3135
+    },
+    {
+      "epoch": 0.5713245997088792,
+      "grad_norm": 0.15170110762119293,
+      "learning_rate": 2.0512282883292257e-05,
+      "loss": 0.14161767959594726,
+      "step": 3140
+    },
+    {
+      "epoch": 0.5722343522561864,
+      "grad_norm": 0.8107472658157349,
+      "learning_rate": 2.0439890025672955e-05,
+      "loss": 0.14481087923049926,
+      "step": 3145
+    },
+    {
+      "epoch": 0.5731441048034934,
+      "grad_norm": 0.15346679091453552,
+      "learning_rate": 2.036753669979174e-05,
+      "loss": 0.14860262870788574,
+      "step": 3150
+    },
+    {
+      "epoch": 0.5740538573508006,
+      "grad_norm": 0.1632593423128128,
+      "learning_rate": 2.0295223532881886e-05,
+      "loss": 0.1481687307357788,
+      "step": 3155
+    },
+    {
+      "epoch": 0.5749636098981077,
+      "grad_norm": 0.23399172723293304,
+      "learning_rate": 2.022295115182852e-05,
+      "loss": 0.149153733253479,
+      "step": 3160
+    },
+    {
+      "epoch": 0.5758733624454149,
+      "grad_norm": 0.14977394044399261,
+      "learning_rate": 2.015072018316323e-05,
+      "loss": 0.14921388626098633,
+      "step": 3165
+    },
+    {
+      "epoch": 0.576783114992722,
+      "grad_norm": 0.1550658792257309,
+      "learning_rate": 2.007853125305856e-05,
+      "loss": 0.1482759475708008,
+      "step": 3170
+    },
+    {
+      "epoch": 0.5776928675400291,
+      "grad_norm": 0.16661737859249115,
+      "learning_rate": 2.0006384987322645e-05,
+      "loss": 0.14903552532196046,
+      "step": 3175
+    },
+    {
+      "epoch": 0.5786026200873362,
+      "grad_norm": 0.1746823936700821,
+      "learning_rate": 1.9934282011393753e-05,
+      "loss": 0.1412947654724121,
+      "step": 3180
+    },
+    {
+      "epoch": 0.5795123726346434,
+      "grad_norm": 0.17025792598724365,
+      "learning_rate": 1.9862222950334857e-05,
+      "loss": 0.15289769172668458,
+      "step": 3185
+    },
+    {
+      "epoch": 0.5804221251819505,
+      "grad_norm": 0.16857658326625824,
+      "learning_rate": 1.9790208428828252e-05,
+      "loss": 0.14419941902160643,
+      "step": 3190
+    },
+    {
+      "epoch": 0.5813318777292577,
+      "grad_norm": 0.16099876165390015,
+      "learning_rate": 1.9718239071170118e-05,
+      "loss": 0.14476487636566163,
+      "step": 3195
+    },
+    {
+      "epoch": 0.5822416302765647,
+      "grad_norm": 0.16140873730182648,
+      "learning_rate": 1.964631550126508e-05,
+      "loss": 0.14588416814804078,
+      "step": 3200
+    },
+    {
+      "epoch": 0.5831513828238719,
+      "grad_norm": 0.15719448029994965,
+      "learning_rate": 1.957443834262087e-05,
+      "loss": 0.15144693851470947,
+      "step": 3205
+    },
+    {
+      "epoch": 0.584061135371179,
+      "grad_norm": 0.16512645781040192,
+      "learning_rate": 1.950260821834285e-05,
+      "loss": 0.14787566661834717,
+      "step": 3210
+    },
+    {
+      "epoch": 0.5849708879184862,
+      "grad_norm": 0.18584516644477844,
+      "learning_rate": 1.9430825751128643e-05,
+      "loss": 0.14514710903167724,
+      "step": 3215
+    },
+    {
+      "epoch": 0.5858806404657934,
+      "grad_norm": 0.17640981078147888,
+      "learning_rate": 1.9359091563262742e-05,
+      "loss": 0.1511004686355591,
+      "step": 3220
+    },
+    {
+      "epoch": 0.5867903930131004,
+      "grad_norm": 0.1697624921798706,
+      "learning_rate": 1.9287406276611095e-05,
+      "loss": 0.15392563343048096,
+      "step": 3225
+    },
+    {
+      "epoch": 0.5877001455604076,
+      "grad_norm": 0.1677260845899582,
+      "learning_rate": 1.9215770512615725e-05,
+      "loss": 0.15311745405197144,
+      "step": 3230
+    },
+    {
+      "epoch": 0.5886098981077147,
+      "grad_norm": 0.15357480943202972,
+      "learning_rate": 1.9144184892289337e-05,
+      "loss": 0.14370160102844237,
+      "step": 3235
+    },
+    {
+      "epoch": 0.5895196506550219,
+      "grad_norm": 0.18601207435131073,
+      "learning_rate": 1.9072650036209955e-05,
+      "loss": 0.14095077514648438,
+      "step": 3240
+    },
+    {
+      "epoch": 0.590429403202329,
+      "grad_norm": 0.17313526570796967,
+      "learning_rate": 1.9001166564515513e-05,
+      "loss": 0.148259174823761,
+      "step": 3245
+    },
+    {
+      "epoch": 0.5913391557496361,
+      "grad_norm": 0.1634378433227539,
+      "learning_rate": 1.8929735096898504e-05,
+      "loss": 0.15082294940948487,
+      "step": 3250
+    },
+    {
+      "epoch": 0.5922489082969432,
+      "grad_norm": 0.18542174994945526,
+      "learning_rate": 1.885835625260058e-05,
+      "loss": 0.14461435079574586,
+      "step": 3255
+    },
+    {
+      "epoch": 0.5931586608442504,
+      "grad_norm": 0.1740756630897522,
+      "learning_rate": 1.87870306504072e-05,
+      "loss": 0.14083608388900756,
+      "step": 3260
+    },
+    {
+      "epoch": 0.5940684133915575,
+      "grad_norm": 0.25606217980384827,
+      "learning_rate": 1.8715758908642288e-05,
+      "loss": 0.15125386714935302,
+      "step": 3265
+    },
+    {
+      "epoch": 0.5949781659388647,
+      "grad_norm": 0.20194627344608307,
+      "learning_rate": 1.8644541645162834e-05,
+      "loss": 0.14433003664016725,
+      "step": 3270
+    },
+    {
+      "epoch": 0.5958879184861717,
+      "grad_norm": 0.1902168095111847,
+      "learning_rate": 1.8573379477353542e-05,
+      "loss": 0.14718132019042968,
+      "step": 3275
+    },
+    {
+      "epoch": 0.5967976710334789,
+      "grad_norm": 0.15122972428798676,
+      "learning_rate": 1.850227302212151e-05,
+      "loss": 0.153376567363739,
+      "step": 3280
+    },
+    {
+      "epoch": 0.597707423580786,
+      "grad_norm": 0.14331959187984467,
+      "learning_rate": 1.843122289589085e-05,
+      "loss": 0.146630597114563,
+      "step": 3285
+    },
+    {
+      "epoch": 0.5986171761280932,
+      "grad_norm": 0.15083099901676178,
+      "learning_rate": 1.836022971459737e-05,
+      "loss": 0.1445971965789795,
+      "step": 3290
+    },
+    {
+      "epoch": 0.5995269286754003,
+      "grad_norm": 0.16585418581962585,
+      "learning_rate": 1.828929409368321e-05,
+      "loss": 0.15120241641998292,
+      "step": 3295
+    },
+    {
+      "epoch": 0.6004366812227074,
+      "grad_norm": 0.1653224229812622,
+      "learning_rate": 1.8218416648091524e-05,
+      "loss": 0.14349838495254516,
+      "step": 3300
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.8176324849839572e+18,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-3300/training_args.bin b/checkpoint-3300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-3300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/checkpoint-3400/README.md b/checkpoint-3400/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-3400/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-3400/adapter_config.json b/checkpoint-3400/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-3400/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-3400/adapter_model.safetensors b/checkpoint-3400/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..81aca3510ccca4664b7b77c69a34f1bed6ac6a89
--- /dev/null
+++ b/checkpoint-3400/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f84d5e1dbf9adb3564ef0dd5855eda4e578dfb64526eab20ef967dcafa91de8c
+size 169741912
diff --git a/checkpoint-3400/chat_template.jinja b/checkpoint-3400/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-3400/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-3400/optimizer.pt b/checkpoint-3400/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6e2921c4e4fc064a5fbd73bd211b4b60fd103991
--- /dev/null
+++ b/checkpoint-3400/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1d2f3734b61c4f008d3375cf2af6c68eb9386d2a6832d9482b0ca22ab76e59a2
+size 72807355
diff --git a/checkpoint-3400/processor_config.json b/checkpoint-3400/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-3400/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-3400/rng_state.pth b/checkpoint-3400/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-3400/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-3400/scheduler.pt b/checkpoint-3400/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4398756d480b21edbcb76e68de5eb5295c55bcd2
--- /dev/null
+++ b/checkpoint-3400/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ce5a00f8ff3c6602d68364babff15c75ed86345b24f966653e3d9d6e6f07629d
+size 1465
diff --git a/checkpoint-3400/tokenizer.json b/checkpoint-3400/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-3400/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-3400/tokenizer_config.json b/checkpoint-3400/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-3400/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-3400/trainer_state.json b/checkpoint-3400/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..9113ed7d6c19bdf00eaededee5566c5d1d740ed1
--- /dev/null
+++ b/checkpoint-3400/trainer_state.json
@@ -0,0 +1,4802 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.61863173216885,
+  "eval_steps": 100,
+  "global_step": 3400,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    },
+    {
+      "epoch": 0.03729985443959243,
+      "grad_norm": 0.061678580939769745,
+      "learning_rate": 4.999340748636462e-05,
+      "loss": 0.24956226348876953,
+      "step": 205
+    },
+    {
+      "epoch": 0.03820960698689956,
+      "grad_norm": 0.07328873127698898,
+      "learning_rate": 4.999160884067051e-05,
+      "loss": 0.26169676780700685,
+      "step": 210
+    },
+    {
+      "epoch": 0.0391193595342067,
+      "grad_norm": 0.08287990838289261,
+      "learning_rate": 4.9989593541926246e-05,
+      "loss": 0.2574604034423828,
+      "step": 215
+    },
+    {
+      "epoch": 0.04002911208151383,
+      "grad_norm": 0.06787359714508057,
+      "learning_rate": 4.9987361607602525e-05,
+      "loss": 0.25351409912109374,
+      "step": 220
+    },
+    {
+      "epoch": 0.04093886462882096,
+      "grad_norm": 0.06695502996444702,
+      "learning_rate": 4.998491305704805e-05,
+      "loss": 0.24522039890289307,
+      "step": 225
+    },
+    {
+      "epoch": 0.041848617176128096,
+      "grad_norm": 0.08872214704751968,
+      "learning_rate": 4.9982247911489375e-05,
+      "loss": 0.2581867933273315,
+      "step": 230
+    },
+    {
+      "epoch": 0.042758369723435226,
+      "grad_norm": 0.07637131959199905,
+      "learning_rate": 4.9979366194030743e-05,
+      "loss": 0.25569658279418944,
+      "step": 235
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 0.08158119022846222,
+      "learning_rate": 4.997626792965385e-05,
+      "loss": 0.2529409646987915,
+      "step": 240
+    },
+    {
+      "epoch": 0.04457787481804949,
+      "grad_norm": 0.07529161125421524,
+      "learning_rate": 4.997295314521766e-05,
+      "loss": 0.24049024581909179,
+      "step": 245
+    },
+    {
+      "epoch": 0.04548762736535662,
+      "grad_norm": 0.08860139548778534,
+      "learning_rate": 4.996942186945813e-05,
+      "loss": 0.2490522861480713,
+      "step": 250
+    },
+    {
+      "epoch": 0.04639737991266375,
+      "grad_norm": 0.0850321501493454,
+      "learning_rate": 4.9965674132988005e-05,
+      "loss": 0.24180831909179687,
+      "step": 255
+    },
+    {
+      "epoch": 0.04730713245997089,
+      "grad_norm": 0.07556115090847015,
+      "learning_rate": 4.996170996829653e-05,
+      "loss": 0.2509631872177124,
+      "step": 260
+    },
+    {
+      "epoch": 0.04821688500727802,
+      "grad_norm": 0.07971206307411194,
+      "learning_rate": 4.995752940974918e-05,
+      "loss": 0.24398891925811766,
+      "step": 265
+    },
+    {
+      "epoch": 0.04912663755458515,
+      "grad_norm": 0.09149336814880371,
+      "learning_rate": 4.9953132493587344e-05,
+      "loss": 0.2300492286682129,
+      "step": 270
+    },
+    {
+      "epoch": 0.050036390101892286,
+      "grad_norm": 0.08265820890665054,
+      "learning_rate": 4.9948519257928034e-05,
+      "loss": 0.24246792793273925,
+      "step": 275
+    },
+    {
+      "epoch": 0.050946142649199416,
+      "grad_norm": 0.10328587144613266,
+      "learning_rate": 4.9943689742763534e-05,
+      "loss": 0.2367171049118042,
+      "step": 280
+    },
+    {
+      "epoch": 0.05185589519650655,
+      "grad_norm": 0.0836917981505394,
+      "learning_rate": 4.993864398996105e-05,
+      "loss": 0.23215813636779786,
+      "step": 285
+    },
+    {
+      "epoch": 0.05276564774381368,
+      "grad_norm": 0.09475161135196686,
+      "learning_rate": 4.99333820432624e-05,
+      "loss": 0.2350748062133789,
+      "step": 290
+    },
+    {
+      "epoch": 0.05367540029112081,
+      "grad_norm": 0.08040128648281097,
+      "learning_rate": 4.992790394828355e-05,
+      "loss": 0.23253886699676513,
+      "step": 295
+    },
+    {
+      "epoch": 0.05458515283842795,
+      "grad_norm": 0.08852150291204453,
+      "learning_rate": 4.992220975251428e-05,
+      "loss": 0.23856515884399415,
+      "step": 300
+    },
+    {
+      "epoch": 0.05549490538573508,
+      "grad_norm": 0.09565229713916779,
+      "learning_rate": 4.991629950531775e-05,
+      "loss": 0.23311660289764405,
+      "step": 305
+    },
+    {
+      "epoch": 0.05640465793304221,
+      "grad_norm": 0.08158160001039505,
+      "learning_rate": 4.991017325793009e-05,
+      "loss": 0.22467944622039795,
+      "step": 310
+    },
+    {
+      "epoch": 0.05731441048034935,
+      "grad_norm": 0.07746429741382599,
+      "learning_rate": 4.990383106345994e-05,
+      "loss": 0.229844069480896,
+      "step": 315
+    },
+    {
+      "epoch": 0.05822416302765648,
+      "grad_norm": 0.08564355969429016,
+      "learning_rate": 4.989727297688797e-05,
+      "loss": 0.22414517402648926,
+      "step": 320
+    },
+    {
+      "epoch": 0.05913391557496361,
+      "grad_norm": 0.07517435401678085,
+      "learning_rate": 4.9890499055066435e-05,
+      "loss": 0.2236532211303711,
+      "step": 325
+    },
+    {
+      "epoch": 0.060043668122270744,
+      "grad_norm": 0.111734539270401,
+      "learning_rate": 4.988350935671869e-05,
+      "loss": 0.21474847793579102,
+      "step": 330
+    },
+    {
+      "epoch": 0.060953420669577874,
+      "grad_norm": 0.09906989336013794,
+      "learning_rate": 4.987630394243866e-05,
+      "loss": 0.23321933746337892,
+      "step": 335
+    },
+    {
+      "epoch": 0.06186317321688501,
+      "grad_norm": 0.10131457448005676,
+      "learning_rate": 4.98688828746903e-05,
+      "loss": 0.2310662031173706,
+      "step": 340
+    },
+    {
+      "epoch": 0.06277292576419213,
+      "grad_norm": 0.09203507006168365,
+      "learning_rate": 4.986124621780708e-05,
+      "loss": 0.22021169662475587,
+      "step": 345
+    },
+    {
+      "epoch": 0.06368267831149928,
+      "grad_norm": 0.09505912661552429,
+      "learning_rate": 4.9853394037991416e-05,
+      "loss": 0.2197155237197876,
+      "step": 350
+    },
+    {
+      "epoch": 0.06459243085880641,
+      "grad_norm": 0.09038657695055008,
+      "learning_rate": 4.984532640331412e-05,
+      "loss": 0.22066287994384765,
+      "step": 355
+    },
+    {
+      "epoch": 0.06550218340611354,
+      "grad_norm": 0.09707064181566238,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 0.22455451488494874,
+      "step": 360
+    },
+    {
+      "epoch": 0.06641193595342067,
+      "grad_norm": 0.10367228090763092,
+      "learning_rate": 4.98285450509961e-05,
+      "loss": 0.21993820667266845,
+      "step": 365
+    },
+    {
+      "epoch": 0.0673216885007278,
+      "grad_norm": 0.12229471653699875,
+      "learning_rate": 4.9819831478833456e-05,
+      "loss": 0.2168867588043213,
+      "step": 370
+    },
+    {
+      "epoch": 0.06823144104803494,
+      "grad_norm": 0.0964592918753624,
+      "learning_rate": 4.981090274276406e-05,
+      "loss": 0.21579203605651856,
+      "step": 375
+    },
+    {
+      "epoch": 0.06914119359534207,
+      "grad_norm": 0.09400496631860733,
+      "learning_rate": 4.980175892019141e-05,
+      "loss": 0.20972180366516113,
+      "step": 380
+    },
+    {
+      "epoch": 0.0700509461426492,
+      "grad_norm": 0.08158645778894424,
+      "learning_rate": 4.9792400090383594e-05,
+      "loss": 0.22148358821868896,
+      "step": 385
+    },
+    {
+      "epoch": 0.07096069868995633,
+      "grad_norm": 0.10916394740343094,
+      "learning_rate": 4.978282633447261e-05,
+      "loss": 0.2214418649673462,
+      "step": 390
+    },
+    {
+      "epoch": 0.07187045123726346,
+      "grad_norm": 0.11138810962438583,
+      "learning_rate": 4.9773037735453636e-05,
+      "loss": 0.21814754009246826,
+      "step": 395
+    },
+    {
+      "epoch": 0.07278020378457059,
+      "grad_norm": 0.10914396494626999,
+      "learning_rate": 4.9763034378184365e-05,
+      "loss": 0.21310818195343018,
+      "step": 400
+    },
+    {
+      "epoch": 0.07368995633187773,
+      "grad_norm": 0.1043366864323616,
+      "learning_rate": 4.975281634938421e-05,
+      "loss": 0.21266789436340333,
+      "step": 405
+    },
+    {
+      "epoch": 0.07459970887918486,
+      "grad_norm": 0.1036868542432785,
+      "learning_rate": 4.9742383737633594e-05,
+      "loss": 0.21606721878051757,
+      "step": 410
+    },
+    {
+      "epoch": 0.075509461426492,
+      "grad_norm": 0.11640442907810211,
+      "learning_rate": 4.9731736633373144e-05,
+      "loss": 0.21532948017120362,
+      "step": 415
+    },
+    {
+      "epoch": 0.07641921397379912,
+      "grad_norm": 0.11219926178455353,
+      "learning_rate": 4.9720875128902956e-05,
+      "loss": 0.2191627025604248,
+      "step": 420
+    },
+    {
+      "epoch": 0.07732896652110625,
+      "grad_norm": 0.12103637307882309,
+      "learning_rate": 4.970979931838176e-05,
+      "loss": 0.20938868522644044,
+      "step": 425
+    },
+    {
+      "epoch": 0.0782387190684134,
+      "grad_norm": 0.13274189829826355,
+      "learning_rate": 4.96985092978261e-05,
+      "loss": 0.21792960166931152,
+      "step": 430
+    },
+    {
+      "epoch": 0.07914847161572053,
+      "grad_norm": 0.11164513230323792,
+      "learning_rate": 4.968700516510954e-05,
+      "loss": 0.2022618055343628,
+      "step": 435
+    },
+    {
+      "epoch": 0.08005822416302766,
+      "grad_norm": 0.09532847255468369,
+      "learning_rate": 4.967528701996174e-05,
+      "loss": 0.21255812644958497,
+      "step": 440
+    },
+    {
+      "epoch": 0.08096797671033479,
+      "grad_norm": 0.10279258340597153,
+      "learning_rate": 4.96633549639677e-05,
+      "loss": 0.20683050155639648,
+      "step": 445
+    },
+    {
+      "epoch": 0.08187772925764192,
+      "grad_norm": 0.1257462352514267,
+      "learning_rate": 4.965120910056677e-05,
+      "loss": 0.21419920921325683,
+      "step": 450
+    },
+    {
+      "epoch": 0.08278748180494905,
+      "grad_norm": 0.11663137376308441,
+      "learning_rate": 4.963884953505186e-05,
+      "loss": 0.2072287082672119,
+      "step": 455
+    },
+    {
+      "epoch": 0.08369723435225619,
+      "grad_norm": 0.10488224029541016,
+      "learning_rate": 4.96262763745684e-05,
+      "loss": 0.1982678532600403,
+      "step": 460
+    },
+    {
+      "epoch": 0.08460698689956332,
+      "grad_norm": 0.11801692098379135,
+      "learning_rate": 4.961348972811354e-05,
+      "loss": 0.20662031173706055,
+      "step": 465
+    },
+    {
+      "epoch": 0.08551673944687045,
+      "grad_norm": 0.11318827420473099,
+      "learning_rate": 4.96004897065351e-05,
+      "loss": 0.20947303771972656,
+      "step": 470
+    },
+    {
+      "epoch": 0.08642649199417758,
+      "grad_norm": 0.13409486413002014,
+      "learning_rate": 4.95872764225307e-05,
+      "loss": 0.19670876264572143,
+      "step": 475
+    },
+    {
+      "epoch": 0.08733624454148471,
+      "grad_norm": 0.14440792798995972,
+      "learning_rate": 4.957384999064672e-05,
+      "loss": 0.19842848777770997,
+      "step": 480
+    },
+    {
+      "epoch": 0.08824599708879186,
+      "grad_norm": 0.12246996909379959,
+      "learning_rate": 4.956021052727731e-05,
+      "loss": 0.20318071842193602,
+      "step": 485
+    },
+    {
+      "epoch": 0.08915574963609899,
+      "grad_norm": 0.13437233865261078,
+      "learning_rate": 4.954635815066342e-05,
+      "loss": 0.21675212383270265,
+      "step": 490
+    },
+    {
+      "epoch": 0.09006550218340612,
+      "grad_norm": 0.11109672486782074,
+      "learning_rate": 4.9532292980891744e-05,
+      "loss": 0.2100757837295532,
+      "step": 495
+    },
+    {
+      "epoch": 0.09097525473071325,
+      "grad_norm": 0.1388893872499466,
+      "learning_rate": 4.9518015139893675e-05,
+      "loss": 0.20303285121917725,
+      "step": 500
+    },
+    {
+      "epoch": 0.09188500727802038,
+      "grad_norm": 0.13239721953868866,
+      "learning_rate": 4.950352475144427e-05,
+      "loss": 0.2152268409729004,
+      "step": 505
+    },
+    {
+      "epoch": 0.0927947598253275,
+      "grad_norm": 0.12834979593753815,
+      "learning_rate": 4.948882194116119e-05,
+      "loss": 0.20799248218536376,
+      "step": 510
+    },
+    {
+      "epoch": 0.09370451237263465,
+      "grad_norm": 0.11886704713106155,
+      "learning_rate": 4.947390683650354e-05,
+      "loss": 0.20394976139068605,
+      "step": 515
+    },
+    {
+      "epoch": 0.09461426491994178,
+      "grad_norm": 0.11398876458406448,
+      "learning_rate": 4.945877956677083e-05,
+      "loss": 0.2091092586517334,
+      "step": 520
+    },
+    {
+      "epoch": 0.09552401746724891,
+      "grad_norm": 0.1422540694475174,
+      "learning_rate": 4.944344026310186e-05,
+      "loss": 0.19564238786697388,
+      "step": 525
+    },
+    {
+      "epoch": 0.09643377001455604,
+      "grad_norm": 0.11359584331512451,
+      "learning_rate": 4.9427889058473535e-05,
+      "loss": 0.20493624210357667,
+      "step": 530
+    },
+    {
+      "epoch": 0.09734352256186317,
+      "grad_norm": 0.11703553050756454,
+      "learning_rate": 4.941212608769974e-05,
+      "loss": 0.2098615884780884,
+      "step": 535
+    },
+    {
+      "epoch": 0.0982532751091703,
+      "grad_norm": 0.14552047848701477,
+      "learning_rate": 4.939615148743017e-05,
+      "loss": 0.20382182598114013,
+      "step": 540
+    },
+    {
+      "epoch": 0.09916302765647744,
+      "grad_norm": 0.13178016245365143,
+      "learning_rate": 4.937996539614914e-05,
+      "loss": 0.19901862144470214,
+      "step": 545
+    },
+    {
+      "epoch": 0.10007278020378457,
+      "grad_norm": 0.635392427444458,
+      "learning_rate": 4.936356795417439e-05,
+      "loss": 0.20694944858551026,
+      "step": 550
+    },
+    {
+      "epoch": 0.1009825327510917,
+      "grad_norm": 0.15019077062606812,
+      "learning_rate": 4.934695930365586e-05,
+      "loss": 0.19313746690750122,
+      "step": 555
+    },
+    {
+      "epoch": 0.10189228529839883,
+      "grad_norm": 0.12941956520080566,
+      "learning_rate": 4.9330139588574474e-05,
+      "loss": 0.19671722650527954,
+      "step": 560
+    },
+    {
+      "epoch": 0.10280203784570596,
+      "grad_norm": 0.13818831741809845,
+      "learning_rate": 4.931310895474088e-05,
+      "loss": 0.20026786327362062,
+      "step": 565
+    },
+    {
+      "epoch": 0.1037117903930131,
+      "grad_norm": 0.12011194974184036,
+      "learning_rate": 4.929586754979417e-05,
+      "loss": 0.1932437539100647,
+      "step": 570
+    },
+    {
+      "epoch": 0.10462154294032024,
+      "grad_norm": 0.1345364898443222,
+      "learning_rate": 4.9278415523200644e-05,
+      "loss": 0.20245940685272218,
+      "step": 575
+    },
+    {
+      "epoch": 0.10553129548762737,
+      "grad_norm": 0.13281017541885376,
+      "learning_rate": 4.926075302625247e-05,
+      "loss": 0.19864981174468993,
+      "step": 580
+    },
+    {
+      "epoch": 0.1064410480349345,
+      "grad_norm": 0.13465586304664612,
+      "learning_rate": 4.924288021206639e-05,
+      "loss": 0.19573183059692384,
+      "step": 585
+    },
+    {
+      "epoch": 0.10735080058224163,
+      "grad_norm": 0.15225961804389954,
+      "learning_rate": 4.9224797235582396e-05,
+      "loss": 0.19946801662445068,
+      "step": 590
+    },
+    {
+      "epoch": 0.10826055312954876,
+      "grad_norm": 0.12816746532917023,
+      "learning_rate": 4.92065042535624e-05,
+      "loss": 0.19851526021957397,
+      "step": 595
+    },
+    {
+      "epoch": 0.1091703056768559,
+      "grad_norm": 0.13802853226661682,
+      "learning_rate": 4.9188001424588824e-05,
+      "loss": 0.19321763515472412,
+      "step": 600
+    },
+    {
+      "epoch": 0.11008005822416303,
+      "grad_norm": 0.17504797875881195,
+      "learning_rate": 4.9169288909063295e-05,
+      "loss": 0.2032616138458252,
+      "step": 605
+    },
+    {
+      "epoch": 0.11098981077147016,
+      "grad_norm": 0.13544194400310516,
+      "learning_rate": 4.91503668692052e-05,
+      "loss": 0.2011256456375122,
+      "step": 610
+    },
+    {
+      "epoch": 0.11189956331877729,
+      "grad_norm": 1.3976134061813354,
+      "learning_rate": 4.91312354690503e-05,
+      "loss": 0.19916868209838867,
+      "step": 615
+    },
+    {
+      "epoch": 0.11280931586608442,
+      "grad_norm": 0.1465059071779251,
+      "learning_rate": 4.91118948744493e-05,
+      "loss": 0.19487457275390624,
+      "step": 620
+    },
+    {
+      "epoch": 0.11371906841339156,
+      "grad_norm": 0.12103168666362762,
+      "learning_rate": 4.909234525306645e-05,
+      "loss": 0.1907251238822937,
+      "step": 625
+    },
+    {
+      "epoch": 0.1146288209606987,
+      "grad_norm": 0.12660574913024902,
+      "learning_rate": 4.907258677437802e-05,
+      "loss": 0.19327253103256226,
+      "step": 630
+    },
+    {
+      "epoch": 0.11553857350800582,
+      "grad_norm": 0.1347813606262207,
+      "learning_rate": 4.90526196096709e-05,
+      "loss": 0.19637736082077026,
+      "step": 635
+    },
+    {
+      "epoch": 0.11644832605531295,
+      "grad_norm": 0.14953652024269104,
+      "learning_rate": 4.903244393204107e-05,
+      "loss": 0.20325069427490233,
+      "step": 640
+    },
+    {
+      "epoch": 0.11735807860262008,
+      "grad_norm": 0.13936272263526917,
+      "learning_rate": 4.901205991639213e-05,
+      "loss": 0.1930275321006775,
+      "step": 645
+    },
+    {
+      "epoch": 0.11826783114992721,
+      "grad_norm": 0.1448420137166977,
+      "learning_rate": 4.899146773943374e-05,
+      "loss": 0.20026936531066894,
+      "step": 650
+    },
+    {
+      "epoch": 0.11917758369723436,
+      "grad_norm": 0.1312534064054489,
+      "learning_rate": 4.897066757968014e-05,
+      "loss": 0.19062033891677857,
+      "step": 655
+    },
+    {
+      "epoch": 0.12008733624454149,
+      "grad_norm": 0.13644742965698242,
+      "learning_rate": 4.894965961744859e-05,
+      "loss": 0.18719595670700073,
+      "step": 660
+    },
+    {
+      "epoch": 0.12099708879184862,
+      "grad_norm": 0.14276087284088135,
+      "learning_rate": 4.892844403485777e-05,
+      "loss": 0.19784307479858398,
+      "step": 665
+    },
+    {
+      "epoch": 0.12190684133915575,
+      "grad_norm": 0.14735399186611176,
+      "learning_rate": 4.890702101582623e-05,
+      "loss": 0.19163782596588136,
+      "step": 670
+    },
+    {
+      "epoch": 0.12281659388646288,
+      "grad_norm": 0.15742065012454987,
+      "learning_rate": 4.888539074607082e-05,
+      "loss": 0.19312986135482788,
+      "step": 675
+    },
+    {
+      "epoch": 0.12372634643377002,
+      "grad_norm": 0.12917031347751617,
+      "learning_rate": 4.8863553413105025e-05,
+      "loss": 0.20066320896148682,
+      "step": 680
+    },
+    {
+      "epoch": 0.12463609898107715,
+      "grad_norm": 0.1484801322221756,
+      "learning_rate": 4.884150920623737e-05,
+      "loss": 0.20096096992492676,
+      "step": 685
+    },
+    {
+      "epoch": 0.12554585152838427,
+      "grad_norm": 0.1455296128988266,
+      "learning_rate": 4.88192583165698e-05,
+      "loss": 0.20518505573272705,
+      "step": 690
+    },
+    {
+      "epoch": 0.12645560407569142,
+      "grad_norm": 0.14517490565776825,
+      "learning_rate": 4.879680093699598e-05,
+      "loss": 0.18859238624572755,
+      "step": 695
+    },
+    {
+      "epoch": 0.12736535662299855,
+      "grad_norm": 0.18778090178966522,
+      "learning_rate": 4.877413726219964e-05,
+      "loss": 0.197074818611145,
+      "step": 700
+    },
+    {
+      "epoch": 0.12827510917030568,
+      "grad_norm": 0.13497677445411682,
+      "learning_rate": 4.87512674886529e-05,
+      "loss": 0.18713107109069824,
+      "step": 705
+    },
+    {
+      "epoch": 0.12918486171761281,
+      "grad_norm": 0.12657155096530914,
+      "learning_rate": 4.872819181461455e-05,
+      "loss": 0.1858484387397766,
+      "step": 710
+    },
+    {
+      "epoch": 0.13009461426491994,
+      "grad_norm": 0.11458148807287216,
+      "learning_rate": 4.870491044012834e-05,
+      "loss": 0.18732179403305055,
+      "step": 715
+    },
+    {
+      "epoch": 0.13100436681222707,
+      "grad_norm": 0.13000249862670898,
+      "learning_rate": 4.8681423567021244e-05,
+      "loss": 0.1872936010360718,
+      "step": 720
+    },
+    {
+      "epoch": 0.1319141193595342,
+      "grad_norm": 0.14580890536308289,
+      "learning_rate": 4.865773139890172e-05,
+      "loss": 0.19280019998550416,
+      "step": 725
+    },
+    {
+      "epoch": 0.13282387190684133,
+      "grad_norm": 0.1507277935743332,
+      "learning_rate": 4.8633834141157913e-05,
+      "loss": 0.1898929238319397,
+      "step": 730
+    },
+    {
+      "epoch": 0.13373362445414846,
+      "grad_norm": 0.1418737769126892,
+      "learning_rate": 4.860973200095592e-05,
+      "loss": 0.17926375865936278,
+      "step": 735
+    },
+    {
+      "epoch": 0.1346433770014556,
+      "grad_norm": 0.17151866853237152,
+      "learning_rate": 4.858542518723794e-05,
+      "loss": 0.18963592052459716,
+      "step": 740
+    },
+    {
+      "epoch": 0.13555312954876272,
+      "grad_norm": 0.11162743717432022,
+      "learning_rate": 4.8560913910720535e-05,
+      "loss": 0.19466646909713745,
+      "step": 745
+    },
+    {
+      "epoch": 0.13646288209606988,
+      "grad_norm": 0.15628376603126526,
+      "learning_rate": 4.8536198383892725e-05,
+      "loss": 0.19494034051895143,
+      "step": 750
+    },
+    {
+      "epoch": 0.137372634643377,
+      "grad_norm": 0.18209289014339447,
+      "learning_rate": 4.851127882101421e-05,
+      "loss": 0.18747550249099731,
+      "step": 755
+    },
+    {
+      "epoch": 0.13828238719068414,
+      "grad_norm": 0.14559614658355713,
+      "learning_rate": 4.8486155438113454e-05,
+      "loss": 0.1897158980369568,
+      "step": 760
+    },
+    {
+      "epoch": 0.13919213973799127,
+      "grad_norm": 0.3198587894439697,
+      "learning_rate": 4.846082845298586e-05,
+      "loss": 0.18571001291275024,
+      "step": 765
+    },
+    {
+      "epoch": 0.1401018922852984,
+      "grad_norm": 0.1486678421497345,
+      "learning_rate": 4.843529808519189e-05,
+      "loss": 0.19561930894851684,
+      "step": 770
+    },
+    {
+      "epoch": 0.14101164483260553,
+      "grad_norm": 0.15318170189857483,
+      "learning_rate": 4.840956455605509e-05,
+      "loss": 0.187040114402771,
+      "step": 775
+    },
+    {
+      "epoch": 0.14192139737991266,
+      "grad_norm": 0.13754244148731232,
+      "learning_rate": 4.838362808866025e-05,
+      "loss": 0.18345539569854735,
+      "step": 780
+    },
+    {
+      "epoch": 0.1428311499272198,
+      "grad_norm": 0.12943248450756073,
+      "learning_rate": 4.835748890785143e-05,
+      "loss": 0.1921079397201538,
+      "step": 785
+    },
+    {
+      "epoch": 0.14374090247452692,
+      "grad_norm": 0.110458143055439,
+      "learning_rate": 4.833114724023001e-05,
+      "loss": 0.17927205562591553,
+      "step": 790
+    },
+    {
+      "epoch": 0.14465065502183405,
+      "grad_norm": 0.2421770840883255,
+      "learning_rate": 4.830460331415275e-05,
+      "loss": 0.18317567110061644,
+      "step": 795
+    },
+    {
+      "epoch": 0.14556040756914118,
+      "grad_norm": 0.14752762019634247,
+      "learning_rate": 4.8277857359729787e-05,
+      "loss": 0.1843916058540344,
+      "step": 800
+    },
+    {
+      "epoch": 0.14647016011644834,
+      "grad_norm": 0.15043556690216064,
+      "learning_rate": 4.8250909608822644e-05,
+      "loss": 0.18354393243789674,
+      "step": 805
+    },
+    {
+      "epoch": 0.14737991266375547,
+      "grad_norm": 0.1381794661283493,
+      "learning_rate": 4.822376029504223e-05,
+      "loss": 0.1789781332015991,
+      "step": 810
+    },
+    {
+      "epoch": 0.1482896652110626,
+      "grad_norm": 0.18386174738407135,
+      "learning_rate": 4.819640965374681e-05,
+      "loss": 0.19494292736053467,
+      "step": 815
+    },
+    {
+      "epoch": 0.14919941775836973,
+      "grad_norm": 0.13829593360424042,
+      "learning_rate": 4.816885792203996e-05,
+      "loss": 0.18486063480377196,
+      "step": 820
+    },
+    {
+      "epoch": 0.15010917030567686,
+      "grad_norm": 0.15033291280269623,
+      "learning_rate": 4.814110533876852e-05,
+      "loss": 0.18061509132385253,
+      "step": 825
+    },
+    {
+      "epoch": 0.151018922852984,
+      "grad_norm": 0.17150473594665527,
+      "learning_rate": 4.811315214452051e-05,
+      "loss": 0.18464866876602173,
+      "step": 830
+    },
+    {
+      "epoch": 0.15192867540029112,
+      "grad_norm": 0.15317125618457794,
+      "learning_rate": 4.808499858162307e-05,
+      "loss": 0.1837708592414856,
+      "step": 835
+    },
+    {
+      "epoch": 0.15283842794759825,
+      "grad_norm": 0.2671392560005188,
+      "learning_rate": 4.805664489414031e-05,
+      "loss": 0.19338636398315429,
+      "step": 840
+    },
+    {
+      "epoch": 0.15374818049490538,
+      "grad_norm": 0.14047028124332428,
+      "learning_rate": 4.802809132787125e-05,
+      "loss": 0.17069108486175538,
+      "step": 845
+    },
+    {
+      "epoch": 0.1546579330422125,
+      "grad_norm": 0.1520431935787201,
+      "learning_rate": 4.799933813034768e-05,
+      "loss": 0.18607735633850098,
+      "step": 850
+    },
+    {
+      "epoch": 0.15556768558951964,
+      "grad_norm": 0.17239463329315186,
+      "learning_rate": 4.797038555083197e-05,
+      "loss": 0.18069062232971192,
+      "step": 855
+    },
+    {
+      "epoch": 0.1564774381368268,
+      "grad_norm": 0.1377955675125122,
+      "learning_rate": 4.794123384031495e-05,
+      "loss": 0.18870222568511963,
+      "step": 860
+    },
+    {
+      "epoch": 0.15738719068413393,
+      "grad_norm": 0.15901461243629456,
+      "learning_rate": 4.791188325151373e-05,
+      "loss": 0.18128334283828734,
+      "step": 865
+    },
+    {
+      "epoch": 0.15829694323144106,
+      "grad_norm": 0.14634132385253906,
+      "learning_rate": 4.7882334038869495e-05,
+      "loss": 0.1866163969039917,
+      "step": 870
+    },
+    {
+      "epoch": 0.1592066957787482,
+      "grad_norm": 0.15361061692237854,
+      "learning_rate": 4.785258645854529e-05,
+      "loss": 0.17850807905197144,
+      "step": 875
+    },
+    {
+      "epoch": 0.16011644832605532,
+      "grad_norm": 0.13751649856567383,
+      "learning_rate": 4.782264076842385e-05,
+      "loss": 0.17731113433837892,
+      "step": 880
+    },
+    {
+      "epoch": 0.16102620087336245,
+      "grad_norm": 0.17909638583660126,
+      "learning_rate": 4.7792497228105314e-05,
+      "loss": 0.18344542980194092,
+      "step": 885
+    },
+    {
+      "epoch": 0.16193595342066958,
+      "grad_norm": 0.16038304567337036,
+      "learning_rate": 4.776215609890498e-05,
+      "loss": 0.18868647813796996,
+      "step": 890
+    },
+    {
+      "epoch": 0.1628457059679767,
+      "grad_norm": 0.1653951108455658,
+      "learning_rate": 4.773161764385107e-05,
+      "loss": 0.18614152669906617,
+      "step": 895
+    },
+    {
+      "epoch": 0.16375545851528384,
+      "grad_norm": 0.16193026304244995,
+      "learning_rate": 4.770088212768241e-05,
+      "loss": 0.18564575910568237,
+      "step": 900
+    },
+    {
+      "epoch": 0.16466521106259097,
+      "grad_norm": 0.16048531234264374,
+      "learning_rate": 4.7669949816846173e-05,
+      "loss": 0.18330031633377075,
+      "step": 905
+    },
+    {
+      "epoch": 0.1655749636098981,
+      "grad_norm": 0.1440177708864212,
+      "learning_rate": 4.7638820979495534e-05,
+      "loss": 0.17712442874908446,
+      "step": 910
+    },
+    {
+      "epoch": 0.16648471615720525,
+      "grad_norm": 0.19635969400405884,
+      "learning_rate": 4.760749588548738e-05,
+      "loss": 0.18679027557373046,
+      "step": 915
+    },
+    {
+      "epoch": 0.16739446870451238,
+      "grad_norm": 0.15576541423797607,
+      "learning_rate": 4.757597480637995e-05,
+      "loss": 0.19283764362335204,
+      "step": 920
+    },
+    {
+      "epoch": 0.1683042212518195,
+      "grad_norm": 0.1550331562757492,
+      "learning_rate": 4.7544258015430463e-05,
+      "loss": 0.18269542455673218,
+      "step": 925
+    },
+    {
+      "epoch": 0.16921397379912664,
+      "grad_norm": 0.18369626998901367,
+      "learning_rate": 4.75123457875928e-05,
+      "loss": 0.1697891116142273,
+      "step": 930
+    },
+    {
+      "epoch": 0.17012372634643377,
+      "grad_norm": 0.15266314148902893,
+      "learning_rate": 4.7480238399515074e-05,
+      "loss": 0.18523451089859008,
+      "step": 935
+    },
+    {
+      "epoch": 0.1710334788937409,
+      "grad_norm": 0.16709664463996887,
+      "learning_rate": 4.744793612953724e-05,
+      "loss": 0.1803238034248352,
+      "step": 940
+    },
+    {
+      "epoch": 0.17194323144104803,
+      "grad_norm": 0.14929179847240448,
+      "learning_rate": 4.741543925768872e-05,
+      "loss": 0.1861217737197876,
+      "step": 945
+    },
+    {
+      "epoch": 0.17285298398835516,
+      "grad_norm": 0.1362280696630478,
+      "learning_rate": 4.7382748065685915e-05,
+      "loss": 0.17896100282669067,
+      "step": 950
+    },
+    {
+      "epoch": 0.1737627365356623,
+      "grad_norm": 0.15290239453315735,
+      "learning_rate": 4.734986283692982e-05,
+      "loss": 0.18432788848876952,
+      "step": 955
+    },
+    {
+      "epoch": 0.17467248908296942,
+      "grad_norm": 0.1287035197019577,
+      "learning_rate": 4.73167838565035e-05,
+      "loss": 0.18485682010650634,
+      "step": 960
+    },
+    {
+      "epoch": 0.17558224163027655,
+      "grad_norm": 0.17969627678394318,
+      "learning_rate": 4.728351141116971e-05,
+      "loss": 0.17361557483673096,
+      "step": 965
+    },
+    {
+      "epoch": 0.1764919941775837,
+      "grad_norm": 0.13751201331615448,
+      "learning_rate": 4.7250045789368326e-05,
+      "loss": 0.1731679320335388,
+      "step": 970
+    },
+    {
+      "epoch": 0.17740174672489084,
+      "grad_norm": 0.1603265255689621,
+      "learning_rate": 4.721638728121388e-05,
+      "loss": 0.17308170795440675,
+      "step": 975
+    },
+    {
+      "epoch": 0.17831149927219797,
+      "grad_norm": 0.1592789888381958,
+      "learning_rate": 4.718253617849306e-05,
+      "loss": 0.17534757852554322,
+      "step": 980
+    },
+    {
+      "epoch": 0.1792212518195051,
+      "grad_norm": 0.12727224826812744,
+      "learning_rate": 4.714849277466214e-05,
+      "loss": 0.17817609310150145,
+      "step": 985
+    },
+    {
+      "epoch": 0.18013100436681223,
+      "grad_norm": 0.15401554107666016,
+      "learning_rate": 4.711425736484447e-05,
+      "loss": 0.1733405351638794,
+      "step": 990
+    },
+    {
+      "epoch": 0.18104075691411936,
+      "grad_norm": 0.13253968954086304,
+      "learning_rate": 4.7079830245827906e-05,
+      "loss": 0.17846795320510864,
+      "step": 995
+    },
+    {
+      "epoch": 0.1819505094614265,
+      "grad_norm": 0.21846213936805725,
+      "learning_rate": 4.7045211716062245e-05,
+      "loss": 0.18021599054336548,
+      "step": 1000
+    },
+    {
+      "epoch": 0.18286026200873362,
+      "grad_norm": 0.16867990791797638,
+      "learning_rate": 4.7010402075656595e-05,
+      "loss": 0.18232386112213134,
+      "step": 1005
+    },
+    {
+      "epoch": 0.18377001455604075,
+      "grad_norm": 0.17180582880973816,
+      "learning_rate": 4.697540162637686e-05,
+      "loss": 0.1816317319869995,
+      "step": 1010
+    },
+    {
+      "epoch": 0.18467976710334788,
+      "grad_norm": 0.16480213403701782,
+      "learning_rate": 4.694021067164303e-05,
+      "loss": 0.17718446254730225,
+      "step": 1015
+    },
+    {
+      "epoch": 0.185589519650655,
+      "grad_norm": 0.15015918016433716,
+      "learning_rate": 4.6904829516526605e-05,
+      "loss": 0.17412011623382567,
+      "step": 1020
+    },
+    {
+      "epoch": 0.18649927219796217,
+      "grad_norm": 0.14445139467716217,
+      "learning_rate": 4.686925846774795e-05,
+      "loss": 0.1778018832206726,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1874090247452693,
+      "grad_norm": 0.1701960265636444,
+      "learning_rate": 4.683349783367362e-05,
+      "loss": 0.16901081800460815,
+      "step": 1030
+    },
+    {
+      "epoch": 0.18831877729257643,
+      "grad_norm": 0.15894867479801178,
+      "learning_rate": 4.679754792431368e-05,
+      "loss": 0.17055928707122803,
+      "step": 1035
+    },
+    {
+      "epoch": 0.18922852983988356,
+      "grad_norm": 0.1511942446231842,
+      "learning_rate": 4.676140905131903e-05,
+      "loss": 0.17339680194854737,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1901382823871907,
+      "grad_norm": 0.14735209941864014,
+      "learning_rate": 4.672508152797872e-05,
+      "loss": 0.17802717685699462,
+      "step": 1045
+    },
+    {
+      "epoch": 0.19104803493449782,
+      "grad_norm": 0.17367291450500488,
+      "learning_rate": 4.66885656692172e-05,
+      "loss": 0.1732744097709656,
+      "step": 1050
+    },
+    {
+      "epoch": 0.19195778748180495,
+      "grad_norm": 0.147227481007576,
+      "learning_rate": 4.665186179159159e-05,
+      "loss": 0.17040517330169677,
+      "step": 1055
+    },
+    {
+      "epoch": 0.19286754002911208,
+      "grad_norm": 0.1709655076265335,
+      "learning_rate": 4.6614970213289e-05,
+      "loss": 0.17794088125228882,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1937772925764192,
+      "grad_norm": 0.1588088721036911,
+      "learning_rate": 4.657789125412366e-05,
+      "loss": 0.17180380821228028,
+      "step": 1065
+    },
+    {
+      "epoch": 0.19468704512372634,
+      "grad_norm": 0.14827021956443787,
+      "learning_rate": 4.654062523553428e-05,
+      "loss": 0.182997989654541,
+      "step": 1070
+    },
+    {
+      "epoch": 0.19559679767103347,
+      "grad_norm": 0.16230466961860657,
+      "learning_rate": 4.6503172480581126e-05,
+      "loss": 0.17346880435943604,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1965065502183406,
+      "grad_norm": 0.1637624353170395,
+      "learning_rate": 4.646553331394333e-05,
+      "loss": 0.17263576984405518,
+      "step": 1080
+    },
+    {
+      "epoch": 0.19741630276564776,
+      "grad_norm": 0.15977843105793,
+      "learning_rate": 4.642770806191603e-05,
+      "loss": 0.17284308671951293,
+      "step": 1085
+    },
+    {
+      "epoch": 0.19832605531295489,
+      "grad_norm": 0.15394869446754456,
+      "learning_rate": 4.6389697052407534e-05,
+      "loss": 0.17797101736068727,
+      "step": 1090
+    },
+    {
+      "epoch": 0.19923580786026202,
+      "grad_norm": 0.15995225310325623,
+      "learning_rate": 4.6351500614936485e-05,
+      "loss": 0.18137198686599731,
+      "step": 1095
+    },
+    {
+      "epoch": 0.20014556040756915,
+      "grad_norm": 0.1779479682445526,
+      "learning_rate": 4.6313119080629006e-05,
+      "loss": 0.17998344898223878,
+      "step": 1100
+    },
+    {
+      "epoch": 0.20105531295487628,
+      "grad_norm": 0.14362832903862,
+      "learning_rate": 4.627455278221584e-05,
+      "loss": 0.18196423053741456,
+      "step": 1105
+    },
+    {
+      "epoch": 0.2019650655021834,
+      "grad_norm": 0.15951639413833618,
+      "learning_rate": 4.623580205402947e-05,
+      "loss": 0.17423888444900512,
+      "step": 1110
+    },
+    {
+      "epoch": 0.20287481804949054,
+      "grad_norm": 0.17273563146591187,
+      "learning_rate": 4.619686723200115e-05,
+      "loss": 0.17392473220825194,
+      "step": 1115
+    },
+    {
+      "epoch": 0.20378457059679767,
+      "grad_norm": 0.1655360758304596,
+      "learning_rate": 4.615774865365813e-05,
+      "loss": 0.17528389692306517,
+      "step": 1120
+    },
+    {
+      "epoch": 0.2046943231441048,
+      "grad_norm": 0.15920691192150116,
+      "learning_rate": 4.611844665812058e-05,
+      "loss": 0.1806849241256714,
+      "step": 1125
+    },
+    {
+      "epoch": 0.20560407569141192,
+      "grad_norm": 0.16114577651023865,
+      "learning_rate": 4.607896158609875e-05,
+      "loss": 0.17217352390289306,
+      "step": 1130
+    },
+    {
+      "epoch": 0.20651382823871905,
+      "grad_norm": 0.1499422937631607,
+      "learning_rate": 4.603929377988999e-05,
+      "loss": 0.17806737422943114,
+      "step": 1135
+    },
+    {
+      "epoch": 0.2074235807860262,
+      "grad_norm": 0.17605191469192505,
+      "learning_rate": 4.5999443583375765e-05,
+      "loss": 0.17842113971710205,
+      "step": 1140
+    },
+    {
+      "epoch": 0.20833333333333334,
+      "grad_norm": 0.16117210686206818,
+      "learning_rate": 4.595941134201871e-05,
+      "loss": 0.18379683494567872,
+      "step": 1145
+    },
+    {
+      "epoch": 0.20924308588064047,
+      "grad_norm": 0.21199050545692444,
+      "learning_rate": 4.591919740285957e-05,
+      "loss": 0.16286123991012574,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2101528384279476,
+      "grad_norm": 0.15100529789924622,
+      "learning_rate": 4.587880211451427e-05,
+      "loss": 0.17995200157165528,
+      "step": 1155
+    },
+    {
+      "epoch": 0.21106259097525473,
+      "grad_norm": 0.16618172824382782,
+      "learning_rate": 4.583822582717085e-05,
+      "loss": 0.16960303783416747,
+      "step": 1160
+    },
+    {
+      "epoch": 0.21197234352256186,
+      "grad_norm": 0.14743569493293762,
+      "learning_rate": 4.579746889258643e-05,
+      "loss": 0.17762668132781984,
+      "step": 1165
+    },
+    {
+      "epoch": 0.212882096069869,
+      "grad_norm": 0.1697179079055786,
+      "learning_rate": 4.575653166408417e-05,
+      "loss": 0.16656005382537842,
+      "step": 1170
+    },
+    {
+      "epoch": 0.21379184861717612,
+      "grad_norm": 0.14886513352394104,
+      "learning_rate": 4.57154144965502e-05,
+      "loss": 0.17091882228851318,
+      "step": 1175
+    },
+    {
+      "epoch": 0.21470160116448325,
+      "grad_norm": 0.18197473883628845,
+      "learning_rate": 4.5674117746430556e-05,
+      "loss": 0.1770920753479004,
+      "step": 1180
+    },
+    {
+      "epoch": 0.21561135371179038,
+      "grad_norm": 0.17323088645935059,
+      "learning_rate": 4.563264177172807e-05,
+      "loss": 0.1734643578529358,
+      "step": 1185
+    },
+    {
+      "epoch": 0.2165211062590975,
+      "grad_norm": 0.1521984338760376,
+      "learning_rate": 4.559098693199929e-05,
+      "loss": 0.17515116930007935,
+      "step": 1190
+    },
+    {
+      "epoch": 0.21743085880640467,
+      "grad_norm": 0.1842304915189743,
+      "learning_rate": 4.554915358835134e-05,
+      "loss": 0.16798022985458375,
+      "step": 1195
+    },
+    {
+      "epoch": 0.2183406113537118,
+      "grad_norm": 0.14753451943397522,
+      "learning_rate": 4.5507142103438794e-05,
+      "loss": 0.1755476713180542,
+      "step": 1200
+    },
+    {
+      "epoch": 0.21925036390101893,
+      "grad_norm": 0.17096194624900818,
+      "learning_rate": 4.546495284146057e-05,
+      "loss": 0.1792473554611206,
+      "step": 1205
+    },
+    {
+      "epoch": 0.22016011644832606,
+      "grad_norm": 0.1579233556985855,
+      "learning_rate": 4.542258616815669e-05,
+      "loss": 0.17230144739151002,
+      "step": 1210
+    },
+    {
+      "epoch": 0.2210698689956332,
+      "grad_norm": 0.177297905087471,
+      "learning_rate": 4.5380042450805216e-05,
+      "loss": 0.1807127833366394,
+      "step": 1215
+    },
+    {
+      "epoch": 0.22197962154294032,
+      "grad_norm": 0.14331696927547455,
+      "learning_rate": 4.533732205821897e-05,
+      "loss": 0.17201389074325563,
+      "step": 1220
+    },
+    {
+      "epoch": 0.22288937409024745,
+      "grad_norm": 0.14473360776901245,
+      "learning_rate": 4.529442536074239e-05,
+      "loss": 0.17036900520324708,
+      "step": 1225
+    },
+    {
+      "epoch": 0.22379912663755458,
+      "grad_norm": 0.1820901483297348,
+      "learning_rate": 4.5251352730248314e-05,
+      "loss": 0.17704882621765136,
+      "step": 1230
+    },
+    {
+      "epoch": 0.2247088791848617,
+      "grad_norm": 0.1948976367712021,
+      "learning_rate": 4.5208104540134746e-05,
+      "loss": 0.1706973433494568,
+      "step": 1235
+    },
+    {
+      "epoch": 0.22561863173216884,
+      "grad_norm": 0.16660070419311523,
+      "learning_rate": 4.51646811653216e-05,
+      "loss": 0.17636821269989014,
+      "step": 1240
+    },
+    {
+      "epoch": 0.22652838427947597,
+      "grad_norm": 0.1699984073638916,
+      "learning_rate": 4.512108298224751e-05,
+      "loss": 0.16986632347106934,
+      "step": 1245
+    },
+    {
+      "epoch": 0.22743813682678313,
+      "grad_norm": 0.17601042985916138,
+      "learning_rate": 4.50773103688665e-05,
+      "loss": 0.17507898807525635,
+      "step": 1250
+    },
+    {
+      "epoch": 0.22834788937409026,
+      "grad_norm": 0.17557238042354584,
+      "learning_rate": 4.503336370464476e-05,
+      "loss": 0.17702863216400147,
+      "step": 1255
+    },
+    {
+      "epoch": 0.2292576419213974,
+      "grad_norm": 0.1800651252269745,
+      "learning_rate": 4.498924337055729e-05,
+      "loss": 0.16419180631637573,
+      "step": 1260
+    },
+    {
+      "epoch": 0.23016739446870452,
+      "grad_norm": 0.2022479772567749,
+      "learning_rate": 4.494494974908468e-05,
+      "loss": 0.17482060194015503,
+      "step": 1265
+    },
+    {
+      "epoch": 0.23107714701601165,
+      "grad_norm": 0.14180205762386322,
+      "learning_rate": 4.490048322420973e-05,
+      "loss": 0.1723136067390442,
+      "step": 1270
+    },
+    {
+      "epoch": 0.23198689956331878,
+      "grad_norm": 0.18607310950756073,
+      "learning_rate": 4.485584418141419e-05,
+      "loss": 0.17096419334411622,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2328966521106259,
+      "grad_norm": 0.15958310663700104,
+      "learning_rate": 4.481103300767529e-05,
+      "loss": 0.1656244158744812,
+      "step": 1280
+    },
+    {
+      "epoch": 0.23380640465793304,
+      "grad_norm": 0.17552383244037628,
+      "learning_rate": 4.476605009146255e-05,
+      "loss": 0.17677626609802247,
+      "step": 1285
+    },
+    {
+      "epoch": 0.23471615720524017,
+      "grad_norm": 0.15299823880195618,
+      "learning_rate": 4.472089582273429e-05,
+      "loss": 0.1778991103172302,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2356259097525473,
+      "grad_norm": 0.14613987505435944,
+      "learning_rate": 4.46755705929343e-05,
+      "loss": 0.17071452140808105,
+      "step": 1295
+    },
+    {
+      "epoch": 0.23653566229985443,
+      "grad_norm": 0.17781122028827667,
+      "learning_rate": 4.463007479498843e-05,
+      "loss": 0.16955430507659913,
+      "step": 1300
+    },
+    {
+      "epoch": 0.23744541484716158,
+      "grad_norm": 0.16326487064361572,
+      "learning_rate": 4.458440882330119e-05,
+      "loss": 0.1777693510055542,
+      "step": 1305
+    },
+    {
+      "epoch": 0.23835516739446871,
+      "grad_norm": 0.17701926827430725,
+      "learning_rate": 4.4538573073752365e-05,
+      "loss": 0.16323351860046387,
+      "step": 1310
+    },
+    {
+      "epoch": 0.23926491994177584,
+      "grad_norm": 0.13104717433452606,
+      "learning_rate": 4.449256794369349e-05,
+      "loss": 0.17653456926345826,
+      "step": 1315
+    },
+    {
+      "epoch": 0.24017467248908297,
+      "grad_norm": 0.1796836256980896,
+      "learning_rate": 4.444639383194452e-05,
+      "loss": 0.17189600467681884,
+      "step": 1320
+    },
+    {
+      "epoch": 0.2410844250363901,
+      "grad_norm": 0.14919696748256683,
+      "learning_rate": 4.440005113879029e-05,
+      "loss": 0.17003334760665895,
+      "step": 1325
+    },
+    {
+      "epoch": 0.24199417758369723,
+      "grad_norm": 0.1728784441947937,
+      "learning_rate": 4.4353540265977064e-05,
+      "loss": 0.17397408485412597,
+      "step": 1330
+    },
+    {
+      "epoch": 0.24290393013100436,
+      "grad_norm": 0.14591015875339508,
+      "learning_rate": 4.43068616167091e-05,
+      "loss": 0.16498478651046752,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2438136826783115,
+      "grad_norm": 0.18417201936244965,
+      "learning_rate": 4.4260015595645055e-05,
+      "loss": 0.16841750144958495,
+      "step": 1340
+    },
+    {
+      "epoch": 0.24472343522561862,
+      "grad_norm": 0.16264279186725616,
+      "learning_rate": 4.4213002608894605e-05,
+      "loss": 0.16907373666763306,
+      "step": 1345
+    },
+    {
+      "epoch": 0.24563318777292575,
+      "grad_norm": 0.15248481929302216,
+      "learning_rate": 4.416582306401481e-05,
+      "loss": 0.15931472778320313,
+      "step": 1350
+    },
+    {
+      "epoch": 0.24654294032023288,
+      "grad_norm": 0.1488373875617981,
+      "learning_rate": 4.4118477370006636e-05,
+      "loss": 0.1701716423034668,
+      "step": 1355
+    },
+    {
+      "epoch": 0.24745269286754004,
+      "grad_norm": 0.14679782092571259,
+      "learning_rate": 4.407096593731142e-05,
+      "loss": 0.157412326335907,
+      "step": 1360
+    },
+    {
+      "epoch": 0.24836244541484717,
+      "grad_norm": 0.17139530181884766,
+      "learning_rate": 4.402328917780728e-05,
+      "loss": 0.17303754091262818,
+      "step": 1365
+    },
+    {
+      "epoch": 0.2492721979621543,
+      "grad_norm": 0.1534871757030487,
+      "learning_rate": 4.397544750480554e-05,
+      "loss": 0.1786255121231079,
+      "step": 1370
+    },
+    {
+      "epoch": 0.2501819505094614,
+      "grad_norm": 0.1876252293586731,
+      "learning_rate": 4.39274413330472e-05,
+      "loss": 0.16442898511886597,
+      "step": 1375
+    },
+    {
+      "epoch": 0.25109170305676853,
+      "grad_norm": 0.16165752708911896,
+      "learning_rate": 4.387927107869928e-05,
+      "loss": 0.1780426025390625,
+      "step": 1380
+    },
+    {
+      "epoch": 0.25200145560407566,
+      "grad_norm": 0.17242255806922913,
+      "learning_rate": 4.383093715935124e-05,
+      "loss": 0.15959256887435913,
+      "step": 1385
+    },
+    {
+      "epoch": 0.25291120815138285,
+      "grad_norm": 0.1627114862203598,
+      "learning_rate": 4.378243999401137e-05,
+      "loss": 0.17606115341186523,
+      "step": 1390
+    },
+    {
+      "epoch": 0.25382096069869,
+      "grad_norm": 0.15911224484443665,
+      "learning_rate": 4.373378000310312e-05,
+      "loss": 0.16798585653305054,
+      "step": 1395
+    },
+    {
+      "epoch": 0.2547307132459971,
+      "grad_norm": 0.15542249381542206,
+      "learning_rate": 4.3684957608461505e-05,
+      "loss": 0.1695417881011963,
+      "step": 1400
+    },
+    {
+      "epoch": 0.25564046579330424,
+      "grad_norm": 0.1475304812192917,
+      "learning_rate": 4.363597323332941e-05,
+      "loss": 0.16340878009796142,
+      "step": 1405
+    },
+    {
+      "epoch": 0.25655021834061137,
+      "grad_norm": 0.16943927109241486,
+      "learning_rate": 4.358682730235395e-05,
+      "loss": 0.17240238189697266,
+      "step": 1410
+    },
+    {
+      "epoch": 0.2574599708879185,
+      "grad_norm": 0.1816391944885254,
+      "learning_rate": 4.3537520241582744e-05,
+      "loss": 0.16558437347412108,
+      "step": 1415
+    },
+    {
+      "epoch": 0.25836972343522563,
+      "grad_norm": 0.23851341009140015,
+      "learning_rate": 4.348805247846027e-05,
+      "loss": 0.16796000003814698,
+      "step": 1420
+    },
+    {
+      "epoch": 0.25927947598253276,
+      "grad_norm": 0.15415243804454803,
+      "learning_rate": 4.343842444182414e-05,
+      "loss": 0.1746017098426819,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2601892285298399,
+      "grad_norm": 0.15651032328605652,
+      "learning_rate": 4.338863656190139e-05,
+      "loss": 0.1649057984352112,
+      "step": 1430
+    },
+    {
+      "epoch": 0.261098981077147,
+      "grad_norm": 0.16601966321468353,
+      "learning_rate": 4.333868927030471e-05,
+      "loss": 0.15888988971710205,
+      "step": 1435
+    },
+    {
+      "epoch": 0.26200873362445415,
+      "grad_norm": 0.1549467295408249,
+      "learning_rate": 4.328858300002876e-05,
+      "loss": 0.16357985734939576,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2629184861717613,
+      "grad_norm": 0.16332370042800903,
+      "learning_rate": 4.32383181854464e-05,
+      "loss": 0.16749982833862304,
+      "step": 1445
+    },
+    {
+      "epoch": 0.2638282387190684,
+      "grad_norm": 0.14827077090740204,
+      "learning_rate": 4.3187895262304894e-05,
+      "loss": 0.16886214017868043,
+      "step": 1450
+    },
+    {
+      "epoch": 0.26473799126637554,
+      "grad_norm": 0.1557198166847229,
+      "learning_rate": 4.313731466772216e-05,
+      "loss": 0.17512214183807373,
+      "step": 1455
+    },
+    {
+      "epoch": 0.26564774381368267,
+      "grad_norm": 0.17263570427894592,
+      "learning_rate": 4.308657684018299e-05,
+      "loss": 0.16248074769973755,
+      "step": 1460
+    },
+    {
+      "epoch": 0.2665574963609898,
+      "grad_norm": 0.17135761678218842,
+      "learning_rate": 4.303568221953521e-05,
+      "loss": 0.16605921983718872,
+      "step": 1465
+    },
+    {
+      "epoch": 0.26746724890829693,
+      "grad_norm": 0.14322632551193237,
+      "learning_rate": 4.2984631246985897e-05,
+      "loss": 0.1610772728919983,
+      "step": 1470
+    },
+    {
+      "epoch": 0.26837700145560406,
+      "grad_norm": 0.18852312862873077,
+      "learning_rate": 4.2933424365097564e-05,
+      "loss": 0.1686462163925171,
+      "step": 1475
+    },
+    {
+      "epoch": 0.2692867540029112,
+      "grad_norm": 0.1780245155096054,
+      "learning_rate": 4.2882062017784294e-05,
+      "loss": 0.16953932046890258,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2701965065502183,
+      "grad_norm": 0.180568665266037,
+      "learning_rate": 4.2830544650307895e-05,
+      "loss": 0.16442664861679077,
+      "step": 1485
+    },
+    {
+      "epoch": 0.27110625909752545,
+      "grad_norm": 0.16876435279846191,
+      "learning_rate": 4.277887270927407e-05,
+      "loss": 0.17128173112869263,
+      "step": 1490
+    },
+    {
+      "epoch": 0.2720160116448326,
+      "grad_norm": 0.164053276181221,
+      "learning_rate": 4.2727046642628513e-05,
+      "loss": 0.16331382989883422,
+      "step": 1495
+    },
+    {
+      "epoch": 0.27292576419213976,
+      "grad_norm": 0.14577528834342957,
+      "learning_rate": 4.267506689965305e-05,
+      "loss": 0.1638316035270691,
+      "step": 1500
+    },
+    {
+      "epoch": 0.2738355167394469,
+      "grad_norm": 0.1648740917444229,
+      "learning_rate": 4.262293393096171e-05,
+      "loss": 0.15332664251327516,
+      "step": 1505
+    },
+    {
+      "epoch": 0.274745269286754,
+      "grad_norm": 0.16445094347000122,
+      "learning_rate": 4.257064818849685e-05,
+      "loss": 0.1706634521484375,
+      "step": 1510
+    },
+    {
+      "epoch": 0.27565502183406115,
+      "grad_norm": 0.1584935486316681,
+      "learning_rate": 4.251821012552524e-05,
+      "loss": 0.1684114694595337,
+      "step": 1515
+    },
+    {
+      "epoch": 0.2765647743813683,
+      "grad_norm": 0.17215611040592194,
+      "learning_rate": 4.24656201966341e-05,
+      "loss": 0.15594131946563722,
+      "step": 1520
+    },
+    {
+      "epoch": 0.2774745269286754,
+      "grad_norm": 0.15945589542388916,
+      "learning_rate": 4.2412878857727214e-05,
+      "loss": 0.1686659574508667,
+      "step": 1525
+    },
+    {
+      "epoch": 0.27838427947598254,
+      "grad_norm": 0.16103951632976532,
+      "learning_rate": 4.2359986566020906e-05,
+      "loss": 0.17779340744018554,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2792940320232897,
+      "grad_norm": 0.1770307570695877,
+      "learning_rate": 4.230694378004014e-05,
+      "loss": 0.16786882877349854,
+      "step": 1535
+    },
+    {
+      "epoch": 0.2802037845705968,
+      "grad_norm": 0.16225053369998932,
+      "learning_rate": 4.2253750959614504e-05,
+      "loss": 0.16558897495269775,
+      "step": 1540
+    },
+    {
+      "epoch": 0.28111353711790393,
+      "grad_norm": 0.27213969826698303,
+      "learning_rate": 4.220040856587425e-05,
+      "loss": 0.1641119599342346,
+      "step": 1545
+    },
+    {
+      "epoch": 0.28202328966521106,
+      "grad_norm": 0.1773071587085724,
+      "learning_rate": 4.2146917061246284e-05,
+      "loss": 0.16919140815734862,
+      "step": 1550
+    },
+    {
+      "epoch": 0.2829330422125182,
+      "grad_norm": 0.15519705414772034,
+      "learning_rate": 4.209327690945014e-05,
+      "loss": 0.15501506328582765,
+      "step": 1555
+    },
+    {
+      "epoch": 0.2838427947598253,
+      "grad_norm": 0.19921597838401794,
+      "learning_rate": 4.203948857549402e-05,
+      "loss": 0.1690821886062622,
+      "step": 1560
+    },
+    {
+      "epoch": 0.28475254730713245,
+      "grad_norm": 0.15417630970478058,
+      "learning_rate": 4.1985552525670696e-05,
+      "loss": 0.1675640344619751,
+      "step": 1565
+    },
+    {
+      "epoch": 0.2856622998544396,
+      "grad_norm": 0.1739572137594223,
+      "learning_rate": 4.193146922755348e-05,
+      "loss": 0.16738017797470092,
+      "step": 1570
+    },
+    {
+      "epoch": 0.2865720524017467,
+      "grad_norm": 0.1384361982345581,
+      "learning_rate": 4.187723914999221e-05,
+      "loss": 0.16802358627319336,
+      "step": 1575
+    },
+    {
+      "epoch": 0.28748180494905384,
+      "grad_norm": 0.1491454839706421,
+      "learning_rate": 4.182286276310915e-05,
+      "loss": 0.1619583249092102,
+      "step": 1580
+    },
+    {
+      "epoch": 0.288391557496361,
+      "grad_norm": 0.15831919014453888,
+      "learning_rate": 4.176834053829492e-05,
+      "loss": 0.1625199794769287,
+      "step": 1585
+    },
+    {
+      "epoch": 0.2893013100436681,
+      "grad_norm": 0.16265396773815155,
+      "learning_rate": 4.1713672948204416e-05,
+      "loss": 0.16718552112579346,
+      "step": 1590
+    },
+    {
+      "epoch": 0.29021106259097523,
+      "grad_norm": 0.15153461694717407,
+      "learning_rate": 4.1658860466752714e-05,
+      "loss": 0.15979087352752686,
+      "step": 1595
+    },
+    {
+      "epoch": 0.29112081513828236,
+      "grad_norm": 0.1620412915945053,
+      "learning_rate": 4.160390356911096e-05,
+      "loss": 0.16103557348251343,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2920305676855895,
+      "grad_norm": 0.16673807799816132,
+      "learning_rate": 4.154880273170223e-05,
+      "loss": 0.16394708156585694,
+      "step": 1605
+    },
+    {
+      "epoch": 0.2929403202328967,
+      "grad_norm": 0.14834867417812347,
+      "learning_rate": 4.149355843219744e-05,
+      "loss": 0.15916435718536376,
+      "step": 1610
+    },
+    {
+      "epoch": 0.2938500727802038,
+      "grad_norm": 0.16977964341640472,
+      "learning_rate": 4.143817114951119e-05,
+      "loss": 0.16538127660751342,
+      "step": 1615
+    },
+    {
+      "epoch": 0.29475982532751094,
+      "grad_norm": 0.17986875772476196,
+      "learning_rate": 4.138264136379756e-05,
+      "loss": 0.15514618158340454,
+      "step": 1620
+    },
+    {
+      "epoch": 0.29566957787481807,
+      "grad_norm": 0.15794920921325684,
+      "learning_rate": 4.132696955644605e-05,
+      "loss": 0.15992183685302735,
+      "step": 1625
+    },
+    {
+      "epoch": 0.2965793304221252,
+      "grad_norm": 0.19955399632453918,
+      "learning_rate": 4.127115621007731e-05,
+      "loss": 0.16362056732177735,
+      "step": 1630
+    },
+    {
+      "epoch": 0.29748908296943233,
+      "grad_norm": 0.1352023035287857,
+      "learning_rate": 4.121520180853903e-05,
+      "loss": 0.15631601810455323,
+      "step": 1635
+    },
+    {
+      "epoch": 0.29839883551673946,
+      "grad_norm": 0.15340781211853027,
+      "learning_rate": 4.1159106836901674e-05,
+      "loss": 0.1571858048439026,
+      "step": 1640
+    },
+    {
+      "epoch": 0.2993085880640466,
+      "grad_norm": 0.15311770141124725,
+      "learning_rate": 4.110287178145433e-05,
+      "loss": 0.16082344055175782,
+      "step": 1645
+    },
+    {
+      "epoch": 0.3002183406113537,
+      "grad_norm": 0.17811627686023712,
+      "learning_rate": 4.10464971297005e-05,
+      "loss": 0.16117215156555176,
+      "step": 1650
+    },
+    {
+      "epoch": 0.30112809315866085,
+      "grad_norm": 0.21060039103031158,
+      "learning_rate": 4.0989983370353805e-05,
+      "loss": 0.15838587284088135,
+      "step": 1655
+    },
+    {
+      "epoch": 0.302037845705968,
+      "grad_norm": 0.155836820602417,
+      "learning_rate": 4.093333099333383e-05,
+      "loss": 0.16648870706558228,
+      "step": 1660
+    },
+    {
+      "epoch": 0.3029475982532751,
+      "grad_norm": 0.13711698353290558,
+      "learning_rate": 4.0876540489761826e-05,
+      "loss": 0.16899349689483642,
+      "step": 1665
+    },
+    {
+      "epoch": 0.30385735080058224,
+      "grad_norm": 0.15162716805934906,
+      "learning_rate": 4.0819612351956485e-05,
+      "loss": 0.16574090719223022,
+      "step": 1670
+    },
+    {
+      "epoch": 0.30476710334788937,
+      "grad_norm": 0.15016348659992218,
+      "learning_rate": 4.0762547073429615e-05,
+      "loss": 0.1689780354499817,
+      "step": 1675
+    },
+    {
+      "epoch": 0.3056768558951965,
+      "grad_norm": 0.15182986855506897,
+      "learning_rate": 4.070534514888194e-05,
+      "loss": 0.1593686819076538,
+      "step": 1680
+    },
+    {
+      "epoch": 0.3065866084425036,
+      "grad_norm": 0.15648750960826874,
+      "learning_rate": 4.0648007074198765e-05,
+      "loss": 0.16436235904693602,
+      "step": 1685
+    },
+    {
+      "epoch": 0.30749636098981076,
+      "grad_norm": 0.18339484930038452,
+      "learning_rate": 4.0590533346445665e-05,
+      "loss": 0.1678077220916748,
+      "step": 1690
+    },
+    {
+      "epoch": 0.3084061135371179,
+      "grad_norm": 0.16426527500152588,
+      "learning_rate": 4.053292446386422e-05,
+      "loss": 0.1689227342605591,
+      "step": 1695
+    },
+    {
+      "epoch": 0.309315866084425,
+      "grad_norm": 0.16129335761070251,
+      "learning_rate": 4.047518092586766e-05,
+      "loss": 0.16592445373535156,
+      "step": 1700
+    },
+    {
+      "epoch": 0.31022561863173215,
+      "grad_norm": 0.15512363612651825,
+      "learning_rate": 4.041730323303654e-05,
+      "loss": 0.16142364740371704,
+      "step": 1705
+    },
+    {
+      "epoch": 0.3111353711790393,
+      "grad_norm": 0.159842386841774,
+      "learning_rate": 4.0359291887114425e-05,
+      "loss": 0.1702875852584839,
+      "step": 1710
+    },
+    {
+      "epoch": 0.3120451237263464,
+      "grad_norm": 0.19558854401111603,
+      "learning_rate": 4.030114739100352e-05,
+      "loss": 0.15966148376464845,
+      "step": 1715
+    },
+    {
+      "epoch": 0.3129548762736536,
+      "grad_norm": 0.1577496975660324,
+      "learning_rate": 4.024287024876029e-05,
+      "loss": 0.1620358943939209,
+      "step": 1720
+    },
+    {
+      "epoch": 0.3138646288209607,
+      "grad_norm": 0.1629355251789093,
+      "learning_rate": 4.0184460965591144e-05,
+      "loss": 0.16511552333831786,
+      "step": 1725
+    },
+    {
+      "epoch": 0.31477438136826785,
+      "grad_norm": 0.17060767114162445,
+      "learning_rate": 4.0125920047848e-05,
+      "loss": 0.15672838687896729,
+      "step": 1730
+    },
+    {
+      "epoch": 0.315684133915575,
+      "grad_norm": 0.22447620332241058,
+      "learning_rate": 4.006724800302394e-05,
+      "loss": 0.15339784622192382,
+      "step": 1735
+    },
+    {
+      "epoch": 0.3165938864628821,
+      "grad_norm": 0.14572037756443024,
+      "learning_rate": 4.000844533974878e-05,
+      "loss": 0.16566959619522095,
+      "step": 1740
+    },
+    {
+      "epoch": 0.31750363901018924,
+      "grad_norm": 0.15915483236312866,
+      "learning_rate": 3.9949512567784684e-05,
+      "loss": 0.16153957843780517,
+      "step": 1745
+    },
+    {
+      "epoch": 0.3184133915574964,
+      "grad_norm": 0.1668540984392166,
+      "learning_rate": 3.9890450198021704e-05,
+      "loss": 0.1659809947013855,
+      "step": 1750
+    },
+    {
+      "epoch": 0.3193231441048035,
+      "grad_norm": 0.16612035036087036,
+      "learning_rate": 3.983125874247341e-05,
+      "loss": 0.16941241025924683,
+      "step": 1755
+    },
+    {
+      "epoch": 0.32023289665211063,
+      "grad_norm": 0.15163679420948029,
+      "learning_rate": 3.9771938714272407e-05,
+      "loss": 0.16053590774536133,
+      "step": 1760
+    },
+    {
+      "epoch": 0.32114264919941776,
+      "grad_norm": 0.1797824203968048,
+      "learning_rate": 3.97124906276659e-05,
+      "loss": 0.1667110800743103,
+      "step": 1765
+    },
+    {
+      "epoch": 0.3220524017467249,
+      "grad_norm": 0.15076608955860138,
+      "learning_rate": 3.9652914998011237e-05,
+      "loss": 0.1607860803604126,
+      "step": 1770
+    },
+    {
+      "epoch": 0.322962154294032,
+      "grad_norm": 0.16523587703704834,
+      "learning_rate": 3.959321234177144e-05,
+      "loss": 0.16515827178955078,
+      "step": 1775
+    },
+    {
+      "epoch": 0.32387190684133915,
+      "grad_norm": 0.22065149247646332,
+      "learning_rate": 3.9533383176510746e-05,
+      "loss": 0.1618957757949829,
+      "step": 1780
+    },
+    {
+      "epoch": 0.3247816593886463,
+      "grad_norm": 0.16426463425159454,
+      "learning_rate": 3.9473428020890066e-05,
+      "loss": 0.15763382911682128,
+      "step": 1785
+    },
+    {
+      "epoch": 0.3256914119359534,
+      "grad_norm": 0.16474904119968414,
+      "learning_rate": 3.941334739466257e-05,
+      "loss": 0.15135571956634522,
+      "step": 1790
+    },
+    {
+      "epoch": 0.32660116448326054,
+      "grad_norm": 0.16746412217617035,
+      "learning_rate": 3.935314181866909e-05,
+      "loss": 0.15925389528274536,
+      "step": 1795
+    },
+    {
+      "epoch": 0.32751091703056767,
+      "grad_norm": 0.17819371819496155,
+      "learning_rate": 3.929281181483369e-05,
+      "loss": 0.1598669171333313,
+      "step": 1800
+    },
+    {
+      "epoch": 0.3284206695778748,
+      "grad_norm": 0.1816040277481079,
+      "learning_rate": 3.923235790615907e-05,
+      "loss": 0.1652522087097168,
+      "step": 1805
+    },
+    {
+      "epoch": 0.32933042212518193,
+      "grad_norm": 0.14846695959568024,
+      "learning_rate": 3.917178061672211e-05,
+      "loss": 0.16665585041046144,
+      "step": 1810
+    },
+    {
+      "epoch": 0.33024017467248906,
+      "grad_norm": 0.1734926551580429,
+      "learning_rate": 3.911108047166924e-05,
+      "loss": 0.16069791316986085,
+      "step": 1815
+    },
+    {
+      "epoch": 0.3311499272197962,
+      "grad_norm": 0.16154922544956207,
+      "learning_rate": 3.905025799721194e-05,
+      "loss": 0.16114097833633423,
+      "step": 1820
+    },
+    {
+      "epoch": 0.3320596797671033,
+      "grad_norm": 0.1538771390914917,
+      "learning_rate": 3.898931372062217e-05,
+      "loss": 0.1602831244468689,
+      "step": 1825
+    },
+    {
+      "epoch": 0.3329694323144105,
+      "grad_norm": 0.14036566019058228,
+      "learning_rate": 3.892824817022781e-05,
+      "loss": 0.1502395749092102,
+      "step": 1830
+    },
+    {
+      "epoch": 0.33387918486171764,
+      "grad_norm": 0.19212059676647186,
+      "learning_rate": 3.886706187540804e-05,
+      "loss": 0.16265250444412233,
+      "step": 1835
+    },
+    {
+      "epoch": 0.33478893740902477,
+      "grad_norm": 0.17410333454608917,
+      "learning_rate": 3.880575536658881e-05,
+      "loss": 0.15689224004745483,
+      "step": 1840
+    },
+    {
+      "epoch": 0.3356986899563319,
+      "grad_norm": 0.15165294706821442,
+      "learning_rate": 3.874432917523817e-05,
+      "loss": 0.15033140182495117,
+      "step": 1845
+    },
+    {
+      "epoch": 0.336608442503639,
+      "grad_norm": 0.16166730225086212,
+      "learning_rate": 3.8682783833861736e-05,
+      "loss": 0.16896235942840576,
+      "step": 1850
+    },
+    {
+      "epoch": 0.33751819505094616,
+      "grad_norm": 0.16497021913528442,
+      "learning_rate": 3.8621119875998026e-05,
+      "loss": 0.1600774645805359,
+      "step": 1855
+    },
+    {
+      "epoch": 0.3384279475982533,
+      "grad_norm": 0.17264948785305023,
+      "learning_rate": 3.855933783621384e-05,
+      "loss": 0.16947593688964843,
+      "step": 1860
+    },
+    {
+      "epoch": 0.3393377001455604,
+      "grad_norm": 0.16870704293251038,
+      "learning_rate": 3.8497438250099636e-05,
+      "loss": 0.16062095165252685,
+      "step": 1865
+    },
+    {
+      "epoch": 0.34024745269286755,
+      "grad_norm": 0.16644036769866943,
+      "learning_rate": 3.843542165426492e-05,
+      "loss": 0.16015599966049193,
+      "step": 1870
+    },
+    {
+      "epoch": 0.3411572052401747,
+      "grad_norm": 0.1626352220773697,
+      "learning_rate": 3.837328858633349e-05,
+      "loss": 0.17444703578948975,
+      "step": 1875
+    },
+    {
+      "epoch": 0.3420669577874818,
+      "grad_norm": 0.1427375227212906,
+      "learning_rate": 3.83110395849389e-05,
+      "loss": 0.1589805006980896,
+      "step": 1880
+    },
+    {
+      "epoch": 0.34297671033478894,
+      "grad_norm": 0.17840255796909332,
+      "learning_rate": 3.824867518971973e-05,
+      "loss": 0.15953952074050903,
+      "step": 1885
+    },
+    {
+      "epoch": 0.34388646288209607,
+      "grad_norm": 0.16998249292373657,
+      "learning_rate": 3.818619594131489e-05,
+      "loss": 0.16027032136917113,
+      "step": 1890
+    },
+    {
+      "epoch": 0.3447962154294032,
+      "grad_norm": 0.14950257539749146,
+      "learning_rate": 3.812360238135897e-05,
+      "loss": 0.15335670709609986,
+      "step": 1895
+    },
+    {
+      "epoch": 0.3457059679767103,
+      "grad_norm": 0.1678011417388916,
+      "learning_rate": 3.806089505247752e-05,
+      "loss": 0.1560648798942566,
+      "step": 1900
+    },
+    {
+      "epoch": 0.34661572052401746,
+      "grad_norm": 0.17944541573524475,
+      "learning_rate": 3.799807449828238e-05,
+      "loss": 0.16072254180908202,
+      "step": 1905
+    },
+    {
+      "epoch": 0.3475254730713246,
+      "grad_norm": 0.166817307472229,
+      "learning_rate": 3.793514126336691e-05,
+      "loss": 0.1542820692062378,
+      "step": 1910
+    },
+    {
+      "epoch": 0.3484352256186317,
+      "grad_norm": 0.16047626733779907,
+      "learning_rate": 3.787209589330134e-05,
+      "loss": 0.16092092990875245,
+      "step": 1915
+    },
+    {
+      "epoch": 0.34934497816593885,
+      "grad_norm": 0.16478900611400604,
+      "learning_rate": 3.7808938934627965e-05,
+      "loss": 0.16765867471694945,
+      "step": 1920
+    },
+    {
+      "epoch": 0.350254730713246,
+      "grad_norm": 0.15349514782428741,
+      "learning_rate": 3.774567093485648e-05,
+      "loss": 0.15890377759933472,
+      "step": 1925
+    },
+    {
+      "epoch": 0.3511644832605531,
+      "grad_norm": 0.1515921950340271,
+      "learning_rate": 3.768229244245917e-05,
+      "loss": 0.16668319702148438,
+      "step": 1930
+    },
+    {
+      "epoch": 0.35207423580786024,
+      "grad_norm": 0.16310466825962067,
+      "learning_rate": 3.7618804006866195e-05,
+      "loss": 0.15182652473449706,
+      "step": 1935
+    },
+    {
+      "epoch": 0.3529839883551674,
+      "grad_norm": 0.17294517159461975,
+      "learning_rate": 3.755520617846084e-05,
+      "loss": 0.16287628412246705,
+      "step": 1940
+    },
+    {
+      "epoch": 0.35389374090247455,
+      "grad_norm": 0.1482895463705063,
+      "learning_rate": 3.749149950857467e-05,
+      "loss": 0.15321952104568481,
+      "step": 1945
+    },
+    {
+      "epoch": 0.3548034934497817,
+      "grad_norm": 0.2236029952764511,
+      "learning_rate": 3.7427684549482847e-05,
+      "loss": 0.15403482913970948,
+      "step": 1950
+    },
+    {
+      "epoch": 0.3557132459970888,
+      "grad_norm": 0.20185327529907227,
+      "learning_rate": 3.736376185439927e-05,
+      "loss": 0.1633884072303772,
+      "step": 1955
+    },
+    {
+      "epoch": 0.35662299854439594,
+      "grad_norm": 0.13906247913837433,
+      "learning_rate": 3.7299731977471816e-05,
+      "loss": 0.15925350189208984,
+      "step": 1960
+    },
+    {
+      "epoch": 0.35753275109170307,
+      "grad_norm": 0.18665002286434174,
+      "learning_rate": 3.723559547377751e-05,
+      "loss": 0.1612026572227478,
+      "step": 1965
+    },
+    {
+      "epoch": 0.3584425036390102,
+      "grad_norm": 0.16913433372974396,
+      "learning_rate": 3.717135289931774e-05,
+      "loss": 0.15479494333267213,
+      "step": 1970
+    },
+    {
+      "epoch": 0.35935225618631733,
+      "grad_norm": 0.1620066910982132,
+      "learning_rate": 3.7107004811013434e-05,
+      "loss": 0.1604058027267456,
+      "step": 1975
+    },
+    {
+      "epoch": 0.36026200873362446,
+      "grad_norm": 0.16838301718235016,
+      "learning_rate": 3.704255176670021e-05,
+      "loss": 0.15335073471069335,
+      "step": 1980
+    },
+    {
+      "epoch": 0.3611717612809316,
+      "grad_norm": 0.3054695427417755,
+      "learning_rate": 3.6977994325123535e-05,
+      "loss": 0.16558053493499755,
+      "step": 1985
+    },
+    {
+      "epoch": 0.3620815138282387,
+      "grad_norm": 0.1526716649532318,
+      "learning_rate": 3.6913333045933934e-05,
+      "loss": 0.16148923635482787,
+      "step": 1990
+    },
+    {
+      "epoch": 0.36299126637554585,
+      "grad_norm": 0.15328513085842133,
+      "learning_rate": 3.684856848968209e-05,
+      "loss": 0.1553613781929016,
+      "step": 1995
+    },
+    {
+      "epoch": 0.363901018922853,
+      "grad_norm": 0.16129714250564575,
+      "learning_rate": 3.6783701217813995e-05,
+      "loss": 0.16724612712860107,
+      "step": 2000
+    },
+    {
+      "epoch": 0.3648107714701601,
+      "grad_norm": 0.15715539455413818,
+      "learning_rate": 3.6718731792666086e-05,
+      "loss": 0.15867922306060792,
+      "step": 2005
+    },
+    {
+      "epoch": 0.36572052401746724,
+      "grad_norm": 0.15569166839122772,
+      "learning_rate": 3.6653660777460366e-05,
+      "loss": 0.1552058696746826,
+      "step": 2010
+    },
+    {
+      "epoch": 0.36663027656477437,
+      "grad_norm": 0.16223010420799255,
+      "learning_rate": 3.6588488736299535e-05,
+      "loss": 0.1583200454711914,
+      "step": 2015
+    },
+    {
+      "epoch": 0.3675400291120815,
+      "grad_norm": 0.18441995978355408,
+      "learning_rate": 3.652321623416209e-05,
+      "loss": 0.15050662755966188,
+      "step": 2020
+    },
+    {
+      "epoch": 0.36844978165938863,
+      "grad_norm": 0.13792674243450165,
+      "learning_rate": 3.645784383689742e-05,
+      "loss": 0.15458759069442748,
+      "step": 2025
+    },
+    {
+      "epoch": 0.36935953420669576,
+      "grad_norm": 0.14993111789226532,
+      "learning_rate": 3.639237211122091e-05,
+      "loss": 0.15926222801208495,
+      "step": 2030
+    },
+    {
+      "epoch": 0.3702692867540029,
+      "grad_norm": 0.16815930604934692,
+      "learning_rate": 3.632680162470904e-05,
+      "loss": 0.15524441003799438,
+      "step": 2035
+    },
+    {
+      "epoch": 0.37117903930131,
+      "grad_norm": 0.13312821090221405,
+      "learning_rate": 3.626113294579441e-05,
+      "loss": 0.15883516073226928,
+      "step": 2040
+    },
+    {
+      "epoch": 0.37208879184861715,
+      "grad_norm": 0.16838273406028748,
+      "learning_rate": 3.619536664376091e-05,
+      "loss": 0.15829603672027587,
+      "step": 2045
+    },
+    {
+      "epoch": 0.37299854439592434,
+      "grad_norm": 0.14706873893737793,
+      "learning_rate": 3.612950328873869e-05,
+      "loss": 0.15644397735595703,
+      "step": 2050
+    },
+    {
+      "epoch": 0.37390829694323147,
+      "grad_norm": 0.1644199639558792,
+      "learning_rate": 3.606354345169926e-05,
+      "loss": 0.15858219861984252,
+      "step": 2055
+    },
+    {
+      "epoch": 0.3748180494905386,
+      "grad_norm": 0.18077051639556885,
+      "learning_rate": 3.599748770445055e-05,
+      "loss": 0.1641286849975586,
+      "step": 2060
+    },
+    {
+      "epoch": 0.3757278020378457,
+      "grad_norm": 0.16329127550125122,
+      "learning_rate": 3.5931336619631914e-05,
+      "loss": 0.15027186870574952,
+      "step": 2065
+    },
+    {
+      "epoch": 0.37663755458515286,
+      "grad_norm": 0.16346783936023712,
+      "learning_rate": 3.586509077070922e-05,
+      "loss": 0.1558641314506531,
+      "step": 2070
+    },
+    {
+      "epoch": 0.37754730713246,
+      "grad_norm": 0.1727602630853653,
+      "learning_rate": 3.5798750731969834e-05,
+      "loss": 0.15390506982803345,
+      "step": 2075
+    },
+    {
+      "epoch": 0.3784570596797671,
+      "grad_norm": 0.7598192691802979,
+      "learning_rate": 3.5732317078517654e-05,
+      "loss": 0.1533232808113098,
+      "step": 2080
+    },
+    {
+      "epoch": 0.37936681222707425,
+      "grad_norm": 0.1433355212211609,
+      "learning_rate": 3.5665790386268124e-05,
+      "loss": 0.15560413599014283,
+      "step": 2085
+    },
+    {
+      "epoch": 0.3802765647743814,
+      "grad_norm": 0.18439625203609467,
+      "learning_rate": 3.559917123194325e-05,
+      "loss": 0.16695556640625,
+      "step": 2090
+    },
+    {
+      "epoch": 0.3811863173216885,
+      "grad_norm": 0.1693502813577652,
+      "learning_rate": 3.55324601930666e-05,
+      "loss": 0.15957870483398437,
+      "step": 2095
+    },
+    {
+      "epoch": 0.38209606986899564,
+      "grad_norm": 0.17776088416576385,
+      "learning_rate": 3.54656578479583e-05,
+      "loss": 0.1527492880821228,
+      "step": 2100
+    },
+    {
+      "epoch": 0.38300582241630277,
+      "grad_norm": 0.15993724763393402,
+      "learning_rate": 3.539876477572998e-05,
+      "loss": 0.1567505717277527,
+      "step": 2105
+    },
+    {
+      "epoch": 0.3839155749636099,
+      "grad_norm": 0.17067375779151917,
+      "learning_rate": 3.533178155627981e-05,
+      "loss": 0.14660797119140626,
+      "step": 2110
+    },
+    {
+      "epoch": 0.384825327510917,
+      "grad_norm": 0.20239882171154022,
+      "learning_rate": 3.526470877028745e-05,
+      "loss": 0.1596767544746399,
+      "step": 2115
+    },
+    {
+      "epoch": 0.38573508005822416,
+      "grad_norm": 0.1863643079996109,
+      "learning_rate": 3.5197546999209005e-05,
+      "loss": 0.15738571882247926,
+      "step": 2120
+    },
+    {
+      "epoch": 0.3866448326055313,
+      "grad_norm": 0.16994133591651917,
+      "learning_rate": 3.5130296825272014e-05,
+      "loss": 0.16255316734313965,
+      "step": 2125
+    },
+    {
+      "epoch": 0.3875545851528384,
+      "grad_norm": 0.18703415989875793,
+      "learning_rate": 3.5062958831470355e-05,
+      "loss": 0.15206334590911866,
+      "step": 2130
+    },
+    {
+      "epoch": 0.38846433770014555,
+      "grad_norm": 0.15433982014656067,
+      "learning_rate": 3.4995533601559226e-05,
+      "loss": 0.1590178370475769,
+      "step": 2135
+    },
+    {
+      "epoch": 0.3893740902474527,
+      "grad_norm": 0.16498146951198578,
+      "learning_rate": 3.4928021720050104e-05,
+      "loss": 0.14759145975112914,
+      "step": 2140
+    },
+    {
+      "epoch": 0.3902838427947598,
+      "grad_norm": 0.17880478501319885,
+      "learning_rate": 3.486042377220562e-05,
+      "loss": 0.1642458915710449,
+      "step": 2145
+    },
+    {
+      "epoch": 0.39119359534206694,
+      "grad_norm": 0.14700061082839966,
+      "learning_rate": 3.479274034403455e-05,
+      "loss": 0.16105138063430785,
+      "step": 2150
+    },
+    {
+      "epoch": 0.39210334788937407,
+      "grad_norm": 0.1620762050151825,
+      "learning_rate": 3.472497202228664e-05,
+      "loss": 0.15104985237121582,
+      "step": 2155
+    },
+    {
+      "epoch": 0.3930131004366812,
+      "grad_norm": 0.1625058799982071,
+      "learning_rate": 3.4657119394447654e-05,
+      "loss": 0.16145485639572144,
+      "step": 2160
+    },
+    {
+      "epoch": 0.3939228529839884,
+      "grad_norm": 0.1631549596786499,
+      "learning_rate": 3.458918304873417e-05,
+      "loss": 0.16712255477905275,
+      "step": 2165
+    },
+    {
+      "epoch": 0.3948326055312955,
+      "grad_norm": 0.16041551530361176,
+      "learning_rate": 3.452116357408853e-05,
+      "loss": 0.15118330717086792,
+      "step": 2170
+    },
+    {
+      "epoch": 0.39574235807860264,
+      "grad_norm": 0.16692611575126648,
+      "learning_rate": 3.44530615601737e-05,
+      "loss": 0.16982550621032716,
+      "step": 2175
+    },
+    {
+      "epoch": 0.39665211062590977,
+      "grad_norm": 0.16082268953323364,
+      "learning_rate": 3.438487759736821e-05,
+      "loss": 0.1513260006904602,
+      "step": 2180
+    },
+    {
+      "epoch": 0.3975618631732169,
+      "grad_norm": 0.1474589854478836,
+      "learning_rate": 3.4316612276761004e-05,
+      "loss": 0.14968743324279785,
+      "step": 2185
+    },
+    {
+      "epoch": 0.39847161572052403,
+      "grad_norm": 0.14531342685222626,
+      "learning_rate": 3.42482661901463e-05,
+      "loss": 0.1563260555267334,
+      "step": 2190
+    },
+    {
+      "epoch": 0.39938136826783116,
+      "grad_norm": 0.16775506734848022,
+      "learning_rate": 3.41798399300185e-05,
+      "loss": 0.14861010313034057,
+      "step": 2195
+    },
+    {
+      "epoch": 0.4002911208151383,
+      "grad_norm": 0.15065217018127441,
+      "learning_rate": 3.411133408956703e-05,
+      "loss": 0.15559519529342652,
+      "step": 2200
+    },
+    {
+      "epoch": 0.4012008733624454,
+      "grad_norm": 0.16655296087265015,
+      "learning_rate": 3.4042749262671184e-05,
+      "loss": 0.16025567054748535,
+      "step": 2205
+    },
+    {
+      "epoch": 0.40211062590975255,
+      "grad_norm": 0.14773905277252197,
+      "learning_rate": 3.397408604389501e-05,
+      "loss": 0.15074082612991332,
+      "step": 2210
+    },
+    {
+      "epoch": 0.4030203784570597,
+      "grad_norm": 0.16233304142951965,
+      "learning_rate": 3.3905345028482125e-05,
+      "loss": 0.15490520000457764,
+      "step": 2215
+    },
+    {
+      "epoch": 0.4039301310043668,
+      "grad_norm": 0.17520153522491455,
+      "learning_rate": 3.383652681235058e-05,
+      "loss": 0.1517520785331726,
+      "step": 2220
+    },
+    {
+      "epoch": 0.40483988355167394,
+      "grad_norm": 0.14749875664710999,
+      "learning_rate": 3.376763199208766e-05,
+      "loss": 0.15410997867584228,
+      "step": 2225
+    },
+    {
+      "epoch": 0.40574963609898107,
+      "grad_norm": 0.16855919361114502,
+      "learning_rate": 3.369866116494477e-05,
+      "loss": 0.1510261058807373,
+      "step": 2230
+    },
+    {
+      "epoch": 0.4066593886462882,
+      "grad_norm": 0.1594122350215912,
+      "learning_rate": 3.362961492883218e-05,
+      "loss": 0.1493813395500183,
+      "step": 2235
+    },
+    {
+      "epoch": 0.40756914119359533,
+      "grad_norm": 0.13645926117897034,
+      "learning_rate": 3.3560493882313915e-05,
+      "loss": 0.14876762628555298,
+      "step": 2240
+    },
+    {
+      "epoch": 0.40847889374090246,
+      "grad_norm": 0.14304400980472565,
+      "learning_rate": 3.349129862460251e-05,
+      "loss": 0.15567013025283813,
+      "step": 2245
+    },
+    {
+      "epoch": 0.4093886462882096,
+      "grad_norm": 0.17040041089057922,
+      "learning_rate": 3.342202975555386e-05,
+      "loss": 0.1563249945640564,
+      "step": 2250
+    },
+    {
+      "epoch": 0.4102983988355167,
+      "grad_norm": 0.15594671666622162,
+      "learning_rate": 3.3352687875661984e-05,
+      "loss": 0.1546410083770752,
+      "step": 2255
+    },
+    {
+      "epoch": 0.41120815138282385,
+      "grad_norm": 0.1677195280790329,
+      "learning_rate": 3.328327358605384e-05,
+      "loss": 0.15710171461105346,
+      "step": 2260
+    },
+    {
+      "epoch": 0.412117903930131,
+      "grad_norm": 0.1731705516576767,
+      "learning_rate": 3.321378748848412e-05,
+      "loss": 0.16444036960601807,
+      "step": 2265
+    },
+    {
+      "epoch": 0.4130276564774381,
+      "grad_norm": 0.18779033422470093,
+      "learning_rate": 3.3144230185329984e-05,
+      "loss": 0.15659687519073487,
+      "step": 2270
+    },
+    {
+      "epoch": 0.4139374090247453,
+      "grad_norm": 0.1543768346309662,
+      "learning_rate": 3.3074602279585913e-05,
+      "loss": 0.15100739002227784,
+      "step": 2275
+    },
+    {
+      "epoch": 0.4148471615720524,
+      "grad_norm": 0.16672168672084808,
+      "learning_rate": 3.300490437485843e-05,
+      "loss": 0.15535364151000977,
+      "step": 2280
+    },
+    {
+      "epoch": 0.41575691411935956,
+      "grad_norm": 0.16741308569908142,
+      "learning_rate": 3.293513707536089e-05,
+      "loss": 0.15523911714553834,
+      "step": 2285
+    },
+    {
+      "epoch": 0.4166666666666667,
+      "grad_norm": 0.1488303542137146,
+      "learning_rate": 3.286530098590822e-05,
+      "loss": 0.1542000651359558,
+      "step": 2290
+    },
+    {
+      "epoch": 0.4175764192139738,
+      "grad_norm": 0.1637732982635498,
+      "learning_rate": 3.2795396711911694e-05,
+      "loss": 0.15354831218719484,
+      "step": 2295
+    },
+    {
+      "epoch": 0.41848617176128095,
+      "grad_norm": 0.1472022533416748,
+      "learning_rate": 3.272542485937369e-05,
+      "loss": 0.16235145330429077,
+      "step": 2300
+    },
+    {
+      "epoch": 0.4193959243085881,
+      "grad_norm": 0.15908290445804596,
+      "learning_rate": 3.265538603488241e-05,
+      "loss": 0.15642645359039306,
+      "step": 2305
+    },
+    {
+      "epoch": 0.4203056768558952,
+      "grad_norm": 0.1584865301847458,
+      "learning_rate": 3.2585280845606645e-05,
+      "loss": 0.15490249395370484,
+      "step": 2310
+    },
+    {
+      "epoch": 0.42121542940320233,
+      "grad_norm": 0.15893949568271637,
+      "learning_rate": 3.251510989929052e-05,
+      "loss": 0.1598116159439087,
+      "step": 2315
+    },
+    {
+      "epoch": 0.42212518195050946,
+      "grad_norm": 0.18930596113204956,
+      "learning_rate": 3.244487380424817e-05,
+      "loss": 0.1482008934020996,
+      "step": 2320
+    },
+    {
+      "epoch": 0.4230349344978166,
+      "grad_norm": 0.132876455783844,
+      "learning_rate": 3.237457316935856e-05,
+      "loss": 0.15304710865020751,
+      "step": 2325
+    },
+    {
+      "epoch": 0.4239446870451237,
+      "grad_norm": 0.16447032988071442,
+      "learning_rate": 3.2304208604060106e-05,
+      "loss": 0.15298750400543212,
+      "step": 2330
+    },
+    {
+      "epoch": 0.42485443959243085,
+      "grad_norm": 0.17748120427131653,
+      "learning_rate": 3.223378071834546e-05,
+      "loss": 0.1556084156036377,
+      "step": 2335
+    },
+    {
+      "epoch": 0.425764192139738,
+      "grad_norm": 0.16366586089134216,
+      "learning_rate": 3.2163290122756206e-05,
+      "loss": 0.14387927055358887,
+      "step": 2340
+    },
+    {
+      "epoch": 0.4266739446870451,
+      "grad_norm": 0.15398970246315002,
+      "learning_rate": 3.209273742837755e-05,
+      "loss": 0.16091293096542358,
+      "step": 2345
+    },
+    {
+      "epoch": 0.42758369723435224,
+      "grad_norm": 0.164212167263031,
+      "learning_rate": 3.202212324683305e-05,
+      "loss": 0.15523531436920165,
+      "step": 2350
+    },
+    {
+      "epoch": 0.4284934497816594,
+      "grad_norm": 0.16749800741672516,
+      "learning_rate": 3.1951448190279255e-05,
+      "loss": 0.15354975461959838,
+      "step": 2355
+    },
+    {
+      "epoch": 0.4294032023289665,
+      "grad_norm": 0.14137034118175507,
+      "learning_rate": 3.18807128714005e-05,
+      "loss": 0.14981694221496583,
+      "step": 2360
+    },
+    {
+      "epoch": 0.43031295487627363,
+      "grad_norm": 0.14848439395427704,
+      "learning_rate": 3.1809917903403507e-05,
+      "loss": 0.15448769330978393,
+      "step": 2365
+    },
+    {
+      "epoch": 0.43122270742358076,
+      "grad_norm": 0.1747605800628662,
+      "learning_rate": 3.1739063900012095e-05,
+      "loss": 0.15882387161254882,
+      "step": 2370
+    },
+    {
+      "epoch": 0.4321324599708879,
+      "grad_norm": 0.16054467856884003,
+      "learning_rate": 3.166815147546186e-05,
+      "loss": 0.15170297622680665,
+      "step": 2375
+    },
+    {
+      "epoch": 0.433042212518195,
+      "grad_norm": 0.15428027510643005,
+      "learning_rate": 3.1597181244494886e-05,
+      "loss": 0.16202548742294312,
+      "step": 2380
+    },
+    {
+      "epoch": 0.4339519650655022,
+      "grad_norm": 0.16747219860553741,
+      "learning_rate": 3.1526153822354325e-05,
+      "loss": 0.15461477041244506,
+      "step": 2385
+    },
+    {
+      "epoch": 0.43486171761280934,
+      "grad_norm": 0.17415772378444672,
+      "learning_rate": 3.145506982477918e-05,
+      "loss": 0.16173542737960817,
+      "step": 2390
+    },
+    {
+      "epoch": 0.43577147016011647,
+      "grad_norm": 0.1293518990278244,
+      "learning_rate": 3.1383929867998865e-05,
+      "loss": 0.15572521686553956,
+      "step": 2395
+    },
+    {
+      "epoch": 0.4366812227074236,
+      "grad_norm": 0.16909323632717133,
+      "learning_rate": 3.1312734568727935e-05,
+      "loss": 0.15898628234863282,
+      "step": 2400
+    },
+    {
+      "epoch": 0.43759097525473073,
+      "grad_norm": 0.16770294308662415,
+      "learning_rate": 3.124148454416069e-05,
+      "loss": 0.1536281704902649,
+      "step": 2405
+    },
+    {
+      "epoch": 0.43850072780203786,
+      "grad_norm": 0.14078612625598907,
+      "learning_rate": 3.117018041196585e-05,
+      "loss": 0.15274266004562378,
+      "step": 2410
+    },
+    {
+      "epoch": 0.439410480349345,
+      "grad_norm": 0.15457536280155182,
+      "learning_rate": 3.1098822790281226e-05,
+      "loss": 0.15391263961791993,
+      "step": 2415
+    },
+    {
+      "epoch": 0.4403202328966521,
+      "grad_norm": 0.1640717089176178,
+      "learning_rate": 3.102741229770827e-05,
+      "loss": 0.15515168905258178,
+      "step": 2420
+    },
+    {
+      "epoch": 0.44122998544395925,
+      "grad_norm": 0.2601533830165863,
+      "learning_rate": 3.095594955330683e-05,
+      "loss": 0.1587247371673584,
+      "step": 2425
+    },
+    {
+      "epoch": 0.4421397379912664,
+      "grad_norm": 0.1352529525756836,
+      "learning_rate": 3.08844351765897e-05,
+      "loss": 0.1483217477798462,
+      "step": 2430
+    },
+    {
+      "epoch": 0.4430494905385735,
+      "grad_norm": 0.18479721248149872,
+      "learning_rate": 3.081286978751728e-05,
+      "loss": 0.15121787786483765,
+      "step": 2435
+    },
+    {
+      "epoch": 0.44395924308588064,
+      "grad_norm": 0.16954511404037476,
+      "learning_rate": 3.074125400649221e-05,
+      "loss": 0.16073100566864013,
+      "step": 2440
+    },
+    {
+      "epoch": 0.44486899563318777,
+      "grad_norm": 0.15154729783535004,
+      "learning_rate": 3.0669588454353944e-05,
+      "loss": 0.15738017559051515,
+      "step": 2445
+    },
+    {
+      "epoch": 0.4457787481804949,
+      "grad_norm": 0.1540488302707672,
+      "learning_rate": 3.059787375237344e-05,
+      "loss": 0.1515384554862976,
+      "step": 2450
+    },
+    {
+      "epoch": 0.44668850072780203,
+      "grad_norm": 0.1814432442188263,
+      "learning_rate": 3.052611052224774e-05,
+      "loss": 0.15731438398361205,
+      "step": 2455
+    },
+    {
+      "epoch": 0.44759825327510916,
+      "grad_norm": 0.16657036542892456,
+      "learning_rate": 3.0454299386094542e-05,
+      "loss": 0.15741543769836425,
+      "step": 2460
+    },
+    {
+      "epoch": 0.4485080058224163,
+      "grad_norm": 0.2177237570285797,
+      "learning_rate": 3.0382440966446875e-05,
+      "loss": 0.14972515106201173,
+      "step": 2465
+    },
+    {
+      "epoch": 0.4494177583697234,
+      "grad_norm": 0.1669909954071045,
+      "learning_rate": 3.031053588624766e-05,
+      "loss": 0.1506432294845581,
+      "step": 2470
+    },
+    {
+      "epoch": 0.45032751091703055,
+      "grad_norm": 0.1752234250307083,
+      "learning_rate": 3.0238584768844313e-05,
+      "loss": 0.14969609975814818,
+      "step": 2475
+    },
+    {
+      "epoch": 0.4512372634643377,
+      "grad_norm": 0.18267901241779327,
+      "learning_rate": 3.0166588237983363e-05,
+      "loss": 0.15112748146057128,
+      "step": 2480
+    },
+    {
+      "epoch": 0.4521470160116448,
+      "grad_norm": 0.16250105202198029,
+      "learning_rate": 3.0094546917805007e-05,
+      "loss": 0.15864100456237792,
+      "step": 2485
+    },
+    {
+      "epoch": 0.45305676855895194,
+      "grad_norm": 0.14825721085071564,
+      "learning_rate": 3.0022461432837752e-05,
+      "loss": 0.1513954520225525,
+      "step": 2490
+    },
+    {
+      "epoch": 0.4539665211062591,
+      "grad_norm": 0.1626640111207962,
+      "learning_rate": 2.9950332407992943e-05,
+      "loss": 0.1505578875541687,
+      "step": 2495
+    },
+    {
+      "epoch": 0.45487627365356625,
+      "grad_norm": 0.1535351574420929,
+      "learning_rate": 2.987816046855939e-05,
+      "loss": 0.15255829095840454,
+      "step": 2500
+    },
+    {
+      "epoch": 0.4557860262008734,
+      "grad_norm": 0.17552775144577026,
+      "learning_rate": 2.9805946240197928e-05,
+      "loss": 0.1516443133354187,
+      "step": 2505
+    },
+    {
+      "epoch": 0.4566957787481805,
+      "grad_norm": 0.16020981967449188,
+      "learning_rate": 2.9733690348935994e-05,
+      "loss": 0.14519743919372557,
+      "step": 2510
+    },
+    {
+      "epoch": 0.45760553129548764,
+      "grad_norm": 0.17800211906433105,
+      "learning_rate": 2.9661393421162204e-05,
+      "loss": 0.15679080486297609,
+      "step": 2515
+    },
+    {
+      "epoch": 0.4585152838427948,
+      "grad_norm": 0.16016991436481476,
+      "learning_rate": 2.9589056083620902e-05,
+      "loss": 0.14768127202987671,
+      "step": 2520
+    },
+    {
+      "epoch": 0.4594250363901019,
+      "grad_norm": 0.16272081434726715,
+      "learning_rate": 2.951667896340679e-05,
+      "loss": 0.1513301968574524,
+      "step": 2525
+    },
+    {
+      "epoch": 0.46033478893740903,
+      "grad_norm": 0.1726413071155548,
+      "learning_rate": 2.9444262687959402e-05,
+      "loss": 0.14819332361221313,
+      "step": 2530
+    },
+    {
+      "epoch": 0.46124454148471616,
+      "grad_norm": 0.1670403778553009,
+      "learning_rate": 2.9371807885057735e-05,
+      "loss": 0.15245940685272216,
+      "step": 2535
+    },
+    {
+      "epoch": 0.4621542940320233,
+      "grad_norm": 0.1650049239397049,
+      "learning_rate": 2.9299315182814772e-05,
+      "loss": 0.15187418460845947,
+      "step": 2540
+    },
+    {
+      "epoch": 0.4630640465793304,
+      "grad_norm": 0.16327734291553497,
+      "learning_rate": 2.9226785209672047e-05,
+      "loss": 0.15579828023910522,
+      "step": 2545
+    },
+    {
+      "epoch": 0.46397379912663755,
+      "grad_norm": 0.3367880582809448,
+      "learning_rate": 2.91542185943942e-05,
+      "loss": 0.15617697238922118,
+      "step": 2550
+    },
+    {
+      "epoch": 0.4648835516739447,
+      "grad_norm": 0.1731594055891037,
+      "learning_rate": 2.908161596606353e-05,
+      "loss": 0.1559603691101074,
+      "step": 2555
+    },
+    {
+      "epoch": 0.4657933042212518,
+      "grad_norm": 0.1477293074131012,
+      "learning_rate": 2.9008977954074517e-05,
+      "loss": 0.15567959547042848,
+      "step": 2560
+    },
+    {
+      "epoch": 0.46670305676855894,
+      "grad_norm": 0.16227173805236816,
+      "learning_rate": 2.8936305188128392e-05,
+      "loss": 0.1522113561630249,
+      "step": 2565
+    },
+    {
+      "epoch": 0.4676128093158661,
+      "grad_norm": 0.2031075656414032,
+      "learning_rate": 2.8863598298227674e-05,
+      "loss": 0.15054640769958497,
+      "step": 2570
+    },
+    {
+      "epoch": 0.4685225618631732,
+      "grad_norm": 0.18351472914218903,
+      "learning_rate": 2.8790857914670698e-05,
+      "loss": 0.15837019681930542,
+      "step": 2575
+    },
+    {
+      "epoch": 0.46943231441048033,
+      "grad_norm": 0.15914765000343323,
+      "learning_rate": 2.871808466804616e-05,
+      "loss": 0.1550259470939636,
+      "step": 2580
+    },
+    {
+      "epoch": 0.47034206695778746,
+      "grad_norm": 0.17366717755794525,
+      "learning_rate": 2.8645279189227636e-05,
+      "loss": 0.15702390670776367,
+      "step": 2585
+    },
+    {
+      "epoch": 0.4712518195050946,
+      "grad_norm": 0.13677838444709778,
+      "learning_rate": 2.8572442109368134e-05,
+      "loss": 0.15485031604766847,
+      "step": 2590
+    },
+    {
+      "epoch": 0.4721615720524017,
+      "grad_norm": 0.1477748304605484,
+      "learning_rate": 2.8499574059894617e-05,
+      "loss": 0.14577245712280273,
+      "step": 2595
+    },
+    {
+      "epoch": 0.47307132459970885,
+      "grad_norm": 0.1582217663526535,
+      "learning_rate": 2.842667567250252e-05,
+      "loss": 0.15586793422698975,
+      "step": 2600
+    },
+    {
+      "epoch": 0.47398107714701604,
+      "grad_norm": 0.19658738374710083,
+      "learning_rate": 2.8353747579150268e-05,
+      "loss": 0.15060495138168334,
+      "step": 2605
+    },
+    {
+      "epoch": 0.47489082969432317,
+      "grad_norm": 0.176767036318779,
+      "learning_rate": 2.828079041205382e-05,
+      "loss": 0.15116705894470214,
+      "step": 2610
+    },
+    {
+      "epoch": 0.4758005822416303,
+      "grad_norm": 0.16972507536411285,
+      "learning_rate": 2.820780480368117e-05,
+      "loss": 0.1541937470436096,
+      "step": 2615
+    },
+    {
+      "epoch": 0.47671033478893743,
+      "grad_norm": 0.1548585742712021,
+      "learning_rate": 2.8134791386746884e-05,
+      "loss": 0.14334756135940552,
+      "step": 2620
+    },
+    {
+      "epoch": 0.47762008733624456,
+      "grad_norm": 0.15411986410617828,
+      "learning_rate": 2.806175079420658e-05,
+      "loss": 0.14642289876937867,
+      "step": 2625
+    },
+    {
+      "epoch": 0.4785298398835517,
+      "grad_norm": 0.16609491407871246,
+      "learning_rate": 2.7988683659251474e-05,
+      "loss": 0.15083469152450563,
+      "step": 2630
+    },
+    {
+      "epoch": 0.4794395924308588,
+      "grad_norm": 0.16592684388160706,
+      "learning_rate": 2.791559061530289e-05,
+      "loss": 0.14218480587005616,
+      "step": 2635
+    },
+    {
+      "epoch": 0.48034934497816595,
+      "grad_norm": 0.1764935404062271,
+      "learning_rate": 2.7842472296006722e-05,
+      "loss": 0.15004343986511232,
+      "step": 2640
+    },
+    {
+      "epoch": 0.4812590975254731,
+      "grad_norm": 0.20094354450702667,
+      "learning_rate": 2.7769329335228022e-05,
+      "loss": 0.14975016117095946,
+      "step": 2645
+    },
+    {
+      "epoch": 0.4821688500727802,
+      "grad_norm": 0.1869269460439682,
+      "learning_rate": 2.769616236704542e-05,
+      "loss": 0.155981707572937,
+      "step": 2650
+    },
+    {
+      "epoch": 0.48307860262008734,
+      "grad_norm": 0.16671574115753174,
+      "learning_rate": 2.762297202574571e-05,
+      "loss": 0.14633859395980836,
+      "step": 2655
+    },
+    {
+      "epoch": 0.48398835516739447,
+      "grad_norm": 0.14999663829803467,
+      "learning_rate": 2.754975894581826e-05,
+      "loss": 0.15692603588104248,
+      "step": 2660
+    },
+    {
+      "epoch": 0.4848981077147016,
+      "grad_norm": 0.16893649101257324,
+      "learning_rate": 2.7476523761949592e-05,
+      "loss": 0.14530394077301026,
+      "step": 2665
+    },
+    {
+      "epoch": 0.48580786026200873,
+      "grad_norm": 0.16039884090423584,
+      "learning_rate": 2.740326710901784e-05,
+      "loss": 0.15013915300369263,
+      "step": 2670
+    },
+    {
+      "epoch": 0.48671761280931586,
+      "grad_norm": 0.16672006249427795,
+      "learning_rate": 2.732998962208725e-05,
+      "loss": 0.15667349100112915,
+      "step": 2675
+    },
+    {
+      "epoch": 0.487627365356623,
+      "grad_norm": 0.2160867303609848,
+      "learning_rate": 2.7256691936402684e-05,
+      "loss": 0.14335414171218872,
+      "step": 2680
+    },
+    {
+      "epoch": 0.4885371179039301,
+      "grad_norm": 0.349030077457428,
+      "learning_rate": 2.71833746873841e-05,
+      "loss": 0.1437530279159546,
+      "step": 2685
+    },
+    {
+      "epoch": 0.48944687045123725,
+      "grad_norm": 0.18380966782569885,
+      "learning_rate": 2.7110038510621073e-05,
+      "loss": 0.1476014256477356,
+      "step": 2690
+    },
+    {
+      "epoch": 0.4903566229985444,
+      "grad_norm": 0.1523742377758026,
+      "learning_rate": 2.703668404186722e-05,
+      "loss": 0.14578526020050048,
+      "step": 2695
+    },
+    {
+      "epoch": 0.4912663755458515,
+      "grad_norm": 0.16092729568481445,
+      "learning_rate": 2.696331191703479e-05,
+      "loss": 0.15335593223571778,
+      "step": 2700
+    },
+    {
+      "epoch": 0.49217612809315864,
+      "grad_norm": 0.17185333371162415,
+      "learning_rate": 2.688992277218904e-05,
+      "loss": 0.1540898084640503,
+      "step": 2705
+    },
+    {
+      "epoch": 0.49308588064046577,
+      "grad_norm": 0.1521969735622406,
+      "learning_rate": 2.6816517243542792e-05,
+      "loss": 0.15171396732330322,
+      "step": 2710
+    },
+    {
+      "epoch": 0.49399563318777295,
+      "grad_norm": 0.16064171493053436,
+      "learning_rate": 2.674309596745092e-05,
+      "loss": 0.1505839228630066,
+      "step": 2715
+    },
+    {
+      "epoch": 0.4949053857350801,
+      "grad_norm": 0.16430898010730743,
+      "learning_rate": 2.6669659580404795e-05,
+      "loss": 0.1551363468170166,
+      "step": 2720
+    },
+    {
+      "epoch": 0.4958151382823872,
+      "grad_norm": 0.16125477850437164,
+      "learning_rate": 2.659620871902677e-05,
+      "loss": 0.15069286823272704,
+      "step": 2725
+    },
+    {
+      "epoch": 0.49672489082969434,
+      "grad_norm": 0.1428450047969818,
+      "learning_rate": 2.652274402006471e-05,
+      "loss": 0.15511081218719483,
+      "step": 2730
+    },
+    {
+      "epoch": 0.4976346433770015,
+      "grad_norm": 0.15452754497528076,
+      "learning_rate": 2.6449266120386406e-05,
+      "loss": 0.14941939115524291,
+      "step": 2735
+    },
+    {
+      "epoch": 0.4985443959243086,
+      "grad_norm": 0.17243537306785583,
+      "learning_rate": 2.6375775656974123e-05,
+      "loss": 0.151741623878479,
+      "step": 2740
+    },
+    {
+      "epoch": 0.49945414847161573,
+      "grad_norm": 0.13736453652381897,
+      "learning_rate": 2.6302273266919008e-05,
+      "loss": 0.147042977809906,
+      "step": 2745
+    },
+    {
+      "epoch": 0.5003639010189228,
+      "grad_norm": 0.16241495311260223,
+      "learning_rate": 2.6228759587415614e-05,
+      "loss": 0.14664684534072875,
+      "step": 2750
+    },
+    {
+      "epoch": 0.50127365356623,
+      "grad_norm": 0.193496435880661,
+      "learning_rate": 2.6155235255756356e-05,
+      "loss": 0.15486966371536254,
+      "step": 2755
+    },
+    {
+      "epoch": 0.5021834061135371,
+      "grad_norm": 0.1542847901582718,
+      "learning_rate": 2.6081700909326e-05,
+      "loss": 0.15148009061813356,
+      "step": 2760
+    },
+    {
+      "epoch": 0.5030931586608443,
+      "grad_norm": 0.1696511209011078,
+      "learning_rate": 2.6008157185596142e-05,
+      "loss": 0.14190055131912233,
+      "step": 2765
+    },
+    {
+      "epoch": 0.5040029112081513,
+      "grad_norm": 0.14690077304840088,
+      "learning_rate": 2.5934604722119655e-05,
+      "loss": 0.1570739269256592,
+      "step": 2770
+    },
+    {
+      "epoch": 0.5049126637554585,
+      "grad_norm": 0.17149671912193298,
+      "learning_rate": 2.5861044156525162e-05,
+      "loss": 0.14940304756164552,
+      "step": 2775
+    },
+    {
+      "epoch": 0.5058224163027657,
+      "grad_norm": 0.16639231145381927,
+      "learning_rate": 2.578747612651155e-05,
+      "loss": 0.15691237449645995,
+      "step": 2780
+    },
+    {
+      "epoch": 0.5067321688500728,
+      "grad_norm": 0.2062763124704361,
+      "learning_rate": 2.5713901269842404e-05,
+      "loss": 0.1564734935760498,
+      "step": 2785
+    },
+    {
+      "epoch": 0.50764192139738,
+      "grad_norm": 0.12636308372020721,
+      "learning_rate": 2.5640320224340502e-05,
+      "loss": 0.14539417028427123,
+      "step": 2790
+    },
+    {
+      "epoch": 0.508551673944687,
+      "grad_norm": 0.16893689334392548,
+      "learning_rate": 2.556673362788225e-05,
+      "loss": 0.15440930128097535,
+      "step": 2795
+    },
+    {
+      "epoch": 0.5094614264919942,
+      "grad_norm": 0.16250015795230865,
+      "learning_rate": 2.54931421183922e-05,
+      "loss": 0.14485647678375244,
+      "step": 2800
+    },
+    {
+      "epoch": 0.5103711790393013,
+      "grad_norm": 0.1700994372367859,
+      "learning_rate": 2.5419546333837462e-05,
+      "loss": 0.15411126613616943,
+      "step": 2805
+    },
+    {
+      "epoch": 0.5112809315866085,
+      "grad_norm": 0.1547706127166748,
+      "learning_rate": 2.5345946912222256e-05,
+      "loss": 0.15516072511672974,
+      "step": 2810
+    },
+    {
+      "epoch": 0.5121906841339156,
+      "grad_norm": 0.17955681681632996,
+      "learning_rate": 2.527234449158228e-05,
+      "loss": 0.15546923875808716,
+      "step": 2815
+    },
+    {
+      "epoch": 0.5131004366812227,
+      "grad_norm": 0.163709819316864,
+      "learning_rate": 2.519873970997927e-05,
+      "loss": 0.15665037631988527,
+      "step": 2820
+    },
+    {
+      "epoch": 0.5140101892285298,
+      "grad_norm": 0.17859576642513275,
+      "learning_rate": 2.5125133205495405e-05,
+      "loss": 0.1539722204208374,
+      "step": 2825
+    },
+    {
+      "epoch": 0.514919941775837,
+      "grad_norm": 0.17443150281906128,
+      "learning_rate": 2.5051525616227806e-05,
+      "loss": 0.148411762714386,
+      "step": 2830
+    },
+    {
+      "epoch": 0.5158296943231441,
+      "grad_norm": 0.17397581040859222,
+      "learning_rate": 2.4977917580283007e-05,
+      "loss": 0.14880497455596925,
+      "step": 2835
+    },
+    {
+      "epoch": 0.5167394468704513,
+      "grad_norm": 0.14565663039684296,
+      "learning_rate": 2.4904309735771405e-05,
+      "loss": 0.14934680461883545,
+      "step": 2840
+    },
+    {
+      "epoch": 0.5176491994177583,
+      "grad_norm": 0.17895659804344177,
+      "learning_rate": 2.4830702720801746e-05,
+      "loss": 0.15287939310073853,
+      "step": 2845
+    },
+    {
+      "epoch": 0.5185589519650655,
+      "grad_norm": 0.15812788903713226,
+      "learning_rate": 2.4757097173475572e-05,
+      "loss": 0.14576947689056396,
+      "step": 2850
+    },
+    {
+      "epoch": 0.5194687045123726,
+      "grad_norm": 0.17123781144618988,
+      "learning_rate": 2.46834937318817e-05,
+      "loss": 0.15224847793579102,
+      "step": 2855
+    },
+    {
+      "epoch": 0.5203784570596798,
+      "grad_norm": 0.14845474064350128,
+      "learning_rate": 2.460989303409072e-05,
+      "loss": 0.14901585578918458,
+      "step": 2860
+    },
+    {
+      "epoch": 0.5212882096069869,
+      "grad_norm": 0.23493704199790955,
+      "learning_rate": 2.4536295718149407e-05,
+      "loss": 0.1517487049102783,
+      "step": 2865
+    },
+    {
+      "epoch": 0.522197962154294,
+      "grad_norm": 0.16209843754768372,
+      "learning_rate": 2.4462702422075217e-05,
+      "loss": 0.14327445030212402,
+      "step": 2870
+    },
+    {
+      "epoch": 0.5231077147016011,
+      "grad_norm": 0.17249803245067596,
+      "learning_rate": 2.4389113783850793e-05,
+      "loss": 0.1517549753189087,
+      "step": 2875
+    },
+    {
+      "epoch": 0.5240174672489083,
+      "grad_norm": 0.14561402797698975,
+      "learning_rate": 2.431553044141836e-05,
+      "loss": 0.14764087200164794,
+      "step": 2880
+    },
+    {
+      "epoch": 0.5249272197962155,
+      "grad_norm": 0.17033302783966064,
+      "learning_rate": 2.4241953032674256e-05,
+      "loss": 0.15181604623794556,
+      "step": 2885
+    },
+    {
+      "epoch": 0.5258369723435226,
+      "grad_norm": 0.1184430941939354,
+      "learning_rate": 2.4168382195463367e-05,
+      "loss": 0.14264242649078368,
+      "step": 2890
+    },
+    {
+      "epoch": 0.5267467248908297,
+      "grad_norm": 0.17521196603775024,
+      "learning_rate": 2.4094818567573618e-05,
+      "loss": 0.1509538173675537,
+      "step": 2895
+    },
+    {
+      "epoch": 0.5276564774381368,
+      "grad_norm": 0.1681576371192932,
+      "learning_rate": 2.4021262786730428e-05,
+      "loss": 0.15344605445861817,
+      "step": 2900
+    },
+    {
+      "epoch": 0.528566229985444,
+      "grad_norm": 0.17134182155132294,
+      "learning_rate": 2.3947715490591206e-05,
+      "loss": 0.15161689519882202,
+      "step": 2905
+    },
+    {
+      "epoch": 0.5294759825327511,
+      "grad_norm": 0.1796472817659378,
+      "learning_rate": 2.3874177316739778e-05,
+      "loss": 0.15086464881896972,
+      "step": 2910
+    },
+    {
+      "epoch": 0.5303857350800583,
+      "grad_norm": 0.23268625140190125,
+      "learning_rate": 2.380064890268093e-05,
+      "loss": 0.15354180335998535,
+      "step": 2915
+    },
+    {
+      "epoch": 0.5312954876273653,
+      "grad_norm": 0.16318941116333008,
+      "learning_rate": 2.372713088583481e-05,
+      "loss": 0.15131797790527343,
+      "step": 2920
+    },
+    {
+      "epoch": 0.5322052401746725,
+      "grad_norm": 0.18171803653240204,
+      "learning_rate": 2.365362390353143e-05,
+      "loss": 0.15784090757369995,
+      "step": 2925
+    },
+    {
+      "epoch": 0.5331149927219796,
+      "grad_norm": 0.17672640085220337,
+      "learning_rate": 2.3580128593005156e-05,
+      "loss": 0.15509436130523682,
+      "step": 2930
+    },
+    {
+      "epoch": 0.5340247452692868,
+      "grad_norm": 0.15985223650932312,
+      "learning_rate": 2.3506645591389174e-05,
+      "loss": 0.14851027727127075,
+      "step": 2935
+    },
+    {
+      "epoch": 0.5349344978165939,
+      "grad_norm": 0.16597607731819153,
+      "learning_rate": 2.343317553570995e-05,
+      "loss": 0.1504931092262268,
+      "step": 2940
+    },
+    {
+      "epoch": 0.535844250363901,
+      "grad_norm": 0.20180748403072357,
+      "learning_rate": 2.3359719062881725e-05,
+      "loss": 0.15023820400238036,
+      "step": 2945
+    },
+    {
+      "epoch": 0.5367540029112081,
+      "grad_norm": 0.1735963076353073,
+      "learning_rate": 2.3286276809701e-05,
+      "loss": 0.15374408960342406,
+      "step": 2950
+    },
+    {
+      "epoch": 0.5376637554585153,
+      "grad_norm": 0.17629501223564148,
+      "learning_rate": 2.3212849412840995e-05,
+      "loss": 0.15007833242416382,
+      "step": 2955
+    },
+    {
+      "epoch": 0.5385735080058224,
+      "grad_norm": 0.1493796557188034,
+      "learning_rate": 2.3139437508846155e-05,
+      "loss": 0.15206656455993653,
+      "step": 2960
+    },
+    {
+      "epoch": 0.5394832605531296,
+      "grad_norm": 0.17426837980747223,
+      "learning_rate": 2.306604173412659e-05,
+      "loss": 0.1441131591796875,
+      "step": 2965
+    },
+    {
+      "epoch": 0.5403930131004366,
+      "grad_norm": 0.16984431445598602,
+      "learning_rate": 2.2992662724952613e-05,
+      "loss": 0.14438753128051757,
+      "step": 2970
+    },
+    {
+      "epoch": 0.5413027656477438,
+      "grad_norm": 0.1814386397600174,
+      "learning_rate": 2.2919301117449167e-05,
+      "loss": 0.14887022972106934,
+      "step": 2975
+    },
+    {
+      "epoch": 0.5422125181950509,
+      "grad_norm": 0.158392995595932,
+      "learning_rate": 2.2845957547590368e-05,
+      "loss": 0.14404361248016356,
+      "step": 2980
+    },
+    {
+      "epoch": 0.5431222707423581,
+      "grad_norm": 0.17496263980865479,
+      "learning_rate": 2.2772632651193953e-05,
+      "loss": 0.1454906702041626,
+      "step": 2985
+    },
+    {
+      "epoch": 0.5440320232896652,
+      "grad_norm": 0.157533198595047,
+      "learning_rate": 2.2699327063915766e-05,
+      "loss": 0.1458217740058899,
+      "step": 2990
+    },
+    {
+      "epoch": 0.5449417758369723,
+      "grad_norm": 0.1767890453338623,
+      "learning_rate": 2.262604142124427e-05,
+      "loss": 0.14384825229644777,
+      "step": 2995
+    },
+    {
+      "epoch": 0.5458515283842795,
+      "grad_norm": 0.1851050704717636,
+      "learning_rate": 2.2552776358495033e-05,
+      "loss": 0.14832457304000854,
+      "step": 3000
+    },
+    {
+      "epoch": 0.5467612809315866,
+      "grad_norm": 0.164175882935524,
+      "learning_rate": 2.247953251080521e-05,
+      "loss": 0.14999878406524658,
+      "step": 3005
+    },
+    {
+      "epoch": 0.5476710334788938,
+      "grad_norm": 0.3403675854206085,
+      "learning_rate": 2.240631051312804e-05,
+      "loss": 0.1443937063217163,
+      "step": 3010
+    },
+    {
+      "epoch": 0.5485807860262009,
+      "grad_norm": 0.16751109063625336,
+      "learning_rate": 2.2333111000227342e-05,
+      "loss": 0.1462402105331421,
+      "step": 3015
+    },
+    {
+      "epoch": 0.549490538573508,
+      "grad_norm": 0.14741151034832,
+      "learning_rate": 2.225993460667201e-05,
+      "loss": 0.149855899810791,
+      "step": 3020
+    },
+    {
+      "epoch": 0.5504002911208151,
+      "grad_norm": 0.20605266094207764,
+      "learning_rate": 2.218678196683054e-05,
+      "loss": 0.15413178205490113,
+      "step": 3025
+    },
+    {
+      "epoch": 0.5513100436681223,
+      "grad_norm": 0.14884796738624573,
+      "learning_rate": 2.2113653714865473e-05,
+      "loss": 0.14592334032058715,
+      "step": 3030
+    },
+    {
+      "epoch": 0.5522197962154294,
+      "grad_norm": 0.17114350199699402,
+      "learning_rate": 2.2040550484727943e-05,
+      "loss": 0.1498338460922241,
+      "step": 3035
+    },
+    {
+      "epoch": 0.5531295487627366,
+      "grad_norm": 0.16496853530406952,
+      "learning_rate": 2.196747291015219e-05,
+      "loss": 0.14650315046310425,
+      "step": 3040
+    },
+    {
+      "epoch": 0.5540393013100436,
+      "grad_norm": 0.15172401070594788,
+      "learning_rate": 2.189442162465001e-05,
+      "loss": 0.14984124898910522,
+      "step": 3045
+    },
+    {
+      "epoch": 0.5549490538573508,
+      "grad_norm": 0.19258467853069305,
+      "learning_rate": 2.182139726150532e-05,
+      "loss": 0.1486764669418335,
+      "step": 3050
+    },
+    {
+      "epoch": 0.5558588064046579,
+      "grad_norm": 0.1749001443386078,
+      "learning_rate": 2.1748400453768652e-05,
+      "loss": 0.14983701705932617,
+      "step": 3055
+    },
+    {
+      "epoch": 0.5567685589519651,
+      "grad_norm": 0.37510567903518677,
+      "learning_rate": 2.1675431834251637e-05,
+      "loss": 0.14483561515808105,
+      "step": 3060
+    },
+    {
+      "epoch": 0.5576783114992722,
+      "grad_norm": 0.16932405531406403,
+      "learning_rate": 2.1602492035521553e-05,
+      "loss": 0.14487643241882325,
+      "step": 3065
+    },
+    {
+      "epoch": 0.5585880640465793,
+      "grad_norm": 0.174176424741745,
+      "learning_rate": 2.152958168989584e-05,
+      "loss": 0.14737497568130492,
+      "step": 3070
+    },
+    {
+      "epoch": 0.5594978165938864,
+      "grad_norm": 0.1601252257823944,
+      "learning_rate": 2.1456701429436577e-05,
+      "loss": 0.15183379650115966,
+      "step": 3075
+    },
+    {
+      "epoch": 0.5604075691411936,
+      "grad_norm": 0.14960910379886627,
+      "learning_rate": 2.1383851885945085e-05,
+      "loss": 0.143074893951416,
+      "step": 3080
+    },
+    {
+      "epoch": 0.5613173216885007,
+      "grad_norm": 0.1678633838891983,
+      "learning_rate": 2.1311033690956346e-05,
+      "loss": 0.14961432218551635,
+      "step": 3085
+    },
+    {
+      "epoch": 0.5622270742358079,
+      "grad_norm": 0.15814319252967834,
+      "learning_rate": 2.1238247475733613e-05,
+      "loss": 0.14308581352233887,
+      "step": 3090
+    },
+    {
+      "epoch": 0.5631368267831149,
+      "grad_norm": 0.21240772306919098,
+      "learning_rate": 2.1165493871262887e-05,
+      "loss": 0.14737485647201537,
+      "step": 3095
+    },
+    {
+      "epoch": 0.5640465793304221,
+      "grad_norm": 0.15161271393299103,
+      "learning_rate": 2.109277350824749e-05,
+      "loss": 0.14534420967102052,
+      "step": 3100
+    },
+    {
+      "epoch": 0.5649563318777293,
+      "grad_norm": 0.16572362184524536,
+      "learning_rate": 2.1020087017102537e-05,
+      "loss": 0.14299670457839966,
+      "step": 3105
+    },
+    {
+      "epoch": 0.5658660844250364,
+      "grad_norm": 0.1548164039850235,
+      "learning_rate": 2.094743502794954e-05,
+      "loss": 0.14371142387390137,
+      "step": 3110
+    },
+    {
+      "epoch": 0.5667758369723436,
+      "grad_norm": 0.2574169933795929,
+      "learning_rate": 2.0874818170610885e-05,
+      "loss": 0.14350423812866211,
+      "step": 3115
+    },
+    {
+      "epoch": 0.5676855895196506,
+      "grad_norm": 0.16359548270702362,
+      "learning_rate": 2.080223707460443e-05,
+      "loss": 0.1520243763923645,
+      "step": 3120
+    },
+    {
+      "epoch": 0.5685953420669578,
+      "grad_norm": 0.1798320859670639,
+      "learning_rate": 2.072969236913799e-05,
+      "loss": 0.14832595586776734,
+      "step": 3125
+    },
+    {
+      "epoch": 0.5695050946142649,
+      "grad_norm": 0.17045916616916656,
+      "learning_rate": 2.0657184683103926e-05,
+      "loss": 0.15308042764663696,
+      "step": 3130
+    },
+    {
+      "epoch": 0.5704148471615721,
+      "grad_norm": 0.16345897316932678,
+      "learning_rate": 2.058471464507366e-05,
+      "loss": 0.14564799070358275,
+      "step": 3135
+    },
+    {
+      "epoch": 0.5713245997088792,
+      "grad_norm": 0.15170110762119293,
+      "learning_rate": 2.0512282883292257e-05,
+      "loss": 0.14161767959594726,
+      "step": 3140
+    },
+    {
+      "epoch": 0.5722343522561864,
+      "grad_norm": 0.8107472658157349,
+      "learning_rate": 2.0439890025672955e-05,
+      "loss": 0.14481087923049926,
+      "step": 3145
+    },
+    {
+      "epoch": 0.5731441048034934,
+      "grad_norm": 0.15346679091453552,
+      "learning_rate": 2.036753669979174e-05,
+      "loss": 0.14860262870788574,
+      "step": 3150
+    },
+    {
+      "epoch": 0.5740538573508006,
+      "grad_norm": 0.1632593423128128,
+      "learning_rate": 2.0295223532881886e-05,
+      "loss": 0.1481687307357788,
+      "step": 3155
+    },
+    {
+      "epoch": 0.5749636098981077,
+      "grad_norm": 0.23399172723293304,
+      "learning_rate": 2.022295115182852e-05,
+      "loss": 0.149153733253479,
+      "step": 3160
+    },
+    {
+      "epoch": 0.5758733624454149,
+      "grad_norm": 0.14977394044399261,
+      "learning_rate": 2.015072018316323e-05,
+      "loss": 0.14921388626098633,
+      "step": 3165
+    },
+    {
+      "epoch": 0.576783114992722,
+      "grad_norm": 0.1550658792257309,
+      "learning_rate": 2.007853125305856e-05,
+      "loss": 0.1482759475708008,
+      "step": 3170
+    },
+    {
+      "epoch": 0.5776928675400291,
+      "grad_norm": 0.16661737859249115,
+      "learning_rate": 2.0006384987322645e-05,
+      "loss": 0.14903552532196046,
+      "step": 3175
+    },
+    {
+      "epoch": 0.5786026200873362,
+      "grad_norm": 0.1746823936700821,
+      "learning_rate": 1.9934282011393753e-05,
+      "loss": 0.1412947654724121,
+      "step": 3180
+    },
+    {
+      "epoch": 0.5795123726346434,
+      "grad_norm": 0.17025792598724365,
+      "learning_rate": 1.9862222950334857e-05,
+      "loss": 0.15289769172668458,
+      "step": 3185
+    },
+    {
+      "epoch": 0.5804221251819505,
+      "grad_norm": 0.16857658326625824,
+      "learning_rate": 1.9790208428828252e-05,
+      "loss": 0.14419941902160643,
+      "step": 3190
+    },
+    {
+      "epoch": 0.5813318777292577,
+      "grad_norm": 0.16099876165390015,
+      "learning_rate": 1.9718239071170118e-05,
+      "loss": 0.14476487636566163,
+      "step": 3195
+    },
+    {
+      "epoch": 0.5822416302765647,
+      "grad_norm": 0.16140873730182648,
+      "learning_rate": 1.964631550126508e-05,
+      "loss": 0.14588416814804078,
+      "step": 3200
+    },
+    {
+      "epoch": 0.5831513828238719,
+      "grad_norm": 0.15719448029994965,
+      "learning_rate": 1.957443834262087e-05,
+      "loss": 0.15144693851470947,
+      "step": 3205
+    },
+    {
+      "epoch": 0.584061135371179,
+      "grad_norm": 0.16512645781040192,
+      "learning_rate": 1.950260821834285e-05,
+      "loss": 0.14787566661834717,
+      "step": 3210
+    },
+    {
+      "epoch": 0.5849708879184862,
+      "grad_norm": 0.18584516644477844,
+      "learning_rate": 1.9430825751128643e-05,
+      "loss": 0.14514710903167724,
+      "step": 3215
+    },
+    {
+      "epoch": 0.5858806404657934,
+      "grad_norm": 0.17640981078147888,
+      "learning_rate": 1.9359091563262742e-05,
+      "loss": 0.1511004686355591,
+      "step": 3220
+    },
+    {
+      "epoch": 0.5867903930131004,
+      "grad_norm": 0.1697624921798706,
+      "learning_rate": 1.9287406276611095e-05,
+      "loss": 0.15392563343048096,
+      "step": 3225
+    },
+    {
+      "epoch": 0.5877001455604076,
+      "grad_norm": 0.1677260845899582,
+      "learning_rate": 1.9215770512615725e-05,
+      "loss": 0.15311745405197144,
+      "step": 3230
+    },
+    {
+      "epoch": 0.5886098981077147,
+      "grad_norm": 0.15357480943202972,
+      "learning_rate": 1.9144184892289337e-05,
+      "loss": 0.14370160102844237,
+      "step": 3235
+    },
+    {
+      "epoch": 0.5895196506550219,
+      "grad_norm": 0.18601207435131073,
+      "learning_rate": 1.9072650036209955e-05,
+      "loss": 0.14095077514648438,
+      "step": 3240
+    },
+    {
+      "epoch": 0.590429403202329,
+      "grad_norm": 0.17313526570796967,
+      "learning_rate": 1.9001166564515513e-05,
+      "loss": 0.148259174823761,
+      "step": 3245
+    },
+    {
+      "epoch": 0.5913391557496361,
+      "grad_norm": 0.1634378433227539,
+      "learning_rate": 1.8929735096898504e-05,
+      "loss": 0.15082294940948487,
+      "step": 3250
+    },
+    {
+      "epoch": 0.5922489082969432,
+      "grad_norm": 0.18542174994945526,
+      "learning_rate": 1.885835625260058e-05,
+      "loss": 0.14461435079574586,
+      "step": 3255
+    },
+    {
+      "epoch": 0.5931586608442504,
+      "grad_norm": 0.1740756630897522,
+      "learning_rate": 1.87870306504072e-05,
+      "loss": 0.14083608388900756,
+      "step": 3260
+    },
+    {
+      "epoch": 0.5940684133915575,
+      "grad_norm": 0.25606217980384827,
+      "learning_rate": 1.8715758908642288e-05,
+      "loss": 0.15125386714935302,
+      "step": 3265
+    },
+    {
+      "epoch": 0.5949781659388647,
+      "grad_norm": 0.20194627344608307,
+      "learning_rate": 1.8644541645162834e-05,
+      "loss": 0.14433003664016725,
+      "step": 3270
+    },
+    {
+      "epoch": 0.5958879184861717,
+      "grad_norm": 0.1902168095111847,
+      "learning_rate": 1.8573379477353542e-05,
+      "loss": 0.14718132019042968,
+      "step": 3275
+    },
+    {
+      "epoch": 0.5967976710334789,
+      "grad_norm": 0.15122972428798676,
+      "learning_rate": 1.850227302212151e-05,
+      "loss": 0.153376567363739,
+      "step": 3280
+    },
+    {
+      "epoch": 0.597707423580786,
+      "grad_norm": 0.14331959187984467,
+      "learning_rate": 1.843122289589085e-05,
+      "loss": 0.146630597114563,
+      "step": 3285
+    },
+    {
+      "epoch": 0.5986171761280932,
+      "grad_norm": 0.15083099901676178,
+      "learning_rate": 1.836022971459737e-05,
+      "loss": 0.1445971965789795,
+      "step": 3290
+    },
+    {
+      "epoch": 0.5995269286754003,
+      "grad_norm": 0.16585418581962585,
+      "learning_rate": 1.828929409368321e-05,
+      "loss": 0.15120241641998292,
+      "step": 3295
+    },
+    {
+      "epoch": 0.6004366812227074,
+      "grad_norm": 0.1653224229812622,
+      "learning_rate": 1.8218416648091524e-05,
+      "loss": 0.14349838495254516,
+      "step": 3300
+    },
+    {
+      "epoch": 0.6013464337700145,
+      "grad_norm": 0.1891375184059143,
+      "learning_rate": 1.8147597992261124e-05,
+      "loss": 0.15171384811401367,
+      "step": 3305
+    },
+    {
+      "epoch": 0.6022561863173217,
+      "grad_norm": 0.13392704725265503,
+      "learning_rate": 1.8076838740121187e-05,
+      "loss": 0.14607118368148803,
+      "step": 3310
+    },
+    {
+      "epoch": 0.6031659388646288,
+      "grad_norm": 0.15421944856643677,
+      "learning_rate": 1.8006139505085926e-05,
+      "loss": 0.1380957007408142,
+      "step": 3315
+    },
+    {
+      "epoch": 0.604075691411936,
+      "grad_norm": 0.16637761890888214,
+      "learning_rate": 1.7935500900049246e-05,
+      "loss": 0.14604611396789552,
+      "step": 3320
+    },
+    {
+      "epoch": 0.6049854439592431,
+      "grad_norm": 0.16638441383838654,
+      "learning_rate": 1.7864923537379445e-05,
+      "loss": 0.1513611912727356,
+      "step": 3325
+    },
+    {
+      "epoch": 0.6058951965065502,
+      "grad_norm": 0.1745707094669342,
+      "learning_rate": 1.779440802891394e-05,
+      "loss": 0.15391240119934083,
+      "step": 3330
+    },
+    {
+      "epoch": 0.6068049490538574,
+      "grad_norm": 0.1620505005121231,
+      "learning_rate": 1.77239549859539e-05,
+      "loss": 0.14986472129821776,
+      "step": 3335
+    },
+    {
+      "epoch": 0.6077147016011645,
+      "grad_norm": 0.1579132080078125,
+      "learning_rate": 1.7653565019259e-05,
+      "loss": 0.1466603994369507,
+      "step": 3340
+    },
+    {
+      "epoch": 0.6086244541484717,
+      "grad_norm": 0.19154994189739227,
+      "learning_rate": 1.7583238739042086e-05,
+      "loss": 0.15228934288024903,
+      "step": 3345
+    },
+    {
+      "epoch": 0.6095342066957787,
+      "grad_norm": 0.15771779417991638,
+      "learning_rate": 1.7512976754963913e-05,
+      "loss": 0.14965078830718995,
+      "step": 3350
+    },
+    {
+      "epoch": 0.6104439592430859,
+      "grad_norm": 0.18406136333942413,
+      "learning_rate": 1.744277967612785e-05,
+      "loss": 0.1473196864128113,
+      "step": 3355
+    },
+    {
+      "epoch": 0.611353711790393,
+      "grad_norm": 0.17603816092014313,
+      "learning_rate": 1.7372648111074607e-05,
+      "loss": 0.1430676221847534,
+      "step": 3360
+    },
+    {
+      "epoch": 0.6122634643377002,
+      "grad_norm": 0.156408429145813,
+      "learning_rate": 1.7302582667776933e-05,
+      "loss": 0.14018454551696777,
+      "step": 3365
+    },
+    {
+      "epoch": 0.6131732168850073,
+      "grad_norm": 0.14504843950271606,
+      "learning_rate": 1.7232583953634407e-05,
+      "loss": 0.14505640268325806,
+      "step": 3370
+    },
+    {
+      "epoch": 0.6140829694323144,
+      "grad_norm": 0.1864968240261078,
+      "learning_rate": 1.716265257546808e-05,
+      "loss": 0.14810394048690795,
+      "step": 3375
+    },
+    {
+      "epoch": 0.6149927219796215,
+      "grad_norm": 0.1621711403131485,
+      "learning_rate": 1.7092789139515295e-05,
+      "loss": 0.14203091859817504,
+      "step": 3380
+    },
+    {
+      "epoch": 0.6159024745269287,
+      "grad_norm": 0.17994914948940277,
+      "learning_rate": 1.70229942514244e-05,
+      "loss": 0.14565644264221192,
+      "step": 3385
+    },
+    {
+      "epoch": 0.6168122270742358,
+      "grad_norm": 0.1707388162612915,
+      "learning_rate": 1.6953268516249486e-05,
+      "loss": 0.14449434280395507,
+      "step": 3390
+    },
+    {
+      "epoch": 0.617721979621543,
+      "grad_norm": 0.16425329446792603,
+      "learning_rate": 1.6883612538445175e-05,
+      "loss": 0.15185940265655518,
+      "step": 3395
+    },
+    {
+      "epoch": 0.61863173216885,
+      "grad_norm": 0.15987788140773773,
+      "learning_rate": 1.6814026921861335e-05,
+      "loss": 0.14994431734085084,
+      "step": 3400
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.8722872678405663e+18,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-3400/training_args.bin b/checkpoint-3400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-3400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/checkpoint-3500/README.md b/checkpoint-3500/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-3500/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-3500/adapter_config.json b/checkpoint-3500/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-3500/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-3500/adapter_model.safetensors b/checkpoint-3500/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..4872acf012f516316068a397a3e22f1a9523bc3c
--- /dev/null
+++ b/checkpoint-3500/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:384574f4d1031e95902d07379d0ca65cef78f2a497ce8f88e0b5fb2abd9befc7
+size 169741912
diff --git a/checkpoint-3500/chat_template.jinja b/checkpoint-3500/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-3500/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-3500/optimizer.pt b/checkpoint-3500/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ce30efb9345abceba0680da344fdfd0ad13520c1
--- /dev/null
+++ b/checkpoint-3500/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4cceea90703d2831388e6837b34fe0ea3e8c37f98beaec341ba630ec47a73b69
+size 72807355
diff --git a/checkpoint-3500/processor_config.json b/checkpoint-3500/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-3500/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-3500/rng_state.pth b/checkpoint-3500/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-3500/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-3500/scheduler.pt b/checkpoint-3500/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4f2c71216c33756027cf1bc9233b471b10082886
--- /dev/null
+++ b/checkpoint-3500/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4e7f5c829273d81c7d394275a2bb332278dfb03715ee3b6e4102dba4691e9d37
+size 1465
diff --git a/checkpoint-3500/tokenizer.json b/checkpoint-3500/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-3500/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-3500/tokenizer_config.json b/checkpoint-3500/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-3500/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-3500/trainer_state.json b/checkpoint-3500/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..1aef63ff4eaed2165e38c9eb070351d0a2896ad1
--- /dev/null
+++ b/checkpoint-3500/trainer_state.json
@@ -0,0 +1,4942 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.6368267831149927,
+  "eval_steps": 100,
+  "global_step": 3500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    },
+    {
+      "epoch": 0.03729985443959243,
+      "grad_norm": 0.061678580939769745,
+      "learning_rate": 4.999340748636462e-05,
+      "loss": 0.24956226348876953,
+      "step": 205
+    },
+    {
+      "epoch": 0.03820960698689956,
+      "grad_norm": 0.07328873127698898,
+      "learning_rate": 4.999160884067051e-05,
+      "loss": 0.26169676780700685,
+      "step": 210
+    },
+    {
+      "epoch": 0.0391193595342067,
+      "grad_norm": 0.08287990838289261,
+      "learning_rate": 4.9989593541926246e-05,
+      "loss": 0.2574604034423828,
+      "step": 215
+    },
+    {
+      "epoch": 0.04002911208151383,
+      "grad_norm": 0.06787359714508057,
+      "learning_rate": 4.9987361607602525e-05,
+      "loss": 0.25351409912109374,
+      "step": 220
+    },
+    {
+      "epoch": 0.04093886462882096,
+      "grad_norm": 0.06695502996444702,
+      "learning_rate": 4.998491305704805e-05,
+      "loss": 0.24522039890289307,
+      "step": 225
+    },
+    {
+      "epoch": 0.041848617176128096,
+      "grad_norm": 0.08872214704751968,
+      "learning_rate": 4.9982247911489375e-05,
+      "loss": 0.2581867933273315,
+      "step": 230
+    },
+    {
+      "epoch": 0.042758369723435226,
+      "grad_norm": 0.07637131959199905,
+      "learning_rate": 4.9979366194030743e-05,
+      "loss": 0.25569658279418944,
+      "step": 235
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 0.08158119022846222,
+      "learning_rate": 4.997626792965385e-05,
+      "loss": 0.2529409646987915,
+      "step": 240
+    },
+    {
+      "epoch": 0.04457787481804949,
+      "grad_norm": 0.07529161125421524,
+      "learning_rate": 4.997295314521766e-05,
+      "loss": 0.24049024581909179,
+      "step": 245
+    },
+    {
+      "epoch": 0.04548762736535662,
+      "grad_norm": 0.08860139548778534,
+      "learning_rate": 4.996942186945813e-05,
+      "loss": 0.2490522861480713,
+      "step": 250
+    },
+    {
+      "epoch": 0.04639737991266375,
+      "grad_norm": 0.0850321501493454,
+      "learning_rate": 4.9965674132988005e-05,
+      "loss": 0.24180831909179687,
+      "step": 255
+    },
+    {
+      "epoch": 0.04730713245997089,
+      "grad_norm": 0.07556115090847015,
+      "learning_rate": 4.996170996829653e-05,
+      "loss": 0.2509631872177124,
+      "step": 260
+    },
+    {
+      "epoch": 0.04821688500727802,
+      "grad_norm": 0.07971206307411194,
+      "learning_rate": 4.995752940974918e-05,
+      "loss": 0.24398891925811766,
+      "step": 265
+    },
+    {
+      "epoch": 0.04912663755458515,
+      "grad_norm": 0.09149336814880371,
+      "learning_rate": 4.9953132493587344e-05,
+      "loss": 0.2300492286682129,
+      "step": 270
+    },
+    {
+      "epoch": 0.050036390101892286,
+      "grad_norm": 0.08265820890665054,
+      "learning_rate": 4.9948519257928034e-05,
+      "loss": 0.24246792793273925,
+      "step": 275
+    },
+    {
+      "epoch": 0.050946142649199416,
+      "grad_norm": 0.10328587144613266,
+      "learning_rate": 4.9943689742763534e-05,
+      "loss": 0.2367171049118042,
+      "step": 280
+    },
+    {
+      "epoch": 0.05185589519650655,
+      "grad_norm": 0.0836917981505394,
+      "learning_rate": 4.993864398996105e-05,
+      "loss": 0.23215813636779786,
+      "step": 285
+    },
+    {
+      "epoch": 0.05276564774381368,
+      "grad_norm": 0.09475161135196686,
+      "learning_rate": 4.99333820432624e-05,
+      "loss": 0.2350748062133789,
+      "step": 290
+    },
+    {
+      "epoch": 0.05367540029112081,
+      "grad_norm": 0.08040128648281097,
+      "learning_rate": 4.992790394828355e-05,
+      "loss": 0.23253886699676513,
+      "step": 295
+    },
+    {
+      "epoch": 0.05458515283842795,
+      "grad_norm": 0.08852150291204453,
+      "learning_rate": 4.992220975251428e-05,
+      "loss": 0.23856515884399415,
+      "step": 300
+    },
+    {
+      "epoch": 0.05549490538573508,
+      "grad_norm": 0.09565229713916779,
+      "learning_rate": 4.991629950531775e-05,
+      "loss": 0.23311660289764405,
+      "step": 305
+    },
+    {
+      "epoch": 0.05640465793304221,
+      "grad_norm": 0.08158160001039505,
+      "learning_rate": 4.991017325793009e-05,
+      "loss": 0.22467944622039795,
+      "step": 310
+    },
+    {
+      "epoch": 0.05731441048034935,
+      "grad_norm": 0.07746429741382599,
+      "learning_rate": 4.990383106345994e-05,
+      "loss": 0.229844069480896,
+      "step": 315
+    },
+    {
+      "epoch": 0.05822416302765648,
+      "grad_norm": 0.08564355969429016,
+      "learning_rate": 4.989727297688797e-05,
+      "loss": 0.22414517402648926,
+      "step": 320
+    },
+    {
+      "epoch": 0.05913391557496361,
+      "grad_norm": 0.07517435401678085,
+      "learning_rate": 4.9890499055066435e-05,
+      "loss": 0.2236532211303711,
+      "step": 325
+    },
+    {
+      "epoch": 0.060043668122270744,
+      "grad_norm": 0.111734539270401,
+      "learning_rate": 4.988350935671869e-05,
+      "loss": 0.21474847793579102,
+      "step": 330
+    },
+    {
+      "epoch": 0.060953420669577874,
+      "grad_norm": 0.09906989336013794,
+      "learning_rate": 4.987630394243866e-05,
+      "loss": 0.23321933746337892,
+      "step": 335
+    },
+    {
+      "epoch": 0.06186317321688501,
+      "grad_norm": 0.10131457448005676,
+      "learning_rate": 4.98688828746903e-05,
+      "loss": 0.2310662031173706,
+      "step": 340
+    },
+    {
+      "epoch": 0.06277292576419213,
+      "grad_norm": 0.09203507006168365,
+      "learning_rate": 4.986124621780708e-05,
+      "loss": 0.22021169662475587,
+      "step": 345
+    },
+    {
+      "epoch": 0.06368267831149928,
+      "grad_norm": 0.09505912661552429,
+      "learning_rate": 4.9853394037991416e-05,
+      "loss": 0.2197155237197876,
+      "step": 350
+    },
+    {
+      "epoch": 0.06459243085880641,
+      "grad_norm": 0.09038657695055008,
+      "learning_rate": 4.984532640331412e-05,
+      "loss": 0.22066287994384765,
+      "step": 355
+    },
+    {
+      "epoch": 0.06550218340611354,
+      "grad_norm": 0.09707064181566238,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 0.22455451488494874,
+      "step": 360
+    },
+    {
+      "epoch": 0.06641193595342067,
+      "grad_norm": 0.10367228090763092,
+      "learning_rate": 4.98285450509961e-05,
+      "loss": 0.21993820667266845,
+      "step": 365
+    },
+    {
+      "epoch": 0.0673216885007278,
+      "grad_norm": 0.12229471653699875,
+      "learning_rate": 4.9819831478833456e-05,
+      "loss": 0.2168867588043213,
+      "step": 370
+    },
+    {
+      "epoch": 0.06823144104803494,
+      "grad_norm": 0.0964592918753624,
+      "learning_rate": 4.981090274276406e-05,
+      "loss": 0.21579203605651856,
+      "step": 375
+    },
+    {
+      "epoch": 0.06914119359534207,
+      "grad_norm": 0.09400496631860733,
+      "learning_rate": 4.980175892019141e-05,
+      "loss": 0.20972180366516113,
+      "step": 380
+    },
+    {
+      "epoch": 0.0700509461426492,
+      "grad_norm": 0.08158645778894424,
+      "learning_rate": 4.9792400090383594e-05,
+      "loss": 0.22148358821868896,
+      "step": 385
+    },
+    {
+      "epoch": 0.07096069868995633,
+      "grad_norm": 0.10916394740343094,
+      "learning_rate": 4.978282633447261e-05,
+      "loss": 0.2214418649673462,
+      "step": 390
+    },
+    {
+      "epoch": 0.07187045123726346,
+      "grad_norm": 0.11138810962438583,
+      "learning_rate": 4.9773037735453636e-05,
+      "loss": 0.21814754009246826,
+      "step": 395
+    },
+    {
+      "epoch": 0.07278020378457059,
+      "grad_norm": 0.10914396494626999,
+      "learning_rate": 4.9763034378184365e-05,
+      "loss": 0.21310818195343018,
+      "step": 400
+    },
+    {
+      "epoch": 0.07368995633187773,
+      "grad_norm": 0.1043366864323616,
+      "learning_rate": 4.975281634938421e-05,
+      "loss": 0.21266789436340333,
+      "step": 405
+    },
+    {
+      "epoch": 0.07459970887918486,
+      "grad_norm": 0.1036868542432785,
+      "learning_rate": 4.9742383737633594e-05,
+      "loss": 0.21606721878051757,
+      "step": 410
+    },
+    {
+      "epoch": 0.075509461426492,
+      "grad_norm": 0.11640442907810211,
+      "learning_rate": 4.9731736633373144e-05,
+      "loss": 0.21532948017120362,
+      "step": 415
+    },
+    {
+      "epoch": 0.07641921397379912,
+      "grad_norm": 0.11219926178455353,
+      "learning_rate": 4.9720875128902956e-05,
+      "loss": 0.2191627025604248,
+      "step": 420
+    },
+    {
+      "epoch": 0.07732896652110625,
+      "grad_norm": 0.12103637307882309,
+      "learning_rate": 4.970979931838176e-05,
+      "loss": 0.20938868522644044,
+      "step": 425
+    },
+    {
+      "epoch": 0.0782387190684134,
+      "grad_norm": 0.13274189829826355,
+      "learning_rate": 4.96985092978261e-05,
+      "loss": 0.21792960166931152,
+      "step": 430
+    },
+    {
+      "epoch": 0.07914847161572053,
+      "grad_norm": 0.11164513230323792,
+      "learning_rate": 4.968700516510954e-05,
+      "loss": 0.2022618055343628,
+      "step": 435
+    },
+    {
+      "epoch": 0.08005822416302766,
+      "grad_norm": 0.09532847255468369,
+      "learning_rate": 4.967528701996174e-05,
+      "loss": 0.21255812644958497,
+      "step": 440
+    },
+    {
+      "epoch": 0.08096797671033479,
+      "grad_norm": 0.10279258340597153,
+      "learning_rate": 4.96633549639677e-05,
+      "loss": 0.20683050155639648,
+      "step": 445
+    },
+    {
+      "epoch": 0.08187772925764192,
+      "grad_norm": 0.1257462352514267,
+      "learning_rate": 4.965120910056677e-05,
+      "loss": 0.21419920921325683,
+      "step": 450
+    },
+    {
+      "epoch": 0.08278748180494905,
+      "grad_norm": 0.11663137376308441,
+      "learning_rate": 4.963884953505186e-05,
+      "loss": 0.2072287082672119,
+      "step": 455
+    },
+    {
+      "epoch": 0.08369723435225619,
+      "grad_norm": 0.10488224029541016,
+      "learning_rate": 4.96262763745684e-05,
+      "loss": 0.1982678532600403,
+      "step": 460
+    },
+    {
+      "epoch": 0.08460698689956332,
+      "grad_norm": 0.11801692098379135,
+      "learning_rate": 4.961348972811354e-05,
+      "loss": 0.20662031173706055,
+      "step": 465
+    },
+    {
+      "epoch": 0.08551673944687045,
+      "grad_norm": 0.11318827420473099,
+      "learning_rate": 4.96004897065351e-05,
+      "loss": 0.20947303771972656,
+      "step": 470
+    },
+    {
+      "epoch": 0.08642649199417758,
+      "grad_norm": 0.13409486413002014,
+      "learning_rate": 4.95872764225307e-05,
+      "loss": 0.19670876264572143,
+      "step": 475
+    },
+    {
+      "epoch": 0.08733624454148471,
+      "grad_norm": 0.14440792798995972,
+      "learning_rate": 4.957384999064672e-05,
+      "loss": 0.19842848777770997,
+      "step": 480
+    },
+    {
+      "epoch": 0.08824599708879186,
+      "grad_norm": 0.12246996909379959,
+      "learning_rate": 4.956021052727731e-05,
+      "loss": 0.20318071842193602,
+      "step": 485
+    },
+    {
+      "epoch": 0.08915574963609899,
+      "grad_norm": 0.13437233865261078,
+      "learning_rate": 4.954635815066342e-05,
+      "loss": 0.21675212383270265,
+      "step": 490
+    },
+    {
+      "epoch": 0.09006550218340612,
+      "grad_norm": 0.11109672486782074,
+      "learning_rate": 4.9532292980891744e-05,
+      "loss": 0.2100757837295532,
+      "step": 495
+    },
+    {
+      "epoch": 0.09097525473071325,
+      "grad_norm": 0.1388893872499466,
+      "learning_rate": 4.9518015139893675e-05,
+      "loss": 0.20303285121917725,
+      "step": 500
+    },
+    {
+      "epoch": 0.09188500727802038,
+      "grad_norm": 0.13239721953868866,
+      "learning_rate": 4.950352475144427e-05,
+      "loss": 0.2152268409729004,
+      "step": 505
+    },
+    {
+      "epoch": 0.0927947598253275,
+      "grad_norm": 0.12834979593753815,
+      "learning_rate": 4.948882194116119e-05,
+      "loss": 0.20799248218536376,
+      "step": 510
+    },
+    {
+      "epoch": 0.09370451237263465,
+      "grad_norm": 0.11886704713106155,
+      "learning_rate": 4.947390683650354e-05,
+      "loss": 0.20394976139068605,
+      "step": 515
+    },
+    {
+      "epoch": 0.09461426491994178,
+      "grad_norm": 0.11398876458406448,
+      "learning_rate": 4.945877956677083e-05,
+      "loss": 0.2091092586517334,
+      "step": 520
+    },
+    {
+      "epoch": 0.09552401746724891,
+      "grad_norm": 0.1422540694475174,
+      "learning_rate": 4.944344026310186e-05,
+      "loss": 0.19564238786697388,
+      "step": 525
+    },
+    {
+      "epoch": 0.09643377001455604,
+      "grad_norm": 0.11359584331512451,
+      "learning_rate": 4.9427889058473535e-05,
+      "loss": 0.20493624210357667,
+      "step": 530
+    },
+    {
+      "epoch": 0.09734352256186317,
+      "grad_norm": 0.11703553050756454,
+      "learning_rate": 4.941212608769974e-05,
+      "loss": 0.2098615884780884,
+      "step": 535
+    },
+    {
+      "epoch": 0.0982532751091703,
+      "grad_norm": 0.14552047848701477,
+      "learning_rate": 4.939615148743017e-05,
+      "loss": 0.20382182598114013,
+      "step": 540
+    },
+    {
+      "epoch": 0.09916302765647744,
+      "grad_norm": 0.13178016245365143,
+      "learning_rate": 4.937996539614914e-05,
+      "loss": 0.19901862144470214,
+      "step": 545
+    },
+    {
+      "epoch": 0.10007278020378457,
+      "grad_norm": 0.635392427444458,
+      "learning_rate": 4.936356795417439e-05,
+      "loss": 0.20694944858551026,
+      "step": 550
+    },
+    {
+      "epoch": 0.1009825327510917,
+      "grad_norm": 0.15019077062606812,
+      "learning_rate": 4.934695930365586e-05,
+      "loss": 0.19313746690750122,
+      "step": 555
+    },
+    {
+      "epoch": 0.10189228529839883,
+      "grad_norm": 0.12941956520080566,
+      "learning_rate": 4.9330139588574474e-05,
+      "loss": 0.19671722650527954,
+      "step": 560
+    },
+    {
+      "epoch": 0.10280203784570596,
+      "grad_norm": 0.13818831741809845,
+      "learning_rate": 4.931310895474088e-05,
+      "loss": 0.20026786327362062,
+      "step": 565
+    },
+    {
+      "epoch": 0.1037117903930131,
+      "grad_norm": 0.12011194974184036,
+      "learning_rate": 4.929586754979417e-05,
+      "loss": 0.1932437539100647,
+      "step": 570
+    },
+    {
+      "epoch": 0.10462154294032024,
+      "grad_norm": 0.1345364898443222,
+      "learning_rate": 4.9278415523200644e-05,
+      "loss": 0.20245940685272218,
+      "step": 575
+    },
+    {
+      "epoch": 0.10553129548762737,
+      "grad_norm": 0.13281017541885376,
+      "learning_rate": 4.926075302625247e-05,
+      "loss": 0.19864981174468993,
+      "step": 580
+    },
+    {
+      "epoch": 0.1064410480349345,
+      "grad_norm": 0.13465586304664612,
+      "learning_rate": 4.924288021206639e-05,
+      "loss": 0.19573183059692384,
+      "step": 585
+    },
+    {
+      "epoch": 0.10735080058224163,
+      "grad_norm": 0.15225961804389954,
+      "learning_rate": 4.9224797235582396e-05,
+      "loss": 0.19946801662445068,
+      "step": 590
+    },
+    {
+      "epoch": 0.10826055312954876,
+      "grad_norm": 0.12816746532917023,
+      "learning_rate": 4.92065042535624e-05,
+      "loss": 0.19851526021957397,
+      "step": 595
+    },
+    {
+      "epoch": 0.1091703056768559,
+      "grad_norm": 0.13802853226661682,
+      "learning_rate": 4.9188001424588824e-05,
+      "loss": 0.19321763515472412,
+      "step": 600
+    },
+    {
+      "epoch": 0.11008005822416303,
+      "grad_norm": 0.17504797875881195,
+      "learning_rate": 4.9169288909063295e-05,
+      "loss": 0.2032616138458252,
+      "step": 605
+    },
+    {
+      "epoch": 0.11098981077147016,
+      "grad_norm": 0.13544194400310516,
+      "learning_rate": 4.91503668692052e-05,
+      "loss": 0.2011256456375122,
+      "step": 610
+    },
+    {
+      "epoch": 0.11189956331877729,
+      "grad_norm": 1.3976134061813354,
+      "learning_rate": 4.91312354690503e-05,
+      "loss": 0.19916868209838867,
+      "step": 615
+    },
+    {
+      "epoch": 0.11280931586608442,
+      "grad_norm": 0.1465059071779251,
+      "learning_rate": 4.91118948744493e-05,
+      "loss": 0.19487457275390624,
+      "step": 620
+    },
+    {
+      "epoch": 0.11371906841339156,
+      "grad_norm": 0.12103168666362762,
+      "learning_rate": 4.909234525306645e-05,
+      "loss": 0.1907251238822937,
+      "step": 625
+    },
+    {
+      "epoch": 0.1146288209606987,
+      "grad_norm": 0.12660574913024902,
+      "learning_rate": 4.907258677437802e-05,
+      "loss": 0.19327253103256226,
+      "step": 630
+    },
+    {
+      "epoch": 0.11553857350800582,
+      "grad_norm": 0.1347813606262207,
+      "learning_rate": 4.90526196096709e-05,
+      "loss": 0.19637736082077026,
+      "step": 635
+    },
+    {
+      "epoch": 0.11644832605531295,
+      "grad_norm": 0.14953652024269104,
+      "learning_rate": 4.903244393204107e-05,
+      "loss": 0.20325069427490233,
+      "step": 640
+    },
+    {
+      "epoch": 0.11735807860262008,
+      "grad_norm": 0.13936272263526917,
+      "learning_rate": 4.901205991639213e-05,
+      "loss": 0.1930275321006775,
+      "step": 645
+    },
+    {
+      "epoch": 0.11826783114992721,
+      "grad_norm": 0.1448420137166977,
+      "learning_rate": 4.899146773943374e-05,
+      "loss": 0.20026936531066894,
+      "step": 650
+    },
+    {
+      "epoch": 0.11917758369723436,
+      "grad_norm": 0.1312534064054489,
+      "learning_rate": 4.897066757968014e-05,
+      "loss": 0.19062033891677857,
+      "step": 655
+    },
+    {
+      "epoch": 0.12008733624454149,
+      "grad_norm": 0.13644742965698242,
+      "learning_rate": 4.894965961744859e-05,
+      "loss": 0.18719595670700073,
+      "step": 660
+    },
+    {
+      "epoch": 0.12099708879184862,
+      "grad_norm": 0.14276087284088135,
+      "learning_rate": 4.892844403485777e-05,
+      "loss": 0.19784307479858398,
+      "step": 665
+    },
+    {
+      "epoch": 0.12190684133915575,
+      "grad_norm": 0.14735399186611176,
+      "learning_rate": 4.890702101582623e-05,
+      "loss": 0.19163782596588136,
+      "step": 670
+    },
+    {
+      "epoch": 0.12281659388646288,
+      "grad_norm": 0.15742065012454987,
+      "learning_rate": 4.888539074607082e-05,
+      "loss": 0.19312986135482788,
+      "step": 675
+    },
+    {
+      "epoch": 0.12372634643377002,
+      "grad_norm": 0.12917031347751617,
+      "learning_rate": 4.8863553413105025e-05,
+      "loss": 0.20066320896148682,
+      "step": 680
+    },
+    {
+      "epoch": 0.12463609898107715,
+      "grad_norm": 0.1484801322221756,
+      "learning_rate": 4.884150920623737e-05,
+      "loss": 0.20096096992492676,
+      "step": 685
+    },
+    {
+      "epoch": 0.12554585152838427,
+      "grad_norm": 0.1455296128988266,
+      "learning_rate": 4.88192583165698e-05,
+      "loss": 0.20518505573272705,
+      "step": 690
+    },
+    {
+      "epoch": 0.12645560407569142,
+      "grad_norm": 0.14517490565776825,
+      "learning_rate": 4.879680093699598e-05,
+      "loss": 0.18859238624572755,
+      "step": 695
+    },
+    {
+      "epoch": 0.12736535662299855,
+      "grad_norm": 0.18778090178966522,
+      "learning_rate": 4.877413726219964e-05,
+      "loss": 0.197074818611145,
+      "step": 700
+    },
+    {
+      "epoch": 0.12827510917030568,
+      "grad_norm": 0.13497677445411682,
+      "learning_rate": 4.87512674886529e-05,
+      "loss": 0.18713107109069824,
+      "step": 705
+    },
+    {
+      "epoch": 0.12918486171761281,
+      "grad_norm": 0.12657155096530914,
+      "learning_rate": 4.872819181461455e-05,
+      "loss": 0.1858484387397766,
+      "step": 710
+    },
+    {
+      "epoch": 0.13009461426491994,
+      "grad_norm": 0.11458148807287216,
+      "learning_rate": 4.870491044012834e-05,
+      "loss": 0.18732179403305055,
+      "step": 715
+    },
+    {
+      "epoch": 0.13100436681222707,
+      "grad_norm": 0.13000249862670898,
+      "learning_rate": 4.8681423567021244e-05,
+      "loss": 0.1872936010360718,
+      "step": 720
+    },
+    {
+      "epoch": 0.1319141193595342,
+      "grad_norm": 0.14580890536308289,
+      "learning_rate": 4.865773139890172e-05,
+      "loss": 0.19280019998550416,
+      "step": 725
+    },
+    {
+      "epoch": 0.13282387190684133,
+      "grad_norm": 0.1507277935743332,
+      "learning_rate": 4.8633834141157913e-05,
+      "loss": 0.1898929238319397,
+      "step": 730
+    },
+    {
+      "epoch": 0.13373362445414846,
+      "grad_norm": 0.1418737769126892,
+      "learning_rate": 4.860973200095592e-05,
+      "loss": 0.17926375865936278,
+      "step": 735
+    },
+    {
+      "epoch": 0.1346433770014556,
+      "grad_norm": 0.17151866853237152,
+      "learning_rate": 4.858542518723794e-05,
+      "loss": 0.18963592052459716,
+      "step": 740
+    },
+    {
+      "epoch": 0.13555312954876272,
+      "grad_norm": 0.11162743717432022,
+      "learning_rate": 4.8560913910720535e-05,
+      "loss": 0.19466646909713745,
+      "step": 745
+    },
+    {
+      "epoch": 0.13646288209606988,
+      "grad_norm": 0.15628376603126526,
+      "learning_rate": 4.8536198383892725e-05,
+      "loss": 0.19494034051895143,
+      "step": 750
+    },
+    {
+      "epoch": 0.137372634643377,
+      "grad_norm": 0.18209289014339447,
+      "learning_rate": 4.851127882101421e-05,
+      "loss": 0.18747550249099731,
+      "step": 755
+    },
+    {
+      "epoch": 0.13828238719068414,
+      "grad_norm": 0.14559614658355713,
+      "learning_rate": 4.8486155438113454e-05,
+      "loss": 0.1897158980369568,
+      "step": 760
+    },
+    {
+      "epoch": 0.13919213973799127,
+      "grad_norm": 0.3198587894439697,
+      "learning_rate": 4.846082845298586e-05,
+      "loss": 0.18571001291275024,
+      "step": 765
+    },
+    {
+      "epoch": 0.1401018922852984,
+      "grad_norm": 0.1486678421497345,
+      "learning_rate": 4.843529808519189e-05,
+      "loss": 0.19561930894851684,
+      "step": 770
+    },
+    {
+      "epoch": 0.14101164483260553,
+      "grad_norm": 0.15318170189857483,
+      "learning_rate": 4.840956455605509e-05,
+      "loss": 0.187040114402771,
+      "step": 775
+    },
+    {
+      "epoch": 0.14192139737991266,
+      "grad_norm": 0.13754244148731232,
+      "learning_rate": 4.838362808866025e-05,
+      "loss": 0.18345539569854735,
+      "step": 780
+    },
+    {
+      "epoch": 0.1428311499272198,
+      "grad_norm": 0.12943248450756073,
+      "learning_rate": 4.835748890785143e-05,
+      "loss": 0.1921079397201538,
+      "step": 785
+    },
+    {
+      "epoch": 0.14374090247452692,
+      "grad_norm": 0.110458143055439,
+      "learning_rate": 4.833114724023001e-05,
+      "loss": 0.17927205562591553,
+      "step": 790
+    },
+    {
+      "epoch": 0.14465065502183405,
+      "grad_norm": 0.2421770840883255,
+      "learning_rate": 4.830460331415275e-05,
+      "loss": 0.18317567110061644,
+      "step": 795
+    },
+    {
+      "epoch": 0.14556040756914118,
+      "grad_norm": 0.14752762019634247,
+      "learning_rate": 4.8277857359729787e-05,
+      "loss": 0.1843916058540344,
+      "step": 800
+    },
+    {
+      "epoch": 0.14647016011644834,
+      "grad_norm": 0.15043556690216064,
+      "learning_rate": 4.8250909608822644e-05,
+      "loss": 0.18354393243789674,
+      "step": 805
+    },
+    {
+      "epoch": 0.14737991266375547,
+      "grad_norm": 0.1381794661283493,
+      "learning_rate": 4.822376029504223e-05,
+      "loss": 0.1789781332015991,
+      "step": 810
+    },
+    {
+      "epoch": 0.1482896652110626,
+      "grad_norm": 0.18386174738407135,
+      "learning_rate": 4.819640965374681e-05,
+      "loss": 0.19494292736053467,
+      "step": 815
+    },
+    {
+      "epoch": 0.14919941775836973,
+      "grad_norm": 0.13829593360424042,
+      "learning_rate": 4.816885792203996e-05,
+      "loss": 0.18486063480377196,
+      "step": 820
+    },
+    {
+      "epoch": 0.15010917030567686,
+      "grad_norm": 0.15033291280269623,
+      "learning_rate": 4.814110533876852e-05,
+      "loss": 0.18061509132385253,
+      "step": 825
+    },
+    {
+      "epoch": 0.151018922852984,
+      "grad_norm": 0.17150473594665527,
+      "learning_rate": 4.811315214452051e-05,
+      "loss": 0.18464866876602173,
+      "step": 830
+    },
+    {
+      "epoch": 0.15192867540029112,
+      "grad_norm": 0.15317125618457794,
+      "learning_rate": 4.808499858162307e-05,
+      "loss": 0.1837708592414856,
+      "step": 835
+    },
+    {
+      "epoch": 0.15283842794759825,
+      "grad_norm": 0.2671392560005188,
+      "learning_rate": 4.805664489414031e-05,
+      "loss": 0.19338636398315429,
+      "step": 840
+    },
+    {
+      "epoch": 0.15374818049490538,
+      "grad_norm": 0.14047028124332428,
+      "learning_rate": 4.802809132787125e-05,
+      "loss": 0.17069108486175538,
+      "step": 845
+    },
+    {
+      "epoch": 0.1546579330422125,
+      "grad_norm": 0.1520431935787201,
+      "learning_rate": 4.799933813034768e-05,
+      "loss": 0.18607735633850098,
+      "step": 850
+    },
+    {
+      "epoch": 0.15556768558951964,
+      "grad_norm": 0.17239463329315186,
+      "learning_rate": 4.797038555083197e-05,
+      "loss": 0.18069062232971192,
+      "step": 855
+    },
+    {
+      "epoch": 0.1564774381368268,
+      "grad_norm": 0.1377955675125122,
+      "learning_rate": 4.794123384031495e-05,
+      "loss": 0.18870222568511963,
+      "step": 860
+    },
+    {
+      "epoch": 0.15738719068413393,
+      "grad_norm": 0.15901461243629456,
+      "learning_rate": 4.791188325151373e-05,
+      "loss": 0.18128334283828734,
+      "step": 865
+    },
+    {
+      "epoch": 0.15829694323144106,
+      "grad_norm": 0.14634132385253906,
+      "learning_rate": 4.7882334038869495e-05,
+      "loss": 0.1866163969039917,
+      "step": 870
+    },
+    {
+      "epoch": 0.1592066957787482,
+      "grad_norm": 0.15361061692237854,
+      "learning_rate": 4.785258645854529e-05,
+      "loss": 0.17850807905197144,
+      "step": 875
+    },
+    {
+      "epoch": 0.16011644832605532,
+      "grad_norm": 0.13751649856567383,
+      "learning_rate": 4.782264076842385e-05,
+      "loss": 0.17731113433837892,
+      "step": 880
+    },
+    {
+      "epoch": 0.16102620087336245,
+      "grad_norm": 0.17909638583660126,
+      "learning_rate": 4.7792497228105314e-05,
+      "loss": 0.18344542980194092,
+      "step": 885
+    },
+    {
+      "epoch": 0.16193595342066958,
+      "grad_norm": 0.16038304567337036,
+      "learning_rate": 4.776215609890498e-05,
+      "loss": 0.18868647813796996,
+      "step": 890
+    },
+    {
+      "epoch": 0.1628457059679767,
+      "grad_norm": 0.1653951108455658,
+      "learning_rate": 4.773161764385107e-05,
+      "loss": 0.18614152669906617,
+      "step": 895
+    },
+    {
+      "epoch": 0.16375545851528384,
+      "grad_norm": 0.16193026304244995,
+      "learning_rate": 4.770088212768241e-05,
+      "loss": 0.18564575910568237,
+      "step": 900
+    },
+    {
+      "epoch": 0.16466521106259097,
+      "grad_norm": 0.16048531234264374,
+      "learning_rate": 4.7669949816846173e-05,
+      "loss": 0.18330031633377075,
+      "step": 905
+    },
+    {
+      "epoch": 0.1655749636098981,
+      "grad_norm": 0.1440177708864212,
+      "learning_rate": 4.7638820979495534e-05,
+      "loss": 0.17712442874908446,
+      "step": 910
+    },
+    {
+      "epoch": 0.16648471615720525,
+      "grad_norm": 0.19635969400405884,
+      "learning_rate": 4.760749588548738e-05,
+      "loss": 0.18679027557373046,
+      "step": 915
+    },
+    {
+      "epoch": 0.16739446870451238,
+      "grad_norm": 0.15576541423797607,
+      "learning_rate": 4.757597480637995e-05,
+      "loss": 0.19283764362335204,
+      "step": 920
+    },
+    {
+      "epoch": 0.1683042212518195,
+      "grad_norm": 0.1550331562757492,
+      "learning_rate": 4.7544258015430463e-05,
+      "loss": 0.18269542455673218,
+      "step": 925
+    },
+    {
+      "epoch": 0.16921397379912664,
+      "grad_norm": 0.18369626998901367,
+      "learning_rate": 4.75123457875928e-05,
+      "loss": 0.1697891116142273,
+      "step": 930
+    },
+    {
+      "epoch": 0.17012372634643377,
+      "grad_norm": 0.15266314148902893,
+      "learning_rate": 4.7480238399515074e-05,
+      "loss": 0.18523451089859008,
+      "step": 935
+    },
+    {
+      "epoch": 0.1710334788937409,
+      "grad_norm": 0.16709664463996887,
+      "learning_rate": 4.744793612953724e-05,
+      "loss": 0.1803238034248352,
+      "step": 940
+    },
+    {
+      "epoch": 0.17194323144104803,
+      "grad_norm": 0.14929179847240448,
+      "learning_rate": 4.741543925768872e-05,
+      "loss": 0.1861217737197876,
+      "step": 945
+    },
+    {
+      "epoch": 0.17285298398835516,
+      "grad_norm": 0.1362280696630478,
+      "learning_rate": 4.7382748065685915e-05,
+      "loss": 0.17896100282669067,
+      "step": 950
+    },
+    {
+      "epoch": 0.1737627365356623,
+      "grad_norm": 0.15290239453315735,
+      "learning_rate": 4.734986283692982e-05,
+      "loss": 0.18432788848876952,
+      "step": 955
+    },
+    {
+      "epoch": 0.17467248908296942,
+      "grad_norm": 0.1287035197019577,
+      "learning_rate": 4.73167838565035e-05,
+      "loss": 0.18485682010650634,
+      "step": 960
+    },
+    {
+      "epoch": 0.17558224163027655,
+      "grad_norm": 0.17969627678394318,
+      "learning_rate": 4.728351141116971e-05,
+      "loss": 0.17361557483673096,
+      "step": 965
+    },
+    {
+      "epoch": 0.1764919941775837,
+      "grad_norm": 0.13751201331615448,
+      "learning_rate": 4.7250045789368326e-05,
+      "loss": 0.1731679320335388,
+      "step": 970
+    },
+    {
+      "epoch": 0.17740174672489084,
+      "grad_norm": 0.1603265255689621,
+      "learning_rate": 4.721638728121388e-05,
+      "loss": 0.17308170795440675,
+      "step": 975
+    },
+    {
+      "epoch": 0.17831149927219797,
+      "grad_norm": 0.1592789888381958,
+      "learning_rate": 4.718253617849306e-05,
+      "loss": 0.17534757852554322,
+      "step": 980
+    },
+    {
+      "epoch": 0.1792212518195051,
+      "grad_norm": 0.12727224826812744,
+      "learning_rate": 4.714849277466214e-05,
+      "loss": 0.17817609310150145,
+      "step": 985
+    },
+    {
+      "epoch": 0.18013100436681223,
+      "grad_norm": 0.15401554107666016,
+      "learning_rate": 4.711425736484447e-05,
+      "loss": 0.1733405351638794,
+      "step": 990
+    },
+    {
+      "epoch": 0.18104075691411936,
+      "grad_norm": 0.13253968954086304,
+      "learning_rate": 4.7079830245827906e-05,
+      "loss": 0.17846795320510864,
+      "step": 995
+    },
+    {
+      "epoch": 0.1819505094614265,
+      "grad_norm": 0.21846213936805725,
+      "learning_rate": 4.7045211716062245e-05,
+      "loss": 0.18021599054336548,
+      "step": 1000
+    },
+    {
+      "epoch": 0.18286026200873362,
+      "grad_norm": 0.16867990791797638,
+      "learning_rate": 4.7010402075656595e-05,
+      "loss": 0.18232386112213134,
+      "step": 1005
+    },
+    {
+      "epoch": 0.18377001455604075,
+      "grad_norm": 0.17180582880973816,
+      "learning_rate": 4.697540162637686e-05,
+      "loss": 0.1816317319869995,
+      "step": 1010
+    },
+    {
+      "epoch": 0.18467976710334788,
+      "grad_norm": 0.16480213403701782,
+      "learning_rate": 4.694021067164303e-05,
+      "loss": 0.17718446254730225,
+      "step": 1015
+    },
+    {
+      "epoch": 0.185589519650655,
+      "grad_norm": 0.15015918016433716,
+      "learning_rate": 4.6904829516526605e-05,
+      "loss": 0.17412011623382567,
+      "step": 1020
+    },
+    {
+      "epoch": 0.18649927219796217,
+      "grad_norm": 0.14445139467716217,
+      "learning_rate": 4.686925846774795e-05,
+      "loss": 0.1778018832206726,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1874090247452693,
+      "grad_norm": 0.1701960265636444,
+      "learning_rate": 4.683349783367362e-05,
+      "loss": 0.16901081800460815,
+      "step": 1030
+    },
+    {
+      "epoch": 0.18831877729257643,
+      "grad_norm": 0.15894867479801178,
+      "learning_rate": 4.679754792431368e-05,
+      "loss": 0.17055928707122803,
+      "step": 1035
+    },
+    {
+      "epoch": 0.18922852983988356,
+      "grad_norm": 0.1511942446231842,
+      "learning_rate": 4.676140905131903e-05,
+      "loss": 0.17339680194854737,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1901382823871907,
+      "grad_norm": 0.14735209941864014,
+      "learning_rate": 4.672508152797872e-05,
+      "loss": 0.17802717685699462,
+      "step": 1045
+    },
+    {
+      "epoch": 0.19104803493449782,
+      "grad_norm": 0.17367291450500488,
+      "learning_rate": 4.66885656692172e-05,
+      "loss": 0.1732744097709656,
+      "step": 1050
+    },
+    {
+      "epoch": 0.19195778748180495,
+      "grad_norm": 0.147227481007576,
+      "learning_rate": 4.665186179159159e-05,
+      "loss": 0.17040517330169677,
+      "step": 1055
+    },
+    {
+      "epoch": 0.19286754002911208,
+      "grad_norm": 0.1709655076265335,
+      "learning_rate": 4.6614970213289e-05,
+      "loss": 0.17794088125228882,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1937772925764192,
+      "grad_norm": 0.1588088721036911,
+      "learning_rate": 4.657789125412366e-05,
+      "loss": 0.17180380821228028,
+      "step": 1065
+    },
+    {
+      "epoch": 0.19468704512372634,
+      "grad_norm": 0.14827021956443787,
+      "learning_rate": 4.654062523553428e-05,
+      "loss": 0.182997989654541,
+      "step": 1070
+    },
+    {
+      "epoch": 0.19559679767103347,
+      "grad_norm": 0.16230466961860657,
+      "learning_rate": 4.6503172480581126e-05,
+      "loss": 0.17346880435943604,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1965065502183406,
+      "grad_norm": 0.1637624353170395,
+      "learning_rate": 4.646553331394333e-05,
+      "loss": 0.17263576984405518,
+      "step": 1080
+    },
+    {
+      "epoch": 0.19741630276564776,
+      "grad_norm": 0.15977843105793,
+      "learning_rate": 4.642770806191603e-05,
+      "loss": 0.17284308671951293,
+      "step": 1085
+    },
+    {
+      "epoch": 0.19832605531295489,
+      "grad_norm": 0.15394869446754456,
+      "learning_rate": 4.6389697052407534e-05,
+      "loss": 0.17797101736068727,
+      "step": 1090
+    },
+    {
+      "epoch": 0.19923580786026202,
+      "grad_norm": 0.15995225310325623,
+      "learning_rate": 4.6351500614936485e-05,
+      "loss": 0.18137198686599731,
+      "step": 1095
+    },
+    {
+      "epoch": 0.20014556040756915,
+      "grad_norm": 0.1779479682445526,
+      "learning_rate": 4.6313119080629006e-05,
+      "loss": 0.17998344898223878,
+      "step": 1100
+    },
+    {
+      "epoch": 0.20105531295487628,
+      "grad_norm": 0.14362832903862,
+      "learning_rate": 4.627455278221584e-05,
+      "loss": 0.18196423053741456,
+      "step": 1105
+    },
+    {
+      "epoch": 0.2019650655021834,
+      "grad_norm": 0.15951639413833618,
+      "learning_rate": 4.623580205402947e-05,
+      "loss": 0.17423888444900512,
+      "step": 1110
+    },
+    {
+      "epoch": 0.20287481804949054,
+      "grad_norm": 0.17273563146591187,
+      "learning_rate": 4.619686723200115e-05,
+      "loss": 0.17392473220825194,
+      "step": 1115
+    },
+    {
+      "epoch": 0.20378457059679767,
+      "grad_norm": 0.1655360758304596,
+      "learning_rate": 4.615774865365813e-05,
+      "loss": 0.17528389692306517,
+      "step": 1120
+    },
+    {
+      "epoch": 0.2046943231441048,
+      "grad_norm": 0.15920691192150116,
+      "learning_rate": 4.611844665812058e-05,
+      "loss": 0.1806849241256714,
+      "step": 1125
+    },
+    {
+      "epoch": 0.20560407569141192,
+      "grad_norm": 0.16114577651023865,
+      "learning_rate": 4.607896158609875e-05,
+      "loss": 0.17217352390289306,
+      "step": 1130
+    },
+    {
+      "epoch": 0.20651382823871905,
+      "grad_norm": 0.1499422937631607,
+      "learning_rate": 4.603929377988999e-05,
+      "loss": 0.17806737422943114,
+      "step": 1135
+    },
+    {
+      "epoch": 0.2074235807860262,
+      "grad_norm": 0.17605191469192505,
+      "learning_rate": 4.5999443583375765e-05,
+      "loss": 0.17842113971710205,
+      "step": 1140
+    },
+    {
+      "epoch": 0.20833333333333334,
+      "grad_norm": 0.16117210686206818,
+      "learning_rate": 4.595941134201871e-05,
+      "loss": 0.18379683494567872,
+      "step": 1145
+    },
+    {
+      "epoch": 0.20924308588064047,
+      "grad_norm": 0.21199050545692444,
+      "learning_rate": 4.591919740285957e-05,
+      "loss": 0.16286123991012574,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2101528384279476,
+      "grad_norm": 0.15100529789924622,
+      "learning_rate": 4.587880211451427e-05,
+      "loss": 0.17995200157165528,
+      "step": 1155
+    },
+    {
+      "epoch": 0.21106259097525473,
+      "grad_norm": 0.16618172824382782,
+      "learning_rate": 4.583822582717085e-05,
+      "loss": 0.16960303783416747,
+      "step": 1160
+    },
+    {
+      "epoch": 0.21197234352256186,
+      "grad_norm": 0.14743569493293762,
+      "learning_rate": 4.579746889258643e-05,
+      "loss": 0.17762668132781984,
+      "step": 1165
+    },
+    {
+      "epoch": 0.212882096069869,
+      "grad_norm": 0.1697179079055786,
+      "learning_rate": 4.575653166408417e-05,
+      "loss": 0.16656005382537842,
+      "step": 1170
+    },
+    {
+      "epoch": 0.21379184861717612,
+      "grad_norm": 0.14886513352394104,
+      "learning_rate": 4.57154144965502e-05,
+      "loss": 0.17091882228851318,
+      "step": 1175
+    },
+    {
+      "epoch": 0.21470160116448325,
+      "grad_norm": 0.18197473883628845,
+      "learning_rate": 4.5674117746430556e-05,
+      "loss": 0.1770920753479004,
+      "step": 1180
+    },
+    {
+      "epoch": 0.21561135371179038,
+      "grad_norm": 0.17323088645935059,
+      "learning_rate": 4.563264177172807e-05,
+      "loss": 0.1734643578529358,
+      "step": 1185
+    },
+    {
+      "epoch": 0.2165211062590975,
+      "grad_norm": 0.1521984338760376,
+      "learning_rate": 4.559098693199929e-05,
+      "loss": 0.17515116930007935,
+      "step": 1190
+    },
+    {
+      "epoch": 0.21743085880640467,
+      "grad_norm": 0.1842304915189743,
+      "learning_rate": 4.554915358835134e-05,
+      "loss": 0.16798022985458375,
+      "step": 1195
+    },
+    {
+      "epoch": 0.2183406113537118,
+      "grad_norm": 0.14753451943397522,
+      "learning_rate": 4.5507142103438794e-05,
+      "loss": 0.1755476713180542,
+      "step": 1200
+    },
+    {
+      "epoch": 0.21925036390101893,
+      "grad_norm": 0.17096194624900818,
+      "learning_rate": 4.546495284146057e-05,
+      "loss": 0.1792473554611206,
+      "step": 1205
+    },
+    {
+      "epoch": 0.22016011644832606,
+      "grad_norm": 0.1579233556985855,
+      "learning_rate": 4.542258616815669e-05,
+      "loss": 0.17230144739151002,
+      "step": 1210
+    },
+    {
+      "epoch": 0.2210698689956332,
+      "grad_norm": 0.177297905087471,
+      "learning_rate": 4.5380042450805216e-05,
+      "loss": 0.1807127833366394,
+      "step": 1215
+    },
+    {
+      "epoch": 0.22197962154294032,
+      "grad_norm": 0.14331696927547455,
+      "learning_rate": 4.533732205821897e-05,
+      "loss": 0.17201389074325563,
+      "step": 1220
+    },
+    {
+      "epoch": 0.22288937409024745,
+      "grad_norm": 0.14473360776901245,
+      "learning_rate": 4.529442536074239e-05,
+      "loss": 0.17036900520324708,
+      "step": 1225
+    },
+    {
+      "epoch": 0.22379912663755458,
+      "grad_norm": 0.1820901483297348,
+      "learning_rate": 4.5251352730248314e-05,
+      "loss": 0.17704882621765136,
+      "step": 1230
+    },
+    {
+      "epoch": 0.2247088791848617,
+      "grad_norm": 0.1948976367712021,
+      "learning_rate": 4.5208104540134746e-05,
+      "loss": 0.1706973433494568,
+      "step": 1235
+    },
+    {
+      "epoch": 0.22561863173216884,
+      "grad_norm": 0.16660070419311523,
+      "learning_rate": 4.51646811653216e-05,
+      "loss": 0.17636821269989014,
+      "step": 1240
+    },
+    {
+      "epoch": 0.22652838427947597,
+      "grad_norm": 0.1699984073638916,
+      "learning_rate": 4.512108298224751e-05,
+      "loss": 0.16986632347106934,
+      "step": 1245
+    },
+    {
+      "epoch": 0.22743813682678313,
+      "grad_norm": 0.17601042985916138,
+      "learning_rate": 4.50773103688665e-05,
+      "loss": 0.17507898807525635,
+      "step": 1250
+    },
+    {
+      "epoch": 0.22834788937409026,
+      "grad_norm": 0.17557238042354584,
+      "learning_rate": 4.503336370464476e-05,
+      "loss": 0.17702863216400147,
+      "step": 1255
+    },
+    {
+      "epoch": 0.2292576419213974,
+      "grad_norm": 0.1800651252269745,
+      "learning_rate": 4.498924337055729e-05,
+      "loss": 0.16419180631637573,
+      "step": 1260
+    },
+    {
+      "epoch": 0.23016739446870452,
+      "grad_norm": 0.2022479772567749,
+      "learning_rate": 4.494494974908468e-05,
+      "loss": 0.17482060194015503,
+      "step": 1265
+    },
+    {
+      "epoch": 0.23107714701601165,
+      "grad_norm": 0.14180205762386322,
+      "learning_rate": 4.490048322420973e-05,
+      "loss": 0.1723136067390442,
+      "step": 1270
+    },
+    {
+      "epoch": 0.23198689956331878,
+      "grad_norm": 0.18607310950756073,
+      "learning_rate": 4.485584418141419e-05,
+      "loss": 0.17096419334411622,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2328966521106259,
+      "grad_norm": 0.15958310663700104,
+      "learning_rate": 4.481103300767529e-05,
+      "loss": 0.1656244158744812,
+      "step": 1280
+    },
+    {
+      "epoch": 0.23380640465793304,
+      "grad_norm": 0.17552383244037628,
+      "learning_rate": 4.476605009146255e-05,
+      "loss": 0.17677626609802247,
+      "step": 1285
+    },
+    {
+      "epoch": 0.23471615720524017,
+      "grad_norm": 0.15299823880195618,
+      "learning_rate": 4.472089582273429e-05,
+      "loss": 0.1778991103172302,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2356259097525473,
+      "grad_norm": 0.14613987505435944,
+      "learning_rate": 4.46755705929343e-05,
+      "loss": 0.17071452140808105,
+      "step": 1295
+    },
+    {
+      "epoch": 0.23653566229985443,
+      "grad_norm": 0.17781122028827667,
+      "learning_rate": 4.463007479498843e-05,
+      "loss": 0.16955430507659913,
+      "step": 1300
+    },
+    {
+      "epoch": 0.23744541484716158,
+      "grad_norm": 0.16326487064361572,
+      "learning_rate": 4.458440882330119e-05,
+      "loss": 0.1777693510055542,
+      "step": 1305
+    },
+    {
+      "epoch": 0.23835516739446871,
+      "grad_norm": 0.17701926827430725,
+      "learning_rate": 4.4538573073752365e-05,
+      "loss": 0.16323351860046387,
+      "step": 1310
+    },
+    {
+      "epoch": 0.23926491994177584,
+      "grad_norm": 0.13104717433452606,
+      "learning_rate": 4.449256794369349e-05,
+      "loss": 0.17653456926345826,
+      "step": 1315
+    },
+    {
+      "epoch": 0.24017467248908297,
+      "grad_norm": 0.1796836256980896,
+      "learning_rate": 4.444639383194452e-05,
+      "loss": 0.17189600467681884,
+      "step": 1320
+    },
+    {
+      "epoch": 0.2410844250363901,
+      "grad_norm": 0.14919696748256683,
+      "learning_rate": 4.440005113879029e-05,
+      "loss": 0.17003334760665895,
+      "step": 1325
+    },
+    {
+      "epoch": 0.24199417758369723,
+      "grad_norm": 0.1728784441947937,
+      "learning_rate": 4.4353540265977064e-05,
+      "loss": 0.17397408485412597,
+      "step": 1330
+    },
+    {
+      "epoch": 0.24290393013100436,
+      "grad_norm": 0.14591015875339508,
+      "learning_rate": 4.43068616167091e-05,
+      "loss": 0.16498478651046752,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2438136826783115,
+      "grad_norm": 0.18417201936244965,
+      "learning_rate": 4.4260015595645055e-05,
+      "loss": 0.16841750144958495,
+      "step": 1340
+    },
+    {
+      "epoch": 0.24472343522561862,
+      "grad_norm": 0.16264279186725616,
+      "learning_rate": 4.4213002608894605e-05,
+      "loss": 0.16907373666763306,
+      "step": 1345
+    },
+    {
+      "epoch": 0.24563318777292575,
+      "grad_norm": 0.15248481929302216,
+      "learning_rate": 4.416582306401481e-05,
+      "loss": 0.15931472778320313,
+      "step": 1350
+    },
+    {
+      "epoch": 0.24654294032023288,
+      "grad_norm": 0.1488373875617981,
+      "learning_rate": 4.4118477370006636e-05,
+      "loss": 0.1701716423034668,
+      "step": 1355
+    },
+    {
+      "epoch": 0.24745269286754004,
+      "grad_norm": 0.14679782092571259,
+      "learning_rate": 4.407096593731142e-05,
+      "loss": 0.157412326335907,
+      "step": 1360
+    },
+    {
+      "epoch": 0.24836244541484717,
+      "grad_norm": 0.17139530181884766,
+      "learning_rate": 4.402328917780728e-05,
+      "loss": 0.17303754091262818,
+      "step": 1365
+    },
+    {
+      "epoch": 0.2492721979621543,
+      "grad_norm": 0.1534871757030487,
+      "learning_rate": 4.397544750480554e-05,
+      "loss": 0.1786255121231079,
+      "step": 1370
+    },
+    {
+      "epoch": 0.2501819505094614,
+      "grad_norm": 0.1876252293586731,
+      "learning_rate": 4.39274413330472e-05,
+      "loss": 0.16442898511886597,
+      "step": 1375
+    },
+    {
+      "epoch": 0.25109170305676853,
+      "grad_norm": 0.16165752708911896,
+      "learning_rate": 4.387927107869928e-05,
+      "loss": 0.1780426025390625,
+      "step": 1380
+    },
+    {
+      "epoch": 0.25200145560407566,
+      "grad_norm": 0.17242255806922913,
+      "learning_rate": 4.383093715935124e-05,
+      "loss": 0.15959256887435913,
+      "step": 1385
+    },
+    {
+      "epoch": 0.25291120815138285,
+      "grad_norm": 0.1627114862203598,
+      "learning_rate": 4.378243999401137e-05,
+      "loss": 0.17606115341186523,
+      "step": 1390
+    },
+    {
+      "epoch": 0.25382096069869,
+      "grad_norm": 0.15911224484443665,
+      "learning_rate": 4.373378000310312e-05,
+      "loss": 0.16798585653305054,
+      "step": 1395
+    },
+    {
+      "epoch": 0.2547307132459971,
+      "grad_norm": 0.15542249381542206,
+      "learning_rate": 4.3684957608461505e-05,
+      "loss": 0.1695417881011963,
+      "step": 1400
+    },
+    {
+      "epoch": 0.25564046579330424,
+      "grad_norm": 0.1475304812192917,
+      "learning_rate": 4.363597323332941e-05,
+      "loss": 0.16340878009796142,
+      "step": 1405
+    },
+    {
+      "epoch": 0.25655021834061137,
+      "grad_norm": 0.16943927109241486,
+      "learning_rate": 4.358682730235395e-05,
+      "loss": 0.17240238189697266,
+      "step": 1410
+    },
+    {
+      "epoch": 0.2574599708879185,
+      "grad_norm": 0.1816391944885254,
+      "learning_rate": 4.3537520241582744e-05,
+      "loss": 0.16558437347412108,
+      "step": 1415
+    },
+    {
+      "epoch": 0.25836972343522563,
+      "grad_norm": 0.23851341009140015,
+      "learning_rate": 4.348805247846027e-05,
+      "loss": 0.16796000003814698,
+      "step": 1420
+    },
+    {
+      "epoch": 0.25927947598253276,
+      "grad_norm": 0.15415243804454803,
+      "learning_rate": 4.343842444182414e-05,
+      "loss": 0.1746017098426819,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2601892285298399,
+      "grad_norm": 0.15651032328605652,
+      "learning_rate": 4.338863656190139e-05,
+      "loss": 0.1649057984352112,
+      "step": 1430
+    },
+    {
+      "epoch": 0.261098981077147,
+      "grad_norm": 0.16601966321468353,
+      "learning_rate": 4.333868927030471e-05,
+      "loss": 0.15888988971710205,
+      "step": 1435
+    },
+    {
+      "epoch": 0.26200873362445415,
+      "grad_norm": 0.1549467295408249,
+      "learning_rate": 4.328858300002876e-05,
+      "loss": 0.16357985734939576,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2629184861717613,
+      "grad_norm": 0.16332370042800903,
+      "learning_rate": 4.32383181854464e-05,
+      "loss": 0.16749982833862304,
+      "step": 1445
+    },
+    {
+      "epoch": 0.2638282387190684,
+      "grad_norm": 0.14827077090740204,
+      "learning_rate": 4.3187895262304894e-05,
+      "loss": 0.16886214017868043,
+      "step": 1450
+    },
+    {
+      "epoch": 0.26473799126637554,
+      "grad_norm": 0.1557198166847229,
+      "learning_rate": 4.313731466772216e-05,
+      "loss": 0.17512214183807373,
+      "step": 1455
+    },
+    {
+      "epoch": 0.26564774381368267,
+      "grad_norm": 0.17263570427894592,
+      "learning_rate": 4.308657684018299e-05,
+      "loss": 0.16248074769973755,
+      "step": 1460
+    },
+    {
+      "epoch": 0.2665574963609898,
+      "grad_norm": 0.17135761678218842,
+      "learning_rate": 4.303568221953521e-05,
+      "loss": 0.16605921983718872,
+      "step": 1465
+    },
+    {
+      "epoch": 0.26746724890829693,
+      "grad_norm": 0.14322632551193237,
+      "learning_rate": 4.2984631246985897e-05,
+      "loss": 0.1610772728919983,
+      "step": 1470
+    },
+    {
+      "epoch": 0.26837700145560406,
+      "grad_norm": 0.18852312862873077,
+      "learning_rate": 4.2933424365097564e-05,
+      "loss": 0.1686462163925171,
+      "step": 1475
+    },
+    {
+      "epoch": 0.2692867540029112,
+      "grad_norm": 0.1780245155096054,
+      "learning_rate": 4.2882062017784294e-05,
+      "loss": 0.16953932046890258,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2701965065502183,
+      "grad_norm": 0.180568665266037,
+      "learning_rate": 4.2830544650307895e-05,
+      "loss": 0.16442664861679077,
+      "step": 1485
+    },
+    {
+      "epoch": 0.27110625909752545,
+      "grad_norm": 0.16876435279846191,
+      "learning_rate": 4.277887270927407e-05,
+      "loss": 0.17128173112869263,
+      "step": 1490
+    },
+    {
+      "epoch": 0.2720160116448326,
+      "grad_norm": 0.164053276181221,
+      "learning_rate": 4.2727046642628513e-05,
+      "loss": 0.16331382989883422,
+      "step": 1495
+    },
+    {
+      "epoch": 0.27292576419213976,
+      "grad_norm": 0.14577528834342957,
+      "learning_rate": 4.267506689965305e-05,
+      "loss": 0.1638316035270691,
+      "step": 1500
+    },
+    {
+      "epoch": 0.2738355167394469,
+      "grad_norm": 0.1648740917444229,
+      "learning_rate": 4.262293393096171e-05,
+      "loss": 0.15332664251327516,
+      "step": 1505
+    },
+    {
+      "epoch": 0.274745269286754,
+      "grad_norm": 0.16445094347000122,
+      "learning_rate": 4.257064818849685e-05,
+      "loss": 0.1706634521484375,
+      "step": 1510
+    },
+    {
+      "epoch": 0.27565502183406115,
+      "grad_norm": 0.1584935486316681,
+      "learning_rate": 4.251821012552524e-05,
+      "loss": 0.1684114694595337,
+      "step": 1515
+    },
+    {
+      "epoch": 0.2765647743813683,
+      "grad_norm": 0.17215611040592194,
+      "learning_rate": 4.24656201966341e-05,
+      "loss": 0.15594131946563722,
+      "step": 1520
+    },
+    {
+      "epoch": 0.2774745269286754,
+      "grad_norm": 0.15945589542388916,
+      "learning_rate": 4.2412878857727214e-05,
+      "loss": 0.1686659574508667,
+      "step": 1525
+    },
+    {
+      "epoch": 0.27838427947598254,
+      "grad_norm": 0.16103951632976532,
+      "learning_rate": 4.2359986566020906e-05,
+      "loss": 0.17779340744018554,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2792940320232897,
+      "grad_norm": 0.1770307570695877,
+      "learning_rate": 4.230694378004014e-05,
+      "loss": 0.16786882877349854,
+      "step": 1535
+    },
+    {
+      "epoch": 0.2802037845705968,
+      "grad_norm": 0.16225053369998932,
+      "learning_rate": 4.2253750959614504e-05,
+      "loss": 0.16558897495269775,
+      "step": 1540
+    },
+    {
+      "epoch": 0.28111353711790393,
+      "grad_norm": 0.27213969826698303,
+      "learning_rate": 4.220040856587425e-05,
+      "loss": 0.1641119599342346,
+      "step": 1545
+    },
+    {
+      "epoch": 0.28202328966521106,
+      "grad_norm": 0.1773071587085724,
+      "learning_rate": 4.2146917061246284e-05,
+      "loss": 0.16919140815734862,
+      "step": 1550
+    },
+    {
+      "epoch": 0.2829330422125182,
+      "grad_norm": 0.15519705414772034,
+      "learning_rate": 4.209327690945014e-05,
+      "loss": 0.15501506328582765,
+      "step": 1555
+    },
+    {
+      "epoch": 0.2838427947598253,
+      "grad_norm": 0.19921597838401794,
+      "learning_rate": 4.203948857549402e-05,
+      "loss": 0.1690821886062622,
+      "step": 1560
+    },
+    {
+      "epoch": 0.28475254730713245,
+      "grad_norm": 0.15417630970478058,
+      "learning_rate": 4.1985552525670696e-05,
+      "loss": 0.1675640344619751,
+      "step": 1565
+    },
+    {
+      "epoch": 0.2856622998544396,
+      "grad_norm": 0.1739572137594223,
+      "learning_rate": 4.193146922755348e-05,
+      "loss": 0.16738017797470092,
+      "step": 1570
+    },
+    {
+      "epoch": 0.2865720524017467,
+      "grad_norm": 0.1384361982345581,
+      "learning_rate": 4.187723914999221e-05,
+      "loss": 0.16802358627319336,
+      "step": 1575
+    },
+    {
+      "epoch": 0.28748180494905384,
+      "grad_norm": 0.1491454839706421,
+      "learning_rate": 4.182286276310915e-05,
+      "loss": 0.1619583249092102,
+      "step": 1580
+    },
+    {
+      "epoch": 0.288391557496361,
+      "grad_norm": 0.15831919014453888,
+      "learning_rate": 4.176834053829492e-05,
+      "loss": 0.1625199794769287,
+      "step": 1585
+    },
+    {
+      "epoch": 0.2893013100436681,
+      "grad_norm": 0.16265396773815155,
+      "learning_rate": 4.1713672948204416e-05,
+      "loss": 0.16718552112579346,
+      "step": 1590
+    },
+    {
+      "epoch": 0.29021106259097523,
+      "grad_norm": 0.15153461694717407,
+      "learning_rate": 4.1658860466752714e-05,
+      "loss": 0.15979087352752686,
+      "step": 1595
+    },
+    {
+      "epoch": 0.29112081513828236,
+      "grad_norm": 0.1620412915945053,
+      "learning_rate": 4.160390356911096e-05,
+      "loss": 0.16103557348251343,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2920305676855895,
+      "grad_norm": 0.16673807799816132,
+      "learning_rate": 4.154880273170223e-05,
+      "loss": 0.16394708156585694,
+      "step": 1605
+    },
+    {
+      "epoch": 0.2929403202328967,
+      "grad_norm": 0.14834867417812347,
+      "learning_rate": 4.149355843219744e-05,
+      "loss": 0.15916435718536376,
+      "step": 1610
+    },
+    {
+      "epoch": 0.2938500727802038,
+      "grad_norm": 0.16977964341640472,
+      "learning_rate": 4.143817114951119e-05,
+      "loss": 0.16538127660751342,
+      "step": 1615
+    },
+    {
+      "epoch": 0.29475982532751094,
+      "grad_norm": 0.17986875772476196,
+      "learning_rate": 4.138264136379756e-05,
+      "loss": 0.15514618158340454,
+      "step": 1620
+    },
+    {
+      "epoch": 0.29566957787481807,
+      "grad_norm": 0.15794920921325684,
+      "learning_rate": 4.132696955644605e-05,
+      "loss": 0.15992183685302735,
+      "step": 1625
+    },
+    {
+      "epoch": 0.2965793304221252,
+      "grad_norm": 0.19955399632453918,
+      "learning_rate": 4.127115621007731e-05,
+      "loss": 0.16362056732177735,
+      "step": 1630
+    },
+    {
+      "epoch": 0.29748908296943233,
+      "grad_norm": 0.1352023035287857,
+      "learning_rate": 4.121520180853903e-05,
+      "loss": 0.15631601810455323,
+      "step": 1635
+    },
+    {
+      "epoch": 0.29839883551673946,
+      "grad_norm": 0.15340781211853027,
+      "learning_rate": 4.1159106836901674e-05,
+      "loss": 0.1571858048439026,
+      "step": 1640
+    },
+    {
+      "epoch": 0.2993085880640466,
+      "grad_norm": 0.15311770141124725,
+      "learning_rate": 4.110287178145433e-05,
+      "loss": 0.16082344055175782,
+      "step": 1645
+    },
+    {
+      "epoch": 0.3002183406113537,
+      "grad_norm": 0.17811627686023712,
+      "learning_rate": 4.10464971297005e-05,
+      "loss": 0.16117215156555176,
+      "step": 1650
+    },
+    {
+      "epoch": 0.30112809315866085,
+      "grad_norm": 0.21060039103031158,
+      "learning_rate": 4.0989983370353805e-05,
+      "loss": 0.15838587284088135,
+      "step": 1655
+    },
+    {
+      "epoch": 0.302037845705968,
+      "grad_norm": 0.155836820602417,
+      "learning_rate": 4.093333099333383e-05,
+      "loss": 0.16648870706558228,
+      "step": 1660
+    },
+    {
+      "epoch": 0.3029475982532751,
+      "grad_norm": 0.13711698353290558,
+      "learning_rate": 4.0876540489761826e-05,
+      "loss": 0.16899349689483642,
+      "step": 1665
+    },
+    {
+      "epoch": 0.30385735080058224,
+      "grad_norm": 0.15162716805934906,
+      "learning_rate": 4.0819612351956485e-05,
+      "loss": 0.16574090719223022,
+      "step": 1670
+    },
+    {
+      "epoch": 0.30476710334788937,
+      "grad_norm": 0.15016348659992218,
+      "learning_rate": 4.0762547073429615e-05,
+      "loss": 0.1689780354499817,
+      "step": 1675
+    },
+    {
+      "epoch": 0.3056768558951965,
+      "grad_norm": 0.15182986855506897,
+      "learning_rate": 4.070534514888194e-05,
+      "loss": 0.1593686819076538,
+      "step": 1680
+    },
+    {
+      "epoch": 0.3065866084425036,
+      "grad_norm": 0.15648750960826874,
+      "learning_rate": 4.0648007074198765e-05,
+      "loss": 0.16436235904693602,
+      "step": 1685
+    },
+    {
+      "epoch": 0.30749636098981076,
+      "grad_norm": 0.18339484930038452,
+      "learning_rate": 4.0590533346445665e-05,
+      "loss": 0.1678077220916748,
+      "step": 1690
+    },
+    {
+      "epoch": 0.3084061135371179,
+      "grad_norm": 0.16426527500152588,
+      "learning_rate": 4.053292446386422e-05,
+      "loss": 0.1689227342605591,
+      "step": 1695
+    },
+    {
+      "epoch": 0.309315866084425,
+      "grad_norm": 0.16129335761070251,
+      "learning_rate": 4.047518092586766e-05,
+      "loss": 0.16592445373535156,
+      "step": 1700
+    },
+    {
+      "epoch": 0.31022561863173215,
+      "grad_norm": 0.15512363612651825,
+      "learning_rate": 4.041730323303654e-05,
+      "loss": 0.16142364740371704,
+      "step": 1705
+    },
+    {
+      "epoch": 0.3111353711790393,
+      "grad_norm": 0.159842386841774,
+      "learning_rate": 4.0359291887114425e-05,
+      "loss": 0.1702875852584839,
+      "step": 1710
+    },
+    {
+      "epoch": 0.3120451237263464,
+      "grad_norm": 0.19558854401111603,
+      "learning_rate": 4.030114739100352e-05,
+      "loss": 0.15966148376464845,
+      "step": 1715
+    },
+    {
+      "epoch": 0.3129548762736536,
+      "grad_norm": 0.1577496975660324,
+      "learning_rate": 4.024287024876029e-05,
+      "loss": 0.1620358943939209,
+      "step": 1720
+    },
+    {
+      "epoch": 0.3138646288209607,
+      "grad_norm": 0.1629355251789093,
+      "learning_rate": 4.0184460965591144e-05,
+      "loss": 0.16511552333831786,
+      "step": 1725
+    },
+    {
+      "epoch": 0.31477438136826785,
+      "grad_norm": 0.17060767114162445,
+      "learning_rate": 4.0125920047848e-05,
+      "loss": 0.15672838687896729,
+      "step": 1730
+    },
+    {
+      "epoch": 0.315684133915575,
+      "grad_norm": 0.22447620332241058,
+      "learning_rate": 4.006724800302394e-05,
+      "loss": 0.15339784622192382,
+      "step": 1735
+    },
+    {
+      "epoch": 0.3165938864628821,
+      "grad_norm": 0.14572037756443024,
+      "learning_rate": 4.000844533974878e-05,
+      "loss": 0.16566959619522095,
+      "step": 1740
+    },
+    {
+      "epoch": 0.31750363901018924,
+      "grad_norm": 0.15915483236312866,
+      "learning_rate": 3.9949512567784684e-05,
+      "loss": 0.16153957843780517,
+      "step": 1745
+    },
+    {
+      "epoch": 0.3184133915574964,
+      "grad_norm": 0.1668540984392166,
+      "learning_rate": 3.9890450198021704e-05,
+      "loss": 0.1659809947013855,
+      "step": 1750
+    },
+    {
+      "epoch": 0.3193231441048035,
+      "grad_norm": 0.16612035036087036,
+      "learning_rate": 3.983125874247341e-05,
+      "loss": 0.16941241025924683,
+      "step": 1755
+    },
+    {
+      "epoch": 0.32023289665211063,
+      "grad_norm": 0.15163679420948029,
+      "learning_rate": 3.9771938714272407e-05,
+      "loss": 0.16053590774536133,
+      "step": 1760
+    },
+    {
+      "epoch": 0.32114264919941776,
+      "grad_norm": 0.1797824203968048,
+      "learning_rate": 3.97124906276659e-05,
+      "loss": 0.1667110800743103,
+      "step": 1765
+    },
+    {
+      "epoch": 0.3220524017467249,
+      "grad_norm": 0.15076608955860138,
+      "learning_rate": 3.9652914998011237e-05,
+      "loss": 0.1607860803604126,
+      "step": 1770
+    },
+    {
+      "epoch": 0.322962154294032,
+      "grad_norm": 0.16523587703704834,
+      "learning_rate": 3.959321234177144e-05,
+      "loss": 0.16515827178955078,
+      "step": 1775
+    },
+    {
+      "epoch": 0.32387190684133915,
+      "grad_norm": 0.22065149247646332,
+      "learning_rate": 3.9533383176510746e-05,
+      "loss": 0.1618957757949829,
+      "step": 1780
+    },
+    {
+      "epoch": 0.3247816593886463,
+      "grad_norm": 0.16426463425159454,
+      "learning_rate": 3.9473428020890066e-05,
+      "loss": 0.15763382911682128,
+      "step": 1785
+    },
+    {
+      "epoch": 0.3256914119359534,
+      "grad_norm": 0.16474904119968414,
+      "learning_rate": 3.941334739466257e-05,
+      "loss": 0.15135571956634522,
+      "step": 1790
+    },
+    {
+      "epoch": 0.32660116448326054,
+      "grad_norm": 0.16746412217617035,
+      "learning_rate": 3.935314181866909e-05,
+      "loss": 0.15925389528274536,
+      "step": 1795
+    },
+    {
+      "epoch": 0.32751091703056767,
+      "grad_norm": 0.17819371819496155,
+      "learning_rate": 3.929281181483369e-05,
+      "loss": 0.1598669171333313,
+      "step": 1800
+    },
+    {
+      "epoch": 0.3284206695778748,
+      "grad_norm": 0.1816040277481079,
+      "learning_rate": 3.923235790615907e-05,
+      "loss": 0.1652522087097168,
+      "step": 1805
+    },
+    {
+      "epoch": 0.32933042212518193,
+      "grad_norm": 0.14846695959568024,
+      "learning_rate": 3.917178061672211e-05,
+      "loss": 0.16665585041046144,
+      "step": 1810
+    },
+    {
+      "epoch": 0.33024017467248906,
+      "grad_norm": 0.1734926551580429,
+      "learning_rate": 3.911108047166924e-05,
+      "loss": 0.16069791316986085,
+      "step": 1815
+    },
+    {
+      "epoch": 0.3311499272197962,
+      "grad_norm": 0.16154922544956207,
+      "learning_rate": 3.905025799721194e-05,
+      "loss": 0.16114097833633423,
+      "step": 1820
+    },
+    {
+      "epoch": 0.3320596797671033,
+      "grad_norm": 0.1538771390914917,
+      "learning_rate": 3.898931372062217e-05,
+      "loss": 0.1602831244468689,
+      "step": 1825
+    },
+    {
+      "epoch": 0.3329694323144105,
+      "grad_norm": 0.14036566019058228,
+      "learning_rate": 3.892824817022781e-05,
+      "loss": 0.1502395749092102,
+      "step": 1830
+    },
+    {
+      "epoch": 0.33387918486171764,
+      "grad_norm": 0.19212059676647186,
+      "learning_rate": 3.886706187540804e-05,
+      "loss": 0.16265250444412233,
+      "step": 1835
+    },
+    {
+      "epoch": 0.33478893740902477,
+      "grad_norm": 0.17410333454608917,
+      "learning_rate": 3.880575536658881e-05,
+      "loss": 0.15689224004745483,
+      "step": 1840
+    },
+    {
+      "epoch": 0.3356986899563319,
+      "grad_norm": 0.15165294706821442,
+      "learning_rate": 3.874432917523817e-05,
+      "loss": 0.15033140182495117,
+      "step": 1845
+    },
+    {
+      "epoch": 0.336608442503639,
+      "grad_norm": 0.16166730225086212,
+      "learning_rate": 3.8682783833861736e-05,
+      "loss": 0.16896235942840576,
+      "step": 1850
+    },
+    {
+      "epoch": 0.33751819505094616,
+      "grad_norm": 0.16497021913528442,
+      "learning_rate": 3.8621119875998026e-05,
+      "loss": 0.1600774645805359,
+      "step": 1855
+    },
+    {
+      "epoch": 0.3384279475982533,
+      "grad_norm": 0.17264948785305023,
+      "learning_rate": 3.855933783621384e-05,
+      "loss": 0.16947593688964843,
+      "step": 1860
+    },
+    {
+      "epoch": 0.3393377001455604,
+      "grad_norm": 0.16870704293251038,
+      "learning_rate": 3.8497438250099636e-05,
+      "loss": 0.16062095165252685,
+      "step": 1865
+    },
+    {
+      "epoch": 0.34024745269286755,
+      "grad_norm": 0.16644036769866943,
+      "learning_rate": 3.843542165426492e-05,
+      "loss": 0.16015599966049193,
+      "step": 1870
+    },
+    {
+      "epoch": 0.3411572052401747,
+      "grad_norm": 0.1626352220773697,
+      "learning_rate": 3.837328858633349e-05,
+      "loss": 0.17444703578948975,
+      "step": 1875
+    },
+    {
+      "epoch": 0.3420669577874818,
+      "grad_norm": 0.1427375227212906,
+      "learning_rate": 3.83110395849389e-05,
+      "loss": 0.1589805006980896,
+      "step": 1880
+    },
+    {
+      "epoch": 0.34297671033478894,
+      "grad_norm": 0.17840255796909332,
+      "learning_rate": 3.824867518971973e-05,
+      "loss": 0.15953952074050903,
+      "step": 1885
+    },
+    {
+      "epoch": 0.34388646288209607,
+      "grad_norm": 0.16998249292373657,
+      "learning_rate": 3.818619594131489e-05,
+      "loss": 0.16027032136917113,
+      "step": 1890
+    },
+    {
+      "epoch": 0.3447962154294032,
+      "grad_norm": 0.14950257539749146,
+      "learning_rate": 3.812360238135897e-05,
+      "loss": 0.15335670709609986,
+      "step": 1895
+    },
+    {
+      "epoch": 0.3457059679767103,
+      "grad_norm": 0.1678011417388916,
+      "learning_rate": 3.806089505247752e-05,
+      "loss": 0.1560648798942566,
+      "step": 1900
+    },
+    {
+      "epoch": 0.34661572052401746,
+      "grad_norm": 0.17944541573524475,
+      "learning_rate": 3.799807449828238e-05,
+      "loss": 0.16072254180908202,
+      "step": 1905
+    },
+    {
+      "epoch": 0.3475254730713246,
+      "grad_norm": 0.166817307472229,
+      "learning_rate": 3.793514126336691e-05,
+      "loss": 0.1542820692062378,
+      "step": 1910
+    },
+    {
+      "epoch": 0.3484352256186317,
+      "grad_norm": 0.16047626733779907,
+      "learning_rate": 3.787209589330134e-05,
+      "loss": 0.16092092990875245,
+      "step": 1915
+    },
+    {
+      "epoch": 0.34934497816593885,
+      "grad_norm": 0.16478900611400604,
+      "learning_rate": 3.7808938934627965e-05,
+      "loss": 0.16765867471694945,
+      "step": 1920
+    },
+    {
+      "epoch": 0.350254730713246,
+      "grad_norm": 0.15349514782428741,
+      "learning_rate": 3.774567093485648e-05,
+      "loss": 0.15890377759933472,
+      "step": 1925
+    },
+    {
+      "epoch": 0.3511644832605531,
+      "grad_norm": 0.1515921950340271,
+      "learning_rate": 3.768229244245917e-05,
+      "loss": 0.16668319702148438,
+      "step": 1930
+    },
+    {
+      "epoch": 0.35207423580786024,
+      "grad_norm": 0.16310466825962067,
+      "learning_rate": 3.7618804006866195e-05,
+      "loss": 0.15182652473449706,
+      "step": 1935
+    },
+    {
+      "epoch": 0.3529839883551674,
+      "grad_norm": 0.17294517159461975,
+      "learning_rate": 3.755520617846084e-05,
+      "loss": 0.16287628412246705,
+      "step": 1940
+    },
+    {
+      "epoch": 0.35389374090247455,
+      "grad_norm": 0.1482895463705063,
+      "learning_rate": 3.749149950857467e-05,
+      "loss": 0.15321952104568481,
+      "step": 1945
+    },
+    {
+      "epoch": 0.3548034934497817,
+      "grad_norm": 0.2236029952764511,
+      "learning_rate": 3.7427684549482847e-05,
+      "loss": 0.15403482913970948,
+      "step": 1950
+    },
+    {
+      "epoch": 0.3557132459970888,
+      "grad_norm": 0.20185327529907227,
+      "learning_rate": 3.736376185439927e-05,
+      "loss": 0.1633884072303772,
+      "step": 1955
+    },
+    {
+      "epoch": 0.35662299854439594,
+      "grad_norm": 0.13906247913837433,
+      "learning_rate": 3.7299731977471816e-05,
+      "loss": 0.15925350189208984,
+      "step": 1960
+    },
+    {
+      "epoch": 0.35753275109170307,
+      "grad_norm": 0.18665002286434174,
+      "learning_rate": 3.723559547377751e-05,
+      "loss": 0.1612026572227478,
+      "step": 1965
+    },
+    {
+      "epoch": 0.3584425036390102,
+      "grad_norm": 0.16913433372974396,
+      "learning_rate": 3.717135289931774e-05,
+      "loss": 0.15479494333267213,
+      "step": 1970
+    },
+    {
+      "epoch": 0.35935225618631733,
+      "grad_norm": 0.1620066910982132,
+      "learning_rate": 3.7107004811013434e-05,
+      "loss": 0.1604058027267456,
+      "step": 1975
+    },
+    {
+      "epoch": 0.36026200873362446,
+      "grad_norm": 0.16838301718235016,
+      "learning_rate": 3.704255176670021e-05,
+      "loss": 0.15335073471069335,
+      "step": 1980
+    },
+    {
+      "epoch": 0.3611717612809316,
+      "grad_norm": 0.3054695427417755,
+      "learning_rate": 3.6977994325123535e-05,
+      "loss": 0.16558053493499755,
+      "step": 1985
+    },
+    {
+      "epoch": 0.3620815138282387,
+      "grad_norm": 0.1526716649532318,
+      "learning_rate": 3.6913333045933934e-05,
+      "loss": 0.16148923635482787,
+      "step": 1990
+    },
+    {
+      "epoch": 0.36299126637554585,
+      "grad_norm": 0.15328513085842133,
+      "learning_rate": 3.684856848968209e-05,
+      "loss": 0.1553613781929016,
+      "step": 1995
+    },
+    {
+      "epoch": 0.363901018922853,
+      "grad_norm": 0.16129714250564575,
+      "learning_rate": 3.6783701217813995e-05,
+      "loss": 0.16724612712860107,
+      "step": 2000
+    },
+    {
+      "epoch": 0.3648107714701601,
+      "grad_norm": 0.15715539455413818,
+      "learning_rate": 3.6718731792666086e-05,
+      "loss": 0.15867922306060792,
+      "step": 2005
+    },
+    {
+      "epoch": 0.36572052401746724,
+      "grad_norm": 0.15569166839122772,
+      "learning_rate": 3.6653660777460366e-05,
+      "loss": 0.1552058696746826,
+      "step": 2010
+    },
+    {
+      "epoch": 0.36663027656477437,
+      "grad_norm": 0.16223010420799255,
+      "learning_rate": 3.6588488736299535e-05,
+      "loss": 0.1583200454711914,
+      "step": 2015
+    },
+    {
+      "epoch": 0.3675400291120815,
+      "grad_norm": 0.18441995978355408,
+      "learning_rate": 3.652321623416209e-05,
+      "loss": 0.15050662755966188,
+      "step": 2020
+    },
+    {
+      "epoch": 0.36844978165938863,
+      "grad_norm": 0.13792674243450165,
+      "learning_rate": 3.645784383689742e-05,
+      "loss": 0.15458759069442748,
+      "step": 2025
+    },
+    {
+      "epoch": 0.36935953420669576,
+      "grad_norm": 0.14993111789226532,
+      "learning_rate": 3.639237211122091e-05,
+      "loss": 0.15926222801208495,
+      "step": 2030
+    },
+    {
+      "epoch": 0.3702692867540029,
+      "grad_norm": 0.16815930604934692,
+      "learning_rate": 3.632680162470904e-05,
+      "loss": 0.15524441003799438,
+      "step": 2035
+    },
+    {
+      "epoch": 0.37117903930131,
+      "grad_norm": 0.13312821090221405,
+      "learning_rate": 3.626113294579441e-05,
+      "loss": 0.15883516073226928,
+      "step": 2040
+    },
+    {
+      "epoch": 0.37208879184861715,
+      "grad_norm": 0.16838273406028748,
+      "learning_rate": 3.619536664376091e-05,
+      "loss": 0.15829603672027587,
+      "step": 2045
+    },
+    {
+      "epoch": 0.37299854439592434,
+      "grad_norm": 0.14706873893737793,
+      "learning_rate": 3.612950328873869e-05,
+      "loss": 0.15644397735595703,
+      "step": 2050
+    },
+    {
+      "epoch": 0.37390829694323147,
+      "grad_norm": 0.1644199639558792,
+      "learning_rate": 3.606354345169926e-05,
+      "loss": 0.15858219861984252,
+      "step": 2055
+    },
+    {
+      "epoch": 0.3748180494905386,
+      "grad_norm": 0.18077051639556885,
+      "learning_rate": 3.599748770445055e-05,
+      "loss": 0.1641286849975586,
+      "step": 2060
+    },
+    {
+      "epoch": 0.3757278020378457,
+      "grad_norm": 0.16329127550125122,
+      "learning_rate": 3.5931336619631914e-05,
+      "loss": 0.15027186870574952,
+      "step": 2065
+    },
+    {
+      "epoch": 0.37663755458515286,
+      "grad_norm": 0.16346783936023712,
+      "learning_rate": 3.586509077070922e-05,
+      "loss": 0.1558641314506531,
+      "step": 2070
+    },
+    {
+      "epoch": 0.37754730713246,
+      "grad_norm": 0.1727602630853653,
+      "learning_rate": 3.5798750731969834e-05,
+      "loss": 0.15390506982803345,
+      "step": 2075
+    },
+    {
+      "epoch": 0.3784570596797671,
+      "grad_norm": 0.7598192691802979,
+      "learning_rate": 3.5732317078517654e-05,
+      "loss": 0.1533232808113098,
+      "step": 2080
+    },
+    {
+      "epoch": 0.37936681222707425,
+      "grad_norm": 0.1433355212211609,
+      "learning_rate": 3.5665790386268124e-05,
+      "loss": 0.15560413599014283,
+      "step": 2085
+    },
+    {
+      "epoch": 0.3802765647743814,
+      "grad_norm": 0.18439625203609467,
+      "learning_rate": 3.559917123194325e-05,
+      "loss": 0.16695556640625,
+      "step": 2090
+    },
+    {
+      "epoch": 0.3811863173216885,
+      "grad_norm": 0.1693502813577652,
+      "learning_rate": 3.55324601930666e-05,
+      "loss": 0.15957870483398437,
+      "step": 2095
+    },
+    {
+      "epoch": 0.38209606986899564,
+      "grad_norm": 0.17776088416576385,
+      "learning_rate": 3.54656578479583e-05,
+      "loss": 0.1527492880821228,
+      "step": 2100
+    },
+    {
+      "epoch": 0.38300582241630277,
+      "grad_norm": 0.15993724763393402,
+      "learning_rate": 3.539876477572998e-05,
+      "loss": 0.1567505717277527,
+      "step": 2105
+    },
+    {
+      "epoch": 0.3839155749636099,
+      "grad_norm": 0.17067375779151917,
+      "learning_rate": 3.533178155627981e-05,
+      "loss": 0.14660797119140626,
+      "step": 2110
+    },
+    {
+      "epoch": 0.384825327510917,
+      "grad_norm": 0.20239882171154022,
+      "learning_rate": 3.526470877028745e-05,
+      "loss": 0.1596767544746399,
+      "step": 2115
+    },
+    {
+      "epoch": 0.38573508005822416,
+      "grad_norm": 0.1863643079996109,
+      "learning_rate": 3.5197546999209005e-05,
+      "loss": 0.15738571882247926,
+      "step": 2120
+    },
+    {
+      "epoch": 0.3866448326055313,
+      "grad_norm": 0.16994133591651917,
+      "learning_rate": 3.5130296825272014e-05,
+      "loss": 0.16255316734313965,
+      "step": 2125
+    },
+    {
+      "epoch": 0.3875545851528384,
+      "grad_norm": 0.18703415989875793,
+      "learning_rate": 3.5062958831470355e-05,
+      "loss": 0.15206334590911866,
+      "step": 2130
+    },
+    {
+      "epoch": 0.38846433770014555,
+      "grad_norm": 0.15433982014656067,
+      "learning_rate": 3.4995533601559226e-05,
+      "loss": 0.1590178370475769,
+      "step": 2135
+    },
+    {
+      "epoch": 0.3893740902474527,
+      "grad_norm": 0.16498146951198578,
+      "learning_rate": 3.4928021720050104e-05,
+      "loss": 0.14759145975112914,
+      "step": 2140
+    },
+    {
+      "epoch": 0.3902838427947598,
+      "grad_norm": 0.17880478501319885,
+      "learning_rate": 3.486042377220562e-05,
+      "loss": 0.1642458915710449,
+      "step": 2145
+    },
+    {
+      "epoch": 0.39119359534206694,
+      "grad_norm": 0.14700061082839966,
+      "learning_rate": 3.479274034403455e-05,
+      "loss": 0.16105138063430785,
+      "step": 2150
+    },
+    {
+      "epoch": 0.39210334788937407,
+      "grad_norm": 0.1620762050151825,
+      "learning_rate": 3.472497202228664e-05,
+      "loss": 0.15104985237121582,
+      "step": 2155
+    },
+    {
+      "epoch": 0.3930131004366812,
+      "grad_norm": 0.1625058799982071,
+      "learning_rate": 3.4657119394447654e-05,
+      "loss": 0.16145485639572144,
+      "step": 2160
+    },
+    {
+      "epoch": 0.3939228529839884,
+      "grad_norm": 0.1631549596786499,
+      "learning_rate": 3.458918304873417e-05,
+      "loss": 0.16712255477905275,
+      "step": 2165
+    },
+    {
+      "epoch": 0.3948326055312955,
+      "grad_norm": 0.16041551530361176,
+      "learning_rate": 3.452116357408853e-05,
+      "loss": 0.15118330717086792,
+      "step": 2170
+    },
+    {
+      "epoch": 0.39574235807860264,
+      "grad_norm": 0.16692611575126648,
+      "learning_rate": 3.44530615601737e-05,
+      "loss": 0.16982550621032716,
+      "step": 2175
+    },
+    {
+      "epoch": 0.39665211062590977,
+      "grad_norm": 0.16082268953323364,
+      "learning_rate": 3.438487759736821e-05,
+      "loss": 0.1513260006904602,
+      "step": 2180
+    },
+    {
+      "epoch": 0.3975618631732169,
+      "grad_norm": 0.1474589854478836,
+      "learning_rate": 3.4316612276761004e-05,
+      "loss": 0.14968743324279785,
+      "step": 2185
+    },
+    {
+      "epoch": 0.39847161572052403,
+      "grad_norm": 0.14531342685222626,
+      "learning_rate": 3.42482661901463e-05,
+      "loss": 0.1563260555267334,
+      "step": 2190
+    },
+    {
+      "epoch": 0.39938136826783116,
+      "grad_norm": 0.16775506734848022,
+      "learning_rate": 3.41798399300185e-05,
+      "loss": 0.14861010313034057,
+      "step": 2195
+    },
+    {
+      "epoch": 0.4002911208151383,
+      "grad_norm": 0.15065217018127441,
+      "learning_rate": 3.411133408956703e-05,
+      "loss": 0.15559519529342652,
+      "step": 2200
+    },
+    {
+      "epoch": 0.4012008733624454,
+      "grad_norm": 0.16655296087265015,
+      "learning_rate": 3.4042749262671184e-05,
+      "loss": 0.16025567054748535,
+      "step": 2205
+    },
+    {
+      "epoch": 0.40211062590975255,
+      "grad_norm": 0.14773905277252197,
+      "learning_rate": 3.397408604389501e-05,
+      "loss": 0.15074082612991332,
+      "step": 2210
+    },
+    {
+      "epoch": 0.4030203784570597,
+      "grad_norm": 0.16233304142951965,
+      "learning_rate": 3.3905345028482125e-05,
+      "loss": 0.15490520000457764,
+      "step": 2215
+    },
+    {
+      "epoch": 0.4039301310043668,
+      "grad_norm": 0.17520153522491455,
+      "learning_rate": 3.383652681235058e-05,
+      "loss": 0.1517520785331726,
+      "step": 2220
+    },
+    {
+      "epoch": 0.40483988355167394,
+      "grad_norm": 0.14749875664710999,
+      "learning_rate": 3.376763199208766e-05,
+      "loss": 0.15410997867584228,
+      "step": 2225
+    },
+    {
+      "epoch": 0.40574963609898107,
+      "grad_norm": 0.16855919361114502,
+      "learning_rate": 3.369866116494477e-05,
+      "loss": 0.1510261058807373,
+      "step": 2230
+    },
+    {
+      "epoch": 0.4066593886462882,
+      "grad_norm": 0.1594122350215912,
+      "learning_rate": 3.362961492883218e-05,
+      "loss": 0.1493813395500183,
+      "step": 2235
+    },
+    {
+      "epoch": 0.40756914119359533,
+      "grad_norm": 0.13645926117897034,
+      "learning_rate": 3.3560493882313915e-05,
+      "loss": 0.14876762628555298,
+      "step": 2240
+    },
+    {
+      "epoch": 0.40847889374090246,
+      "grad_norm": 0.14304400980472565,
+      "learning_rate": 3.349129862460251e-05,
+      "loss": 0.15567013025283813,
+      "step": 2245
+    },
+    {
+      "epoch": 0.4093886462882096,
+      "grad_norm": 0.17040041089057922,
+      "learning_rate": 3.342202975555386e-05,
+      "loss": 0.1563249945640564,
+      "step": 2250
+    },
+    {
+      "epoch": 0.4102983988355167,
+      "grad_norm": 0.15594671666622162,
+      "learning_rate": 3.3352687875661984e-05,
+      "loss": 0.1546410083770752,
+      "step": 2255
+    },
+    {
+      "epoch": 0.41120815138282385,
+      "grad_norm": 0.1677195280790329,
+      "learning_rate": 3.328327358605384e-05,
+      "loss": 0.15710171461105346,
+      "step": 2260
+    },
+    {
+      "epoch": 0.412117903930131,
+      "grad_norm": 0.1731705516576767,
+      "learning_rate": 3.321378748848412e-05,
+      "loss": 0.16444036960601807,
+      "step": 2265
+    },
+    {
+      "epoch": 0.4130276564774381,
+      "grad_norm": 0.18779033422470093,
+      "learning_rate": 3.3144230185329984e-05,
+      "loss": 0.15659687519073487,
+      "step": 2270
+    },
+    {
+      "epoch": 0.4139374090247453,
+      "grad_norm": 0.1543768346309662,
+      "learning_rate": 3.3074602279585913e-05,
+      "loss": 0.15100739002227784,
+      "step": 2275
+    },
+    {
+      "epoch": 0.4148471615720524,
+      "grad_norm": 0.16672168672084808,
+      "learning_rate": 3.300490437485843e-05,
+      "loss": 0.15535364151000977,
+      "step": 2280
+    },
+    {
+      "epoch": 0.41575691411935956,
+      "grad_norm": 0.16741308569908142,
+      "learning_rate": 3.293513707536089e-05,
+      "loss": 0.15523911714553834,
+      "step": 2285
+    },
+    {
+      "epoch": 0.4166666666666667,
+      "grad_norm": 0.1488303542137146,
+      "learning_rate": 3.286530098590822e-05,
+      "loss": 0.1542000651359558,
+      "step": 2290
+    },
+    {
+      "epoch": 0.4175764192139738,
+      "grad_norm": 0.1637732982635498,
+      "learning_rate": 3.2795396711911694e-05,
+      "loss": 0.15354831218719484,
+      "step": 2295
+    },
+    {
+      "epoch": 0.41848617176128095,
+      "grad_norm": 0.1472022533416748,
+      "learning_rate": 3.272542485937369e-05,
+      "loss": 0.16235145330429077,
+      "step": 2300
+    },
+    {
+      "epoch": 0.4193959243085881,
+      "grad_norm": 0.15908290445804596,
+      "learning_rate": 3.265538603488241e-05,
+      "loss": 0.15642645359039306,
+      "step": 2305
+    },
+    {
+      "epoch": 0.4203056768558952,
+      "grad_norm": 0.1584865301847458,
+      "learning_rate": 3.2585280845606645e-05,
+      "loss": 0.15490249395370484,
+      "step": 2310
+    },
+    {
+      "epoch": 0.42121542940320233,
+      "grad_norm": 0.15893949568271637,
+      "learning_rate": 3.251510989929052e-05,
+      "loss": 0.1598116159439087,
+      "step": 2315
+    },
+    {
+      "epoch": 0.42212518195050946,
+      "grad_norm": 0.18930596113204956,
+      "learning_rate": 3.244487380424817e-05,
+      "loss": 0.1482008934020996,
+      "step": 2320
+    },
+    {
+      "epoch": 0.4230349344978166,
+      "grad_norm": 0.132876455783844,
+      "learning_rate": 3.237457316935856e-05,
+      "loss": 0.15304710865020751,
+      "step": 2325
+    },
+    {
+      "epoch": 0.4239446870451237,
+      "grad_norm": 0.16447032988071442,
+      "learning_rate": 3.2304208604060106e-05,
+      "loss": 0.15298750400543212,
+      "step": 2330
+    },
+    {
+      "epoch": 0.42485443959243085,
+      "grad_norm": 0.17748120427131653,
+      "learning_rate": 3.223378071834546e-05,
+      "loss": 0.1556084156036377,
+      "step": 2335
+    },
+    {
+      "epoch": 0.425764192139738,
+      "grad_norm": 0.16366586089134216,
+      "learning_rate": 3.2163290122756206e-05,
+      "loss": 0.14387927055358887,
+      "step": 2340
+    },
+    {
+      "epoch": 0.4266739446870451,
+      "grad_norm": 0.15398970246315002,
+      "learning_rate": 3.209273742837755e-05,
+      "loss": 0.16091293096542358,
+      "step": 2345
+    },
+    {
+      "epoch": 0.42758369723435224,
+      "grad_norm": 0.164212167263031,
+      "learning_rate": 3.202212324683305e-05,
+      "loss": 0.15523531436920165,
+      "step": 2350
+    },
+    {
+      "epoch": 0.4284934497816594,
+      "grad_norm": 0.16749800741672516,
+      "learning_rate": 3.1951448190279255e-05,
+      "loss": 0.15354975461959838,
+      "step": 2355
+    },
+    {
+      "epoch": 0.4294032023289665,
+      "grad_norm": 0.14137034118175507,
+      "learning_rate": 3.18807128714005e-05,
+      "loss": 0.14981694221496583,
+      "step": 2360
+    },
+    {
+      "epoch": 0.43031295487627363,
+      "grad_norm": 0.14848439395427704,
+      "learning_rate": 3.1809917903403507e-05,
+      "loss": 0.15448769330978393,
+      "step": 2365
+    },
+    {
+      "epoch": 0.43122270742358076,
+      "grad_norm": 0.1747605800628662,
+      "learning_rate": 3.1739063900012095e-05,
+      "loss": 0.15882387161254882,
+      "step": 2370
+    },
+    {
+      "epoch": 0.4321324599708879,
+      "grad_norm": 0.16054467856884003,
+      "learning_rate": 3.166815147546186e-05,
+      "loss": 0.15170297622680665,
+      "step": 2375
+    },
+    {
+      "epoch": 0.433042212518195,
+      "grad_norm": 0.15428027510643005,
+      "learning_rate": 3.1597181244494886e-05,
+      "loss": 0.16202548742294312,
+      "step": 2380
+    },
+    {
+      "epoch": 0.4339519650655022,
+      "grad_norm": 0.16747219860553741,
+      "learning_rate": 3.1526153822354325e-05,
+      "loss": 0.15461477041244506,
+      "step": 2385
+    },
+    {
+      "epoch": 0.43486171761280934,
+      "grad_norm": 0.17415772378444672,
+      "learning_rate": 3.145506982477918e-05,
+      "loss": 0.16173542737960817,
+      "step": 2390
+    },
+    {
+      "epoch": 0.43577147016011647,
+      "grad_norm": 0.1293518990278244,
+      "learning_rate": 3.1383929867998865e-05,
+      "loss": 0.15572521686553956,
+      "step": 2395
+    },
+    {
+      "epoch": 0.4366812227074236,
+      "grad_norm": 0.16909323632717133,
+      "learning_rate": 3.1312734568727935e-05,
+      "loss": 0.15898628234863282,
+      "step": 2400
+    },
+    {
+      "epoch": 0.43759097525473073,
+      "grad_norm": 0.16770294308662415,
+      "learning_rate": 3.124148454416069e-05,
+      "loss": 0.1536281704902649,
+      "step": 2405
+    },
+    {
+      "epoch": 0.43850072780203786,
+      "grad_norm": 0.14078612625598907,
+      "learning_rate": 3.117018041196585e-05,
+      "loss": 0.15274266004562378,
+      "step": 2410
+    },
+    {
+      "epoch": 0.439410480349345,
+      "grad_norm": 0.15457536280155182,
+      "learning_rate": 3.1098822790281226e-05,
+      "loss": 0.15391263961791993,
+      "step": 2415
+    },
+    {
+      "epoch": 0.4403202328966521,
+      "grad_norm": 0.1640717089176178,
+      "learning_rate": 3.102741229770827e-05,
+      "loss": 0.15515168905258178,
+      "step": 2420
+    },
+    {
+      "epoch": 0.44122998544395925,
+      "grad_norm": 0.2601533830165863,
+      "learning_rate": 3.095594955330683e-05,
+      "loss": 0.1587247371673584,
+      "step": 2425
+    },
+    {
+      "epoch": 0.4421397379912664,
+      "grad_norm": 0.1352529525756836,
+      "learning_rate": 3.08844351765897e-05,
+      "loss": 0.1483217477798462,
+      "step": 2430
+    },
+    {
+      "epoch": 0.4430494905385735,
+      "grad_norm": 0.18479721248149872,
+      "learning_rate": 3.081286978751728e-05,
+      "loss": 0.15121787786483765,
+      "step": 2435
+    },
+    {
+      "epoch": 0.44395924308588064,
+      "grad_norm": 0.16954511404037476,
+      "learning_rate": 3.074125400649221e-05,
+      "loss": 0.16073100566864013,
+      "step": 2440
+    },
+    {
+      "epoch": 0.44486899563318777,
+      "grad_norm": 0.15154729783535004,
+      "learning_rate": 3.0669588454353944e-05,
+      "loss": 0.15738017559051515,
+      "step": 2445
+    },
+    {
+      "epoch": 0.4457787481804949,
+      "grad_norm": 0.1540488302707672,
+      "learning_rate": 3.059787375237344e-05,
+      "loss": 0.1515384554862976,
+      "step": 2450
+    },
+    {
+      "epoch": 0.44668850072780203,
+      "grad_norm": 0.1814432442188263,
+      "learning_rate": 3.052611052224774e-05,
+      "loss": 0.15731438398361205,
+      "step": 2455
+    },
+    {
+      "epoch": 0.44759825327510916,
+      "grad_norm": 0.16657036542892456,
+      "learning_rate": 3.0454299386094542e-05,
+      "loss": 0.15741543769836425,
+      "step": 2460
+    },
+    {
+      "epoch": 0.4485080058224163,
+      "grad_norm": 0.2177237570285797,
+      "learning_rate": 3.0382440966446875e-05,
+      "loss": 0.14972515106201173,
+      "step": 2465
+    },
+    {
+      "epoch": 0.4494177583697234,
+      "grad_norm": 0.1669909954071045,
+      "learning_rate": 3.031053588624766e-05,
+      "loss": 0.1506432294845581,
+      "step": 2470
+    },
+    {
+      "epoch": 0.45032751091703055,
+      "grad_norm": 0.1752234250307083,
+      "learning_rate": 3.0238584768844313e-05,
+      "loss": 0.14969609975814818,
+      "step": 2475
+    },
+    {
+      "epoch": 0.4512372634643377,
+      "grad_norm": 0.18267901241779327,
+      "learning_rate": 3.0166588237983363e-05,
+      "loss": 0.15112748146057128,
+      "step": 2480
+    },
+    {
+      "epoch": 0.4521470160116448,
+      "grad_norm": 0.16250105202198029,
+      "learning_rate": 3.0094546917805007e-05,
+      "loss": 0.15864100456237792,
+      "step": 2485
+    },
+    {
+      "epoch": 0.45305676855895194,
+      "grad_norm": 0.14825721085071564,
+      "learning_rate": 3.0022461432837752e-05,
+      "loss": 0.1513954520225525,
+      "step": 2490
+    },
+    {
+      "epoch": 0.4539665211062591,
+      "grad_norm": 0.1626640111207962,
+      "learning_rate": 2.9950332407992943e-05,
+      "loss": 0.1505578875541687,
+      "step": 2495
+    },
+    {
+      "epoch": 0.45487627365356625,
+      "grad_norm": 0.1535351574420929,
+      "learning_rate": 2.987816046855939e-05,
+      "loss": 0.15255829095840454,
+      "step": 2500
+    },
+    {
+      "epoch": 0.4557860262008734,
+      "grad_norm": 0.17552775144577026,
+      "learning_rate": 2.9805946240197928e-05,
+      "loss": 0.1516443133354187,
+      "step": 2505
+    },
+    {
+      "epoch": 0.4566957787481805,
+      "grad_norm": 0.16020981967449188,
+      "learning_rate": 2.9733690348935994e-05,
+      "loss": 0.14519743919372557,
+      "step": 2510
+    },
+    {
+      "epoch": 0.45760553129548764,
+      "grad_norm": 0.17800211906433105,
+      "learning_rate": 2.9661393421162204e-05,
+      "loss": 0.15679080486297609,
+      "step": 2515
+    },
+    {
+      "epoch": 0.4585152838427948,
+      "grad_norm": 0.16016991436481476,
+      "learning_rate": 2.9589056083620902e-05,
+      "loss": 0.14768127202987671,
+      "step": 2520
+    },
+    {
+      "epoch": 0.4594250363901019,
+      "grad_norm": 0.16272081434726715,
+      "learning_rate": 2.951667896340679e-05,
+      "loss": 0.1513301968574524,
+      "step": 2525
+    },
+    {
+      "epoch": 0.46033478893740903,
+      "grad_norm": 0.1726413071155548,
+      "learning_rate": 2.9444262687959402e-05,
+      "loss": 0.14819332361221313,
+      "step": 2530
+    },
+    {
+      "epoch": 0.46124454148471616,
+      "grad_norm": 0.1670403778553009,
+      "learning_rate": 2.9371807885057735e-05,
+      "loss": 0.15245940685272216,
+      "step": 2535
+    },
+    {
+      "epoch": 0.4621542940320233,
+      "grad_norm": 0.1650049239397049,
+      "learning_rate": 2.9299315182814772e-05,
+      "loss": 0.15187418460845947,
+      "step": 2540
+    },
+    {
+      "epoch": 0.4630640465793304,
+      "grad_norm": 0.16327734291553497,
+      "learning_rate": 2.9226785209672047e-05,
+      "loss": 0.15579828023910522,
+      "step": 2545
+    },
+    {
+      "epoch": 0.46397379912663755,
+      "grad_norm": 0.3367880582809448,
+      "learning_rate": 2.91542185943942e-05,
+      "loss": 0.15617697238922118,
+      "step": 2550
+    },
+    {
+      "epoch": 0.4648835516739447,
+      "grad_norm": 0.1731594055891037,
+      "learning_rate": 2.908161596606353e-05,
+      "loss": 0.1559603691101074,
+      "step": 2555
+    },
+    {
+      "epoch": 0.4657933042212518,
+      "grad_norm": 0.1477293074131012,
+      "learning_rate": 2.9008977954074517e-05,
+      "loss": 0.15567959547042848,
+      "step": 2560
+    },
+    {
+      "epoch": 0.46670305676855894,
+      "grad_norm": 0.16227173805236816,
+      "learning_rate": 2.8936305188128392e-05,
+      "loss": 0.1522113561630249,
+      "step": 2565
+    },
+    {
+      "epoch": 0.4676128093158661,
+      "grad_norm": 0.2031075656414032,
+      "learning_rate": 2.8863598298227674e-05,
+      "loss": 0.15054640769958497,
+      "step": 2570
+    },
+    {
+      "epoch": 0.4685225618631732,
+      "grad_norm": 0.18351472914218903,
+      "learning_rate": 2.8790857914670698e-05,
+      "loss": 0.15837019681930542,
+      "step": 2575
+    },
+    {
+      "epoch": 0.46943231441048033,
+      "grad_norm": 0.15914765000343323,
+      "learning_rate": 2.871808466804616e-05,
+      "loss": 0.1550259470939636,
+      "step": 2580
+    },
+    {
+      "epoch": 0.47034206695778746,
+      "grad_norm": 0.17366717755794525,
+      "learning_rate": 2.8645279189227636e-05,
+      "loss": 0.15702390670776367,
+      "step": 2585
+    },
+    {
+      "epoch": 0.4712518195050946,
+      "grad_norm": 0.13677838444709778,
+      "learning_rate": 2.8572442109368134e-05,
+      "loss": 0.15485031604766847,
+      "step": 2590
+    },
+    {
+      "epoch": 0.4721615720524017,
+      "grad_norm": 0.1477748304605484,
+      "learning_rate": 2.8499574059894617e-05,
+      "loss": 0.14577245712280273,
+      "step": 2595
+    },
+    {
+      "epoch": 0.47307132459970885,
+      "grad_norm": 0.1582217663526535,
+      "learning_rate": 2.842667567250252e-05,
+      "loss": 0.15586793422698975,
+      "step": 2600
+    },
+    {
+      "epoch": 0.47398107714701604,
+      "grad_norm": 0.19658738374710083,
+      "learning_rate": 2.8353747579150268e-05,
+      "loss": 0.15060495138168334,
+      "step": 2605
+    },
+    {
+      "epoch": 0.47489082969432317,
+      "grad_norm": 0.176767036318779,
+      "learning_rate": 2.828079041205382e-05,
+      "loss": 0.15116705894470214,
+      "step": 2610
+    },
+    {
+      "epoch": 0.4758005822416303,
+      "grad_norm": 0.16972507536411285,
+      "learning_rate": 2.820780480368117e-05,
+      "loss": 0.1541937470436096,
+      "step": 2615
+    },
+    {
+      "epoch": 0.47671033478893743,
+      "grad_norm": 0.1548585742712021,
+      "learning_rate": 2.8134791386746884e-05,
+      "loss": 0.14334756135940552,
+      "step": 2620
+    },
+    {
+      "epoch": 0.47762008733624456,
+      "grad_norm": 0.15411986410617828,
+      "learning_rate": 2.806175079420658e-05,
+      "loss": 0.14642289876937867,
+      "step": 2625
+    },
+    {
+      "epoch": 0.4785298398835517,
+      "grad_norm": 0.16609491407871246,
+      "learning_rate": 2.7988683659251474e-05,
+      "loss": 0.15083469152450563,
+      "step": 2630
+    },
+    {
+      "epoch": 0.4794395924308588,
+      "grad_norm": 0.16592684388160706,
+      "learning_rate": 2.791559061530289e-05,
+      "loss": 0.14218480587005616,
+      "step": 2635
+    },
+    {
+      "epoch": 0.48034934497816595,
+      "grad_norm": 0.1764935404062271,
+      "learning_rate": 2.7842472296006722e-05,
+      "loss": 0.15004343986511232,
+      "step": 2640
+    },
+    {
+      "epoch": 0.4812590975254731,
+      "grad_norm": 0.20094354450702667,
+      "learning_rate": 2.7769329335228022e-05,
+      "loss": 0.14975016117095946,
+      "step": 2645
+    },
+    {
+      "epoch": 0.4821688500727802,
+      "grad_norm": 0.1869269460439682,
+      "learning_rate": 2.769616236704542e-05,
+      "loss": 0.155981707572937,
+      "step": 2650
+    },
+    {
+      "epoch": 0.48307860262008734,
+      "grad_norm": 0.16671574115753174,
+      "learning_rate": 2.762297202574571e-05,
+      "loss": 0.14633859395980836,
+      "step": 2655
+    },
+    {
+      "epoch": 0.48398835516739447,
+      "grad_norm": 0.14999663829803467,
+      "learning_rate": 2.754975894581826e-05,
+      "loss": 0.15692603588104248,
+      "step": 2660
+    },
+    {
+      "epoch": 0.4848981077147016,
+      "grad_norm": 0.16893649101257324,
+      "learning_rate": 2.7476523761949592e-05,
+      "loss": 0.14530394077301026,
+      "step": 2665
+    },
+    {
+      "epoch": 0.48580786026200873,
+      "grad_norm": 0.16039884090423584,
+      "learning_rate": 2.740326710901784e-05,
+      "loss": 0.15013915300369263,
+      "step": 2670
+    },
+    {
+      "epoch": 0.48671761280931586,
+      "grad_norm": 0.16672006249427795,
+      "learning_rate": 2.732998962208725e-05,
+      "loss": 0.15667349100112915,
+      "step": 2675
+    },
+    {
+      "epoch": 0.487627365356623,
+      "grad_norm": 0.2160867303609848,
+      "learning_rate": 2.7256691936402684e-05,
+      "loss": 0.14335414171218872,
+      "step": 2680
+    },
+    {
+      "epoch": 0.4885371179039301,
+      "grad_norm": 0.349030077457428,
+      "learning_rate": 2.71833746873841e-05,
+      "loss": 0.1437530279159546,
+      "step": 2685
+    },
+    {
+      "epoch": 0.48944687045123725,
+      "grad_norm": 0.18380966782569885,
+      "learning_rate": 2.7110038510621073e-05,
+      "loss": 0.1476014256477356,
+      "step": 2690
+    },
+    {
+      "epoch": 0.4903566229985444,
+      "grad_norm": 0.1523742377758026,
+      "learning_rate": 2.703668404186722e-05,
+      "loss": 0.14578526020050048,
+      "step": 2695
+    },
+    {
+      "epoch": 0.4912663755458515,
+      "grad_norm": 0.16092729568481445,
+      "learning_rate": 2.696331191703479e-05,
+      "loss": 0.15335593223571778,
+      "step": 2700
+    },
+    {
+      "epoch": 0.49217612809315864,
+      "grad_norm": 0.17185333371162415,
+      "learning_rate": 2.688992277218904e-05,
+      "loss": 0.1540898084640503,
+      "step": 2705
+    },
+    {
+      "epoch": 0.49308588064046577,
+      "grad_norm": 0.1521969735622406,
+      "learning_rate": 2.6816517243542792e-05,
+      "loss": 0.15171396732330322,
+      "step": 2710
+    },
+    {
+      "epoch": 0.49399563318777295,
+      "grad_norm": 0.16064171493053436,
+      "learning_rate": 2.674309596745092e-05,
+      "loss": 0.1505839228630066,
+      "step": 2715
+    },
+    {
+      "epoch": 0.4949053857350801,
+      "grad_norm": 0.16430898010730743,
+      "learning_rate": 2.6669659580404795e-05,
+      "loss": 0.1551363468170166,
+      "step": 2720
+    },
+    {
+      "epoch": 0.4958151382823872,
+      "grad_norm": 0.16125477850437164,
+      "learning_rate": 2.659620871902677e-05,
+      "loss": 0.15069286823272704,
+      "step": 2725
+    },
+    {
+      "epoch": 0.49672489082969434,
+      "grad_norm": 0.1428450047969818,
+      "learning_rate": 2.652274402006471e-05,
+      "loss": 0.15511081218719483,
+      "step": 2730
+    },
+    {
+      "epoch": 0.4976346433770015,
+      "grad_norm": 0.15452754497528076,
+      "learning_rate": 2.6449266120386406e-05,
+      "loss": 0.14941939115524291,
+      "step": 2735
+    },
+    {
+      "epoch": 0.4985443959243086,
+      "grad_norm": 0.17243537306785583,
+      "learning_rate": 2.6375775656974123e-05,
+      "loss": 0.151741623878479,
+      "step": 2740
+    },
+    {
+      "epoch": 0.49945414847161573,
+      "grad_norm": 0.13736453652381897,
+      "learning_rate": 2.6302273266919008e-05,
+      "loss": 0.147042977809906,
+      "step": 2745
+    },
+    {
+      "epoch": 0.5003639010189228,
+      "grad_norm": 0.16241495311260223,
+      "learning_rate": 2.6228759587415614e-05,
+      "loss": 0.14664684534072875,
+      "step": 2750
+    },
+    {
+      "epoch": 0.50127365356623,
+      "grad_norm": 0.193496435880661,
+      "learning_rate": 2.6155235255756356e-05,
+      "loss": 0.15486966371536254,
+      "step": 2755
+    },
+    {
+      "epoch": 0.5021834061135371,
+      "grad_norm": 0.1542847901582718,
+      "learning_rate": 2.6081700909326e-05,
+      "loss": 0.15148009061813356,
+      "step": 2760
+    },
+    {
+      "epoch": 0.5030931586608443,
+      "grad_norm": 0.1696511209011078,
+      "learning_rate": 2.6008157185596142e-05,
+      "loss": 0.14190055131912233,
+      "step": 2765
+    },
+    {
+      "epoch": 0.5040029112081513,
+      "grad_norm": 0.14690077304840088,
+      "learning_rate": 2.5934604722119655e-05,
+      "loss": 0.1570739269256592,
+      "step": 2770
+    },
+    {
+      "epoch": 0.5049126637554585,
+      "grad_norm": 0.17149671912193298,
+      "learning_rate": 2.5861044156525162e-05,
+      "loss": 0.14940304756164552,
+      "step": 2775
+    },
+    {
+      "epoch": 0.5058224163027657,
+      "grad_norm": 0.16639231145381927,
+      "learning_rate": 2.578747612651155e-05,
+      "loss": 0.15691237449645995,
+      "step": 2780
+    },
+    {
+      "epoch": 0.5067321688500728,
+      "grad_norm": 0.2062763124704361,
+      "learning_rate": 2.5713901269842404e-05,
+      "loss": 0.1564734935760498,
+      "step": 2785
+    },
+    {
+      "epoch": 0.50764192139738,
+      "grad_norm": 0.12636308372020721,
+      "learning_rate": 2.5640320224340502e-05,
+      "loss": 0.14539417028427123,
+      "step": 2790
+    },
+    {
+      "epoch": 0.508551673944687,
+      "grad_norm": 0.16893689334392548,
+      "learning_rate": 2.556673362788225e-05,
+      "loss": 0.15440930128097535,
+      "step": 2795
+    },
+    {
+      "epoch": 0.5094614264919942,
+      "grad_norm": 0.16250015795230865,
+      "learning_rate": 2.54931421183922e-05,
+      "loss": 0.14485647678375244,
+      "step": 2800
+    },
+    {
+      "epoch": 0.5103711790393013,
+      "grad_norm": 0.1700994372367859,
+      "learning_rate": 2.5419546333837462e-05,
+      "loss": 0.15411126613616943,
+      "step": 2805
+    },
+    {
+      "epoch": 0.5112809315866085,
+      "grad_norm": 0.1547706127166748,
+      "learning_rate": 2.5345946912222256e-05,
+      "loss": 0.15516072511672974,
+      "step": 2810
+    },
+    {
+      "epoch": 0.5121906841339156,
+      "grad_norm": 0.17955681681632996,
+      "learning_rate": 2.527234449158228e-05,
+      "loss": 0.15546923875808716,
+      "step": 2815
+    },
+    {
+      "epoch": 0.5131004366812227,
+      "grad_norm": 0.163709819316864,
+      "learning_rate": 2.519873970997927e-05,
+      "loss": 0.15665037631988527,
+      "step": 2820
+    },
+    {
+      "epoch": 0.5140101892285298,
+      "grad_norm": 0.17859576642513275,
+      "learning_rate": 2.5125133205495405e-05,
+      "loss": 0.1539722204208374,
+      "step": 2825
+    },
+    {
+      "epoch": 0.514919941775837,
+      "grad_norm": 0.17443150281906128,
+      "learning_rate": 2.5051525616227806e-05,
+      "loss": 0.148411762714386,
+      "step": 2830
+    },
+    {
+      "epoch": 0.5158296943231441,
+      "grad_norm": 0.17397581040859222,
+      "learning_rate": 2.4977917580283007e-05,
+      "loss": 0.14880497455596925,
+      "step": 2835
+    },
+    {
+      "epoch": 0.5167394468704513,
+      "grad_norm": 0.14565663039684296,
+      "learning_rate": 2.4904309735771405e-05,
+      "loss": 0.14934680461883545,
+      "step": 2840
+    },
+    {
+      "epoch": 0.5176491994177583,
+      "grad_norm": 0.17895659804344177,
+      "learning_rate": 2.4830702720801746e-05,
+      "loss": 0.15287939310073853,
+      "step": 2845
+    },
+    {
+      "epoch": 0.5185589519650655,
+      "grad_norm": 0.15812788903713226,
+      "learning_rate": 2.4757097173475572e-05,
+      "loss": 0.14576947689056396,
+      "step": 2850
+    },
+    {
+      "epoch": 0.5194687045123726,
+      "grad_norm": 0.17123781144618988,
+      "learning_rate": 2.46834937318817e-05,
+      "loss": 0.15224847793579102,
+      "step": 2855
+    },
+    {
+      "epoch": 0.5203784570596798,
+      "grad_norm": 0.14845474064350128,
+      "learning_rate": 2.460989303409072e-05,
+      "loss": 0.14901585578918458,
+      "step": 2860
+    },
+    {
+      "epoch": 0.5212882096069869,
+      "grad_norm": 0.23493704199790955,
+      "learning_rate": 2.4536295718149407e-05,
+      "loss": 0.1517487049102783,
+      "step": 2865
+    },
+    {
+      "epoch": 0.522197962154294,
+      "grad_norm": 0.16209843754768372,
+      "learning_rate": 2.4462702422075217e-05,
+      "loss": 0.14327445030212402,
+      "step": 2870
+    },
+    {
+      "epoch": 0.5231077147016011,
+      "grad_norm": 0.17249803245067596,
+      "learning_rate": 2.4389113783850793e-05,
+      "loss": 0.1517549753189087,
+      "step": 2875
+    },
+    {
+      "epoch": 0.5240174672489083,
+      "grad_norm": 0.14561402797698975,
+      "learning_rate": 2.431553044141836e-05,
+      "loss": 0.14764087200164794,
+      "step": 2880
+    },
+    {
+      "epoch": 0.5249272197962155,
+      "grad_norm": 0.17033302783966064,
+      "learning_rate": 2.4241953032674256e-05,
+      "loss": 0.15181604623794556,
+      "step": 2885
+    },
+    {
+      "epoch": 0.5258369723435226,
+      "grad_norm": 0.1184430941939354,
+      "learning_rate": 2.4168382195463367e-05,
+      "loss": 0.14264242649078368,
+      "step": 2890
+    },
+    {
+      "epoch": 0.5267467248908297,
+      "grad_norm": 0.17521196603775024,
+      "learning_rate": 2.4094818567573618e-05,
+      "loss": 0.1509538173675537,
+      "step": 2895
+    },
+    {
+      "epoch": 0.5276564774381368,
+      "grad_norm": 0.1681576371192932,
+      "learning_rate": 2.4021262786730428e-05,
+      "loss": 0.15344605445861817,
+      "step": 2900
+    },
+    {
+      "epoch": 0.528566229985444,
+      "grad_norm": 0.17134182155132294,
+      "learning_rate": 2.3947715490591206e-05,
+      "loss": 0.15161689519882202,
+      "step": 2905
+    },
+    {
+      "epoch": 0.5294759825327511,
+      "grad_norm": 0.1796472817659378,
+      "learning_rate": 2.3874177316739778e-05,
+      "loss": 0.15086464881896972,
+      "step": 2910
+    },
+    {
+      "epoch": 0.5303857350800583,
+      "grad_norm": 0.23268625140190125,
+      "learning_rate": 2.380064890268093e-05,
+      "loss": 0.15354180335998535,
+      "step": 2915
+    },
+    {
+      "epoch": 0.5312954876273653,
+      "grad_norm": 0.16318941116333008,
+      "learning_rate": 2.372713088583481e-05,
+      "loss": 0.15131797790527343,
+      "step": 2920
+    },
+    {
+      "epoch": 0.5322052401746725,
+      "grad_norm": 0.18171803653240204,
+      "learning_rate": 2.365362390353143e-05,
+      "loss": 0.15784090757369995,
+      "step": 2925
+    },
+    {
+      "epoch": 0.5331149927219796,
+      "grad_norm": 0.17672640085220337,
+      "learning_rate": 2.3580128593005156e-05,
+      "loss": 0.15509436130523682,
+      "step": 2930
+    },
+    {
+      "epoch": 0.5340247452692868,
+      "grad_norm": 0.15985223650932312,
+      "learning_rate": 2.3506645591389174e-05,
+      "loss": 0.14851027727127075,
+      "step": 2935
+    },
+    {
+      "epoch": 0.5349344978165939,
+      "grad_norm": 0.16597607731819153,
+      "learning_rate": 2.343317553570995e-05,
+      "loss": 0.1504931092262268,
+      "step": 2940
+    },
+    {
+      "epoch": 0.535844250363901,
+      "grad_norm": 0.20180748403072357,
+      "learning_rate": 2.3359719062881725e-05,
+      "loss": 0.15023820400238036,
+      "step": 2945
+    },
+    {
+      "epoch": 0.5367540029112081,
+      "grad_norm": 0.1735963076353073,
+      "learning_rate": 2.3286276809701e-05,
+      "loss": 0.15374408960342406,
+      "step": 2950
+    },
+    {
+      "epoch": 0.5376637554585153,
+      "grad_norm": 0.17629501223564148,
+      "learning_rate": 2.3212849412840995e-05,
+      "loss": 0.15007833242416382,
+      "step": 2955
+    },
+    {
+      "epoch": 0.5385735080058224,
+      "grad_norm": 0.1493796557188034,
+      "learning_rate": 2.3139437508846155e-05,
+      "loss": 0.15206656455993653,
+      "step": 2960
+    },
+    {
+      "epoch": 0.5394832605531296,
+      "grad_norm": 0.17426837980747223,
+      "learning_rate": 2.306604173412659e-05,
+      "loss": 0.1441131591796875,
+      "step": 2965
+    },
+    {
+      "epoch": 0.5403930131004366,
+      "grad_norm": 0.16984431445598602,
+      "learning_rate": 2.2992662724952613e-05,
+      "loss": 0.14438753128051757,
+      "step": 2970
+    },
+    {
+      "epoch": 0.5413027656477438,
+      "grad_norm": 0.1814386397600174,
+      "learning_rate": 2.2919301117449167e-05,
+      "loss": 0.14887022972106934,
+      "step": 2975
+    },
+    {
+      "epoch": 0.5422125181950509,
+      "grad_norm": 0.158392995595932,
+      "learning_rate": 2.2845957547590368e-05,
+      "loss": 0.14404361248016356,
+      "step": 2980
+    },
+    {
+      "epoch": 0.5431222707423581,
+      "grad_norm": 0.17496263980865479,
+      "learning_rate": 2.2772632651193953e-05,
+      "loss": 0.1454906702041626,
+      "step": 2985
+    },
+    {
+      "epoch": 0.5440320232896652,
+      "grad_norm": 0.157533198595047,
+      "learning_rate": 2.2699327063915766e-05,
+      "loss": 0.1458217740058899,
+      "step": 2990
+    },
+    {
+      "epoch": 0.5449417758369723,
+      "grad_norm": 0.1767890453338623,
+      "learning_rate": 2.262604142124427e-05,
+      "loss": 0.14384825229644777,
+      "step": 2995
+    },
+    {
+      "epoch": 0.5458515283842795,
+      "grad_norm": 0.1851050704717636,
+      "learning_rate": 2.2552776358495033e-05,
+      "loss": 0.14832457304000854,
+      "step": 3000
+    },
+    {
+      "epoch": 0.5467612809315866,
+      "grad_norm": 0.164175882935524,
+      "learning_rate": 2.247953251080521e-05,
+      "loss": 0.14999878406524658,
+      "step": 3005
+    },
+    {
+      "epoch": 0.5476710334788938,
+      "grad_norm": 0.3403675854206085,
+      "learning_rate": 2.240631051312804e-05,
+      "loss": 0.1443937063217163,
+      "step": 3010
+    },
+    {
+      "epoch": 0.5485807860262009,
+      "grad_norm": 0.16751109063625336,
+      "learning_rate": 2.2333111000227342e-05,
+      "loss": 0.1462402105331421,
+      "step": 3015
+    },
+    {
+      "epoch": 0.549490538573508,
+      "grad_norm": 0.14741151034832,
+      "learning_rate": 2.225993460667201e-05,
+      "loss": 0.149855899810791,
+      "step": 3020
+    },
+    {
+      "epoch": 0.5504002911208151,
+      "grad_norm": 0.20605266094207764,
+      "learning_rate": 2.218678196683054e-05,
+      "loss": 0.15413178205490113,
+      "step": 3025
+    },
+    {
+      "epoch": 0.5513100436681223,
+      "grad_norm": 0.14884796738624573,
+      "learning_rate": 2.2113653714865473e-05,
+      "loss": 0.14592334032058715,
+      "step": 3030
+    },
+    {
+      "epoch": 0.5522197962154294,
+      "grad_norm": 0.17114350199699402,
+      "learning_rate": 2.2040550484727943e-05,
+      "loss": 0.1498338460922241,
+      "step": 3035
+    },
+    {
+      "epoch": 0.5531295487627366,
+      "grad_norm": 0.16496853530406952,
+      "learning_rate": 2.196747291015219e-05,
+      "loss": 0.14650315046310425,
+      "step": 3040
+    },
+    {
+      "epoch": 0.5540393013100436,
+      "grad_norm": 0.15172401070594788,
+      "learning_rate": 2.189442162465001e-05,
+      "loss": 0.14984124898910522,
+      "step": 3045
+    },
+    {
+      "epoch": 0.5549490538573508,
+      "grad_norm": 0.19258467853069305,
+      "learning_rate": 2.182139726150532e-05,
+      "loss": 0.1486764669418335,
+      "step": 3050
+    },
+    {
+      "epoch": 0.5558588064046579,
+      "grad_norm": 0.1749001443386078,
+      "learning_rate": 2.1748400453768652e-05,
+      "loss": 0.14983701705932617,
+      "step": 3055
+    },
+    {
+      "epoch": 0.5567685589519651,
+      "grad_norm": 0.37510567903518677,
+      "learning_rate": 2.1675431834251637e-05,
+      "loss": 0.14483561515808105,
+      "step": 3060
+    },
+    {
+      "epoch": 0.5576783114992722,
+      "grad_norm": 0.16932405531406403,
+      "learning_rate": 2.1602492035521553e-05,
+      "loss": 0.14487643241882325,
+      "step": 3065
+    },
+    {
+      "epoch": 0.5585880640465793,
+      "grad_norm": 0.174176424741745,
+      "learning_rate": 2.152958168989584e-05,
+      "loss": 0.14737497568130492,
+      "step": 3070
+    },
+    {
+      "epoch": 0.5594978165938864,
+      "grad_norm": 0.1601252257823944,
+      "learning_rate": 2.1456701429436577e-05,
+      "loss": 0.15183379650115966,
+      "step": 3075
+    },
+    {
+      "epoch": 0.5604075691411936,
+      "grad_norm": 0.14960910379886627,
+      "learning_rate": 2.1383851885945085e-05,
+      "loss": 0.143074893951416,
+      "step": 3080
+    },
+    {
+      "epoch": 0.5613173216885007,
+      "grad_norm": 0.1678633838891983,
+      "learning_rate": 2.1311033690956346e-05,
+      "loss": 0.14961432218551635,
+      "step": 3085
+    },
+    {
+      "epoch": 0.5622270742358079,
+      "grad_norm": 0.15814319252967834,
+      "learning_rate": 2.1238247475733613e-05,
+      "loss": 0.14308581352233887,
+      "step": 3090
+    },
+    {
+      "epoch": 0.5631368267831149,
+      "grad_norm": 0.21240772306919098,
+      "learning_rate": 2.1165493871262887e-05,
+      "loss": 0.14737485647201537,
+      "step": 3095
+    },
+    {
+      "epoch": 0.5640465793304221,
+      "grad_norm": 0.15161271393299103,
+      "learning_rate": 2.109277350824749e-05,
+      "loss": 0.14534420967102052,
+      "step": 3100
+    },
+    {
+      "epoch": 0.5649563318777293,
+      "grad_norm": 0.16572362184524536,
+      "learning_rate": 2.1020087017102537e-05,
+      "loss": 0.14299670457839966,
+      "step": 3105
+    },
+    {
+      "epoch": 0.5658660844250364,
+      "grad_norm": 0.1548164039850235,
+      "learning_rate": 2.094743502794954e-05,
+      "loss": 0.14371142387390137,
+      "step": 3110
+    },
+    {
+      "epoch": 0.5667758369723436,
+      "grad_norm": 0.2574169933795929,
+      "learning_rate": 2.0874818170610885e-05,
+      "loss": 0.14350423812866211,
+      "step": 3115
+    },
+    {
+      "epoch": 0.5676855895196506,
+      "grad_norm": 0.16359548270702362,
+      "learning_rate": 2.080223707460443e-05,
+      "loss": 0.1520243763923645,
+      "step": 3120
+    },
+    {
+      "epoch": 0.5685953420669578,
+      "grad_norm": 0.1798320859670639,
+      "learning_rate": 2.072969236913799e-05,
+      "loss": 0.14832595586776734,
+      "step": 3125
+    },
+    {
+      "epoch": 0.5695050946142649,
+      "grad_norm": 0.17045916616916656,
+      "learning_rate": 2.0657184683103926e-05,
+      "loss": 0.15308042764663696,
+      "step": 3130
+    },
+    {
+      "epoch": 0.5704148471615721,
+      "grad_norm": 0.16345897316932678,
+      "learning_rate": 2.058471464507366e-05,
+      "loss": 0.14564799070358275,
+      "step": 3135
+    },
+    {
+      "epoch": 0.5713245997088792,
+      "grad_norm": 0.15170110762119293,
+      "learning_rate": 2.0512282883292257e-05,
+      "loss": 0.14161767959594726,
+      "step": 3140
+    },
+    {
+      "epoch": 0.5722343522561864,
+      "grad_norm": 0.8107472658157349,
+      "learning_rate": 2.0439890025672955e-05,
+      "loss": 0.14481087923049926,
+      "step": 3145
+    },
+    {
+      "epoch": 0.5731441048034934,
+      "grad_norm": 0.15346679091453552,
+      "learning_rate": 2.036753669979174e-05,
+      "loss": 0.14860262870788574,
+      "step": 3150
+    },
+    {
+      "epoch": 0.5740538573508006,
+      "grad_norm": 0.1632593423128128,
+      "learning_rate": 2.0295223532881886e-05,
+      "loss": 0.1481687307357788,
+      "step": 3155
+    },
+    {
+      "epoch": 0.5749636098981077,
+      "grad_norm": 0.23399172723293304,
+      "learning_rate": 2.022295115182852e-05,
+      "loss": 0.149153733253479,
+      "step": 3160
+    },
+    {
+      "epoch": 0.5758733624454149,
+      "grad_norm": 0.14977394044399261,
+      "learning_rate": 2.015072018316323e-05,
+      "loss": 0.14921388626098633,
+      "step": 3165
+    },
+    {
+      "epoch": 0.576783114992722,
+      "grad_norm": 0.1550658792257309,
+      "learning_rate": 2.007853125305856e-05,
+      "loss": 0.1482759475708008,
+      "step": 3170
+    },
+    {
+      "epoch": 0.5776928675400291,
+      "grad_norm": 0.16661737859249115,
+      "learning_rate": 2.0006384987322645e-05,
+      "loss": 0.14903552532196046,
+      "step": 3175
+    },
+    {
+      "epoch": 0.5786026200873362,
+      "grad_norm": 0.1746823936700821,
+      "learning_rate": 1.9934282011393753e-05,
+      "loss": 0.1412947654724121,
+      "step": 3180
+    },
+    {
+      "epoch": 0.5795123726346434,
+      "grad_norm": 0.17025792598724365,
+      "learning_rate": 1.9862222950334857e-05,
+      "loss": 0.15289769172668458,
+      "step": 3185
+    },
+    {
+      "epoch": 0.5804221251819505,
+      "grad_norm": 0.16857658326625824,
+      "learning_rate": 1.9790208428828252e-05,
+      "loss": 0.14419941902160643,
+      "step": 3190
+    },
+    {
+      "epoch": 0.5813318777292577,
+      "grad_norm": 0.16099876165390015,
+      "learning_rate": 1.9718239071170118e-05,
+      "loss": 0.14476487636566163,
+      "step": 3195
+    },
+    {
+      "epoch": 0.5822416302765647,
+      "grad_norm": 0.16140873730182648,
+      "learning_rate": 1.964631550126508e-05,
+      "loss": 0.14588416814804078,
+      "step": 3200
+    },
+    {
+      "epoch": 0.5831513828238719,
+      "grad_norm": 0.15719448029994965,
+      "learning_rate": 1.957443834262087e-05,
+      "loss": 0.15144693851470947,
+      "step": 3205
+    },
+    {
+      "epoch": 0.584061135371179,
+      "grad_norm": 0.16512645781040192,
+      "learning_rate": 1.950260821834285e-05,
+      "loss": 0.14787566661834717,
+      "step": 3210
+    },
+    {
+      "epoch": 0.5849708879184862,
+      "grad_norm": 0.18584516644477844,
+      "learning_rate": 1.9430825751128643e-05,
+      "loss": 0.14514710903167724,
+      "step": 3215
+    },
+    {
+      "epoch": 0.5858806404657934,
+      "grad_norm": 0.17640981078147888,
+      "learning_rate": 1.9359091563262742e-05,
+      "loss": 0.1511004686355591,
+      "step": 3220
+    },
+    {
+      "epoch": 0.5867903930131004,
+      "grad_norm": 0.1697624921798706,
+      "learning_rate": 1.9287406276611095e-05,
+      "loss": 0.15392563343048096,
+      "step": 3225
+    },
+    {
+      "epoch": 0.5877001455604076,
+      "grad_norm": 0.1677260845899582,
+      "learning_rate": 1.9215770512615725e-05,
+      "loss": 0.15311745405197144,
+      "step": 3230
+    },
+    {
+      "epoch": 0.5886098981077147,
+      "grad_norm": 0.15357480943202972,
+      "learning_rate": 1.9144184892289337e-05,
+      "loss": 0.14370160102844237,
+      "step": 3235
+    },
+    {
+      "epoch": 0.5895196506550219,
+      "grad_norm": 0.18601207435131073,
+      "learning_rate": 1.9072650036209955e-05,
+      "loss": 0.14095077514648438,
+      "step": 3240
+    },
+    {
+      "epoch": 0.590429403202329,
+      "grad_norm": 0.17313526570796967,
+      "learning_rate": 1.9001166564515513e-05,
+      "loss": 0.148259174823761,
+      "step": 3245
+    },
+    {
+      "epoch": 0.5913391557496361,
+      "grad_norm": 0.1634378433227539,
+      "learning_rate": 1.8929735096898504e-05,
+      "loss": 0.15082294940948487,
+      "step": 3250
+    },
+    {
+      "epoch": 0.5922489082969432,
+      "grad_norm": 0.18542174994945526,
+      "learning_rate": 1.885835625260058e-05,
+      "loss": 0.14461435079574586,
+      "step": 3255
+    },
+    {
+      "epoch": 0.5931586608442504,
+      "grad_norm": 0.1740756630897522,
+      "learning_rate": 1.87870306504072e-05,
+      "loss": 0.14083608388900756,
+      "step": 3260
+    },
+    {
+      "epoch": 0.5940684133915575,
+      "grad_norm": 0.25606217980384827,
+      "learning_rate": 1.8715758908642288e-05,
+      "loss": 0.15125386714935302,
+      "step": 3265
+    },
+    {
+      "epoch": 0.5949781659388647,
+      "grad_norm": 0.20194627344608307,
+      "learning_rate": 1.8644541645162834e-05,
+      "loss": 0.14433003664016725,
+      "step": 3270
+    },
+    {
+      "epoch": 0.5958879184861717,
+      "grad_norm": 0.1902168095111847,
+      "learning_rate": 1.8573379477353542e-05,
+      "loss": 0.14718132019042968,
+      "step": 3275
+    },
+    {
+      "epoch": 0.5967976710334789,
+      "grad_norm": 0.15122972428798676,
+      "learning_rate": 1.850227302212151e-05,
+      "loss": 0.153376567363739,
+      "step": 3280
+    },
+    {
+      "epoch": 0.597707423580786,
+      "grad_norm": 0.14331959187984467,
+      "learning_rate": 1.843122289589085e-05,
+      "loss": 0.146630597114563,
+      "step": 3285
+    },
+    {
+      "epoch": 0.5986171761280932,
+      "grad_norm": 0.15083099901676178,
+      "learning_rate": 1.836022971459737e-05,
+      "loss": 0.1445971965789795,
+      "step": 3290
+    },
+    {
+      "epoch": 0.5995269286754003,
+      "grad_norm": 0.16585418581962585,
+      "learning_rate": 1.828929409368321e-05,
+      "loss": 0.15120241641998292,
+      "step": 3295
+    },
+    {
+      "epoch": 0.6004366812227074,
+      "grad_norm": 0.1653224229812622,
+      "learning_rate": 1.8218416648091524e-05,
+      "loss": 0.14349838495254516,
+      "step": 3300
+    },
+    {
+      "epoch": 0.6013464337700145,
+      "grad_norm": 0.1891375184059143,
+      "learning_rate": 1.8147597992261124e-05,
+      "loss": 0.15171384811401367,
+      "step": 3305
+    },
+    {
+      "epoch": 0.6022561863173217,
+      "grad_norm": 0.13392704725265503,
+      "learning_rate": 1.8076838740121187e-05,
+      "loss": 0.14607118368148803,
+      "step": 3310
+    },
+    {
+      "epoch": 0.6031659388646288,
+      "grad_norm": 0.15421944856643677,
+      "learning_rate": 1.8006139505085926e-05,
+      "loss": 0.1380957007408142,
+      "step": 3315
+    },
+    {
+      "epoch": 0.604075691411936,
+      "grad_norm": 0.16637761890888214,
+      "learning_rate": 1.7935500900049246e-05,
+      "loss": 0.14604611396789552,
+      "step": 3320
+    },
+    {
+      "epoch": 0.6049854439592431,
+      "grad_norm": 0.16638441383838654,
+      "learning_rate": 1.7864923537379445e-05,
+      "loss": 0.1513611912727356,
+      "step": 3325
+    },
+    {
+      "epoch": 0.6058951965065502,
+      "grad_norm": 0.1745707094669342,
+      "learning_rate": 1.779440802891394e-05,
+      "loss": 0.15391240119934083,
+      "step": 3330
+    },
+    {
+      "epoch": 0.6068049490538574,
+      "grad_norm": 0.1620505005121231,
+      "learning_rate": 1.77239549859539e-05,
+      "loss": 0.14986472129821776,
+      "step": 3335
+    },
+    {
+      "epoch": 0.6077147016011645,
+      "grad_norm": 0.1579132080078125,
+      "learning_rate": 1.7653565019259e-05,
+      "loss": 0.1466603994369507,
+      "step": 3340
+    },
+    {
+      "epoch": 0.6086244541484717,
+      "grad_norm": 0.19154994189739227,
+      "learning_rate": 1.7583238739042086e-05,
+      "loss": 0.15228934288024903,
+      "step": 3345
+    },
+    {
+      "epoch": 0.6095342066957787,
+      "grad_norm": 0.15771779417991638,
+      "learning_rate": 1.7512976754963913e-05,
+      "loss": 0.14965078830718995,
+      "step": 3350
+    },
+    {
+      "epoch": 0.6104439592430859,
+      "grad_norm": 0.18406136333942413,
+      "learning_rate": 1.744277967612785e-05,
+      "loss": 0.1473196864128113,
+      "step": 3355
+    },
+    {
+      "epoch": 0.611353711790393,
+      "grad_norm": 0.17603816092014313,
+      "learning_rate": 1.7372648111074607e-05,
+      "loss": 0.1430676221847534,
+      "step": 3360
+    },
+    {
+      "epoch": 0.6122634643377002,
+      "grad_norm": 0.156408429145813,
+      "learning_rate": 1.7302582667776933e-05,
+      "loss": 0.14018454551696777,
+      "step": 3365
+    },
+    {
+      "epoch": 0.6131732168850073,
+      "grad_norm": 0.14504843950271606,
+      "learning_rate": 1.7232583953634407e-05,
+      "loss": 0.14505640268325806,
+      "step": 3370
+    },
+    {
+      "epoch": 0.6140829694323144,
+      "grad_norm": 0.1864968240261078,
+      "learning_rate": 1.716265257546808e-05,
+      "loss": 0.14810394048690795,
+      "step": 3375
+    },
+    {
+      "epoch": 0.6149927219796215,
+      "grad_norm": 0.1621711403131485,
+      "learning_rate": 1.7092789139515295e-05,
+      "loss": 0.14203091859817504,
+      "step": 3380
+    },
+    {
+      "epoch": 0.6159024745269287,
+      "grad_norm": 0.17994914948940277,
+      "learning_rate": 1.70229942514244e-05,
+      "loss": 0.14565644264221192,
+      "step": 3385
+    },
+    {
+      "epoch": 0.6168122270742358,
+      "grad_norm": 0.1707388162612915,
+      "learning_rate": 1.6953268516249486e-05,
+      "loss": 0.14449434280395507,
+      "step": 3390
+    },
+    {
+      "epoch": 0.617721979621543,
+      "grad_norm": 0.16425329446792603,
+      "learning_rate": 1.6883612538445175e-05,
+      "loss": 0.15185940265655518,
+      "step": 3395
+    },
+    {
+      "epoch": 0.61863173216885,
+      "grad_norm": 0.15987788140773773,
+      "learning_rate": 1.6814026921861335e-05,
+      "loss": 0.14994431734085084,
+      "step": 3400
+    },
+    {
+      "epoch": 0.6195414847161572,
+      "grad_norm": 0.2987690269947052,
+      "learning_rate": 1.6744512269737894e-05,
+      "loss": 0.14652738571166993,
+      "step": 3405
+    },
+    {
+      "epoch": 0.6204512372634643,
+      "grad_norm": 0.1681315004825592,
+      "learning_rate": 1.6675069184699574e-05,
+      "loss": 0.14566165208816528,
+      "step": 3410
+    },
+    {
+      "epoch": 0.6213609898107715,
+      "grad_norm": 0.15847846865653992,
+      "learning_rate": 1.660569826875069e-05,
+      "loss": 0.1374401330947876,
+      "step": 3415
+    },
+    {
+      "epoch": 0.6222707423580786,
+      "grad_norm": 0.16370312869548798,
+      "learning_rate": 1.6536400123269907e-05,
+      "loss": 0.14905524253845215,
+      "step": 3420
+    },
+    {
+      "epoch": 0.6231804949053857,
+      "grad_norm": 0.16054444015026093,
+      "learning_rate": 1.6467175349005054e-05,
+      "loss": 0.1496324896812439,
+      "step": 3425
+    },
+    {
+      "epoch": 0.6240902474526928,
+      "grad_norm": 0.1663951277732849,
+      "learning_rate": 1.639802454606788e-05,
+      "loss": 0.1504170298576355,
+      "step": 3430
+    },
+    {
+      "epoch": 0.625,
+      "grad_norm": 0.1591310054063797,
+      "learning_rate": 1.6328948313928906e-05,
+      "loss": 0.1410186171531677,
+      "step": 3435
+    },
+    {
+      "epoch": 0.6259097525473072,
+      "grad_norm": 0.1637524962425232,
+      "learning_rate": 1.6259947251412178e-05,
+      "loss": 0.13963305950164795,
+      "step": 3440
+    },
+    {
+      "epoch": 0.6268195050946143,
+      "grad_norm": 0.1688017100095749,
+      "learning_rate": 1.6191021956690096e-05,
+      "loss": 0.14727941751480103,
+      "step": 3445
+    },
+    {
+      "epoch": 0.6277292576419214,
+      "grad_norm": 0.1691795438528061,
+      "learning_rate": 1.612217302727821e-05,
+      "loss": 0.14856183528900146,
+      "step": 3450
+    },
+    {
+      "epoch": 0.6286390101892285,
+      "grad_norm": 0.18501746654510498,
+      "learning_rate": 1.60534010600301e-05,
+      "loss": 0.1481746554374695,
+      "step": 3455
+    },
+    {
+      "epoch": 0.6295487627365357,
+      "grad_norm": 0.16234716773033142,
+      "learning_rate": 1.5984706651132125e-05,
+      "loss": 0.1427530527114868,
+      "step": 3460
+    },
+    {
+      "epoch": 0.6304585152838428,
+      "grad_norm": 0.16013780236244202,
+      "learning_rate": 1.5916090396098293e-05,
+      "loss": 0.14264426231384278,
+      "step": 3465
+    },
+    {
+      "epoch": 0.63136826783115,
+      "grad_norm": 0.17116396129131317,
+      "learning_rate": 1.5847552889765095e-05,
+      "loss": 0.14109257459640503,
+      "step": 3470
+    },
+    {
+      "epoch": 0.632278020378457,
+      "grad_norm": 0.16949769854545593,
+      "learning_rate": 1.5779094726286344e-05,
+      "loss": 0.1387040376663208,
+      "step": 3475
+    },
+    {
+      "epoch": 0.6331877729257642,
+      "grad_norm": 0.14983431994915009,
+      "learning_rate": 1.5710716499128044e-05,
+      "loss": 0.13645120859146118,
+      "step": 3480
+    },
+    {
+      "epoch": 0.6340975254730713,
+      "grad_norm": 0.1632554531097412,
+      "learning_rate": 1.564241880106321e-05,
+      "loss": 0.14883992671966553,
+      "step": 3485
+    },
+    {
+      "epoch": 0.6350072780203785,
+      "grad_norm": 0.15686506032943726,
+      "learning_rate": 1.5574202224166744e-05,
+      "loss": 0.14244272708892822,
+      "step": 3490
+    },
+    {
+      "epoch": 0.6359170305676856,
+      "grad_norm": 0.18843458592891693,
+      "learning_rate": 1.5506067359810333e-05,
+      "loss": 0.15149861574172974,
+      "step": 3495
+    },
+    {
+      "epoch": 0.6368267831149927,
+      "grad_norm": 0.15874551236629486,
+      "learning_rate": 1.5438014798657275e-05,
+      "loss": 0.15188233852386473,
+      "step": 3500
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.9271465327291443e+18,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-3500/training_args.bin b/checkpoint-3500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-3500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/checkpoint-3600/README.md b/checkpoint-3600/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-3600/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-3600/adapter_config.json b/checkpoint-3600/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-3600/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-3600/adapter_model.safetensors b/checkpoint-3600/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..0672b63f57edd1965d6e90f60090756fe5f3a5ea
--- /dev/null
+++ b/checkpoint-3600/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e82f4db15171359b08051df7ef22b2a91759466d62f59e96fcc832280a756330
+size 169741912
diff --git a/checkpoint-3600/chat_template.jinja b/checkpoint-3600/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-3600/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-3600/optimizer.pt b/checkpoint-3600/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5da9402cf51e94b26a2ea76746e2f43e3c2a7173
--- /dev/null
+++ b/checkpoint-3600/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:81f9806af70ac150ea697e8bf39f88750fb40b1e6ca60c895897b99d1e88c451
+size 72807355
diff --git a/checkpoint-3600/processor_config.json b/checkpoint-3600/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-3600/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-3600/rng_state.pth b/checkpoint-3600/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-3600/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-3600/scheduler.pt b/checkpoint-3600/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2f95e80419dd9da35cffe755c298d5e917e06737
--- /dev/null
+++ b/checkpoint-3600/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8381d34f5354aa03b079a9399bfed585738434f75d919dd720618a74b99a1247
+size 1465
diff --git a/checkpoint-3600/tokenizer.json b/checkpoint-3600/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-3600/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-3600/tokenizer_config.json b/checkpoint-3600/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-3600/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-3600/trainer_state.json b/checkpoint-3600/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..52d156c0fb6c0eb1fd3c734e8f74e68af254f1f8
--- /dev/null
+++ b/checkpoint-3600/trainer_state.json
@@ -0,0 +1,5082 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.6550218340611353,
+  "eval_steps": 100,
+  "global_step": 3600,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    },
+    {
+      "epoch": 0.03729985443959243,
+      "grad_norm": 0.061678580939769745,
+      "learning_rate": 4.999340748636462e-05,
+      "loss": 0.24956226348876953,
+      "step": 205
+    },
+    {
+      "epoch": 0.03820960698689956,
+      "grad_norm": 0.07328873127698898,
+      "learning_rate": 4.999160884067051e-05,
+      "loss": 0.26169676780700685,
+      "step": 210
+    },
+    {
+      "epoch": 0.0391193595342067,
+      "grad_norm": 0.08287990838289261,
+      "learning_rate": 4.9989593541926246e-05,
+      "loss": 0.2574604034423828,
+      "step": 215
+    },
+    {
+      "epoch": 0.04002911208151383,
+      "grad_norm": 0.06787359714508057,
+      "learning_rate": 4.9987361607602525e-05,
+      "loss": 0.25351409912109374,
+      "step": 220
+    },
+    {
+      "epoch": 0.04093886462882096,
+      "grad_norm": 0.06695502996444702,
+      "learning_rate": 4.998491305704805e-05,
+      "loss": 0.24522039890289307,
+      "step": 225
+    },
+    {
+      "epoch": 0.041848617176128096,
+      "grad_norm": 0.08872214704751968,
+      "learning_rate": 4.9982247911489375e-05,
+      "loss": 0.2581867933273315,
+      "step": 230
+    },
+    {
+      "epoch": 0.042758369723435226,
+      "grad_norm": 0.07637131959199905,
+      "learning_rate": 4.9979366194030743e-05,
+      "loss": 0.25569658279418944,
+      "step": 235
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 0.08158119022846222,
+      "learning_rate": 4.997626792965385e-05,
+      "loss": 0.2529409646987915,
+      "step": 240
+    },
+    {
+      "epoch": 0.04457787481804949,
+      "grad_norm": 0.07529161125421524,
+      "learning_rate": 4.997295314521766e-05,
+      "loss": 0.24049024581909179,
+      "step": 245
+    },
+    {
+      "epoch": 0.04548762736535662,
+      "grad_norm": 0.08860139548778534,
+      "learning_rate": 4.996942186945813e-05,
+      "loss": 0.2490522861480713,
+      "step": 250
+    },
+    {
+      "epoch": 0.04639737991266375,
+      "grad_norm": 0.0850321501493454,
+      "learning_rate": 4.9965674132988005e-05,
+      "loss": 0.24180831909179687,
+      "step": 255
+    },
+    {
+      "epoch": 0.04730713245997089,
+      "grad_norm": 0.07556115090847015,
+      "learning_rate": 4.996170996829653e-05,
+      "loss": 0.2509631872177124,
+      "step": 260
+    },
+    {
+      "epoch": 0.04821688500727802,
+      "grad_norm": 0.07971206307411194,
+      "learning_rate": 4.995752940974918e-05,
+      "loss": 0.24398891925811766,
+      "step": 265
+    },
+    {
+      "epoch": 0.04912663755458515,
+      "grad_norm": 0.09149336814880371,
+      "learning_rate": 4.9953132493587344e-05,
+      "loss": 0.2300492286682129,
+      "step": 270
+    },
+    {
+      "epoch": 0.050036390101892286,
+      "grad_norm": 0.08265820890665054,
+      "learning_rate": 4.9948519257928034e-05,
+      "loss": 0.24246792793273925,
+      "step": 275
+    },
+    {
+      "epoch": 0.050946142649199416,
+      "grad_norm": 0.10328587144613266,
+      "learning_rate": 4.9943689742763534e-05,
+      "loss": 0.2367171049118042,
+      "step": 280
+    },
+    {
+      "epoch": 0.05185589519650655,
+      "grad_norm": 0.0836917981505394,
+      "learning_rate": 4.993864398996105e-05,
+      "loss": 0.23215813636779786,
+      "step": 285
+    },
+    {
+      "epoch": 0.05276564774381368,
+      "grad_norm": 0.09475161135196686,
+      "learning_rate": 4.99333820432624e-05,
+      "loss": 0.2350748062133789,
+      "step": 290
+    },
+    {
+      "epoch": 0.05367540029112081,
+      "grad_norm": 0.08040128648281097,
+      "learning_rate": 4.992790394828355e-05,
+      "loss": 0.23253886699676513,
+      "step": 295
+    },
+    {
+      "epoch": 0.05458515283842795,
+      "grad_norm": 0.08852150291204453,
+      "learning_rate": 4.992220975251428e-05,
+      "loss": 0.23856515884399415,
+      "step": 300
+    },
+    {
+      "epoch": 0.05549490538573508,
+      "grad_norm": 0.09565229713916779,
+      "learning_rate": 4.991629950531775e-05,
+      "loss": 0.23311660289764405,
+      "step": 305
+    },
+    {
+      "epoch": 0.05640465793304221,
+      "grad_norm": 0.08158160001039505,
+      "learning_rate": 4.991017325793009e-05,
+      "loss": 0.22467944622039795,
+      "step": 310
+    },
+    {
+      "epoch": 0.05731441048034935,
+      "grad_norm": 0.07746429741382599,
+      "learning_rate": 4.990383106345994e-05,
+      "loss": 0.229844069480896,
+      "step": 315
+    },
+    {
+      "epoch": 0.05822416302765648,
+      "grad_norm": 0.08564355969429016,
+      "learning_rate": 4.989727297688797e-05,
+      "loss": 0.22414517402648926,
+      "step": 320
+    },
+    {
+      "epoch": 0.05913391557496361,
+      "grad_norm": 0.07517435401678085,
+      "learning_rate": 4.9890499055066435e-05,
+      "loss": 0.2236532211303711,
+      "step": 325
+    },
+    {
+      "epoch": 0.060043668122270744,
+      "grad_norm": 0.111734539270401,
+      "learning_rate": 4.988350935671869e-05,
+      "loss": 0.21474847793579102,
+      "step": 330
+    },
+    {
+      "epoch": 0.060953420669577874,
+      "grad_norm": 0.09906989336013794,
+      "learning_rate": 4.987630394243866e-05,
+      "loss": 0.23321933746337892,
+      "step": 335
+    },
+    {
+      "epoch": 0.06186317321688501,
+      "grad_norm": 0.10131457448005676,
+      "learning_rate": 4.98688828746903e-05,
+      "loss": 0.2310662031173706,
+      "step": 340
+    },
+    {
+      "epoch": 0.06277292576419213,
+      "grad_norm": 0.09203507006168365,
+      "learning_rate": 4.986124621780708e-05,
+      "loss": 0.22021169662475587,
+      "step": 345
+    },
+    {
+      "epoch": 0.06368267831149928,
+      "grad_norm": 0.09505912661552429,
+      "learning_rate": 4.9853394037991416e-05,
+      "loss": 0.2197155237197876,
+      "step": 350
+    },
+    {
+      "epoch": 0.06459243085880641,
+      "grad_norm": 0.09038657695055008,
+      "learning_rate": 4.984532640331412e-05,
+      "loss": 0.22066287994384765,
+      "step": 355
+    },
+    {
+      "epoch": 0.06550218340611354,
+      "grad_norm": 0.09707064181566238,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 0.22455451488494874,
+      "step": 360
+    },
+    {
+      "epoch": 0.06641193595342067,
+      "grad_norm": 0.10367228090763092,
+      "learning_rate": 4.98285450509961e-05,
+      "loss": 0.21993820667266845,
+      "step": 365
+    },
+    {
+      "epoch": 0.0673216885007278,
+      "grad_norm": 0.12229471653699875,
+      "learning_rate": 4.9819831478833456e-05,
+      "loss": 0.2168867588043213,
+      "step": 370
+    },
+    {
+      "epoch": 0.06823144104803494,
+      "grad_norm": 0.0964592918753624,
+      "learning_rate": 4.981090274276406e-05,
+      "loss": 0.21579203605651856,
+      "step": 375
+    },
+    {
+      "epoch": 0.06914119359534207,
+      "grad_norm": 0.09400496631860733,
+      "learning_rate": 4.980175892019141e-05,
+      "loss": 0.20972180366516113,
+      "step": 380
+    },
+    {
+      "epoch": 0.0700509461426492,
+      "grad_norm": 0.08158645778894424,
+      "learning_rate": 4.9792400090383594e-05,
+      "loss": 0.22148358821868896,
+      "step": 385
+    },
+    {
+      "epoch": 0.07096069868995633,
+      "grad_norm": 0.10916394740343094,
+      "learning_rate": 4.978282633447261e-05,
+      "loss": 0.2214418649673462,
+      "step": 390
+    },
+    {
+      "epoch": 0.07187045123726346,
+      "grad_norm": 0.11138810962438583,
+      "learning_rate": 4.9773037735453636e-05,
+      "loss": 0.21814754009246826,
+      "step": 395
+    },
+    {
+      "epoch": 0.07278020378457059,
+      "grad_norm": 0.10914396494626999,
+      "learning_rate": 4.9763034378184365e-05,
+      "loss": 0.21310818195343018,
+      "step": 400
+    },
+    {
+      "epoch": 0.07368995633187773,
+      "grad_norm": 0.1043366864323616,
+      "learning_rate": 4.975281634938421e-05,
+      "loss": 0.21266789436340333,
+      "step": 405
+    },
+    {
+      "epoch": 0.07459970887918486,
+      "grad_norm": 0.1036868542432785,
+      "learning_rate": 4.9742383737633594e-05,
+      "loss": 0.21606721878051757,
+      "step": 410
+    },
+    {
+      "epoch": 0.075509461426492,
+      "grad_norm": 0.11640442907810211,
+      "learning_rate": 4.9731736633373144e-05,
+      "loss": 0.21532948017120362,
+      "step": 415
+    },
+    {
+      "epoch": 0.07641921397379912,
+      "grad_norm": 0.11219926178455353,
+      "learning_rate": 4.9720875128902956e-05,
+      "loss": 0.2191627025604248,
+      "step": 420
+    },
+    {
+      "epoch": 0.07732896652110625,
+      "grad_norm": 0.12103637307882309,
+      "learning_rate": 4.970979931838176e-05,
+      "loss": 0.20938868522644044,
+      "step": 425
+    },
+    {
+      "epoch": 0.0782387190684134,
+      "grad_norm": 0.13274189829826355,
+      "learning_rate": 4.96985092978261e-05,
+      "loss": 0.21792960166931152,
+      "step": 430
+    },
+    {
+      "epoch": 0.07914847161572053,
+      "grad_norm": 0.11164513230323792,
+      "learning_rate": 4.968700516510954e-05,
+      "loss": 0.2022618055343628,
+      "step": 435
+    },
+    {
+      "epoch": 0.08005822416302766,
+      "grad_norm": 0.09532847255468369,
+      "learning_rate": 4.967528701996174e-05,
+      "loss": 0.21255812644958497,
+      "step": 440
+    },
+    {
+      "epoch": 0.08096797671033479,
+      "grad_norm": 0.10279258340597153,
+      "learning_rate": 4.96633549639677e-05,
+      "loss": 0.20683050155639648,
+      "step": 445
+    },
+    {
+      "epoch": 0.08187772925764192,
+      "grad_norm": 0.1257462352514267,
+      "learning_rate": 4.965120910056677e-05,
+      "loss": 0.21419920921325683,
+      "step": 450
+    },
+    {
+      "epoch": 0.08278748180494905,
+      "grad_norm": 0.11663137376308441,
+      "learning_rate": 4.963884953505186e-05,
+      "loss": 0.2072287082672119,
+      "step": 455
+    },
+    {
+      "epoch": 0.08369723435225619,
+      "grad_norm": 0.10488224029541016,
+      "learning_rate": 4.96262763745684e-05,
+      "loss": 0.1982678532600403,
+      "step": 460
+    },
+    {
+      "epoch": 0.08460698689956332,
+      "grad_norm": 0.11801692098379135,
+      "learning_rate": 4.961348972811354e-05,
+      "loss": 0.20662031173706055,
+      "step": 465
+    },
+    {
+      "epoch": 0.08551673944687045,
+      "grad_norm": 0.11318827420473099,
+      "learning_rate": 4.96004897065351e-05,
+      "loss": 0.20947303771972656,
+      "step": 470
+    },
+    {
+      "epoch": 0.08642649199417758,
+      "grad_norm": 0.13409486413002014,
+      "learning_rate": 4.95872764225307e-05,
+      "loss": 0.19670876264572143,
+      "step": 475
+    },
+    {
+      "epoch": 0.08733624454148471,
+      "grad_norm": 0.14440792798995972,
+      "learning_rate": 4.957384999064672e-05,
+      "loss": 0.19842848777770997,
+      "step": 480
+    },
+    {
+      "epoch": 0.08824599708879186,
+      "grad_norm": 0.12246996909379959,
+      "learning_rate": 4.956021052727731e-05,
+      "loss": 0.20318071842193602,
+      "step": 485
+    },
+    {
+      "epoch": 0.08915574963609899,
+      "grad_norm": 0.13437233865261078,
+      "learning_rate": 4.954635815066342e-05,
+      "loss": 0.21675212383270265,
+      "step": 490
+    },
+    {
+      "epoch": 0.09006550218340612,
+      "grad_norm": 0.11109672486782074,
+      "learning_rate": 4.9532292980891744e-05,
+      "loss": 0.2100757837295532,
+      "step": 495
+    },
+    {
+      "epoch": 0.09097525473071325,
+      "grad_norm": 0.1388893872499466,
+      "learning_rate": 4.9518015139893675e-05,
+      "loss": 0.20303285121917725,
+      "step": 500
+    },
+    {
+      "epoch": 0.09188500727802038,
+      "grad_norm": 0.13239721953868866,
+      "learning_rate": 4.950352475144427e-05,
+      "loss": 0.2152268409729004,
+      "step": 505
+    },
+    {
+      "epoch": 0.0927947598253275,
+      "grad_norm": 0.12834979593753815,
+      "learning_rate": 4.948882194116119e-05,
+      "loss": 0.20799248218536376,
+      "step": 510
+    },
+    {
+      "epoch": 0.09370451237263465,
+      "grad_norm": 0.11886704713106155,
+      "learning_rate": 4.947390683650354e-05,
+      "loss": 0.20394976139068605,
+      "step": 515
+    },
+    {
+      "epoch": 0.09461426491994178,
+      "grad_norm": 0.11398876458406448,
+      "learning_rate": 4.945877956677083e-05,
+      "loss": 0.2091092586517334,
+      "step": 520
+    },
+    {
+      "epoch": 0.09552401746724891,
+      "grad_norm": 0.1422540694475174,
+      "learning_rate": 4.944344026310186e-05,
+      "loss": 0.19564238786697388,
+      "step": 525
+    },
+    {
+      "epoch": 0.09643377001455604,
+      "grad_norm": 0.11359584331512451,
+      "learning_rate": 4.9427889058473535e-05,
+      "loss": 0.20493624210357667,
+      "step": 530
+    },
+    {
+      "epoch": 0.09734352256186317,
+      "grad_norm": 0.11703553050756454,
+      "learning_rate": 4.941212608769974e-05,
+      "loss": 0.2098615884780884,
+      "step": 535
+    },
+    {
+      "epoch": 0.0982532751091703,
+      "grad_norm": 0.14552047848701477,
+      "learning_rate": 4.939615148743017e-05,
+      "loss": 0.20382182598114013,
+      "step": 540
+    },
+    {
+      "epoch": 0.09916302765647744,
+      "grad_norm": 0.13178016245365143,
+      "learning_rate": 4.937996539614914e-05,
+      "loss": 0.19901862144470214,
+      "step": 545
+    },
+    {
+      "epoch": 0.10007278020378457,
+      "grad_norm": 0.635392427444458,
+      "learning_rate": 4.936356795417439e-05,
+      "loss": 0.20694944858551026,
+      "step": 550
+    },
+    {
+      "epoch": 0.1009825327510917,
+      "grad_norm": 0.15019077062606812,
+      "learning_rate": 4.934695930365586e-05,
+      "loss": 0.19313746690750122,
+      "step": 555
+    },
+    {
+      "epoch": 0.10189228529839883,
+      "grad_norm": 0.12941956520080566,
+      "learning_rate": 4.9330139588574474e-05,
+      "loss": 0.19671722650527954,
+      "step": 560
+    },
+    {
+      "epoch": 0.10280203784570596,
+      "grad_norm": 0.13818831741809845,
+      "learning_rate": 4.931310895474088e-05,
+      "loss": 0.20026786327362062,
+      "step": 565
+    },
+    {
+      "epoch": 0.1037117903930131,
+      "grad_norm": 0.12011194974184036,
+      "learning_rate": 4.929586754979417e-05,
+      "loss": 0.1932437539100647,
+      "step": 570
+    },
+    {
+      "epoch": 0.10462154294032024,
+      "grad_norm": 0.1345364898443222,
+      "learning_rate": 4.9278415523200644e-05,
+      "loss": 0.20245940685272218,
+      "step": 575
+    },
+    {
+      "epoch": 0.10553129548762737,
+      "grad_norm": 0.13281017541885376,
+      "learning_rate": 4.926075302625247e-05,
+      "loss": 0.19864981174468993,
+      "step": 580
+    },
+    {
+      "epoch": 0.1064410480349345,
+      "grad_norm": 0.13465586304664612,
+      "learning_rate": 4.924288021206639e-05,
+      "loss": 0.19573183059692384,
+      "step": 585
+    },
+    {
+      "epoch": 0.10735080058224163,
+      "grad_norm": 0.15225961804389954,
+      "learning_rate": 4.9224797235582396e-05,
+      "loss": 0.19946801662445068,
+      "step": 590
+    },
+    {
+      "epoch": 0.10826055312954876,
+      "grad_norm": 0.12816746532917023,
+      "learning_rate": 4.92065042535624e-05,
+      "loss": 0.19851526021957397,
+      "step": 595
+    },
+    {
+      "epoch": 0.1091703056768559,
+      "grad_norm": 0.13802853226661682,
+      "learning_rate": 4.9188001424588824e-05,
+      "loss": 0.19321763515472412,
+      "step": 600
+    },
+    {
+      "epoch": 0.11008005822416303,
+      "grad_norm": 0.17504797875881195,
+      "learning_rate": 4.9169288909063295e-05,
+      "loss": 0.2032616138458252,
+      "step": 605
+    },
+    {
+      "epoch": 0.11098981077147016,
+      "grad_norm": 0.13544194400310516,
+      "learning_rate": 4.91503668692052e-05,
+      "loss": 0.2011256456375122,
+      "step": 610
+    },
+    {
+      "epoch": 0.11189956331877729,
+      "grad_norm": 1.3976134061813354,
+      "learning_rate": 4.91312354690503e-05,
+      "loss": 0.19916868209838867,
+      "step": 615
+    },
+    {
+      "epoch": 0.11280931586608442,
+      "grad_norm": 0.1465059071779251,
+      "learning_rate": 4.91118948744493e-05,
+      "loss": 0.19487457275390624,
+      "step": 620
+    },
+    {
+      "epoch": 0.11371906841339156,
+      "grad_norm": 0.12103168666362762,
+      "learning_rate": 4.909234525306645e-05,
+      "loss": 0.1907251238822937,
+      "step": 625
+    },
+    {
+      "epoch": 0.1146288209606987,
+      "grad_norm": 0.12660574913024902,
+      "learning_rate": 4.907258677437802e-05,
+      "loss": 0.19327253103256226,
+      "step": 630
+    },
+    {
+      "epoch": 0.11553857350800582,
+      "grad_norm": 0.1347813606262207,
+      "learning_rate": 4.90526196096709e-05,
+      "loss": 0.19637736082077026,
+      "step": 635
+    },
+    {
+      "epoch": 0.11644832605531295,
+      "grad_norm": 0.14953652024269104,
+      "learning_rate": 4.903244393204107e-05,
+      "loss": 0.20325069427490233,
+      "step": 640
+    },
+    {
+      "epoch": 0.11735807860262008,
+      "grad_norm": 0.13936272263526917,
+      "learning_rate": 4.901205991639213e-05,
+      "loss": 0.1930275321006775,
+      "step": 645
+    },
+    {
+      "epoch": 0.11826783114992721,
+      "grad_norm": 0.1448420137166977,
+      "learning_rate": 4.899146773943374e-05,
+      "loss": 0.20026936531066894,
+      "step": 650
+    },
+    {
+      "epoch": 0.11917758369723436,
+      "grad_norm": 0.1312534064054489,
+      "learning_rate": 4.897066757968014e-05,
+      "loss": 0.19062033891677857,
+      "step": 655
+    },
+    {
+      "epoch": 0.12008733624454149,
+      "grad_norm": 0.13644742965698242,
+      "learning_rate": 4.894965961744859e-05,
+      "loss": 0.18719595670700073,
+      "step": 660
+    },
+    {
+      "epoch": 0.12099708879184862,
+      "grad_norm": 0.14276087284088135,
+      "learning_rate": 4.892844403485777e-05,
+      "loss": 0.19784307479858398,
+      "step": 665
+    },
+    {
+      "epoch": 0.12190684133915575,
+      "grad_norm": 0.14735399186611176,
+      "learning_rate": 4.890702101582623e-05,
+      "loss": 0.19163782596588136,
+      "step": 670
+    },
+    {
+      "epoch": 0.12281659388646288,
+      "grad_norm": 0.15742065012454987,
+      "learning_rate": 4.888539074607082e-05,
+      "loss": 0.19312986135482788,
+      "step": 675
+    },
+    {
+      "epoch": 0.12372634643377002,
+      "grad_norm": 0.12917031347751617,
+      "learning_rate": 4.8863553413105025e-05,
+      "loss": 0.20066320896148682,
+      "step": 680
+    },
+    {
+      "epoch": 0.12463609898107715,
+      "grad_norm": 0.1484801322221756,
+      "learning_rate": 4.884150920623737e-05,
+      "loss": 0.20096096992492676,
+      "step": 685
+    },
+    {
+      "epoch": 0.12554585152838427,
+      "grad_norm": 0.1455296128988266,
+      "learning_rate": 4.88192583165698e-05,
+      "loss": 0.20518505573272705,
+      "step": 690
+    },
+    {
+      "epoch": 0.12645560407569142,
+      "grad_norm": 0.14517490565776825,
+      "learning_rate": 4.879680093699598e-05,
+      "loss": 0.18859238624572755,
+      "step": 695
+    },
+    {
+      "epoch": 0.12736535662299855,
+      "grad_norm": 0.18778090178966522,
+      "learning_rate": 4.877413726219964e-05,
+      "loss": 0.197074818611145,
+      "step": 700
+    },
+    {
+      "epoch": 0.12827510917030568,
+      "grad_norm": 0.13497677445411682,
+      "learning_rate": 4.87512674886529e-05,
+      "loss": 0.18713107109069824,
+      "step": 705
+    },
+    {
+      "epoch": 0.12918486171761281,
+      "grad_norm": 0.12657155096530914,
+      "learning_rate": 4.872819181461455e-05,
+      "loss": 0.1858484387397766,
+      "step": 710
+    },
+    {
+      "epoch": 0.13009461426491994,
+      "grad_norm": 0.11458148807287216,
+      "learning_rate": 4.870491044012834e-05,
+      "loss": 0.18732179403305055,
+      "step": 715
+    },
+    {
+      "epoch": 0.13100436681222707,
+      "grad_norm": 0.13000249862670898,
+      "learning_rate": 4.8681423567021244e-05,
+      "loss": 0.1872936010360718,
+      "step": 720
+    },
+    {
+      "epoch": 0.1319141193595342,
+      "grad_norm": 0.14580890536308289,
+      "learning_rate": 4.865773139890172e-05,
+      "loss": 0.19280019998550416,
+      "step": 725
+    },
+    {
+      "epoch": 0.13282387190684133,
+      "grad_norm": 0.1507277935743332,
+      "learning_rate": 4.8633834141157913e-05,
+      "loss": 0.1898929238319397,
+      "step": 730
+    },
+    {
+      "epoch": 0.13373362445414846,
+      "grad_norm": 0.1418737769126892,
+      "learning_rate": 4.860973200095592e-05,
+      "loss": 0.17926375865936278,
+      "step": 735
+    },
+    {
+      "epoch": 0.1346433770014556,
+      "grad_norm": 0.17151866853237152,
+      "learning_rate": 4.858542518723794e-05,
+      "loss": 0.18963592052459716,
+      "step": 740
+    },
+    {
+      "epoch": 0.13555312954876272,
+      "grad_norm": 0.11162743717432022,
+      "learning_rate": 4.8560913910720535e-05,
+      "loss": 0.19466646909713745,
+      "step": 745
+    },
+    {
+      "epoch": 0.13646288209606988,
+      "grad_norm": 0.15628376603126526,
+      "learning_rate": 4.8536198383892725e-05,
+      "loss": 0.19494034051895143,
+      "step": 750
+    },
+    {
+      "epoch": 0.137372634643377,
+      "grad_norm": 0.18209289014339447,
+      "learning_rate": 4.851127882101421e-05,
+      "loss": 0.18747550249099731,
+      "step": 755
+    },
+    {
+      "epoch": 0.13828238719068414,
+      "grad_norm": 0.14559614658355713,
+      "learning_rate": 4.8486155438113454e-05,
+      "loss": 0.1897158980369568,
+      "step": 760
+    },
+    {
+      "epoch": 0.13919213973799127,
+      "grad_norm": 0.3198587894439697,
+      "learning_rate": 4.846082845298586e-05,
+      "loss": 0.18571001291275024,
+      "step": 765
+    },
+    {
+      "epoch": 0.1401018922852984,
+      "grad_norm": 0.1486678421497345,
+      "learning_rate": 4.843529808519189e-05,
+      "loss": 0.19561930894851684,
+      "step": 770
+    },
+    {
+      "epoch": 0.14101164483260553,
+      "grad_norm": 0.15318170189857483,
+      "learning_rate": 4.840956455605509e-05,
+      "loss": 0.187040114402771,
+      "step": 775
+    },
+    {
+      "epoch": 0.14192139737991266,
+      "grad_norm": 0.13754244148731232,
+      "learning_rate": 4.838362808866025e-05,
+      "loss": 0.18345539569854735,
+      "step": 780
+    },
+    {
+      "epoch": 0.1428311499272198,
+      "grad_norm": 0.12943248450756073,
+      "learning_rate": 4.835748890785143e-05,
+      "loss": 0.1921079397201538,
+      "step": 785
+    },
+    {
+      "epoch": 0.14374090247452692,
+      "grad_norm": 0.110458143055439,
+      "learning_rate": 4.833114724023001e-05,
+      "loss": 0.17927205562591553,
+      "step": 790
+    },
+    {
+      "epoch": 0.14465065502183405,
+      "grad_norm": 0.2421770840883255,
+      "learning_rate": 4.830460331415275e-05,
+      "loss": 0.18317567110061644,
+      "step": 795
+    },
+    {
+      "epoch": 0.14556040756914118,
+      "grad_norm": 0.14752762019634247,
+      "learning_rate": 4.8277857359729787e-05,
+      "loss": 0.1843916058540344,
+      "step": 800
+    },
+    {
+      "epoch": 0.14647016011644834,
+      "grad_norm": 0.15043556690216064,
+      "learning_rate": 4.8250909608822644e-05,
+      "loss": 0.18354393243789674,
+      "step": 805
+    },
+    {
+      "epoch": 0.14737991266375547,
+      "grad_norm": 0.1381794661283493,
+      "learning_rate": 4.822376029504223e-05,
+      "loss": 0.1789781332015991,
+      "step": 810
+    },
+    {
+      "epoch": 0.1482896652110626,
+      "grad_norm": 0.18386174738407135,
+      "learning_rate": 4.819640965374681e-05,
+      "loss": 0.19494292736053467,
+      "step": 815
+    },
+    {
+      "epoch": 0.14919941775836973,
+      "grad_norm": 0.13829593360424042,
+      "learning_rate": 4.816885792203996e-05,
+      "loss": 0.18486063480377196,
+      "step": 820
+    },
+    {
+      "epoch": 0.15010917030567686,
+      "grad_norm": 0.15033291280269623,
+      "learning_rate": 4.814110533876852e-05,
+      "loss": 0.18061509132385253,
+      "step": 825
+    },
+    {
+      "epoch": 0.151018922852984,
+      "grad_norm": 0.17150473594665527,
+      "learning_rate": 4.811315214452051e-05,
+      "loss": 0.18464866876602173,
+      "step": 830
+    },
+    {
+      "epoch": 0.15192867540029112,
+      "grad_norm": 0.15317125618457794,
+      "learning_rate": 4.808499858162307e-05,
+      "loss": 0.1837708592414856,
+      "step": 835
+    },
+    {
+      "epoch": 0.15283842794759825,
+      "grad_norm": 0.2671392560005188,
+      "learning_rate": 4.805664489414031e-05,
+      "loss": 0.19338636398315429,
+      "step": 840
+    },
+    {
+      "epoch": 0.15374818049490538,
+      "grad_norm": 0.14047028124332428,
+      "learning_rate": 4.802809132787125e-05,
+      "loss": 0.17069108486175538,
+      "step": 845
+    },
+    {
+      "epoch": 0.1546579330422125,
+      "grad_norm": 0.1520431935787201,
+      "learning_rate": 4.799933813034768e-05,
+      "loss": 0.18607735633850098,
+      "step": 850
+    },
+    {
+      "epoch": 0.15556768558951964,
+      "grad_norm": 0.17239463329315186,
+      "learning_rate": 4.797038555083197e-05,
+      "loss": 0.18069062232971192,
+      "step": 855
+    },
+    {
+      "epoch": 0.1564774381368268,
+      "grad_norm": 0.1377955675125122,
+      "learning_rate": 4.794123384031495e-05,
+      "loss": 0.18870222568511963,
+      "step": 860
+    },
+    {
+      "epoch": 0.15738719068413393,
+      "grad_norm": 0.15901461243629456,
+      "learning_rate": 4.791188325151373e-05,
+      "loss": 0.18128334283828734,
+      "step": 865
+    },
+    {
+      "epoch": 0.15829694323144106,
+      "grad_norm": 0.14634132385253906,
+      "learning_rate": 4.7882334038869495e-05,
+      "loss": 0.1866163969039917,
+      "step": 870
+    },
+    {
+      "epoch": 0.1592066957787482,
+      "grad_norm": 0.15361061692237854,
+      "learning_rate": 4.785258645854529e-05,
+      "loss": 0.17850807905197144,
+      "step": 875
+    },
+    {
+      "epoch": 0.16011644832605532,
+      "grad_norm": 0.13751649856567383,
+      "learning_rate": 4.782264076842385e-05,
+      "loss": 0.17731113433837892,
+      "step": 880
+    },
+    {
+      "epoch": 0.16102620087336245,
+      "grad_norm": 0.17909638583660126,
+      "learning_rate": 4.7792497228105314e-05,
+      "loss": 0.18344542980194092,
+      "step": 885
+    },
+    {
+      "epoch": 0.16193595342066958,
+      "grad_norm": 0.16038304567337036,
+      "learning_rate": 4.776215609890498e-05,
+      "loss": 0.18868647813796996,
+      "step": 890
+    },
+    {
+      "epoch": 0.1628457059679767,
+      "grad_norm": 0.1653951108455658,
+      "learning_rate": 4.773161764385107e-05,
+      "loss": 0.18614152669906617,
+      "step": 895
+    },
+    {
+      "epoch": 0.16375545851528384,
+      "grad_norm": 0.16193026304244995,
+      "learning_rate": 4.770088212768241e-05,
+      "loss": 0.18564575910568237,
+      "step": 900
+    },
+    {
+      "epoch": 0.16466521106259097,
+      "grad_norm": 0.16048531234264374,
+      "learning_rate": 4.7669949816846173e-05,
+      "loss": 0.18330031633377075,
+      "step": 905
+    },
+    {
+      "epoch": 0.1655749636098981,
+      "grad_norm": 0.1440177708864212,
+      "learning_rate": 4.7638820979495534e-05,
+      "loss": 0.17712442874908446,
+      "step": 910
+    },
+    {
+      "epoch": 0.16648471615720525,
+      "grad_norm": 0.19635969400405884,
+      "learning_rate": 4.760749588548738e-05,
+      "loss": 0.18679027557373046,
+      "step": 915
+    },
+    {
+      "epoch": 0.16739446870451238,
+      "grad_norm": 0.15576541423797607,
+      "learning_rate": 4.757597480637995e-05,
+      "loss": 0.19283764362335204,
+      "step": 920
+    },
+    {
+      "epoch": 0.1683042212518195,
+      "grad_norm": 0.1550331562757492,
+      "learning_rate": 4.7544258015430463e-05,
+      "loss": 0.18269542455673218,
+      "step": 925
+    },
+    {
+      "epoch": 0.16921397379912664,
+      "grad_norm": 0.18369626998901367,
+      "learning_rate": 4.75123457875928e-05,
+      "loss": 0.1697891116142273,
+      "step": 930
+    },
+    {
+      "epoch": 0.17012372634643377,
+      "grad_norm": 0.15266314148902893,
+      "learning_rate": 4.7480238399515074e-05,
+      "loss": 0.18523451089859008,
+      "step": 935
+    },
+    {
+      "epoch": 0.1710334788937409,
+      "grad_norm": 0.16709664463996887,
+      "learning_rate": 4.744793612953724e-05,
+      "loss": 0.1803238034248352,
+      "step": 940
+    },
+    {
+      "epoch": 0.17194323144104803,
+      "grad_norm": 0.14929179847240448,
+      "learning_rate": 4.741543925768872e-05,
+      "loss": 0.1861217737197876,
+      "step": 945
+    },
+    {
+      "epoch": 0.17285298398835516,
+      "grad_norm": 0.1362280696630478,
+      "learning_rate": 4.7382748065685915e-05,
+      "loss": 0.17896100282669067,
+      "step": 950
+    },
+    {
+      "epoch": 0.1737627365356623,
+      "grad_norm": 0.15290239453315735,
+      "learning_rate": 4.734986283692982e-05,
+      "loss": 0.18432788848876952,
+      "step": 955
+    },
+    {
+      "epoch": 0.17467248908296942,
+      "grad_norm": 0.1287035197019577,
+      "learning_rate": 4.73167838565035e-05,
+      "loss": 0.18485682010650634,
+      "step": 960
+    },
+    {
+      "epoch": 0.17558224163027655,
+      "grad_norm": 0.17969627678394318,
+      "learning_rate": 4.728351141116971e-05,
+      "loss": 0.17361557483673096,
+      "step": 965
+    },
+    {
+      "epoch": 0.1764919941775837,
+      "grad_norm": 0.13751201331615448,
+      "learning_rate": 4.7250045789368326e-05,
+      "loss": 0.1731679320335388,
+      "step": 970
+    },
+    {
+      "epoch": 0.17740174672489084,
+      "grad_norm": 0.1603265255689621,
+      "learning_rate": 4.721638728121388e-05,
+      "loss": 0.17308170795440675,
+      "step": 975
+    },
+    {
+      "epoch": 0.17831149927219797,
+      "grad_norm": 0.1592789888381958,
+      "learning_rate": 4.718253617849306e-05,
+      "loss": 0.17534757852554322,
+      "step": 980
+    },
+    {
+      "epoch": 0.1792212518195051,
+      "grad_norm": 0.12727224826812744,
+      "learning_rate": 4.714849277466214e-05,
+      "loss": 0.17817609310150145,
+      "step": 985
+    },
+    {
+      "epoch": 0.18013100436681223,
+      "grad_norm": 0.15401554107666016,
+      "learning_rate": 4.711425736484447e-05,
+      "loss": 0.1733405351638794,
+      "step": 990
+    },
+    {
+      "epoch": 0.18104075691411936,
+      "grad_norm": 0.13253968954086304,
+      "learning_rate": 4.7079830245827906e-05,
+      "loss": 0.17846795320510864,
+      "step": 995
+    },
+    {
+      "epoch": 0.1819505094614265,
+      "grad_norm": 0.21846213936805725,
+      "learning_rate": 4.7045211716062245e-05,
+      "loss": 0.18021599054336548,
+      "step": 1000
+    },
+    {
+      "epoch": 0.18286026200873362,
+      "grad_norm": 0.16867990791797638,
+      "learning_rate": 4.7010402075656595e-05,
+      "loss": 0.18232386112213134,
+      "step": 1005
+    },
+    {
+      "epoch": 0.18377001455604075,
+      "grad_norm": 0.17180582880973816,
+      "learning_rate": 4.697540162637686e-05,
+      "loss": 0.1816317319869995,
+      "step": 1010
+    },
+    {
+      "epoch": 0.18467976710334788,
+      "grad_norm": 0.16480213403701782,
+      "learning_rate": 4.694021067164303e-05,
+      "loss": 0.17718446254730225,
+      "step": 1015
+    },
+    {
+      "epoch": 0.185589519650655,
+      "grad_norm": 0.15015918016433716,
+      "learning_rate": 4.6904829516526605e-05,
+      "loss": 0.17412011623382567,
+      "step": 1020
+    },
+    {
+      "epoch": 0.18649927219796217,
+      "grad_norm": 0.14445139467716217,
+      "learning_rate": 4.686925846774795e-05,
+      "loss": 0.1778018832206726,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1874090247452693,
+      "grad_norm": 0.1701960265636444,
+      "learning_rate": 4.683349783367362e-05,
+      "loss": 0.16901081800460815,
+      "step": 1030
+    },
+    {
+      "epoch": 0.18831877729257643,
+      "grad_norm": 0.15894867479801178,
+      "learning_rate": 4.679754792431368e-05,
+      "loss": 0.17055928707122803,
+      "step": 1035
+    },
+    {
+      "epoch": 0.18922852983988356,
+      "grad_norm": 0.1511942446231842,
+      "learning_rate": 4.676140905131903e-05,
+      "loss": 0.17339680194854737,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1901382823871907,
+      "grad_norm": 0.14735209941864014,
+      "learning_rate": 4.672508152797872e-05,
+      "loss": 0.17802717685699462,
+      "step": 1045
+    },
+    {
+      "epoch": 0.19104803493449782,
+      "grad_norm": 0.17367291450500488,
+      "learning_rate": 4.66885656692172e-05,
+      "loss": 0.1732744097709656,
+      "step": 1050
+    },
+    {
+      "epoch": 0.19195778748180495,
+      "grad_norm": 0.147227481007576,
+      "learning_rate": 4.665186179159159e-05,
+      "loss": 0.17040517330169677,
+      "step": 1055
+    },
+    {
+      "epoch": 0.19286754002911208,
+      "grad_norm": 0.1709655076265335,
+      "learning_rate": 4.6614970213289e-05,
+      "loss": 0.17794088125228882,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1937772925764192,
+      "grad_norm": 0.1588088721036911,
+      "learning_rate": 4.657789125412366e-05,
+      "loss": 0.17180380821228028,
+      "step": 1065
+    },
+    {
+      "epoch": 0.19468704512372634,
+      "grad_norm": 0.14827021956443787,
+      "learning_rate": 4.654062523553428e-05,
+      "loss": 0.182997989654541,
+      "step": 1070
+    },
+    {
+      "epoch": 0.19559679767103347,
+      "grad_norm": 0.16230466961860657,
+      "learning_rate": 4.6503172480581126e-05,
+      "loss": 0.17346880435943604,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1965065502183406,
+      "grad_norm": 0.1637624353170395,
+      "learning_rate": 4.646553331394333e-05,
+      "loss": 0.17263576984405518,
+      "step": 1080
+    },
+    {
+      "epoch": 0.19741630276564776,
+      "grad_norm": 0.15977843105793,
+      "learning_rate": 4.642770806191603e-05,
+      "loss": 0.17284308671951293,
+      "step": 1085
+    },
+    {
+      "epoch": 0.19832605531295489,
+      "grad_norm": 0.15394869446754456,
+      "learning_rate": 4.6389697052407534e-05,
+      "loss": 0.17797101736068727,
+      "step": 1090
+    },
+    {
+      "epoch": 0.19923580786026202,
+      "grad_norm": 0.15995225310325623,
+      "learning_rate": 4.6351500614936485e-05,
+      "loss": 0.18137198686599731,
+      "step": 1095
+    },
+    {
+      "epoch": 0.20014556040756915,
+      "grad_norm": 0.1779479682445526,
+      "learning_rate": 4.6313119080629006e-05,
+      "loss": 0.17998344898223878,
+      "step": 1100
+    },
+    {
+      "epoch": 0.20105531295487628,
+      "grad_norm": 0.14362832903862,
+      "learning_rate": 4.627455278221584e-05,
+      "loss": 0.18196423053741456,
+      "step": 1105
+    },
+    {
+      "epoch": 0.2019650655021834,
+      "grad_norm": 0.15951639413833618,
+      "learning_rate": 4.623580205402947e-05,
+      "loss": 0.17423888444900512,
+      "step": 1110
+    },
+    {
+      "epoch": 0.20287481804949054,
+      "grad_norm": 0.17273563146591187,
+      "learning_rate": 4.619686723200115e-05,
+      "loss": 0.17392473220825194,
+      "step": 1115
+    },
+    {
+      "epoch": 0.20378457059679767,
+      "grad_norm": 0.1655360758304596,
+      "learning_rate": 4.615774865365813e-05,
+      "loss": 0.17528389692306517,
+      "step": 1120
+    },
+    {
+      "epoch": 0.2046943231441048,
+      "grad_norm": 0.15920691192150116,
+      "learning_rate": 4.611844665812058e-05,
+      "loss": 0.1806849241256714,
+      "step": 1125
+    },
+    {
+      "epoch": 0.20560407569141192,
+      "grad_norm": 0.16114577651023865,
+      "learning_rate": 4.607896158609875e-05,
+      "loss": 0.17217352390289306,
+      "step": 1130
+    },
+    {
+      "epoch": 0.20651382823871905,
+      "grad_norm": 0.1499422937631607,
+      "learning_rate": 4.603929377988999e-05,
+      "loss": 0.17806737422943114,
+      "step": 1135
+    },
+    {
+      "epoch": 0.2074235807860262,
+      "grad_norm": 0.17605191469192505,
+      "learning_rate": 4.5999443583375765e-05,
+      "loss": 0.17842113971710205,
+      "step": 1140
+    },
+    {
+      "epoch": 0.20833333333333334,
+      "grad_norm": 0.16117210686206818,
+      "learning_rate": 4.595941134201871e-05,
+      "loss": 0.18379683494567872,
+      "step": 1145
+    },
+    {
+      "epoch": 0.20924308588064047,
+      "grad_norm": 0.21199050545692444,
+      "learning_rate": 4.591919740285957e-05,
+      "loss": 0.16286123991012574,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2101528384279476,
+      "grad_norm": 0.15100529789924622,
+      "learning_rate": 4.587880211451427e-05,
+      "loss": 0.17995200157165528,
+      "step": 1155
+    },
+    {
+      "epoch": 0.21106259097525473,
+      "grad_norm": 0.16618172824382782,
+      "learning_rate": 4.583822582717085e-05,
+      "loss": 0.16960303783416747,
+      "step": 1160
+    },
+    {
+      "epoch": 0.21197234352256186,
+      "grad_norm": 0.14743569493293762,
+      "learning_rate": 4.579746889258643e-05,
+      "loss": 0.17762668132781984,
+      "step": 1165
+    },
+    {
+      "epoch": 0.212882096069869,
+      "grad_norm": 0.1697179079055786,
+      "learning_rate": 4.575653166408417e-05,
+      "loss": 0.16656005382537842,
+      "step": 1170
+    },
+    {
+      "epoch": 0.21379184861717612,
+      "grad_norm": 0.14886513352394104,
+      "learning_rate": 4.57154144965502e-05,
+      "loss": 0.17091882228851318,
+      "step": 1175
+    },
+    {
+      "epoch": 0.21470160116448325,
+      "grad_norm": 0.18197473883628845,
+      "learning_rate": 4.5674117746430556e-05,
+      "loss": 0.1770920753479004,
+      "step": 1180
+    },
+    {
+      "epoch": 0.21561135371179038,
+      "grad_norm": 0.17323088645935059,
+      "learning_rate": 4.563264177172807e-05,
+      "loss": 0.1734643578529358,
+      "step": 1185
+    },
+    {
+      "epoch": 0.2165211062590975,
+      "grad_norm": 0.1521984338760376,
+      "learning_rate": 4.559098693199929e-05,
+      "loss": 0.17515116930007935,
+      "step": 1190
+    },
+    {
+      "epoch": 0.21743085880640467,
+      "grad_norm": 0.1842304915189743,
+      "learning_rate": 4.554915358835134e-05,
+      "loss": 0.16798022985458375,
+      "step": 1195
+    },
+    {
+      "epoch": 0.2183406113537118,
+      "grad_norm": 0.14753451943397522,
+      "learning_rate": 4.5507142103438794e-05,
+      "loss": 0.1755476713180542,
+      "step": 1200
+    },
+    {
+      "epoch": 0.21925036390101893,
+      "grad_norm": 0.17096194624900818,
+      "learning_rate": 4.546495284146057e-05,
+      "loss": 0.1792473554611206,
+      "step": 1205
+    },
+    {
+      "epoch": 0.22016011644832606,
+      "grad_norm": 0.1579233556985855,
+      "learning_rate": 4.542258616815669e-05,
+      "loss": 0.17230144739151002,
+      "step": 1210
+    },
+    {
+      "epoch": 0.2210698689956332,
+      "grad_norm": 0.177297905087471,
+      "learning_rate": 4.5380042450805216e-05,
+      "loss": 0.1807127833366394,
+      "step": 1215
+    },
+    {
+      "epoch": 0.22197962154294032,
+      "grad_norm": 0.14331696927547455,
+      "learning_rate": 4.533732205821897e-05,
+      "loss": 0.17201389074325563,
+      "step": 1220
+    },
+    {
+      "epoch": 0.22288937409024745,
+      "grad_norm": 0.14473360776901245,
+      "learning_rate": 4.529442536074239e-05,
+      "loss": 0.17036900520324708,
+      "step": 1225
+    },
+    {
+      "epoch": 0.22379912663755458,
+      "grad_norm": 0.1820901483297348,
+      "learning_rate": 4.5251352730248314e-05,
+      "loss": 0.17704882621765136,
+      "step": 1230
+    },
+    {
+      "epoch": 0.2247088791848617,
+      "grad_norm": 0.1948976367712021,
+      "learning_rate": 4.5208104540134746e-05,
+      "loss": 0.1706973433494568,
+      "step": 1235
+    },
+    {
+      "epoch": 0.22561863173216884,
+      "grad_norm": 0.16660070419311523,
+      "learning_rate": 4.51646811653216e-05,
+      "loss": 0.17636821269989014,
+      "step": 1240
+    },
+    {
+      "epoch": 0.22652838427947597,
+      "grad_norm": 0.1699984073638916,
+      "learning_rate": 4.512108298224751e-05,
+      "loss": 0.16986632347106934,
+      "step": 1245
+    },
+    {
+      "epoch": 0.22743813682678313,
+      "grad_norm": 0.17601042985916138,
+      "learning_rate": 4.50773103688665e-05,
+      "loss": 0.17507898807525635,
+      "step": 1250
+    },
+    {
+      "epoch": 0.22834788937409026,
+      "grad_norm": 0.17557238042354584,
+      "learning_rate": 4.503336370464476e-05,
+      "loss": 0.17702863216400147,
+      "step": 1255
+    },
+    {
+      "epoch": 0.2292576419213974,
+      "grad_norm": 0.1800651252269745,
+      "learning_rate": 4.498924337055729e-05,
+      "loss": 0.16419180631637573,
+      "step": 1260
+    },
+    {
+      "epoch": 0.23016739446870452,
+      "grad_norm": 0.2022479772567749,
+      "learning_rate": 4.494494974908468e-05,
+      "loss": 0.17482060194015503,
+      "step": 1265
+    },
+    {
+      "epoch": 0.23107714701601165,
+      "grad_norm": 0.14180205762386322,
+      "learning_rate": 4.490048322420973e-05,
+      "loss": 0.1723136067390442,
+      "step": 1270
+    },
+    {
+      "epoch": 0.23198689956331878,
+      "grad_norm": 0.18607310950756073,
+      "learning_rate": 4.485584418141419e-05,
+      "loss": 0.17096419334411622,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2328966521106259,
+      "grad_norm": 0.15958310663700104,
+      "learning_rate": 4.481103300767529e-05,
+      "loss": 0.1656244158744812,
+      "step": 1280
+    },
+    {
+      "epoch": 0.23380640465793304,
+      "grad_norm": 0.17552383244037628,
+      "learning_rate": 4.476605009146255e-05,
+      "loss": 0.17677626609802247,
+      "step": 1285
+    },
+    {
+      "epoch": 0.23471615720524017,
+      "grad_norm": 0.15299823880195618,
+      "learning_rate": 4.472089582273429e-05,
+      "loss": 0.1778991103172302,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2356259097525473,
+      "grad_norm": 0.14613987505435944,
+      "learning_rate": 4.46755705929343e-05,
+      "loss": 0.17071452140808105,
+      "step": 1295
+    },
+    {
+      "epoch": 0.23653566229985443,
+      "grad_norm": 0.17781122028827667,
+      "learning_rate": 4.463007479498843e-05,
+      "loss": 0.16955430507659913,
+      "step": 1300
+    },
+    {
+      "epoch": 0.23744541484716158,
+      "grad_norm": 0.16326487064361572,
+      "learning_rate": 4.458440882330119e-05,
+      "loss": 0.1777693510055542,
+      "step": 1305
+    },
+    {
+      "epoch": 0.23835516739446871,
+      "grad_norm": 0.17701926827430725,
+      "learning_rate": 4.4538573073752365e-05,
+      "loss": 0.16323351860046387,
+      "step": 1310
+    },
+    {
+      "epoch": 0.23926491994177584,
+      "grad_norm": 0.13104717433452606,
+      "learning_rate": 4.449256794369349e-05,
+      "loss": 0.17653456926345826,
+      "step": 1315
+    },
+    {
+      "epoch": 0.24017467248908297,
+      "grad_norm": 0.1796836256980896,
+      "learning_rate": 4.444639383194452e-05,
+      "loss": 0.17189600467681884,
+      "step": 1320
+    },
+    {
+      "epoch": 0.2410844250363901,
+      "grad_norm": 0.14919696748256683,
+      "learning_rate": 4.440005113879029e-05,
+      "loss": 0.17003334760665895,
+      "step": 1325
+    },
+    {
+      "epoch": 0.24199417758369723,
+      "grad_norm": 0.1728784441947937,
+      "learning_rate": 4.4353540265977064e-05,
+      "loss": 0.17397408485412597,
+      "step": 1330
+    },
+    {
+      "epoch": 0.24290393013100436,
+      "grad_norm": 0.14591015875339508,
+      "learning_rate": 4.43068616167091e-05,
+      "loss": 0.16498478651046752,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2438136826783115,
+      "grad_norm": 0.18417201936244965,
+      "learning_rate": 4.4260015595645055e-05,
+      "loss": 0.16841750144958495,
+      "step": 1340
+    },
+    {
+      "epoch": 0.24472343522561862,
+      "grad_norm": 0.16264279186725616,
+      "learning_rate": 4.4213002608894605e-05,
+      "loss": 0.16907373666763306,
+      "step": 1345
+    },
+    {
+      "epoch": 0.24563318777292575,
+      "grad_norm": 0.15248481929302216,
+      "learning_rate": 4.416582306401481e-05,
+      "loss": 0.15931472778320313,
+      "step": 1350
+    },
+    {
+      "epoch": 0.24654294032023288,
+      "grad_norm": 0.1488373875617981,
+      "learning_rate": 4.4118477370006636e-05,
+      "loss": 0.1701716423034668,
+      "step": 1355
+    },
+    {
+      "epoch": 0.24745269286754004,
+      "grad_norm": 0.14679782092571259,
+      "learning_rate": 4.407096593731142e-05,
+      "loss": 0.157412326335907,
+      "step": 1360
+    },
+    {
+      "epoch": 0.24836244541484717,
+      "grad_norm": 0.17139530181884766,
+      "learning_rate": 4.402328917780728e-05,
+      "loss": 0.17303754091262818,
+      "step": 1365
+    },
+    {
+      "epoch": 0.2492721979621543,
+      "grad_norm": 0.1534871757030487,
+      "learning_rate": 4.397544750480554e-05,
+      "loss": 0.1786255121231079,
+      "step": 1370
+    },
+    {
+      "epoch": 0.2501819505094614,
+      "grad_norm": 0.1876252293586731,
+      "learning_rate": 4.39274413330472e-05,
+      "loss": 0.16442898511886597,
+      "step": 1375
+    },
+    {
+      "epoch": 0.25109170305676853,
+      "grad_norm": 0.16165752708911896,
+      "learning_rate": 4.387927107869928e-05,
+      "loss": 0.1780426025390625,
+      "step": 1380
+    },
+    {
+      "epoch": 0.25200145560407566,
+      "grad_norm": 0.17242255806922913,
+      "learning_rate": 4.383093715935124e-05,
+      "loss": 0.15959256887435913,
+      "step": 1385
+    },
+    {
+      "epoch": 0.25291120815138285,
+      "grad_norm": 0.1627114862203598,
+      "learning_rate": 4.378243999401137e-05,
+      "loss": 0.17606115341186523,
+      "step": 1390
+    },
+    {
+      "epoch": 0.25382096069869,
+      "grad_norm": 0.15911224484443665,
+      "learning_rate": 4.373378000310312e-05,
+      "loss": 0.16798585653305054,
+      "step": 1395
+    },
+    {
+      "epoch": 0.2547307132459971,
+      "grad_norm": 0.15542249381542206,
+      "learning_rate": 4.3684957608461505e-05,
+      "loss": 0.1695417881011963,
+      "step": 1400
+    },
+    {
+      "epoch": 0.25564046579330424,
+      "grad_norm": 0.1475304812192917,
+      "learning_rate": 4.363597323332941e-05,
+      "loss": 0.16340878009796142,
+      "step": 1405
+    },
+    {
+      "epoch": 0.25655021834061137,
+      "grad_norm": 0.16943927109241486,
+      "learning_rate": 4.358682730235395e-05,
+      "loss": 0.17240238189697266,
+      "step": 1410
+    },
+    {
+      "epoch": 0.2574599708879185,
+      "grad_norm": 0.1816391944885254,
+      "learning_rate": 4.3537520241582744e-05,
+      "loss": 0.16558437347412108,
+      "step": 1415
+    },
+    {
+      "epoch": 0.25836972343522563,
+      "grad_norm": 0.23851341009140015,
+      "learning_rate": 4.348805247846027e-05,
+      "loss": 0.16796000003814698,
+      "step": 1420
+    },
+    {
+      "epoch": 0.25927947598253276,
+      "grad_norm": 0.15415243804454803,
+      "learning_rate": 4.343842444182414e-05,
+      "loss": 0.1746017098426819,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2601892285298399,
+      "grad_norm": 0.15651032328605652,
+      "learning_rate": 4.338863656190139e-05,
+      "loss": 0.1649057984352112,
+      "step": 1430
+    },
+    {
+      "epoch": 0.261098981077147,
+      "grad_norm": 0.16601966321468353,
+      "learning_rate": 4.333868927030471e-05,
+      "loss": 0.15888988971710205,
+      "step": 1435
+    },
+    {
+      "epoch": 0.26200873362445415,
+      "grad_norm": 0.1549467295408249,
+      "learning_rate": 4.328858300002876e-05,
+      "loss": 0.16357985734939576,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2629184861717613,
+      "grad_norm": 0.16332370042800903,
+      "learning_rate": 4.32383181854464e-05,
+      "loss": 0.16749982833862304,
+      "step": 1445
+    },
+    {
+      "epoch": 0.2638282387190684,
+      "grad_norm": 0.14827077090740204,
+      "learning_rate": 4.3187895262304894e-05,
+      "loss": 0.16886214017868043,
+      "step": 1450
+    },
+    {
+      "epoch": 0.26473799126637554,
+      "grad_norm": 0.1557198166847229,
+      "learning_rate": 4.313731466772216e-05,
+      "loss": 0.17512214183807373,
+      "step": 1455
+    },
+    {
+      "epoch": 0.26564774381368267,
+      "grad_norm": 0.17263570427894592,
+      "learning_rate": 4.308657684018299e-05,
+      "loss": 0.16248074769973755,
+      "step": 1460
+    },
+    {
+      "epoch": 0.2665574963609898,
+      "grad_norm": 0.17135761678218842,
+      "learning_rate": 4.303568221953521e-05,
+      "loss": 0.16605921983718872,
+      "step": 1465
+    },
+    {
+      "epoch": 0.26746724890829693,
+      "grad_norm": 0.14322632551193237,
+      "learning_rate": 4.2984631246985897e-05,
+      "loss": 0.1610772728919983,
+      "step": 1470
+    },
+    {
+      "epoch": 0.26837700145560406,
+      "grad_norm": 0.18852312862873077,
+      "learning_rate": 4.2933424365097564e-05,
+      "loss": 0.1686462163925171,
+      "step": 1475
+    },
+    {
+      "epoch": 0.2692867540029112,
+      "grad_norm": 0.1780245155096054,
+      "learning_rate": 4.2882062017784294e-05,
+      "loss": 0.16953932046890258,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2701965065502183,
+      "grad_norm": 0.180568665266037,
+      "learning_rate": 4.2830544650307895e-05,
+      "loss": 0.16442664861679077,
+      "step": 1485
+    },
+    {
+      "epoch": 0.27110625909752545,
+      "grad_norm": 0.16876435279846191,
+      "learning_rate": 4.277887270927407e-05,
+      "loss": 0.17128173112869263,
+      "step": 1490
+    },
+    {
+      "epoch": 0.2720160116448326,
+      "grad_norm": 0.164053276181221,
+      "learning_rate": 4.2727046642628513e-05,
+      "loss": 0.16331382989883422,
+      "step": 1495
+    },
+    {
+      "epoch": 0.27292576419213976,
+      "grad_norm": 0.14577528834342957,
+      "learning_rate": 4.267506689965305e-05,
+      "loss": 0.1638316035270691,
+      "step": 1500
+    },
+    {
+      "epoch": 0.2738355167394469,
+      "grad_norm": 0.1648740917444229,
+      "learning_rate": 4.262293393096171e-05,
+      "loss": 0.15332664251327516,
+      "step": 1505
+    },
+    {
+      "epoch": 0.274745269286754,
+      "grad_norm": 0.16445094347000122,
+      "learning_rate": 4.257064818849685e-05,
+      "loss": 0.1706634521484375,
+      "step": 1510
+    },
+    {
+      "epoch": 0.27565502183406115,
+      "grad_norm": 0.1584935486316681,
+      "learning_rate": 4.251821012552524e-05,
+      "loss": 0.1684114694595337,
+      "step": 1515
+    },
+    {
+      "epoch": 0.2765647743813683,
+      "grad_norm": 0.17215611040592194,
+      "learning_rate": 4.24656201966341e-05,
+      "loss": 0.15594131946563722,
+      "step": 1520
+    },
+    {
+      "epoch": 0.2774745269286754,
+      "grad_norm": 0.15945589542388916,
+      "learning_rate": 4.2412878857727214e-05,
+      "loss": 0.1686659574508667,
+      "step": 1525
+    },
+    {
+      "epoch": 0.27838427947598254,
+      "grad_norm": 0.16103951632976532,
+      "learning_rate": 4.2359986566020906e-05,
+      "loss": 0.17779340744018554,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2792940320232897,
+      "grad_norm": 0.1770307570695877,
+      "learning_rate": 4.230694378004014e-05,
+      "loss": 0.16786882877349854,
+      "step": 1535
+    },
+    {
+      "epoch": 0.2802037845705968,
+      "grad_norm": 0.16225053369998932,
+      "learning_rate": 4.2253750959614504e-05,
+      "loss": 0.16558897495269775,
+      "step": 1540
+    },
+    {
+      "epoch": 0.28111353711790393,
+      "grad_norm": 0.27213969826698303,
+      "learning_rate": 4.220040856587425e-05,
+      "loss": 0.1641119599342346,
+      "step": 1545
+    },
+    {
+      "epoch": 0.28202328966521106,
+      "grad_norm": 0.1773071587085724,
+      "learning_rate": 4.2146917061246284e-05,
+      "loss": 0.16919140815734862,
+      "step": 1550
+    },
+    {
+      "epoch": 0.2829330422125182,
+      "grad_norm": 0.15519705414772034,
+      "learning_rate": 4.209327690945014e-05,
+      "loss": 0.15501506328582765,
+      "step": 1555
+    },
+    {
+      "epoch": 0.2838427947598253,
+      "grad_norm": 0.19921597838401794,
+      "learning_rate": 4.203948857549402e-05,
+      "loss": 0.1690821886062622,
+      "step": 1560
+    },
+    {
+      "epoch": 0.28475254730713245,
+      "grad_norm": 0.15417630970478058,
+      "learning_rate": 4.1985552525670696e-05,
+      "loss": 0.1675640344619751,
+      "step": 1565
+    },
+    {
+      "epoch": 0.2856622998544396,
+      "grad_norm": 0.1739572137594223,
+      "learning_rate": 4.193146922755348e-05,
+      "loss": 0.16738017797470092,
+      "step": 1570
+    },
+    {
+      "epoch": 0.2865720524017467,
+      "grad_norm": 0.1384361982345581,
+      "learning_rate": 4.187723914999221e-05,
+      "loss": 0.16802358627319336,
+      "step": 1575
+    },
+    {
+      "epoch": 0.28748180494905384,
+      "grad_norm": 0.1491454839706421,
+      "learning_rate": 4.182286276310915e-05,
+      "loss": 0.1619583249092102,
+      "step": 1580
+    },
+    {
+      "epoch": 0.288391557496361,
+      "grad_norm": 0.15831919014453888,
+      "learning_rate": 4.176834053829492e-05,
+      "loss": 0.1625199794769287,
+      "step": 1585
+    },
+    {
+      "epoch": 0.2893013100436681,
+      "grad_norm": 0.16265396773815155,
+      "learning_rate": 4.1713672948204416e-05,
+      "loss": 0.16718552112579346,
+      "step": 1590
+    },
+    {
+      "epoch": 0.29021106259097523,
+      "grad_norm": 0.15153461694717407,
+      "learning_rate": 4.1658860466752714e-05,
+      "loss": 0.15979087352752686,
+      "step": 1595
+    },
+    {
+      "epoch": 0.29112081513828236,
+      "grad_norm": 0.1620412915945053,
+      "learning_rate": 4.160390356911096e-05,
+      "loss": 0.16103557348251343,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2920305676855895,
+      "grad_norm": 0.16673807799816132,
+      "learning_rate": 4.154880273170223e-05,
+      "loss": 0.16394708156585694,
+      "step": 1605
+    },
+    {
+      "epoch": 0.2929403202328967,
+      "grad_norm": 0.14834867417812347,
+      "learning_rate": 4.149355843219744e-05,
+      "loss": 0.15916435718536376,
+      "step": 1610
+    },
+    {
+      "epoch": 0.2938500727802038,
+      "grad_norm": 0.16977964341640472,
+      "learning_rate": 4.143817114951119e-05,
+      "loss": 0.16538127660751342,
+      "step": 1615
+    },
+    {
+      "epoch": 0.29475982532751094,
+      "grad_norm": 0.17986875772476196,
+      "learning_rate": 4.138264136379756e-05,
+      "loss": 0.15514618158340454,
+      "step": 1620
+    },
+    {
+      "epoch": 0.29566957787481807,
+      "grad_norm": 0.15794920921325684,
+      "learning_rate": 4.132696955644605e-05,
+      "loss": 0.15992183685302735,
+      "step": 1625
+    },
+    {
+      "epoch": 0.2965793304221252,
+      "grad_norm": 0.19955399632453918,
+      "learning_rate": 4.127115621007731e-05,
+      "loss": 0.16362056732177735,
+      "step": 1630
+    },
+    {
+      "epoch": 0.29748908296943233,
+      "grad_norm": 0.1352023035287857,
+      "learning_rate": 4.121520180853903e-05,
+      "loss": 0.15631601810455323,
+      "step": 1635
+    },
+    {
+      "epoch": 0.29839883551673946,
+      "grad_norm": 0.15340781211853027,
+      "learning_rate": 4.1159106836901674e-05,
+      "loss": 0.1571858048439026,
+      "step": 1640
+    },
+    {
+      "epoch": 0.2993085880640466,
+      "grad_norm": 0.15311770141124725,
+      "learning_rate": 4.110287178145433e-05,
+      "loss": 0.16082344055175782,
+      "step": 1645
+    },
+    {
+      "epoch": 0.3002183406113537,
+      "grad_norm": 0.17811627686023712,
+      "learning_rate": 4.10464971297005e-05,
+      "loss": 0.16117215156555176,
+      "step": 1650
+    },
+    {
+      "epoch": 0.30112809315866085,
+      "grad_norm": 0.21060039103031158,
+      "learning_rate": 4.0989983370353805e-05,
+      "loss": 0.15838587284088135,
+      "step": 1655
+    },
+    {
+      "epoch": 0.302037845705968,
+      "grad_norm": 0.155836820602417,
+      "learning_rate": 4.093333099333383e-05,
+      "loss": 0.16648870706558228,
+      "step": 1660
+    },
+    {
+      "epoch": 0.3029475982532751,
+      "grad_norm": 0.13711698353290558,
+      "learning_rate": 4.0876540489761826e-05,
+      "loss": 0.16899349689483642,
+      "step": 1665
+    },
+    {
+      "epoch": 0.30385735080058224,
+      "grad_norm": 0.15162716805934906,
+      "learning_rate": 4.0819612351956485e-05,
+      "loss": 0.16574090719223022,
+      "step": 1670
+    },
+    {
+      "epoch": 0.30476710334788937,
+      "grad_norm": 0.15016348659992218,
+      "learning_rate": 4.0762547073429615e-05,
+      "loss": 0.1689780354499817,
+      "step": 1675
+    },
+    {
+      "epoch": 0.3056768558951965,
+      "grad_norm": 0.15182986855506897,
+      "learning_rate": 4.070534514888194e-05,
+      "loss": 0.1593686819076538,
+      "step": 1680
+    },
+    {
+      "epoch": 0.3065866084425036,
+      "grad_norm": 0.15648750960826874,
+      "learning_rate": 4.0648007074198765e-05,
+      "loss": 0.16436235904693602,
+      "step": 1685
+    },
+    {
+      "epoch": 0.30749636098981076,
+      "grad_norm": 0.18339484930038452,
+      "learning_rate": 4.0590533346445665e-05,
+      "loss": 0.1678077220916748,
+      "step": 1690
+    },
+    {
+      "epoch": 0.3084061135371179,
+      "grad_norm": 0.16426527500152588,
+      "learning_rate": 4.053292446386422e-05,
+      "loss": 0.1689227342605591,
+      "step": 1695
+    },
+    {
+      "epoch": 0.309315866084425,
+      "grad_norm": 0.16129335761070251,
+      "learning_rate": 4.047518092586766e-05,
+      "loss": 0.16592445373535156,
+      "step": 1700
+    },
+    {
+      "epoch": 0.31022561863173215,
+      "grad_norm": 0.15512363612651825,
+      "learning_rate": 4.041730323303654e-05,
+      "loss": 0.16142364740371704,
+      "step": 1705
+    },
+    {
+      "epoch": 0.3111353711790393,
+      "grad_norm": 0.159842386841774,
+      "learning_rate": 4.0359291887114425e-05,
+      "loss": 0.1702875852584839,
+      "step": 1710
+    },
+    {
+      "epoch": 0.3120451237263464,
+      "grad_norm": 0.19558854401111603,
+      "learning_rate": 4.030114739100352e-05,
+      "loss": 0.15966148376464845,
+      "step": 1715
+    },
+    {
+      "epoch": 0.3129548762736536,
+      "grad_norm": 0.1577496975660324,
+      "learning_rate": 4.024287024876029e-05,
+      "loss": 0.1620358943939209,
+      "step": 1720
+    },
+    {
+      "epoch": 0.3138646288209607,
+      "grad_norm": 0.1629355251789093,
+      "learning_rate": 4.0184460965591144e-05,
+      "loss": 0.16511552333831786,
+      "step": 1725
+    },
+    {
+      "epoch": 0.31477438136826785,
+      "grad_norm": 0.17060767114162445,
+      "learning_rate": 4.0125920047848e-05,
+      "loss": 0.15672838687896729,
+      "step": 1730
+    },
+    {
+      "epoch": 0.315684133915575,
+      "grad_norm": 0.22447620332241058,
+      "learning_rate": 4.006724800302394e-05,
+      "loss": 0.15339784622192382,
+      "step": 1735
+    },
+    {
+      "epoch": 0.3165938864628821,
+      "grad_norm": 0.14572037756443024,
+      "learning_rate": 4.000844533974878e-05,
+      "loss": 0.16566959619522095,
+      "step": 1740
+    },
+    {
+      "epoch": 0.31750363901018924,
+      "grad_norm": 0.15915483236312866,
+      "learning_rate": 3.9949512567784684e-05,
+      "loss": 0.16153957843780517,
+      "step": 1745
+    },
+    {
+      "epoch": 0.3184133915574964,
+      "grad_norm": 0.1668540984392166,
+      "learning_rate": 3.9890450198021704e-05,
+      "loss": 0.1659809947013855,
+      "step": 1750
+    },
+    {
+      "epoch": 0.3193231441048035,
+      "grad_norm": 0.16612035036087036,
+      "learning_rate": 3.983125874247341e-05,
+      "loss": 0.16941241025924683,
+      "step": 1755
+    },
+    {
+      "epoch": 0.32023289665211063,
+      "grad_norm": 0.15163679420948029,
+      "learning_rate": 3.9771938714272407e-05,
+      "loss": 0.16053590774536133,
+      "step": 1760
+    },
+    {
+      "epoch": 0.32114264919941776,
+      "grad_norm": 0.1797824203968048,
+      "learning_rate": 3.97124906276659e-05,
+      "loss": 0.1667110800743103,
+      "step": 1765
+    },
+    {
+      "epoch": 0.3220524017467249,
+      "grad_norm": 0.15076608955860138,
+      "learning_rate": 3.9652914998011237e-05,
+      "loss": 0.1607860803604126,
+      "step": 1770
+    },
+    {
+      "epoch": 0.322962154294032,
+      "grad_norm": 0.16523587703704834,
+      "learning_rate": 3.959321234177144e-05,
+      "loss": 0.16515827178955078,
+      "step": 1775
+    },
+    {
+      "epoch": 0.32387190684133915,
+      "grad_norm": 0.22065149247646332,
+      "learning_rate": 3.9533383176510746e-05,
+      "loss": 0.1618957757949829,
+      "step": 1780
+    },
+    {
+      "epoch": 0.3247816593886463,
+      "grad_norm": 0.16426463425159454,
+      "learning_rate": 3.9473428020890066e-05,
+      "loss": 0.15763382911682128,
+      "step": 1785
+    },
+    {
+      "epoch": 0.3256914119359534,
+      "grad_norm": 0.16474904119968414,
+      "learning_rate": 3.941334739466257e-05,
+      "loss": 0.15135571956634522,
+      "step": 1790
+    },
+    {
+      "epoch": 0.32660116448326054,
+      "grad_norm": 0.16746412217617035,
+      "learning_rate": 3.935314181866909e-05,
+      "loss": 0.15925389528274536,
+      "step": 1795
+    },
+    {
+      "epoch": 0.32751091703056767,
+      "grad_norm": 0.17819371819496155,
+      "learning_rate": 3.929281181483369e-05,
+      "loss": 0.1598669171333313,
+      "step": 1800
+    },
+    {
+      "epoch": 0.3284206695778748,
+      "grad_norm": 0.1816040277481079,
+      "learning_rate": 3.923235790615907e-05,
+      "loss": 0.1652522087097168,
+      "step": 1805
+    },
+    {
+      "epoch": 0.32933042212518193,
+      "grad_norm": 0.14846695959568024,
+      "learning_rate": 3.917178061672211e-05,
+      "loss": 0.16665585041046144,
+      "step": 1810
+    },
+    {
+      "epoch": 0.33024017467248906,
+      "grad_norm": 0.1734926551580429,
+      "learning_rate": 3.911108047166924e-05,
+      "loss": 0.16069791316986085,
+      "step": 1815
+    },
+    {
+      "epoch": 0.3311499272197962,
+      "grad_norm": 0.16154922544956207,
+      "learning_rate": 3.905025799721194e-05,
+      "loss": 0.16114097833633423,
+      "step": 1820
+    },
+    {
+      "epoch": 0.3320596797671033,
+      "grad_norm": 0.1538771390914917,
+      "learning_rate": 3.898931372062217e-05,
+      "loss": 0.1602831244468689,
+      "step": 1825
+    },
+    {
+      "epoch": 0.3329694323144105,
+      "grad_norm": 0.14036566019058228,
+      "learning_rate": 3.892824817022781e-05,
+      "loss": 0.1502395749092102,
+      "step": 1830
+    },
+    {
+      "epoch": 0.33387918486171764,
+      "grad_norm": 0.19212059676647186,
+      "learning_rate": 3.886706187540804e-05,
+      "loss": 0.16265250444412233,
+      "step": 1835
+    },
+    {
+      "epoch": 0.33478893740902477,
+      "grad_norm": 0.17410333454608917,
+      "learning_rate": 3.880575536658881e-05,
+      "loss": 0.15689224004745483,
+      "step": 1840
+    },
+    {
+      "epoch": 0.3356986899563319,
+      "grad_norm": 0.15165294706821442,
+      "learning_rate": 3.874432917523817e-05,
+      "loss": 0.15033140182495117,
+      "step": 1845
+    },
+    {
+      "epoch": 0.336608442503639,
+      "grad_norm": 0.16166730225086212,
+      "learning_rate": 3.8682783833861736e-05,
+      "loss": 0.16896235942840576,
+      "step": 1850
+    },
+    {
+      "epoch": 0.33751819505094616,
+      "grad_norm": 0.16497021913528442,
+      "learning_rate": 3.8621119875998026e-05,
+      "loss": 0.1600774645805359,
+      "step": 1855
+    },
+    {
+      "epoch": 0.3384279475982533,
+      "grad_norm": 0.17264948785305023,
+      "learning_rate": 3.855933783621384e-05,
+      "loss": 0.16947593688964843,
+      "step": 1860
+    },
+    {
+      "epoch": 0.3393377001455604,
+      "grad_norm": 0.16870704293251038,
+      "learning_rate": 3.8497438250099636e-05,
+      "loss": 0.16062095165252685,
+      "step": 1865
+    },
+    {
+      "epoch": 0.34024745269286755,
+      "grad_norm": 0.16644036769866943,
+      "learning_rate": 3.843542165426492e-05,
+      "loss": 0.16015599966049193,
+      "step": 1870
+    },
+    {
+      "epoch": 0.3411572052401747,
+      "grad_norm": 0.1626352220773697,
+      "learning_rate": 3.837328858633349e-05,
+      "loss": 0.17444703578948975,
+      "step": 1875
+    },
+    {
+      "epoch": 0.3420669577874818,
+      "grad_norm": 0.1427375227212906,
+      "learning_rate": 3.83110395849389e-05,
+      "loss": 0.1589805006980896,
+      "step": 1880
+    },
+    {
+      "epoch": 0.34297671033478894,
+      "grad_norm": 0.17840255796909332,
+      "learning_rate": 3.824867518971973e-05,
+      "loss": 0.15953952074050903,
+      "step": 1885
+    },
+    {
+      "epoch": 0.34388646288209607,
+      "grad_norm": 0.16998249292373657,
+      "learning_rate": 3.818619594131489e-05,
+      "loss": 0.16027032136917113,
+      "step": 1890
+    },
+    {
+      "epoch": 0.3447962154294032,
+      "grad_norm": 0.14950257539749146,
+      "learning_rate": 3.812360238135897e-05,
+      "loss": 0.15335670709609986,
+      "step": 1895
+    },
+    {
+      "epoch": 0.3457059679767103,
+      "grad_norm": 0.1678011417388916,
+      "learning_rate": 3.806089505247752e-05,
+      "loss": 0.1560648798942566,
+      "step": 1900
+    },
+    {
+      "epoch": 0.34661572052401746,
+      "grad_norm": 0.17944541573524475,
+      "learning_rate": 3.799807449828238e-05,
+      "loss": 0.16072254180908202,
+      "step": 1905
+    },
+    {
+      "epoch": 0.3475254730713246,
+      "grad_norm": 0.166817307472229,
+      "learning_rate": 3.793514126336691e-05,
+      "loss": 0.1542820692062378,
+      "step": 1910
+    },
+    {
+      "epoch": 0.3484352256186317,
+      "grad_norm": 0.16047626733779907,
+      "learning_rate": 3.787209589330134e-05,
+      "loss": 0.16092092990875245,
+      "step": 1915
+    },
+    {
+      "epoch": 0.34934497816593885,
+      "grad_norm": 0.16478900611400604,
+      "learning_rate": 3.7808938934627965e-05,
+      "loss": 0.16765867471694945,
+      "step": 1920
+    },
+    {
+      "epoch": 0.350254730713246,
+      "grad_norm": 0.15349514782428741,
+      "learning_rate": 3.774567093485648e-05,
+      "loss": 0.15890377759933472,
+      "step": 1925
+    },
+    {
+      "epoch": 0.3511644832605531,
+      "grad_norm": 0.1515921950340271,
+      "learning_rate": 3.768229244245917e-05,
+      "loss": 0.16668319702148438,
+      "step": 1930
+    },
+    {
+      "epoch": 0.35207423580786024,
+      "grad_norm": 0.16310466825962067,
+      "learning_rate": 3.7618804006866195e-05,
+      "loss": 0.15182652473449706,
+      "step": 1935
+    },
+    {
+      "epoch": 0.3529839883551674,
+      "grad_norm": 0.17294517159461975,
+      "learning_rate": 3.755520617846084e-05,
+      "loss": 0.16287628412246705,
+      "step": 1940
+    },
+    {
+      "epoch": 0.35389374090247455,
+      "grad_norm": 0.1482895463705063,
+      "learning_rate": 3.749149950857467e-05,
+      "loss": 0.15321952104568481,
+      "step": 1945
+    },
+    {
+      "epoch": 0.3548034934497817,
+      "grad_norm": 0.2236029952764511,
+      "learning_rate": 3.7427684549482847e-05,
+      "loss": 0.15403482913970948,
+      "step": 1950
+    },
+    {
+      "epoch": 0.3557132459970888,
+      "grad_norm": 0.20185327529907227,
+      "learning_rate": 3.736376185439927e-05,
+      "loss": 0.1633884072303772,
+      "step": 1955
+    },
+    {
+      "epoch": 0.35662299854439594,
+      "grad_norm": 0.13906247913837433,
+      "learning_rate": 3.7299731977471816e-05,
+      "loss": 0.15925350189208984,
+      "step": 1960
+    },
+    {
+      "epoch": 0.35753275109170307,
+      "grad_norm": 0.18665002286434174,
+      "learning_rate": 3.723559547377751e-05,
+      "loss": 0.1612026572227478,
+      "step": 1965
+    },
+    {
+      "epoch": 0.3584425036390102,
+      "grad_norm": 0.16913433372974396,
+      "learning_rate": 3.717135289931774e-05,
+      "loss": 0.15479494333267213,
+      "step": 1970
+    },
+    {
+      "epoch": 0.35935225618631733,
+      "grad_norm": 0.1620066910982132,
+      "learning_rate": 3.7107004811013434e-05,
+      "loss": 0.1604058027267456,
+      "step": 1975
+    },
+    {
+      "epoch": 0.36026200873362446,
+      "grad_norm": 0.16838301718235016,
+      "learning_rate": 3.704255176670021e-05,
+      "loss": 0.15335073471069335,
+      "step": 1980
+    },
+    {
+      "epoch": 0.3611717612809316,
+      "grad_norm": 0.3054695427417755,
+      "learning_rate": 3.6977994325123535e-05,
+      "loss": 0.16558053493499755,
+      "step": 1985
+    },
+    {
+      "epoch": 0.3620815138282387,
+      "grad_norm": 0.1526716649532318,
+      "learning_rate": 3.6913333045933934e-05,
+      "loss": 0.16148923635482787,
+      "step": 1990
+    },
+    {
+      "epoch": 0.36299126637554585,
+      "grad_norm": 0.15328513085842133,
+      "learning_rate": 3.684856848968209e-05,
+      "loss": 0.1553613781929016,
+      "step": 1995
+    },
+    {
+      "epoch": 0.363901018922853,
+      "grad_norm": 0.16129714250564575,
+      "learning_rate": 3.6783701217813995e-05,
+      "loss": 0.16724612712860107,
+      "step": 2000
+    },
+    {
+      "epoch": 0.3648107714701601,
+      "grad_norm": 0.15715539455413818,
+      "learning_rate": 3.6718731792666086e-05,
+      "loss": 0.15867922306060792,
+      "step": 2005
+    },
+    {
+      "epoch": 0.36572052401746724,
+      "grad_norm": 0.15569166839122772,
+      "learning_rate": 3.6653660777460366e-05,
+      "loss": 0.1552058696746826,
+      "step": 2010
+    },
+    {
+      "epoch": 0.36663027656477437,
+      "grad_norm": 0.16223010420799255,
+      "learning_rate": 3.6588488736299535e-05,
+      "loss": 0.1583200454711914,
+      "step": 2015
+    },
+    {
+      "epoch": 0.3675400291120815,
+      "grad_norm": 0.18441995978355408,
+      "learning_rate": 3.652321623416209e-05,
+      "loss": 0.15050662755966188,
+      "step": 2020
+    },
+    {
+      "epoch": 0.36844978165938863,
+      "grad_norm": 0.13792674243450165,
+      "learning_rate": 3.645784383689742e-05,
+      "loss": 0.15458759069442748,
+      "step": 2025
+    },
+    {
+      "epoch": 0.36935953420669576,
+      "grad_norm": 0.14993111789226532,
+      "learning_rate": 3.639237211122091e-05,
+      "loss": 0.15926222801208495,
+      "step": 2030
+    },
+    {
+      "epoch": 0.3702692867540029,
+      "grad_norm": 0.16815930604934692,
+      "learning_rate": 3.632680162470904e-05,
+      "loss": 0.15524441003799438,
+      "step": 2035
+    },
+    {
+      "epoch": 0.37117903930131,
+      "grad_norm": 0.13312821090221405,
+      "learning_rate": 3.626113294579441e-05,
+      "loss": 0.15883516073226928,
+      "step": 2040
+    },
+    {
+      "epoch": 0.37208879184861715,
+      "grad_norm": 0.16838273406028748,
+      "learning_rate": 3.619536664376091e-05,
+      "loss": 0.15829603672027587,
+      "step": 2045
+    },
+    {
+      "epoch": 0.37299854439592434,
+      "grad_norm": 0.14706873893737793,
+      "learning_rate": 3.612950328873869e-05,
+      "loss": 0.15644397735595703,
+      "step": 2050
+    },
+    {
+      "epoch": 0.37390829694323147,
+      "grad_norm": 0.1644199639558792,
+      "learning_rate": 3.606354345169926e-05,
+      "loss": 0.15858219861984252,
+      "step": 2055
+    },
+    {
+      "epoch": 0.3748180494905386,
+      "grad_norm": 0.18077051639556885,
+      "learning_rate": 3.599748770445055e-05,
+      "loss": 0.1641286849975586,
+      "step": 2060
+    },
+    {
+      "epoch": 0.3757278020378457,
+      "grad_norm": 0.16329127550125122,
+      "learning_rate": 3.5931336619631914e-05,
+      "loss": 0.15027186870574952,
+      "step": 2065
+    },
+    {
+      "epoch": 0.37663755458515286,
+      "grad_norm": 0.16346783936023712,
+      "learning_rate": 3.586509077070922e-05,
+      "loss": 0.1558641314506531,
+      "step": 2070
+    },
+    {
+      "epoch": 0.37754730713246,
+      "grad_norm": 0.1727602630853653,
+      "learning_rate": 3.5798750731969834e-05,
+      "loss": 0.15390506982803345,
+      "step": 2075
+    },
+    {
+      "epoch": 0.3784570596797671,
+      "grad_norm": 0.7598192691802979,
+      "learning_rate": 3.5732317078517654e-05,
+      "loss": 0.1533232808113098,
+      "step": 2080
+    },
+    {
+      "epoch": 0.37936681222707425,
+      "grad_norm": 0.1433355212211609,
+      "learning_rate": 3.5665790386268124e-05,
+      "loss": 0.15560413599014283,
+      "step": 2085
+    },
+    {
+      "epoch": 0.3802765647743814,
+      "grad_norm": 0.18439625203609467,
+      "learning_rate": 3.559917123194325e-05,
+      "loss": 0.16695556640625,
+      "step": 2090
+    },
+    {
+      "epoch": 0.3811863173216885,
+      "grad_norm": 0.1693502813577652,
+      "learning_rate": 3.55324601930666e-05,
+      "loss": 0.15957870483398437,
+      "step": 2095
+    },
+    {
+      "epoch": 0.38209606986899564,
+      "grad_norm": 0.17776088416576385,
+      "learning_rate": 3.54656578479583e-05,
+      "loss": 0.1527492880821228,
+      "step": 2100
+    },
+    {
+      "epoch": 0.38300582241630277,
+      "grad_norm": 0.15993724763393402,
+      "learning_rate": 3.539876477572998e-05,
+      "loss": 0.1567505717277527,
+      "step": 2105
+    },
+    {
+      "epoch": 0.3839155749636099,
+      "grad_norm": 0.17067375779151917,
+      "learning_rate": 3.533178155627981e-05,
+      "loss": 0.14660797119140626,
+      "step": 2110
+    },
+    {
+      "epoch": 0.384825327510917,
+      "grad_norm": 0.20239882171154022,
+      "learning_rate": 3.526470877028745e-05,
+      "loss": 0.1596767544746399,
+      "step": 2115
+    },
+    {
+      "epoch": 0.38573508005822416,
+      "grad_norm": 0.1863643079996109,
+      "learning_rate": 3.5197546999209005e-05,
+      "loss": 0.15738571882247926,
+      "step": 2120
+    },
+    {
+      "epoch": 0.3866448326055313,
+      "grad_norm": 0.16994133591651917,
+      "learning_rate": 3.5130296825272014e-05,
+      "loss": 0.16255316734313965,
+      "step": 2125
+    },
+    {
+      "epoch": 0.3875545851528384,
+      "grad_norm": 0.18703415989875793,
+      "learning_rate": 3.5062958831470355e-05,
+      "loss": 0.15206334590911866,
+      "step": 2130
+    },
+    {
+      "epoch": 0.38846433770014555,
+      "grad_norm": 0.15433982014656067,
+      "learning_rate": 3.4995533601559226e-05,
+      "loss": 0.1590178370475769,
+      "step": 2135
+    },
+    {
+      "epoch": 0.3893740902474527,
+      "grad_norm": 0.16498146951198578,
+      "learning_rate": 3.4928021720050104e-05,
+      "loss": 0.14759145975112914,
+      "step": 2140
+    },
+    {
+      "epoch": 0.3902838427947598,
+      "grad_norm": 0.17880478501319885,
+      "learning_rate": 3.486042377220562e-05,
+      "loss": 0.1642458915710449,
+      "step": 2145
+    },
+    {
+      "epoch": 0.39119359534206694,
+      "grad_norm": 0.14700061082839966,
+      "learning_rate": 3.479274034403455e-05,
+      "loss": 0.16105138063430785,
+      "step": 2150
+    },
+    {
+      "epoch": 0.39210334788937407,
+      "grad_norm": 0.1620762050151825,
+      "learning_rate": 3.472497202228664e-05,
+      "loss": 0.15104985237121582,
+      "step": 2155
+    },
+    {
+      "epoch": 0.3930131004366812,
+      "grad_norm": 0.1625058799982071,
+      "learning_rate": 3.4657119394447654e-05,
+      "loss": 0.16145485639572144,
+      "step": 2160
+    },
+    {
+      "epoch": 0.3939228529839884,
+      "grad_norm": 0.1631549596786499,
+      "learning_rate": 3.458918304873417e-05,
+      "loss": 0.16712255477905275,
+      "step": 2165
+    },
+    {
+      "epoch": 0.3948326055312955,
+      "grad_norm": 0.16041551530361176,
+      "learning_rate": 3.452116357408853e-05,
+      "loss": 0.15118330717086792,
+      "step": 2170
+    },
+    {
+      "epoch": 0.39574235807860264,
+      "grad_norm": 0.16692611575126648,
+      "learning_rate": 3.44530615601737e-05,
+      "loss": 0.16982550621032716,
+      "step": 2175
+    },
+    {
+      "epoch": 0.39665211062590977,
+      "grad_norm": 0.16082268953323364,
+      "learning_rate": 3.438487759736821e-05,
+      "loss": 0.1513260006904602,
+      "step": 2180
+    },
+    {
+      "epoch": 0.3975618631732169,
+      "grad_norm": 0.1474589854478836,
+      "learning_rate": 3.4316612276761004e-05,
+      "loss": 0.14968743324279785,
+      "step": 2185
+    },
+    {
+      "epoch": 0.39847161572052403,
+      "grad_norm": 0.14531342685222626,
+      "learning_rate": 3.42482661901463e-05,
+      "loss": 0.1563260555267334,
+      "step": 2190
+    },
+    {
+      "epoch": 0.39938136826783116,
+      "grad_norm": 0.16775506734848022,
+      "learning_rate": 3.41798399300185e-05,
+      "loss": 0.14861010313034057,
+      "step": 2195
+    },
+    {
+      "epoch": 0.4002911208151383,
+      "grad_norm": 0.15065217018127441,
+      "learning_rate": 3.411133408956703e-05,
+      "loss": 0.15559519529342652,
+      "step": 2200
+    },
+    {
+      "epoch": 0.4012008733624454,
+      "grad_norm": 0.16655296087265015,
+      "learning_rate": 3.4042749262671184e-05,
+      "loss": 0.16025567054748535,
+      "step": 2205
+    },
+    {
+      "epoch": 0.40211062590975255,
+      "grad_norm": 0.14773905277252197,
+      "learning_rate": 3.397408604389501e-05,
+      "loss": 0.15074082612991332,
+      "step": 2210
+    },
+    {
+      "epoch": 0.4030203784570597,
+      "grad_norm": 0.16233304142951965,
+      "learning_rate": 3.3905345028482125e-05,
+      "loss": 0.15490520000457764,
+      "step": 2215
+    },
+    {
+      "epoch": 0.4039301310043668,
+      "grad_norm": 0.17520153522491455,
+      "learning_rate": 3.383652681235058e-05,
+      "loss": 0.1517520785331726,
+      "step": 2220
+    },
+    {
+      "epoch": 0.40483988355167394,
+      "grad_norm": 0.14749875664710999,
+      "learning_rate": 3.376763199208766e-05,
+      "loss": 0.15410997867584228,
+      "step": 2225
+    },
+    {
+      "epoch": 0.40574963609898107,
+      "grad_norm": 0.16855919361114502,
+      "learning_rate": 3.369866116494477e-05,
+      "loss": 0.1510261058807373,
+      "step": 2230
+    },
+    {
+      "epoch": 0.4066593886462882,
+      "grad_norm": 0.1594122350215912,
+      "learning_rate": 3.362961492883218e-05,
+      "loss": 0.1493813395500183,
+      "step": 2235
+    },
+    {
+      "epoch": 0.40756914119359533,
+      "grad_norm": 0.13645926117897034,
+      "learning_rate": 3.3560493882313915e-05,
+      "loss": 0.14876762628555298,
+      "step": 2240
+    },
+    {
+      "epoch": 0.40847889374090246,
+      "grad_norm": 0.14304400980472565,
+      "learning_rate": 3.349129862460251e-05,
+      "loss": 0.15567013025283813,
+      "step": 2245
+    },
+    {
+      "epoch": 0.4093886462882096,
+      "grad_norm": 0.17040041089057922,
+      "learning_rate": 3.342202975555386e-05,
+      "loss": 0.1563249945640564,
+      "step": 2250
+    },
+    {
+      "epoch": 0.4102983988355167,
+      "grad_norm": 0.15594671666622162,
+      "learning_rate": 3.3352687875661984e-05,
+      "loss": 0.1546410083770752,
+      "step": 2255
+    },
+    {
+      "epoch": 0.41120815138282385,
+      "grad_norm": 0.1677195280790329,
+      "learning_rate": 3.328327358605384e-05,
+      "loss": 0.15710171461105346,
+      "step": 2260
+    },
+    {
+      "epoch": 0.412117903930131,
+      "grad_norm": 0.1731705516576767,
+      "learning_rate": 3.321378748848412e-05,
+      "loss": 0.16444036960601807,
+      "step": 2265
+    },
+    {
+      "epoch": 0.4130276564774381,
+      "grad_norm": 0.18779033422470093,
+      "learning_rate": 3.3144230185329984e-05,
+      "loss": 0.15659687519073487,
+      "step": 2270
+    },
+    {
+      "epoch": 0.4139374090247453,
+      "grad_norm": 0.1543768346309662,
+      "learning_rate": 3.3074602279585913e-05,
+      "loss": 0.15100739002227784,
+      "step": 2275
+    },
+    {
+      "epoch": 0.4148471615720524,
+      "grad_norm": 0.16672168672084808,
+      "learning_rate": 3.300490437485843e-05,
+      "loss": 0.15535364151000977,
+      "step": 2280
+    },
+    {
+      "epoch": 0.41575691411935956,
+      "grad_norm": 0.16741308569908142,
+      "learning_rate": 3.293513707536089e-05,
+      "loss": 0.15523911714553834,
+      "step": 2285
+    },
+    {
+      "epoch": 0.4166666666666667,
+      "grad_norm": 0.1488303542137146,
+      "learning_rate": 3.286530098590822e-05,
+      "loss": 0.1542000651359558,
+      "step": 2290
+    },
+    {
+      "epoch": 0.4175764192139738,
+      "grad_norm": 0.1637732982635498,
+      "learning_rate": 3.2795396711911694e-05,
+      "loss": 0.15354831218719484,
+      "step": 2295
+    },
+    {
+      "epoch": 0.41848617176128095,
+      "grad_norm": 0.1472022533416748,
+      "learning_rate": 3.272542485937369e-05,
+      "loss": 0.16235145330429077,
+      "step": 2300
+    },
+    {
+      "epoch": 0.4193959243085881,
+      "grad_norm": 0.15908290445804596,
+      "learning_rate": 3.265538603488241e-05,
+      "loss": 0.15642645359039306,
+      "step": 2305
+    },
+    {
+      "epoch": 0.4203056768558952,
+      "grad_norm": 0.1584865301847458,
+      "learning_rate": 3.2585280845606645e-05,
+      "loss": 0.15490249395370484,
+      "step": 2310
+    },
+    {
+      "epoch": 0.42121542940320233,
+      "grad_norm": 0.15893949568271637,
+      "learning_rate": 3.251510989929052e-05,
+      "loss": 0.1598116159439087,
+      "step": 2315
+    },
+    {
+      "epoch": 0.42212518195050946,
+      "grad_norm": 0.18930596113204956,
+      "learning_rate": 3.244487380424817e-05,
+      "loss": 0.1482008934020996,
+      "step": 2320
+    },
+    {
+      "epoch": 0.4230349344978166,
+      "grad_norm": 0.132876455783844,
+      "learning_rate": 3.237457316935856e-05,
+      "loss": 0.15304710865020751,
+      "step": 2325
+    },
+    {
+      "epoch": 0.4239446870451237,
+      "grad_norm": 0.16447032988071442,
+      "learning_rate": 3.2304208604060106e-05,
+      "loss": 0.15298750400543212,
+      "step": 2330
+    },
+    {
+      "epoch": 0.42485443959243085,
+      "grad_norm": 0.17748120427131653,
+      "learning_rate": 3.223378071834546e-05,
+      "loss": 0.1556084156036377,
+      "step": 2335
+    },
+    {
+      "epoch": 0.425764192139738,
+      "grad_norm": 0.16366586089134216,
+      "learning_rate": 3.2163290122756206e-05,
+      "loss": 0.14387927055358887,
+      "step": 2340
+    },
+    {
+      "epoch": 0.4266739446870451,
+      "grad_norm": 0.15398970246315002,
+      "learning_rate": 3.209273742837755e-05,
+      "loss": 0.16091293096542358,
+      "step": 2345
+    },
+    {
+      "epoch": 0.42758369723435224,
+      "grad_norm": 0.164212167263031,
+      "learning_rate": 3.202212324683305e-05,
+      "loss": 0.15523531436920165,
+      "step": 2350
+    },
+    {
+      "epoch": 0.4284934497816594,
+      "grad_norm": 0.16749800741672516,
+      "learning_rate": 3.1951448190279255e-05,
+      "loss": 0.15354975461959838,
+      "step": 2355
+    },
+    {
+      "epoch": 0.4294032023289665,
+      "grad_norm": 0.14137034118175507,
+      "learning_rate": 3.18807128714005e-05,
+      "loss": 0.14981694221496583,
+      "step": 2360
+    },
+    {
+      "epoch": 0.43031295487627363,
+      "grad_norm": 0.14848439395427704,
+      "learning_rate": 3.1809917903403507e-05,
+      "loss": 0.15448769330978393,
+      "step": 2365
+    },
+    {
+      "epoch": 0.43122270742358076,
+      "grad_norm": 0.1747605800628662,
+      "learning_rate": 3.1739063900012095e-05,
+      "loss": 0.15882387161254882,
+      "step": 2370
+    },
+    {
+      "epoch": 0.4321324599708879,
+      "grad_norm": 0.16054467856884003,
+      "learning_rate": 3.166815147546186e-05,
+      "loss": 0.15170297622680665,
+      "step": 2375
+    },
+    {
+      "epoch": 0.433042212518195,
+      "grad_norm": 0.15428027510643005,
+      "learning_rate": 3.1597181244494886e-05,
+      "loss": 0.16202548742294312,
+      "step": 2380
+    },
+    {
+      "epoch": 0.4339519650655022,
+      "grad_norm": 0.16747219860553741,
+      "learning_rate": 3.1526153822354325e-05,
+      "loss": 0.15461477041244506,
+      "step": 2385
+    },
+    {
+      "epoch": 0.43486171761280934,
+      "grad_norm": 0.17415772378444672,
+      "learning_rate": 3.145506982477918e-05,
+      "loss": 0.16173542737960817,
+      "step": 2390
+    },
+    {
+      "epoch": 0.43577147016011647,
+      "grad_norm": 0.1293518990278244,
+      "learning_rate": 3.1383929867998865e-05,
+      "loss": 0.15572521686553956,
+      "step": 2395
+    },
+    {
+      "epoch": 0.4366812227074236,
+      "grad_norm": 0.16909323632717133,
+      "learning_rate": 3.1312734568727935e-05,
+      "loss": 0.15898628234863282,
+      "step": 2400
+    },
+    {
+      "epoch": 0.43759097525473073,
+      "grad_norm": 0.16770294308662415,
+      "learning_rate": 3.124148454416069e-05,
+      "loss": 0.1536281704902649,
+      "step": 2405
+    },
+    {
+      "epoch": 0.43850072780203786,
+      "grad_norm": 0.14078612625598907,
+      "learning_rate": 3.117018041196585e-05,
+      "loss": 0.15274266004562378,
+      "step": 2410
+    },
+    {
+      "epoch": 0.439410480349345,
+      "grad_norm": 0.15457536280155182,
+      "learning_rate": 3.1098822790281226e-05,
+      "loss": 0.15391263961791993,
+      "step": 2415
+    },
+    {
+      "epoch": 0.4403202328966521,
+      "grad_norm": 0.1640717089176178,
+      "learning_rate": 3.102741229770827e-05,
+      "loss": 0.15515168905258178,
+      "step": 2420
+    },
+    {
+      "epoch": 0.44122998544395925,
+      "grad_norm": 0.2601533830165863,
+      "learning_rate": 3.095594955330683e-05,
+      "loss": 0.1587247371673584,
+      "step": 2425
+    },
+    {
+      "epoch": 0.4421397379912664,
+      "grad_norm": 0.1352529525756836,
+      "learning_rate": 3.08844351765897e-05,
+      "loss": 0.1483217477798462,
+      "step": 2430
+    },
+    {
+      "epoch": 0.4430494905385735,
+      "grad_norm": 0.18479721248149872,
+      "learning_rate": 3.081286978751728e-05,
+      "loss": 0.15121787786483765,
+      "step": 2435
+    },
+    {
+      "epoch": 0.44395924308588064,
+      "grad_norm": 0.16954511404037476,
+      "learning_rate": 3.074125400649221e-05,
+      "loss": 0.16073100566864013,
+      "step": 2440
+    },
+    {
+      "epoch": 0.44486899563318777,
+      "grad_norm": 0.15154729783535004,
+      "learning_rate": 3.0669588454353944e-05,
+      "loss": 0.15738017559051515,
+      "step": 2445
+    },
+    {
+      "epoch": 0.4457787481804949,
+      "grad_norm": 0.1540488302707672,
+      "learning_rate": 3.059787375237344e-05,
+      "loss": 0.1515384554862976,
+      "step": 2450
+    },
+    {
+      "epoch": 0.44668850072780203,
+      "grad_norm": 0.1814432442188263,
+      "learning_rate": 3.052611052224774e-05,
+      "loss": 0.15731438398361205,
+      "step": 2455
+    },
+    {
+      "epoch": 0.44759825327510916,
+      "grad_norm": 0.16657036542892456,
+      "learning_rate": 3.0454299386094542e-05,
+      "loss": 0.15741543769836425,
+      "step": 2460
+    },
+    {
+      "epoch": 0.4485080058224163,
+      "grad_norm": 0.2177237570285797,
+      "learning_rate": 3.0382440966446875e-05,
+      "loss": 0.14972515106201173,
+      "step": 2465
+    },
+    {
+      "epoch": 0.4494177583697234,
+      "grad_norm": 0.1669909954071045,
+      "learning_rate": 3.031053588624766e-05,
+      "loss": 0.1506432294845581,
+      "step": 2470
+    },
+    {
+      "epoch": 0.45032751091703055,
+      "grad_norm": 0.1752234250307083,
+      "learning_rate": 3.0238584768844313e-05,
+      "loss": 0.14969609975814818,
+      "step": 2475
+    },
+    {
+      "epoch": 0.4512372634643377,
+      "grad_norm": 0.18267901241779327,
+      "learning_rate": 3.0166588237983363e-05,
+      "loss": 0.15112748146057128,
+      "step": 2480
+    },
+    {
+      "epoch": 0.4521470160116448,
+      "grad_norm": 0.16250105202198029,
+      "learning_rate": 3.0094546917805007e-05,
+      "loss": 0.15864100456237792,
+      "step": 2485
+    },
+    {
+      "epoch": 0.45305676855895194,
+      "grad_norm": 0.14825721085071564,
+      "learning_rate": 3.0022461432837752e-05,
+      "loss": 0.1513954520225525,
+      "step": 2490
+    },
+    {
+      "epoch": 0.4539665211062591,
+      "grad_norm": 0.1626640111207962,
+      "learning_rate": 2.9950332407992943e-05,
+      "loss": 0.1505578875541687,
+      "step": 2495
+    },
+    {
+      "epoch": 0.45487627365356625,
+      "grad_norm": 0.1535351574420929,
+      "learning_rate": 2.987816046855939e-05,
+      "loss": 0.15255829095840454,
+      "step": 2500
+    },
+    {
+      "epoch": 0.4557860262008734,
+      "grad_norm": 0.17552775144577026,
+      "learning_rate": 2.9805946240197928e-05,
+      "loss": 0.1516443133354187,
+      "step": 2505
+    },
+    {
+      "epoch": 0.4566957787481805,
+      "grad_norm": 0.16020981967449188,
+      "learning_rate": 2.9733690348935994e-05,
+      "loss": 0.14519743919372557,
+      "step": 2510
+    },
+    {
+      "epoch": 0.45760553129548764,
+      "grad_norm": 0.17800211906433105,
+      "learning_rate": 2.9661393421162204e-05,
+      "loss": 0.15679080486297609,
+      "step": 2515
+    },
+    {
+      "epoch": 0.4585152838427948,
+      "grad_norm": 0.16016991436481476,
+      "learning_rate": 2.9589056083620902e-05,
+      "loss": 0.14768127202987671,
+      "step": 2520
+    },
+    {
+      "epoch": 0.4594250363901019,
+      "grad_norm": 0.16272081434726715,
+      "learning_rate": 2.951667896340679e-05,
+      "loss": 0.1513301968574524,
+      "step": 2525
+    },
+    {
+      "epoch": 0.46033478893740903,
+      "grad_norm": 0.1726413071155548,
+      "learning_rate": 2.9444262687959402e-05,
+      "loss": 0.14819332361221313,
+      "step": 2530
+    },
+    {
+      "epoch": 0.46124454148471616,
+      "grad_norm": 0.1670403778553009,
+      "learning_rate": 2.9371807885057735e-05,
+      "loss": 0.15245940685272216,
+      "step": 2535
+    },
+    {
+      "epoch": 0.4621542940320233,
+      "grad_norm": 0.1650049239397049,
+      "learning_rate": 2.9299315182814772e-05,
+      "loss": 0.15187418460845947,
+      "step": 2540
+    },
+    {
+      "epoch": 0.4630640465793304,
+      "grad_norm": 0.16327734291553497,
+      "learning_rate": 2.9226785209672047e-05,
+      "loss": 0.15579828023910522,
+      "step": 2545
+    },
+    {
+      "epoch": 0.46397379912663755,
+      "grad_norm": 0.3367880582809448,
+      "learning_rate": 2.91542185943942e-05,
+      "loss": 0.15617697238922118,
+      "step": 2550
+    },
+    {
+      "epoch": 0.4648835516739447,
+      "grad_norm": 0.1731594055891037,
+      "learning_rate": 2.908161596606353e-05,
+      "loss": 0.1559603691101074,
+      "step": 2555
+    },
+    {
+      "epoch": 0.4657933042212518,
+      "grad_norm": 0.1477293074131012,
+      "learning_rate": 2.9008977954074517e-05,
+      "loss": 0.15567959547042848,
+      "step": 2560
+    },
+    {
+      "epoch": 0.46670305676855894,
+      "grad_norm": 0.16227173805236816,
+      "learning_rate": 2.8936305188128392e-05,
+      "loss": 0.1522113561630249,
+      "step": 2565
+    },
+    {
+      "epoch": 0.4676128093158661,
+      "grad_norm": 0.2031075656414032,
+      "learning_rate": 2.8863598298227674e-05,
+      "loss": 0.15054640769958497,
+      "step": 2570
+    },
+    {
+      "epoch": 0.4685225618631732,
+      "grad_norm": 0.18351472914218903,
+      "learning_rate": 2.8790857914670698e-05,
+      "loss": 0.15837019681930542,
+      "step": 2575
+    },
+    {
+      "epoch": 0.46943231441048033,
+      "grad_norm": 0.15914765000343323,
+      "learning_rate": 2.871808466804616e-05,
+      "loss": 0.1550259470939636,
+      "step": 2580
+    },
+    {
+      "epoch": 0.47034206695778746,
+      "grad_norm": 0.17366717755794525,
+      "learning_rate": 2.8645279189227636e-05,
+      "loss": 0.15702390670776367,
+      "step": 2585
+    },
+    {
+      "epoch": 0.4712518195050946,
+      "grad_norm": 0.13677838444709778,
+      "learning_rate": 2.8572442109368134e-05,
+      "loss": 0.15485031604766847,
+      "step": 2590
+    },
+    {
+      "epoch": 0.4721615720524017,
+      "grad_norm": 0.1477748304605484,
+      "learning_rate": 2.8499574059894617e-05,
+      "loss": 0.14577245712280273,
+      "step": 2595
+    },
+    {
+      "epoch": 0.47307132459970885,
+      "grad_norm": 0.1582217663526535,
+      "learning_rate": 2.842667567250252e-05,
+      "loss": 0.15586793422698975,
+      "step": 2600
+    },
+    {
+      "epoch": 0.47398107714701604,
+      "grad_norm": 0.19658738374710083,
+      "learning_rate": 2.8353747579150268e-05,
+      "loss": 0.15060495138168334,
+      "step": 2605
+    },
+    {
+      "epoch": 0.47489082969432317,
+      "grad_norm": 0.176767036318779,
+      "learning_rate": 2.828079041205382e-05,
+      "loss": 0.15116705894470214,
+      "step": 2610
+    },
+    {
+      "epoch": 0.4758005822416303,
+      "grad_norm": 0.16972507536411285,
+      "learning_rate": 2.820780480368117e-05,
+      "loss": 0.1541937470436096,
+      "step": 2615
+    },
+    {
+      "epoch": 0.47671033478893743,
+      "grad_norm": 0.1548585742712021,
+      "learning_rate": 2.8134791386746884e-05,
+      "loss": 0.14334756135940552,
+      "step": 2620
+    },
+    {
+      "epoch": 0.47762008733624456,
+      "grad_norm": 0.15411986410617828,
+      "learning_rate": 2.806175079420658e-05,
+      "loss": 0.14642289876937867,
+      "step": 2625
+    },
+    {
+      "epoch": 0.4785298398835517,
+      "grad_norm": 0.16609491407871246,
+      "learning_rate": 2.7988683659251474e-05,
+      "loss": 0.15083469152450563,
+      "step": 2630
+    },
+    {
+      "epoch": 0.4794395924308588,
+      "grad_norm": 0.16592684388160706,
+      "learning_rate": 2.791559061530289e-05,
+      "loss": 0.14218480587005616,
+      "step": 2635
+    },
+    {
+      "epoch": 0.48034934497816595,
+      "grad_norm": 0.1764935404062271,
+      "learning_rate": 2.7842472296006722e-05,
+      "loss": 0.15004343986511232,
+      "step": 2640
+    },
+    {
+      "epoch": 0.4812590975254731,
+      "grad_norm": 0.20094354450702667,
+      "learning_rate": 2.7769329335228022e-05,
+      "loss": 0.14975016117095946,
+      "step": 2645
+    },
+    {
+      "epoch": 0.4821688500727802,
+      "grad_norm": 0.1869269460439682,
+      "learning_rate": 2.769616236704542e-05,
+      "loss": 0.155981707572937,
+      "step": 2650
+    },
+    {
+      "epoch": 0.48307860262008734,
+      "grad_norm": 0.16671574115753174,
+      "learning_rate": 2.762297202574571e-05,
+      "loss": 0.14633859395980836,
+      "step": 2655
+    },
+    {
+      "epoch": 0.48398835516739447,
+      "grad_norm": 0.14999663829803467,
+      "learning_rate": 2.754975894581826e-05,
+      "loss": 0.15692603588104248,
+      "step": 2660
+    },
+    {
+      "epoch": 0.4848981077147016,
+      "grad_norm": 0.16893649101257324,
+      "learning_rate": 2.7476523761949592e-05,
+      "loss": 0.14530394077301026,
+      "step": 2665
+    },
+    {
+      "epoch": 0.48580786026200873,
+      "grad_norm": 0.16039884090423584,
+      "learning_rate": 2.740326710901784e-05,
+      "loss": 0.15013915300369263,
+      "step": 2670
+    },
+    {
+      "epoch": 0.48671761280931586,
+      "grad_norm": 0.16672006249427795,
+      "learning_rate": 2.732998962208725e-05,
+      "loss": 0.15667349100112915,
+      "step": 2675
+    },
+    {
+      "epoch": 0.487627365356623,
+      "grad_norm": 0.2160867303609848,
+      "learning_rate": 2.7256691936402684e-05,
+      "loss": 0.14335414171218872,
+      "step": 2680
+    },
+    {
+      "epoch": 0.4885371179039301,
+      "grad_norm": 0.349030077457428,
+      "learning_rate": 2.71833746873841e-05,
+      "loss": 0.1437530279159546,
+      "step": 2685
+    },
+    {
+      "epoch": 0.48944687045123725,
+      "grad_norm": 0.18380966782569885,
+      "learning_rate": 2.7110038510621073e-05,
+      "loss": 0.1476014256477356,
+      "step": 2690
+    },
+    {
+      "epoch": 0.4903566229985444,
+      "grad_norm": 0.1523742377758026,
+      "learning_rate": 2.703668404186722e-05,
+      "loss": 0.14578526020050048,
+      "step": 2695
+    },
+    {
+      "epoch": 0.4912663755458515,
+      "grad_norm": 0.16092729568481445,
+      "learning_rate": 2.696331191703479e-05,
+      "loss": 0.15335593223571778,
+      "step": 2700
+    },
+    {
+      "epoch": 0.49217612809315864,
+      "grad_norm": 0.17185333371162415,
+      "learning_rate": 2.688992277218904e-05,
+      "loss": 0.1540898084640503,
+      "step": 2705
+    },
+    {
+      "epoch": 0.49308588064046577,
+      "grad_norm": 0.1521969735622406,
+      "learning_rate": 2.6816517243542792e-05,
+      "loss": 0.15171396732330322,
+      "step": 2710
+    },
+    {
+      "epoch": 0.49399563318777295,
+      "grad_norm": 0.16064171493053436,
+      "learning_rate": 2.674309596745092e-05,
+      "loss": 0.1505839228630066,
+      "step": 2715
+    },
+    {
+      "epoch": 0.4949053857350801,
+      "grad_norm": 0.16430898010730743,
+      "learning_rate": 2.6669659580404795e-05,
+      "loss": 0.1551363468170166,
+      "step": 2720
+    },
+    {
+      "epoch": 0.4958151382823872,
+      "grad_norm": 0.16125477850437164,
+      "learning_rate": 2.659620871902677e-05,
+      "loss": 0.15069286823272704,
+      "step": 2725
+    },
+    {
+      "epoch": 0.49672489082969434,
+      "grad_norm": 0.1428450047969818,
+      "learning_rate": 2.652274402006471e-05,
+      "loss": 0.15511081218719483,
+      "step": 2730
+    },
+    {
+      "epoch": 0.4976346433770015,
+      "grad_norm": 0.15452754497528076,
+      "learning_rate": 2.6449266120386406e-05,
+      "loss": 0.14941939115524291,
+      "step": 2735
+    },
+    {
+      "epoch": 0.4985443959243086,
+      "grad_norm": 0.17243537306785583,
+      "learning_rate": 2.6375775656974123e-05,
+      "loss": 0.151741623878479,
+      "step": 2740
+    },
+    {
+      "epoch": 0.49945414847161573,
+      "grad_norm": 0.13736453652381897,
+      "learning_rate": 2.6302273266919008e-05,
+      "loss": 0.147042977809906,
+      "step": 2745
+    },
+    {
+      "epoch": 0.5003639010189228,
+      "grad_norm": 0.16241495311260223,
+      "learning_rate": 2.6228759587415614e-05,
+      "loss": 0.14664684534072875,
+      "step": 2750
+    },
+    {
+      "epoch": 0.50127365356623,
+      "grad_norm": 0.193496435880661,
+      "learning_rate": 2.6155235255756356e-05,
+      "loss": 0.15486966371536254,
+      "step": 2755
+    },
+    {
+      "epoch": 0.5021834061135371,
+      "grad_norm": 0.1542847901582718,
+      "learning_rate": 2.6081700909326e-05,
+      "loss": 0.15148009061813356,
+      "step": 2760
+    },
+    {
+      "epoch": 0.5030931586608443,
+      "grad_norm": 0.1696511209011078,
+      "learning_rate": 2.6008157185596142e-05,
+      "loss": 0.14190055131912233,
+      "step": 2765
+    },
+    {
+      "epoch": 0.5040029112081513,
+      "grad_norm": 0.14690077304840088,
+      "learning_rate": 2.5934604722119655e-05,
+      "loss": 0.1570739269256592,
+      "step": 2770
+    },
+    {
+      "epoch": 0.5049126637554585,
+      "grad_norm": 0.17149671912193298,
+      "learning_rate": 2.5861044156525162e-05,
+      "loss": 0.14940304756164552,
+      "step": 2775
+    },
+    {
+      "epoch": 0.5058224163027657,
+      "grad_norm": 0.16639231145381927,
+      "learning_rate": 2.578747612651155e-05,
+      "loss": 0.15691237449645995,
+      "step": 2780
+    },
+    {
+      "epoch": 0.5067321688500728,
+      "grad_norm": 0.2062763124704361,
+      "learning_rate": 2.5713901269842404e-05,
+      "loss": 0.1564734935760498,
+      "step": 2785
+    },
+    {
+      "epoch": 0.50764192139738,
+      "grad_norm": 0.12636308372020721,
+      "learning_rate": 2.5640320224340502e-05,
+      "loss": 0.14539417028427123,
+      "step": 2790
+    },
+    {
+      "epoch": 0.508551673944687,
+      "grad_norm": 0.16893689334392548,
+      "learning_rate": 2.556673362788225e-05,
+      "loss": 0.15440930128097535,
+      "step": 2795
+    },
+    {
+      "epoch": 0.5094614264919942,
+      "grad_norm": 0.16250015795230865,
+      "learning_rate": 2.54931421183922e-05,
+      "loss": 0.14485647678375244,
+      "step": 2800
+    },
+    {
+      "epoch": 0.5103711790393013,
+      "grad_norm": 0.1700994372367859,
+      "learning_rate": 2.5419546333837462e-05,
+      "loss": 0.15411126613616943,
+      "step": 2805
+    },
+    {
+      "epoch": 0.5112809315866085,
+      "grad_norm": 0.1547706127166748,
+      "learning_rate": 2.5345946912222256e-05,
+      "loss": 0.15516072511672974,
+      "step": 2810
+    },
+    {
+      "epoch": 0.5121906841339156,
+      "grad_norm": 0.17955681681632996,
+      "learning_rate": 2.527234449158228e-05,
+      "loss": 0.15546923875808716,
+      "step": 2815
+    },
+    {
+      "epoch": 0.5131004366812227,
+      "grad_norm": 0.163709819316864,
+      "learning_rate": 2.519873970997927e-05,
+      "loss": 0.15665037631988527,
+      "step": 2820
+    },
+    {
+      "epoch": 0.5140101892285298,
+      "grad_norm": 0.17859576642513275,
+      "learning_rate": 2.5125133205495405e-05,
+      "loss": 0.1539722204208374,
+      "step": 2825
+    },
+    {
+      "epoch": 0.514919941775837,
+      "grad_norm": 0.17443150281906128,
+      "learning_rate": 2.5051525616227806e-05,
+      "loss": 0.148411762714386,
+      "step": 2830
+    },
+    {
+      "epoch": 0.5158296943231441,
+      "grad_norm": 0.17397581040859222,
+      "learning_rate": 2.4977917580283007e-05,
+      "loss": 0.14880497455596925,
+      "step": 2835
+    },
+    {
+      "epoch": 0.5167394468704513,
+      "grad_norm": 0.14565663039684296,
+      "learning_rate": 2.4904309735771405e-05,
+      "loss": 0.14934680461883545,
+      "step": 2840
+    },
+    {
+      "epoch": 0.5176491994177583,
+      "grad_norm": 0.17895659804344177,
+      "learning_rate": 2.4830702720801746e-05,
+      "loss": 0.15287939310073853,
+      "step": 2845
+    },
+    {
+      "epoch": 0.5185589519650655,
+      "grad_norm": 0.15812788903713226,
+      "learning_rate": 2.4757097173475572e-05,
+      "loss": 0.14576947689056396,
+      "step": 2850
+    },
+    {
+      "epoch": 0.5194687045123726,
+      "grad_norm": 0.17123781144618988,
+      "learning_rate": 2.46834937318817e-05,
+      "loss": 0.15224847793579102,
+      "step": 2855
+    },
+    {
+      "epoch": 0.5203784570596798,
+      "grad_norm": 0.14845474064350128,
+      "learning_rate": 2.460989303409072e-05,
+      "loss": 0.14901585578918458,
+      "step": 2860
+    },
+    {
+      "epoch": 0.5212882096069869,
+      "grad_norm": 0.23493704199790955,
+      "learning_rate": 2.4536295718149407e-05,
+      "loss": 0.1517487049102783,
+      "step": 2865
+    },
+    {
+      "epoch": 0.522197962154294,
+      "grad_norm": 0.16209843754768372,
+      "learning_rate": 2.4462702422075217e-05,
+      "loss": 0.14327445030212402,
+      "step": 2870
+    },
+    {
+      "epoch": 0.5231077147016011,
+      "grad_norm": 0.17249803245067596,
+      "learning_rate": 2.4389113783850793e-05,
+      "loss": 0.1517549753189087,
+      "step": 2875
+    },
+    {
+      "epoch": 0.5240174672489083,
+      "grad_norm": 0.14561402797698975,
+      "learning_rate": 2.431553044141836e-05,
+      "loss": 0.14764087200164794,
+      "step": 2880
+    },
+    {
+      "epoch": 0.5249272197962155,
+      "grad_norm": 0.17033302783966064,
+      "learning_rate": 2.4241953032674256e-05,
+      "loss": 0.15181604623794556,
+      "step": 2885
+    },
+    {
+      "epoch": 0.5258369723435226,
+      "grad_norm": 0.1184430941939354,
+      "learning_rate": 2.4168382195463367e-05,
+      "loss": 0.14264242649078368,
+      "step": 2890
+    },
+    {
+      "epoch": 0.5267467248908297,
+      "grad_norm": 0.17521196603775024,
+      "learning_rate": 2.4094818567573618e-05,
+      "loss": 0.1509538173675537,
+      "step": 2895
+    },
+    {
+      "epoch": 0.5276564774381368,
+      "grad_norm": 0.1681576371192932,
+      "learning_rate": 2.4021262786730428e-05,
+      "loss": 0.15344605445861817,
+      "step": 2900
+    },
+    {
+      "epoch": 0.528566229985444,
+      "grad_norm": 0.17134182155132294,
+      "learning_rate": 2.3947715490591206e-05,
+      "loss": 0.15161689519882202,
+      "step": 2905
+    },
+    {
+      "epoch": 0.5294759825327511,
+      "grad_norm": 0.1796472817659378,
+      "learning_rate": 2.3874177316739778e-05,
+      "loss": 0.15086464881896972,
+      "step": 2910
+    },
+    {
+      "epoch": 0.5303857350800583,
+      "grad_norm": 0.23268625140190125,
+      "learning_rate": 2.380064890268093e-05,
+      "loss": 0.15354180335998535,
+      "step": 2915
+    },
+    {
+      "epoch": 0.5312954876273653,
+      "grad_norm": 0.16318941116333008,
+      "learning_rate": 2.372713088583481e-05,
+      "loss": 0.15131797790527343,
+      "step": 2920
+    },
+    {
+      "epoch": 0.5322052401746725,
+      "grad_norm": 0.18171803653240204,
+      "learning_rate": 2.365362390353143e-05,
+      "loss": 0.15784090757369995,
+      "step": 2925
+    },
+    {
+      "epoch": 0.5331149927219796,
+      "grad_norm": 0.17672640085220337,
+      "learning_rate": 2.3580128593005156e-05,
+      "loss": 0.15509436130523682,
+      "step": 2930
+    },
+    {
+      "epoch": 0.5340247452692868,
+      "grad_norm": 0.15985223650932312,
+      "learning_rate": 2.3506645591389174e-05,
+      "loss": 0.14851027727127075,
+      "step": 2935
+    },
+    {
+      "epoch": 0.5349344978165939,
+      "grad_norm": 0.16597607731819153,
+      "learning_rate": 2.343317553570995e-05,
+      "loss": 0.1504931092262268,
+      "step": 2940
+    },
+    {
+      "epoch": 0.535844250363901,
+      "grad_norm": 0.20180748403072357,
+      "learning_rate": 2.3359719062881725e-05,
+      "loss": 0.15023820400238036,
+      "step": 2945
+    },
+    {
+      "epoch": 0.5367540029112081,
+      "grad_norm": 0.1735963076353073,
+      "learning_rate": 2.3286276809701e-05,
+      "loss": 0.15374408960342406,
+      "step": 2950
+    },
+    {
+      "epoch": 0.5376637554585153,
+      "grad_norm": 0.17629501223564148,
+      "learning_rate": 2.3212849412840995e-05,
+      "loss": 0.15007833242416382,
+      "step": 2955
+    },
+    {
+      "epoch": 0.5385735080058224,
+      "grad_norm": 0.1493796557188034,
+      "learning_rate": 2.3139437508846155e-05,
+      "loss": 0.15206656455993653,
+      "step": 2960
+    },
+    {
+      "epoch": 0.5394832605531296,
+      "grad_norm": 0.17426837980747223,
+      "learning_rate": 2.306604173412659e-05,
+      "loss": 0.1441131591796875,
+      "step": 2965
+    },
+    {
+      "epoch": 0.5403930131004366,
+      "grad_norm": 0.16984431445598602,
+      "learning_rate": 2.2992662724952613e-05,
+      "loss": 0.14438753128051757,
+      "step": 2970
+    },
+    {
+      "epoch": 0.5413027656477438,
+      "grad_norm": 0.1814386397600174,
+      "learning_rate": 2.2919301117449167e-05,
+      "loss": 0.14887022972106934,
+      "step": 2975
+    },
+    {
+      "epoch": 0.5422125181950509,
+      "grad_norm": 0.158392995595932,
+      "learning_rate": 2.2845957547590368e-05,
+      "loss": 0.14404361248016356,
+      "step": 2980
+    },
+    {
+      "epoch": 0.5431222707423581,
+      "grad_norm": 0.17496263980865479,
+      "learning_rate": 2.2772632651193953e-05,
+      "loss": 0.1454906702041626,
+      "step": 2985
+    },
+    {
+      "epoch": 0.5440320232896652,
+      "grad_norm": 0.157533198595047,
+      "learning_rate": 2.2699327063915766e-05,
+      "loss": 0.1458217740058899,
+      "step": 2990
+    },
+    {
+      "epoch": 0.5449417758369723,
+      "grad_norm": 0.1767890453338623,
+      "learning_rate": 2.262604142124427e-05,
+      "loss": 0.14384825229644777,
+      "step": 2995
+    },
+    {
+      "epoch": 0.5458515283842795,
+      "grad_norm": 0.1851050704717636,
+      "learning_rate": 2.2552776358495033e-05,
+      "loss": 0.14832457304000854,
+      "step": 3000
+    },
+    {
+      "epoch": 0.5467612809315866,
+      "grad_norm": 0.164175882935524,
+      "learning_rate": 2.247953251080521e-05,
+      "loss": 0.14999878406524658,
+      "step": 3005
+    },
+    {
+      "epoch": 0.5476710334788938,
+      "grad_norm": 0.3403675854206085,
+      "learning_rate": 2.240631051312804e-05,
+      "loss": 0.1443937063217163,
+      "step": 3010
+    },
+    {
+      "epoch": 0.5485807860262009,
+      "grad_norm": 0.16751109063625336,
+      "learning_rate": 2.2333111000227342e-05,
+      "loss": 0.1462402105331421,
+      "step": 3015
+    },
+    {
+      "epoch": 0.549490538573508,
+      "grad_norm": 0.14741151034832,
+      "learning_rate": 2.225993460667201e-05,
+      "loss": 0.149855899810791,
+      "step": 3020
+    },
+    {
+      "epoch": 0.5504002911208151,
+      "grad_norm": 0.20605266094207764,
+      "learning_rate": 2.218678196683054e-05,
+      "loss": 0.15413178205490113,
+      "step": 3025
+    },
+    {
+      "epoch": 0.5513100436681223,
+      "grad_norm": 0.14884796738624573,
+      "learning_rate": 2.2113653714865473e-05,
+      "loss": 0.14592334032058715,
+      "step": 3030
+    },
+    {
+      "epoch": 0.5522197962154294,
+      "grad_norm": 0.17114350199699402,
+      "learning_rate": 2.2040550484727943e-05,
+      "loss": 0.1498338460922241,
+      "step": 3035
+    },
+    {
+      "epoch": 0.5531295487627366,
+      "grad_norm": 0.16496853530406952,
+      "learning_rate": 2.196747291015219e-05,
+      "loss": 0.14650315046310425,
+      "step": 3040
+    },
+    {
+      "epoch": 0.5540393013100436,
+      "grad_norm": 0.15172401070594788,
+      "learning_rate": 2.189442162465001e-05,
+      "loss": 0.14984124898910522,
+      "step": 3045
+    },
+    {
+      "epoch": 0.5549490538573508,
+      "grad_norm": 0.19258467853069305,
+      "learning_rate": 2.182139726150532e-05,
+      "loss": 0.1486764669418335,
+      "step": 3050
+    },
+    {
+      "epoch": 0.5558588064046579,
+      "grad_norm": 0.1749001443386078,
+      "learning_rate": 2.1748400453768652e-05,
+      "loss": 0.14983701705932617,
+      "step": 3055
+    },
+    {
+      "epoch": 0.5567685589519651,
+      "grad_norm": 0.37510567903518677,
+      "learning_rate": 2.1675431834251637e-05,
+      "loss": 0.14483561515808105,
+      "step": 3060
+    },
+    {
+      "epoch": 0.5576783114992722,
+      "grad_norm": 0.16932405531406403,
+      "learning_rate": 2.1602492035521553e-05,
+      "loss": 0.14487643241882325,
+      "step": 3065
+    },
+    {
+      "epoch": 0.5585880640465793,
+      "grad_norm": 0.174176424741745,
+      "learning_rate": 2.152958168989584e-05,
+      "loss": 0.14737497568130492,
+      "step": 3070
+    },
+    {
+      "epoch": 0.5594978165938864,
+      "grad_norm": 0.1601252257823944,
+      "learning_rate": 2.1456701429436577e-05,
+      "loss": 0.15183379650115966,
+      "step": 3075
+    },
+    {
+      "epoch": 0.5604075691411936,
+      "grad_norm": 0.14960910379886627,
+      "learning_rate": 2.1383851885945085e-05,
+      "loss": 0.143074893951416,
+      "step": 3080
+    },
+    {
+      "epoch": 0.5613173216885007,
+      "grad_norm": 0.1678633838891983,
+      "learning_rate": 2.1311033690956346e-05,
+      "loss": 0.14961432218551635,
+      "step": 3085
+    },
+    {
+      "epoch": 0.5622270742358079,
+      "grad_norm": 0.15814319252967834,
+      "learning_rate": 2.1238247475733613e-05,
+      "loss": 0.14308581352233887,
+      "step": 3090
+    },
+    {
+      "epoch": 0.5631368267831149,
+      "grad_norm": 0.21240772306919098,
+      "learning_rate": 2.1165493871262887e-05,
+      "loss": 0.14737485647201537,
+      "step": 3095
+    },
+    {
+      "epoch": 0.5640465793304221,
+      "grad_norm": 0.15161271393299103,
+      "learning_rate": 2.109277350824749e-05,
+      "loss": 0.14534420967102052,
+      "step": 3100
+    },
+    {
+      "epoch": 0.5649563318777293,
+      "grad_norm": 0.16572362184524536,
+      "learning_rate": 2.1020087017102537e-05,
+      "loss": 0.14299670457839966,
+      "step": 3105
+    },
+    {
+      "epoch": 0.5658660844250364,
+      "grad_norm": 0.1548164039850235,
+      "learning_rate": 2.094743502794954e-05,
+      "loss": 0.14371142387390137,
+      "step": 3110
+    },
+    {
+      "epoch": 0.5667758369723436,
+      "grad_norm": 0.2574169933795929,
+      "learning_rate": 2.0874818170610885e-05,
+      "loss": 0.14350423812866211,
+      "step": 3115
+    },
+    {
+      "epoch": 0.5676855895196506,
+      "grad_norm": 0.16359548270702362,
+      "learning_rate": 2.080223707460443e-05,
+      "loss": 0.1520243763923645,
+      "step": 3120
+    },
+    {
+      "epoch": 0.5685953420669578,
+      "grad_norm": 0.1798320859670639,
+      "learning_rate": 2.072969236913799e-05,
+      "loss": 0.14832595586776734,
+      "step": 3125
+    },
+    {
+      "epoch": 0.5695050946142649,
+      "grad_norm": 0.17045916616916656,
+      "learning_rate": 2.0657184683103926e-05,
+      "loss": 0.15308042764663696,
+      "step": 3130
+    },
+    {
+      "epoch": 0.5704148471615721,
+      "grad_norm": 0.16345897316932678,
+      "learning_rate": 2.058471464507366e-05,
+      "loss": 0.14564799070358275,
+      "step": 3135
+    },
+    {
+      "epoch": 0.5713245997088792,
+      "grad_norm": 0.15170110762119293,
+      "learning_rate": 2.0512282883292257e-05,
+      "loss": 0.14161767959594726,
+      "step": 3140
+    },
+    {
+      "epoch": 0.5722343522561864,
+      "grad_norm": 0.8107472658157349,
+      "learning_rate": 2.0439890025672955e-05,
+      "loss": 0.14481087923049926,
+      "step": 3145
+    },
+    {
+      "epoch": 0.5731441048034934,
+      "grad_norm": 0.15346679091453552,
+      "learning_rate": 2.036753669979174e-05,
+      "loss": 0.14860262870788574,
+      "step": 3150
+    },
+    {
+      "epoch": 0.5740538573508006,
+      "grad_norm": 0.1632593423128128,
+      "learning_rate": 2.0295223532881886e-05,
+      "loss": 0.1481687307357788,
+      "step": 3155
+    },
+    {
+      "epoch": 0.5749636098981077,
+      "grad_norm": 0.23399172723293304,
+      "learning_rate": 2.022295115182852e-05,
+      "loss": 0.149153733253479,
+      "step": 3160
+    },
+    {
+      "epoch": 0.5758733624454149,
+      "grad_norm": 0.14977394044399261,
+      "learning_rate": 2.015072018316323e-05,
+      "loss": 0.14921388626098633,
+      "step": 3165
+    },
+    {
+      "epoch": 0.576783114992722,
+      "grad_norm": 0.1550658792257309,
+      "learning_rate": 2.007853125305856e-05,
+      "loss": 0.1482759475708008,
+      "step": 3170
+    },
+    {
+      "epoch": 0.5776928675400291,
+      "grad_norm": 0.16661737859249115,
+      "learning_rate": 2.0006384987322645e-05,
+      "loss": 0.14903552532196046,
+      "step": 3175
+    },
+    {
+      "epoch": 0.5786026200873362,
+      "grad_norm": 0.1746823936700821,
+      "learning_rate": 1.9934282011393753e-05,
+      "loss": 0.1412947654724121,
+      "step": 3180
+    },
+    {
+      "epoch": 0.5795123726346434,
+      "grad_norm": 0.17025792598724365,
+      "learning_rate": 1.9862222950334857e-05,
+      "loss": 0.15289769172668458,
+      "step": 3185
+    },
+    {
+      "epoch": 0.5804221251819505,
+      "grad_norm": 0.16857658326625824,
+      "learning_rate": 1.9790208428828252e-05,
+      "loss": 0.14419941902160643,
+      "step": 3190
+    },
+    {
+      "epoch": 0.5813318777292577,
+      "grad_norm": 0.16099876165390015,
+      "learning_rate": 1.9718239071170118e-05,
+      "loss": 0.14476487636566163,
+      "step": 3195
+    },
+    {
+      "epoch": 0.5822416302765647,
+      "grad_norm": 0.16140873730182648,
+      "learning_rate": 1.964631550126508e-05,
+      "loss": 0.14588416814804078,
+      "step": 3200
+    },
+    {
+      "epoch": 0.5831513828238719,
+      "grad_norm": 0.15719448029994965,
+      "learning_rate": 1.957443834262087e-05,
+      "loss": 0.15144693851470947,
+      "step": 3205
+    },
+    {
+      "epoch": 0.584061135371179,
+      "grad_norm": 0.16512645781040192,
+      "learning_rate": 1.950260821834285e-05,
+      "loss": 0.14787566661834717,
+      "step": 3210
+    },
+    {
+      "epoch": 0.5849708879184862,
+      "grad_norm": 0.18584516644477844,
+      "learning_rate": 1.9430825751128643e-05,
+      "loss": 0.14514710903167724,
+      "step": 3215
+    },
+    {
+      "epoch": 0.5858806404657934,
+      "grad_norm": 0.17640981078147888,
+      "learning_rate": 1.9359091563262742e-05,
+      "loss": 0.1511004686355591,
+      "step": 3220
+    },
+    {
+      "epoch": 0.5867903930131004,
+      "grad_norm": 0.1697624921798706,
+      "learning_rate": 1.9287406276611095e-05,
+      "loss": 0.15392563343048096,
+      "step": 3225
+    },
+    {
+      "epoch": 0.5877001455604076,
+      "grad_norm": 0.1677260845899582,
+      "learning_rate": 1.9215770512615725e-05,
+      "loss": 0.15311745405197144,
+      "step": 3230
+    },
+    {
+      "epoch": 0.5886098981077147,
+      "grad_norm": 0.15357480943202972,
+      "learning_rate": 1.9144184892289337e-05,
+      "loss": 0.14370160102844237,
+      "step": 3235
+    },
+    {
+      "epoch": 0.5895196506550219,
+      "grad_norm": 0.18601207435131073,
+      "learning_rate": 1.9072650036209955e-05,
+      "loss": 0.14095077514648438,
+      "step": 3240
+    },
+    {
+      "epoch": 0.590429403202329,
+      "grad_norm": 0.17313526570796967,
+      "learning_rate": 1.9001166564515513e-05,
+      "loss": 0.148259174823761,
+      "step": 3245
+    },
+    {
+      "epoch": 0.5913391557496361,
+      "grad_norm": 0.1634378433227539,
+      "learning_rate": 1.8929735096898504e-05,
+      "loss": 0.15082294940948487,
+      "step": 3250
+    },
+    {
+      "epoch": 0.5922489082969432,
+      "grad_norm": 0.18542174994945526,
+      "learning_rate": 1.885835625260058e-05,
+      "loss": 0.14461435079574586,
+      "step": 3255
+    },
+    {
+      "epoch": 0.5931586608442504,
+      "grad_norm": 0.1740756630897522,
+      "learning_rate": 1.87870306504072e-05,
+      "loss": 0.14083608388900756,
+      "step": 3260
+    },
+    {
+      "epoch": 0.5940684133915575,
+      "grad_norm": 0.25606217980384827,
+      "learning_rate": 1.8715758908642288e-05,
+      "loss": 0.15125386714935302,
+      "step": 3265
+    },
+    {
+      "epoch": 0.5949781659388647,
+      "grad_norm": 0.20194627344608307,
+      "learning_rate": 1.8644541645162834e-05,
+      "loss": 0.14433003664016725,
+      "step": 3270
+    },
+    {
+      "epoch": 0.5958879184861717,
+      "grad_norm": 0.1902168095111847,
+      "learning_rate": 1.8573379477353542e-05,
+      "loss": 0.14718132019042968,
+      "step": 3275
+    },
+    {
+      "epoch": 0.5967976710334789,
+      "grad_norm": 0.15122972428798676,
+      "learning_rate": 1.850227302212151e-05,
+      "loss": 0.153376567363739,
+      "step": 3280
+    },
+    {
+      "epoch": 0.597707423580786,
+      "grad_norm": 0.14331959187984467,
+      "learning_rate": 1.843122289589085e-05,
+      "loss": 0.146630597114563,
+      "step": 3285
+    },
+    {
+      "epoch": 0.5986171761280932,
+      "grad_norm": 0.15083099901676178,
+      "learning_rate": 1.836022971459737e-05,
+      "loss": 0.1445971965789795,
+      "step": 3290
+    },
+    {
+      "epoch": 0.5995269286754003,
+      "grad_norm": 0.16585418581962585,
+      "learning_rate": 1.828929409368321e-05,
+      "loss": 0.15120241641998292,
+      "step": 3295
+    },
+    {
+      "epoch": 0.6004366812227074,
+      "grad_norm": 0.1653224229812622,
+      "learning_rate": 1.8218416648091524e-05,
+      "loss": 0.14349838495254516,
+      "step": 3300
+    },
+    {
+      "epoch": 0.6013464337700145,
+      "grad_norm": 0.1891375184059143,
+      "learning_rate": 1.8147597992261124e-05,
+      "loss": 0.15171384811401367,
+      "step": 3305
+    },
+    {
+      "epoch": 0.6022561863173217,
+      "grad_norm": 0.13392704725265503,
+      "learning_rate": 1.8076838740121187e-05,
+      "loss": 0.14607118368148803,
+      "step": 3310
+    },
+    {
+      "epoch": 0.6031659388646288,
+      "grad_norm": 0.15421944856643677,
+      "learning_rate": 1.8006139505085926e-05,
+      "loss": 0.1380957007408142,
+      "step": 3315
+    },
+    {
+      "epoch": 0.604075691411936,
+      "grad_norm": 0.16637761890888214,
+      "learning_rate": 1.7935500900049246e-05,
+      "loss": 0.14604611396789552,
+      "step": 3320
+    },
+    {
+      "epoch": 0.6049854439592431,
+      "grad_norm": 0.16638441383838654,
+      "learning_rate": 1.7864923537379445e-05,
+      "loss": 0.1513611912727356,
+      "step": 3325
+    },
+    {
+      "epoch": 0.6058951965065502,
+      "grad_norm": 0.1745707094669342,
+      "learning_rate": 1.779440802891394e-05,
+      "loss": 0.15391240119934083,
+      "step": 3330
+    },
+    {
+      "epoch": 0.6068049490538574,
+      "grad_norm": 0.1620505005121231,
+      "learning_rate": 1.77239549859539e-05,
+      "loss": 0.14986472129821776,
+      "step": 3335
+    },
+    {
+      "epoch": 0.6077147016011645,
+      "grad_norm": 0.1579132080078125,
+      "learning_rate": 1.7653565019259e-05,
+      "loss": 0.1466603994369507,
+      "step": 3340
+    },
+    {
+      "epoch": 0.6086244541484717,
+      "grad_norm": 0.19154994189739227,
+      "learning_rate": 1.7583238739042086e-05,
+      "loss": 0.15228934288024903,
+      "step": 3345
+    },
+    {
+      "epoch": 0.6095342066957787,
+      "grad_norm": 0.15771779417991638,
+      "learning_rate": 1.7512976754963913e-05,
+      "loss": 0.14965078830718995,
+      "step": 3350
+    },
+    {
+      "epoch": 0.6104439592430859,
+      "grad_norm": 0.18406136333942413,
+      "learning_rate": 1.744277967612785e-05,
+      "loss": 0.1473196864128113,
+      "step": 3355
+    },
+    {
+      "epoch": 0.611353711790393,
+      "grad_norm": 0.17603816092014313,
+      "learning_rate": 1.7372648111074607e-05,
+      "loss": 0.1430676221847534,
+      "step": 3360
+    },
+    {
+      "epoch": 0.6122634643377002,
+      "grad_norm": 0.156408429145813,
+      "learning_rate": 1.7302582667776933e-05,
+      "loss": 0.14018454551696777,
+      "step": 3365
+    },
+    {
+      "epoch": 0.6131732168850073,
+      "grad_norm": 0.14504843950271606,
+      "learning_rate": 1.7232583953634407e-05,
+      "loss": 0.14505640268325806,
+      "step": 3370
+    },
+    {
+      "epoch": 0.6140829694323144,
+      "grad_norm": 0.1864968240261078,
+      "learning_rate": 1.716265257546808e-05,
+      "loss": 0.14810394048690795,
+      "step": 3375
+    },
+    {
+      "epoch": 0.6149927219796215,
+      "grad_norm": 0.1621711403131485,
+      "learning_rate": 1.7092789139515295e-05,
+      "loss": 0.14203091859817504,
+      "step": 3380
+    },
+    {
+      "epoch": 0.6159024745269287,
+      "grad_norm": 0.17994914948940277,
+      "learning_rate": 1.70229942514244e-05,
+      "loss": 0.14565644264221192,
+      "step": 3385
+    },
+    {
+      "epoch": 0.6168122270742358,
+      "grad_norm": 0.1707388162612915,
+      "learning_rate": 1.6953268516249486e-05,
+      "loss": 0.14449434280395507,
+      "step": 3390
+    },
+    {
+      "epoch": 0.617721979621543,
+      "grad_norm": 0.16425329446792603,
+      "learning_rate": 1.6883612538445175e-05,
+      "loss": 0.15185940265655518,
+      "step": 3395
+    },
+    {
+      "epoch": 0.61863173216885,
+      "grad_norm": 0.15987788140773773,
+      "learning_rate": 1.6814026921861335e-05,
+      "loss": 0.14994431734085084,
+      "step": 3400
+    },
+    {
+      "epoch": 0.6195414847161572,
+      "grad_norm": 0.2987690269947052,
+      "learning_rate": 1.6744512269737894e-05,
+      "loss": 0.14652738571166993,
+      "step": 3405
+    },
+    {
+      "epoch": 0.6204512372634643,
+      "grad_norm": 0.1681315004825592,
+      "learning_rate": 1.6675069184699574e-05,
+      "loss": 0.14566165208816528,
+      "step": 3410
+    },
+    {
+      "epoch": 0.6213609898107715,
+      "grad_norm": 0.15847846865653992,
+      "learning_rate": 1.660569826875069e-05,
+      "loss": 0.1374401330947876,
+      "step": 3415
+    },
+    {
+      "epoch": 0.6222707423580786,
+      "grad_norm": 0.16370312869548798,
+      "learning_rate": 1.6536400123269907e-05,
+      "loss": 0.14905524253845215,
+      "step": 3420
+    },
+    {
+      "epoch": 0.6231804949053857,
+      "grad_norm": 0.16054444015026093,
+      "learning_rate": 1.6467175349005054e-05,
+      "loss": 0.1496324896812439,
+      "step": 3425
+    },
+    {
+      "epoch": 0.6240902474526928,
+      "grad_norm": 0.1663951277732849,
+      "learning_rate": 1.639802454606788e-05,
+      "loss": 0.1504170298576355,
+      "step": 3430
+    },
+    {
+      "epoch": 0.625,
+      "grad_norm": 0.1591310054063797,
+      "learning_rate": 1.6328948313928906e-05,
+      "loss": 0.1410186171531677,
+      "step": 3435
+    },
+    {
+      "epoch": 0.6259097525473072,
+      "grad_norm": 0.1637524962425232,
+      "learning_rate": 1.6259947251412178e-05,
+      "loss": 0.13963305950164795,
+      "step": 3440
+    },
+    {
+      "epoch": 0.6268195050946143,
+      "grad_norm": 0.1688017100095749,
+      "learning_rate": 1.6191021956690096e-05,
+      "loss": 0.14727941751480103,
+      "step": 3445
+    },
+    {
+      "epoch": 0.6277292576419214,
+      "grad_norm": 0.1691795438528061,
+      "learning_rate": 1.612217302727821e-05,
+      "loss": 0.14856183528900146,
+      "step": 3450
+    },
+    {
+      "epoch": 0.6286390101892285,
+      "grad_norm": 0.18501746654510498,
+      "learning_rate": 1.60534010600301e-05,
+      "loss": 0.1481746554374695,
+      "step": 3455
+    },
+    {
+      "epoch": 0.6295487627365357,
+      "grad_norm": 0.16234716773033142,
+      "learning_rate": 1.5984706651132125e-05,
+      "loss": 0.1427530527114868,
+      "step": 3460
+    },
+    {
+      "epoch": 0.6304585152838428,
+      "grad_norm": 0.16013780236244202,
+      "learning_rate": 1.5916090396098293e-05,
+      "loss": 0.14264426231384278,
+      "step": 3465
+    },
+    {
+      "epoch": 0.63136826783115,
+      "grad_norm": 0.17116396129131317,
+      "learning_rate": 1.5847552889765095e-05,
+      "loss": 0.14109257459640503,
+      "step": 3470
+    },
+    {
+      "epoch": 0.632278020378457,
+      "grad_norm": 0.16949769854545593,
+      "learning_rate": 1.5779094726286344e-05,
+      "loss": 0.1387040376663208,
+      "step": 3475
+    },
+    {
+      "epoch": 0.6331877729257642,
+      "grad_norm": 0.14983431994915009,
+      "learning_rate": 1.5710716499128044e-05,
+      "loss": 0.13645120859146118,
+      "step": 3480
+    },
+    {
+      "epoch": 0.6340975254730713,
+      "grad_norm": 0.1632554531097412,
+      "learning_rate": 1.564241880106321e-05,
+      "loss": 0.14883992671966553,
+      "step": 3485
+    },
+    {
+      "epoch": 0.6350072780203785,
+      "grad_norm": 0.15686506032943726,
+      "learning_rate": 1.5574202224166744e-05,
+      "loss": 0.14244272708892822,
+      "step": 3490
+    },
+    {
+      "epoch": 0.6359170305676856,
+      "grad_norm": 0.18843458592891693,
+      "learning_rate": 1.5506067359810333e-05,
+      "loss": 0.15149861574172974,
+      "step": 3495
+    },
+    {
+      "epoch": 0.6368267831149927,
+      "grad_norm": 0.15874551236629486,
+      "learning_rate": 1.5438014798657275e-05,
+      "loss": 0.15188233852386473,
+      "step": 3500
+    },
+    {
+      "epoch": 0.6377365356622998,
+      "grad_norm": 0.17014239728450775,
+      "learning_rate": 1.5370045130657366e-05,
+      "loss": 0.14694437980651856,
+      "step": 3505
+    },
+    {
+      "epoch": 0.638646288209607,
+      "grad_norm": 0.14744038879871368,
+      "learning_rate": 1.5302158945041838e-05,
+      "loss": 0.14434736967086792,
+      "step": 3510
+    },
+    {
+      "epoch": 0.6395560407569141,
+      "grad_norm": 0.2069770246744156,
+      "learning_rate": 1.523435683031818e-05,
+      "loss": 0.13982917070388795,
+      "step": 3515
+    },
+    {
+      "epoch": 0.6404657933042213,
+      "grad_norm": 0.17811502516269684,
+      "learning_rate": 1.5166639374265063e-05,
+      "loss": 0.1408839702606201,
+      "step": 3520
+    },
+    {
+      "epoch": 0.6413755458515283,
+      "grad_norm": 0.165786474943161,
+      "learning_rate": 1.509900716392728e-05,
+      "loss": 0.15312877893447877,
+      "step": 3525
+    },
+    {
+      "epoch": 0.6422852983988355,
+      "grad_norm": 0.1633884161710739,
+      "learning_rate": 1.5031460785610596e-05,
+      "loss": 0.1488795518875122,
+      "step": 3530
+    },
+    {
+      "epoch": 0.6431950509461426,
+      "grad_norm": 0.16498984396457672,
+      "learning_rate": 1.4964000824876723e-05,
+      "loss": 0.15031465291976928,
+      "step": 3535
+    },
+    {
+      "epoch": 0.6441048034934498,
+      "grad_norm": 0.18043678998947144,
+      "learning_rate": 1.4896627866538191e-05,
+      "loss": 0.147829806804657,
+      "step": 3540
+    },
+    {
+      "epoch": 0.6450145560407569,
+      "grad_norm": 0.16813597083091736,
+      "learning_rate": 1.4829342494653315e-05,
+      "loss": 0.1418998956680298,
+      "step": 3545
+    },
+    {
+      "epoch": 0.645924308588064,
+      "grad_norm": 0.1817242056131363,
+      "learning_rate": 1.4762145292521118e-05,
+      "loss": 0.14508869647979736,
+      "step": 3550
+    },
+    {
+      "epoch": 0.6468340611353712,
+      "grad_norm": 0.14666494727134705,
+      "learning_rate": 1.469503684267628e-05,
+      "loss": 0.14159854650497436,
+      "step": 3555
+    },
+    {
+      "epoch": 0.6477438136826783,
+      "grad_norm": 0.16485381126403809,
+      "learning_rate": 1.4628017726884086e-05,
+      "loss": 0.14419105052947997,
+      "step": 3560
+    },
+    {
+      "epoch": 0.6486535662299855,
+      "grad_norm": 0.16100342571735382,
+      "learning_rate": 1.4561088526135375e-05,
+      "loss": 0.14501721858978273,
+      "step": 3565
+    },
+    {
+      "epoch": 0.6495633187772926,
+      "grad_norm": 0.16996590793132782,
+      "learning_rate": 1.4494249820641493e-05,
+      "loss": 0.1377166509628296,
+      "step": 3570
+    },
+    {
+      "epoch": 0.6504730713245997,
+      "grad_norm": 0.16168837249279022,
+      "learning_rate": 1.4427502189829339e-05,
+      "loss": 0.1414325475692749,
+      "step": 3575
+    },
+    {
+      "epoch": 0.6513828238719068,
+      "grad_norm": 0.16318906843662262,
+      "learning_rate": 1.436084621233621e-05,
+      "loss": 0.14685193300247193,
+      "step": 3580
+    },
+    {
+      "epoch": 0.652292576419214,
+      "grad_norm": 0.1636219322681427,
+      "learning_rate": 1.4294282466004899e-05,
+      "loss": 0.1405899167060852,
+      "step": 3585
+    },
+    {
+      "epoch": 0.6532023289665211,
+      "grad_norm": 0.1838461309671402,
+      "learning_rate": 1.422781152787865e-05,
+      "loss": 0.14386332035064697,
+      "step": 3590
+    },
+    {
+      "epoch": 0.6541120815138283,
+      "grad_norm": 0.1796344667673111,
+      "learning_rate": 1.4161433974196115e-05,
+      "loss": 0.1513024687767029,
+      "step": 3595
+    },
+    {
+      "epoch": 0.6550218340611353,
+      "grad_norm": 0.16424529254436493,
+      "learning_rate": 1.4095150380386427e-05,
+      "loss": 0.14238927364349366,
+      "step": 3600
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.981864745409872e+18,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-3600/training_args.bin b/checkpoint-3600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-3600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/checkpoint-3700/README.md b/checkpoint-3700/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-3700/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-3700/adapter_config.json b/checkpoint-3700/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-3700/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-3700/adapter_model.safetensors b/checkpoint-3700/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f30c6293c051d936a00e51a0a8df81a652d772f6
--- /dev/null
+++ b/checkpoint-3700/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da845288daa68c0affac00efd07f114f9dcc78e018847d986c15c1f79ec4d423
+size 169741912
diff --git a/checkpoint-3700/chat_template.jinja b/checkpoint-3700/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-3700/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-3700/optimizer.pt b/checkpoint-3700/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ea675ae4e682ce925f7efbb4471f23b91a233a2b
--- /dev/null
+++ b/checkpoint-3700/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a49556e185caef228272d3873c161ac51fb9fe4e142db4b7ca66d31e5543e911
+size 72807355
diff --git a/checkpoint-3700/processor_config.json b/checkpoint-3700/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-3700/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-3700/rng_state.pth b/checkpoint-3700/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-3700/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-3700/scheduler.pt b/checkpoint-3700/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b2caef3ec140126df6ed83d8003b196d0eacd08e
--- /dev/null
+++ b/checkpoint-3700/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8e165369a022f727a5dc2e033515cbe349803d48dbbc9acf35607a611f0cf526
+size 1465
diff --git a/checkpoint-3700/tokenizer.json b/checkpoint-3700/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-3700/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-3700/tokenizer_config.json b/checkpoint-3700/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-3700/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-3700/trainer_state.json b/checkpoint-3700/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..79d1a75319eca95c2e51a702ceff468cde6d277a
--- /dev/null
+++ b/checkpoint-3700/trainer_state.json
@@ -0,0 +1,5222 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.673216885007278,
+  "eval_steps": 100,
+  "global_step": 3700,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    },
+    {
+      "epoch": 0.03729985443959243,
+      "grad_norm": 0.061678580939769745,
+      "learning_rate": 4.999340748636462e-05,
+      "loss": 0.24956226348876953,
+      "step": 205
+    },
+    {
+      "epoch": 0.03820960698689956,
+      "grad_norm": 0.07328873127698898,
+      "learning_rate": 4.999160884067051e-05,
+      "loss": 0.26169676780700685,
+      "step": 210
+    },
+    {
+      "epoch": 0.0391193595342067,
+      "grad_norm": 0.08287990838289261,
+      "learning_rate": 4.9989593541926246e-05,
+      "loss": 0.2574604034423828,
+      "step": 215
+    },
+    {
+      "epoch": 0.04002911208151383,
+      "grad_norm": 0.06787359714508057,
+      "learning_rate": 4.9987361607602525e-05,
+      "loss": 0.25351409912109374,
+      "step": 220
+    },
+    {
+      "epoch": 0.04093886462882096,
+      "grad_norm": 0.06695502996444702,
+      "learning_rate": 4.998491305704805e-05,
+      "loss": 0.24522039890289307,
+      "step": 225
+    },
+    {
+      "epoch": 0.041848617176128096,
+      "grad_norm": 0.08872214704751968,
+      "learning_rate": 4.9982247911489375e-05,
+      "loss": 0.2581867933273315,
+      "step": 230
+    },
+    {
+      "epoch": 0.042758369723435226,
+      "grad_norm": 0.07637131959199905,
+      "learning_rate": 4.9979366194030743e-05,
+      "loss": 0.25569658279418944,
+      "step": 235
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 0.08158119022846222,
+      "learning_rate": 4.997626792965385e-05,
+      "loss": 0.2529409646987915,
+      "step": 240
+    },
+    {
+      "epoch": 0.04457787481804949,
+      "grad_norm": 0.07529161125421524,
+      "learning_rate": 4.997295314521766e-05,
+      "loss": 0.24049024581909179,
+      "step": 245
+    },
+    {
+      "epoch": 0.04548762736535662,
+      "grad_norm": 0.08860139548778534,
+      "learning_rate": 4.996942186945813e-05,
+      "loss": 0.2490522861480713,
+      "step": 250
+    },
+    {
+      "epoch": 0.04639737991266375,
+      "grad_norm": 0.0850321501493454,
+      "learning_rate": 4.9965674132988005e-05,
+      "loss": 0.24180831909179687,
+      "step": 255
+    },
+    {
+      "epoch": 0.04730713245997089,
+      "grad_norm": 0.07556115090847015,
+      "learning_rate": 4.996170996829653e-05,
+      "loss": 0.2509631872177124,
+      "step": 260
+    },
+    {
+      "epoch": 0.04821688500727802,
+      "grad_norm": 0.07971206307411194,
+      "learning_rate": 4.995752940974918e-05,
+      "loss": 0.24398891925811766,
+      "step": 265
+    },
+    {
+      "epoch": 0.04912663755458515,
+      "grad_norm": 0.09149336814880371,
+      "learning_rate": 4.9953132493587344e-05,
+      "loss": 0.2300492286682129,
+      "step": 270
+    },
+    {
+      "epoch": 0.050036390101892286,
+      "grad_norm": 0.08265820890665054,
+      "learning_rate": 4.9948519257928034e-05,
+      "loss": 0.24246792793273925,
+      "step": 275
+    },
+    {
+      "epoch": 0.050946142649199416,
+      "grad_norm": 0.10328587144613266,
+      "learning_rate": 4.9943689742763534e-05,
+      "loss": 0.2367171049118042,
+      "step": 280
+    },
+    {
+      "epoch": 0.05185589519650655,
+      "grad_norm": 0.0836917981505394,
+      "learning_rate": 4.993864398996105e-05,
+      "loss": 0.23215813636779786,
+      "step": 285
+    },
+    {
+      "epoch": 0.05276564774381368,
+      "grad_norm": 0.09475161135196686,
+      "learning_rate": 4.99333820432624e-05,
+      "loss": 0.2350748062133789,
+      "step": 290
+    },
+    {
+      "epoch": 0.05367540029112081,
+      "grad_norm": 0.08040128648281097,
+      "learning_rate": 4.992790394828355e-05,
+      "loss": 0.23253886699676513,
+      "step": 295
+    },
+    {
+      "epoch": 0.05458515283842795,
+      "grad_norm": 0.08852150291204453,
+      "learning_rate": 4.992220975251428e-05,
+      "loss": 0.23856515884399415,
+      "step": 300
+    },
+    {
+      "epoch": 0.05549490538573508,
+      "grad_norm": 0.09565229713916779,
+      "learning_rate": 4.991629950531775e-05,
+      "loss": 0.23311660289764405,
+      "step": 305
+    },
+    {
+      "epoch": 0.05640465793304221,
+      "grad_norm": 0.08158160001039505,
+      "learning_rate": 4.991017325793009e-05,
+      "loss": 0.22467944622039795,
+      "step": 310
+    },
+    {
+      "epoch": 0.05731441048034935,
+      "grad_norm": 0.07746429741382599,
+      "learning_rate": 4.990383106345994e-05,
+      "loss": 0.229844069480896,
+      "step": 315
+    },
+    {
+      "epoch": 0.05822416302765648,
+      "grad_norm": 0.08564355969429016,
+      "learning_rate": 4.989727297688797e-05,
+      "loss": 0.22414517402648926,
+      "step": 320
+    },
+    {
+      "epoch": 0.05913391557496361,
+      "grad_norm": 0.07517435401678085,
+      "learning_rate": 4.9890499055066435e-05,
+      "loss": 0.2236532211303711,
+      "step": 325
+    },
+    {
+      "epoch": 0.060043668122270744,
+      "grad_norm": 0.111734539270401,
+      "learning_rate": 4.988350935671869e-05,
+      "loss": 0.21474847793579102,
+      "step": 330
+    },
+    {
+      "epoch": 0.060953420669577874,
+      "grad_norm": 0.09906989336013794,
+      "learning_rate": 4.987630394243866e-05,
+      "loss": 0.23321933746337892,
+      "step": 335
+    },
+    {
+      "epoch": 0.06186317321688501,
+      "grad_norm": 0.10131457448005676,
+      "learning_rate": 4.98688828746903e-05,
+      "loss": 0.2310662031173706,
+      "step": 340
+    },
+    {
+      "epoch": 0.06277292576419213,
+      "grad_norm": 0.09203507006168365,
+      "learning_rate": 4.986124621780708e-05,
+      "loss": 0.22021169662475587,
+      "step": 345
+    },
+    {
+      "epoch": 0.06368267831149928,
+      "grad_norm": 0.09505912661552429,
+      "learning_rate": 4.9853394037991416e-05,
+      "loss": 0.2197155237197876,
+      "step": 350
+    },
+    {
+      "epoch": 0.06459243085880641,
+      "grad_norm": 0.09038657695055008,
+      "learning_rate": 4.984532640331412e-05,
+      "loss": 0.22066287994384765,
+      "step": 355
+    },
+    {
+      "epoch": 0.06550218340611354,
+      "grad_norm": 0.09707064181566238,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 0.22455451488494874,
+      "step": 360
+    },
+    {
+      "epoch": 0.06641193595342067,
+      "grad_norm": 0.10367228090763092,
+      "learning_rate": 4.98285450509961e-05,
+      "loss": 0.21993820667266845,
+      "step": 365
+    },
+    {
+      "epoch": 0.0673216885007278,
+      "grad_norm": 0.12229471653699875,
+      "learning_rate": 4.9819831478833456e-05,
+      "loss": 0.2168867588043213,
+      "step": 370
+    },
+    {
+      "epoch": 0.06823144104803494,
+      "grad_norm": 0.0964592918753624,
+      "learning_rate": 4.981090274276406e-05,
+      "loss": 0.21579203605651856,
+      "step": 375
+    },
+    {
+      "epoch": 0.06914119359534207,
+      "grad_norm": 0.09400496631860733,
+      "learning_rate": 4.980175892019141e-05,
+      "loss": 0.20972180366516113,
+      "step": 380
+    },
+    {
+      "epoch": 0.0700509461426492,
+      "grad_norm": 0.08158645778894424,
+      "learning_rate": 4.9792400090383594e-05,
+      "loss": 0.22148358821868896,
+      "step": 385
+    },
+    {
+      "epoch": 0.07096069868995633,
+      "grad_norm": 0.10916394740343094,
+      "learning_rate": 4.978282633447261e-05,
+      "loss": 0.2214418649673462,
+      "step": 390
+    },
+    {
+      "epoch": 0.07187045123726346,
+      "grad_norm": 0.11138810962438583,
+      "learning_rate": 4.9773037735453636e-05,
+      "loss": 0.21814754009246826,
+      "step": 395
+    },
+    {
+      "epoch": 0.07278020378457059,
+      "grad_norm": 0.10914396494626999,
+      "learning_rate": 4.9763034378184365e-05,
+      "loss": 0.21310818195343018,
+      "step": 400
+    },
+    {
+      "epoch": 0.07368995633187773,
+      "grad_norm": 0.1043366864323616,
+      "learning_rate": 4.975281634938421e-05,
+      "loss": 0.21266789436340333,
+      "step": 405
+    },
+    {
+      "epoch": 0.07459970887918486,
+      "grad_norm": 0.1036868542432785,
+      "learning_rate": 4.9742383737633594e-05,
+      "loss": 0.21606721878051757,
+      "step": 410
+    },
+    {
+      "epoch": 0.075509461426492,
+      "grad_norm": 0.11640442907810211,
+      "learning_rate": 4.9731736633373144e-05,
+      "loss": 0.21532948017120362,
+      "step": 415
+    },
+    {
+      "epoch": 0.07641921397379912,
+      "grad_norm": 0.11219926178455353,
+      "learning_rate": 4.9720875128902956e-05,
+      "loss": 0.2191627025604248,
+      "step": 420
+    },
+    {
+      "epoch": 0.07732896652110625,
+      "grad_norm": 0.12103637307882309,
+      "learning_rate": 4.970979931838176e-05,
+      "loss": 0.20938868522644044,
+      "step": 425
+    },
+    {
+      "epoch": 0.0782387190684134,
+      "grad_norm": 0.13274189829826355,
+      "learning_rate": 4.96985092978261e-05,
+      "loss": 0.21792960166931152,
+      "step": 430
+    },
+    {
+      "epoch": 0.07914847161572053,
+      "grad_norm": 0.11164513230323792,
+      "learning_rate": 4.968700516510954e-05,
+      "loss": 0.2022618055343628,
+      "step": 435
+    },
+    {
+      "epoch": 0.08005822416302766,
+      "grad_norm": 0.09532847255468369,
+      "learning_rate": 4.967528701996174e-05,
+      "loss": 0.21255812644958497,
+      "step": 440
+    },
+    {
+      "epoch": 0.08096797671033479,
+      "grad_norm": 0.10279258340597153,
+      "learning_rate": 4.96633549639677e-05,
+      "loss": 0.20683050155639648,
+      "step": 445
+    },
+    {
+      "epoch": 0.08187772925764192,
+      "grad_norm": 0.1257462352514267,
+      "learning_rate": 4.965120910056677e-05,
+      "loss": 0.21419920921325683,
+      "step": 450
+    },
+    {
+      "epoch": 0.08278748180494905,
+      "grad_norm": 0.11663137376308441,
+      "learning_rate": 4.963884953505186e-05,
+      "loss": 0.2072287082672119,
+      "step": 455
+    },
+    {
+      "epoch": 0.08369723435225619,
+      "grad_norm": 0.10488224029541016,
+      "learning_rate": 4.96262763745684e-05,
+      "loss": 0.1982678532600403,
+      "step": 460
+    },
+    {
+      "epoch": 0.08460698689956332,
+      "grad_norm": 0.11801692098379135,
+      "learning_rate": 4.961348972811354e-05,
+      "loss": 0.20662031173706055,
+      "step": 465
+    },
+    {
+      "epoch": 0.08551673944687045,
+      "grad_norm": 0.11318827420473099,
+      "learning_rate": 4.96004897065351e-05,
+      "loss": 0.20947303771972656,
+      "step": 470
+    },
+    {
+      "epoch": 0.08642649199417758,
+      "grad_norm": 0.13409486413002014,
+      "learning_rate": 4.95872764225307e-05,
+      "loss": 0.19670876264572143,
+      "step": 475
+    },
+    {
+      "epoch": 0.08733624454148471,
+      "grad_norm": 0.14440792798995972,
+      "learning_rate": 4.957384999064672e-05,
+      "loss": 0.19842848777770997,
+      "step": 480
+    },
+    {
+      "epoch": 0.08824599708879186,
+      "grad_norm": 0.12246996909379959,
+      "learning_rate": 4.956021052727731e-05,
+      "loss": 0.20318071842193602,
+      "step": 485
+    },
+    {
+      "epoch": 0.08915574963609899,
+      "grad_norm": 0.13437233865261078,
+      "learning_rate": 4.954635815066342e-05,
+      "loss": 0.21675212383270265,
+      "step": 490
+    },
+    {
+      "epoch": 0.09006550218340612,
+      "grad_norm": 0.11109672486782074,
+      "learning_rate": 4.9532292980891744e-05,
+      "loss": 0.2100757837295532,
+      "step": 495
+    },
+    {
+      "epoch": 0.09097525473071325,
+      "grad_norm": 0.1388893872499466,
+      "learning_rate": 4.9518015139893675e-05,
+      "loss": 0.20303285121917725,
+      "step": 500
+    },
+    {
+      "epoch": 0.09188500727802038,
+      "grad_norm": 0.13239721953868866,
+      "learning_rate": 4.950352475144427e-05,
+      "loss": 0.2152268409729004,
+      "step": 505
+    },
+    {
+      "epoch": 0.0927947598253275,
+      "grad_norm": 0.12834979593753815,
+      "learning_rate": 4.948882194116119e-05,
+      "loss": 0.20799248218536376,
+      "step": 510
+    },
+    {
+      "epoch": 0.09370451237263465,
+      "grad_norm": 0.11886704713106155,
+      "learning_rate": 4.947390683650354e-05,
+      "loss": 0.20394976139068605,
+      "step": 515
+    },
+    {
+      "epoch": 0.09461426491994178,
+      "grad_norm": 0.11398876458406448,
+      "learning_rate": 4.945877956677083e-05,
+      "loss": 0.2091092586517334,
+      "step": 520
+    },
+    {
+      "epoch": 0.09552401746724891,
+      "grad_norm": 0.1422540694475174,
+      "learning_rate": 4.944344026310186e-05,
+      "loss": 0.19564238786697388,
+      "step": 525
+    },
+    {
+      "epoch": 0.09643377001455604,
+      "grad_norm": 0.11359584331512451,
+      "learning_rate": 4.9427889058473535e-05,
+      "loss": 0.20493624210357667,
+      "step": 530
+    },
+    {
+      "epoch": 0.09734352256186317,
+      "grad_norm": 0.11703553050756454,
+      "learning_rate": 4.941212608769974e-05,
+      "loss": 0.2098615884780884,
+      "step": 535
+    },
+    {
+      "epoch": 0.0982532751091703,
+      "grad_norm": 0.14552047848701477,
+      "learning_rate": 4.939615148743017e-05,
+      "loss": 0.20382182598114013,
+      "step": 540
+    },
+    {
+      "epoch": 0.09916302765647744,
+      "grad_norm": 0.13178016245365143,
+      "learning_rate": 4.937996539614914e-05,
+      "loss": 0.19901862144470214,
+      "step": 545
+    },
+    {
+      "epoch": 0.10007278020378457,
+      "grad_norm": 0.635392427444458,
+      "learning_rate": 4.936356795417439e-05,
+      "loss": 0.20694944858551026,
+      "step": 550
+    },
+    {
+      "epoch": 0.1009825327510917,
+      "grad_norm": 0.15019077062606812,
+      "learning_rate": 4.934695930365586e-05,
+      "loss": 0.19313746690750122,
+      "step": 555
+    },
+    {
+      "epoch": 0.10189228529839883,
+      "grad_norm": 0.12941956520080566,
+      "learning_rate": 4.9330139588574474e-05,
+      "loss": 0.19671722650527954,
+      "step": 560
+    },
+    {
+      "epoch": 0.10280203784570596,
+      "grad_norm": 0.13818831741809845,
+      "learning_rate": 4.931310895474088e-05,
+      "loss": 0.20026786327362062,
+      "step": 565
+    },
+    {
+      "epoch": 0.1037117903930131,
+      "grad_norm": 0.12011194974184036,
+      "learning_rate": 4.929586754979417e-05,
+      "loss": 0.1932437539100647,
+      "step": 570
+    },
+    {
+      "epoch": 0.10462154294032024,
+      "grad_norm": 0.1345364898443222,
+      "learning_rate": 4.9278415523200644e-05,
+      "loss": 0.20245940685272218,
+      "step": 575
+    },
+    {
+      "epoch": 0.10553129548762737,
+      "grad_norm": 0.13281017541885376,
+      "learning_rate": 4.926075302625247e-05,
+      "loss": 0.19864981174468993,
+      "step": 580
+    },
+    {
+      "epoch": 0.1064410480349345,
+      "grad_norm": 0.13465586304664612,
+      "learning_rate": 4.924288021206639e-05,
+      "loss": 0.19573183059692384,
+      "step": 585
+    },
+    {
+      "epoch": 0.10735080058224163,
+      "grad_norm": 0.15225961804389954,
+      "learning_rate": 4.9224797235582396e-05,
+      "loss": 0.19946801662445068,
+      "step": 590
+    },
+    {
+      "epoch": 0.10826055312954876,
+      "grad_norm": 0.12816746532917023,
+      "learning_rate": 4.92065042535624e-05,
+      "loss": 0.19851526021957397,
+      "step": 595
+    },
+    {
+      "epoch": 0.1091703056768559,
+      "grad_norm": 0.13802853226661682,
+      "learning_rate": 4.9188001424588824e-05,
+      "loss": 0.19321763515472412,
+      "step": 600
+    },
+    {
+      "epoch": 0.11008005822416303,
+      "grad_norm": 0.17504797875881195,
+      "learning_rate": 4.9169288909063295e-05,
+      "loss": 0.2032616138458252,
+      "step": 605
+    },
+    {
+      "epoch": 0.11098981077147016,
+      "grad_norm": 0.13544194400310516,
+      "learning_rate": 4.91503668692052e-05,
+      "loss": 0.2011256456375122,
+      "step": 610
+    },
+    {
+      "epoch": 0.11189956331877729,
+      "grad_norm": 1.3976134061813354,
+      "learning_rate": 4.91312354690503e-05,
+      "loss": 0.19916868209838867,
+      "step": 615
+    },
+    {
+      "epoch": 0.11280931586608442,
+      "grad_norm": 0.1465059071779251,
+      "learning_rate": 4.91118948744493e-05,
+      "loss": 0.19487457275390624,
+      "step": 620
+    },
+    {
+      "epoch": 0.11371906841339156,
+      "grad_norm": 0.12103168666362762,
+      "learning_rate": 4.909234525306645e-05,
+      "loss": 0.1907251238822937,
+      "step": 625
+    },
+    {
+      "epoch": 0.1146288209606987,
+      "grad_norm": 0.12660574913024902,
+      "learning_rate": 4.907258677437802e-05,
+      "loss": 0.19327253103256226,
+      "step": 630
+    },
+    {
+      "epoch": 0.11553857350800582,
+      "grad_norm": 0.1347813606262207,
+      "learning_rate": 4.90526196096709e-05,
+      "loss": 0.19637736082077026,
+      "step": 635
+    },
+    {
+      "epoch": 0.11644832605531295,
+      "grad_norm": 0.14953652024269104,
+      "learning_rate": 4.903244393204107e-05,
+      "loss": 0.20325069427490233,
+      "step": 640
+    },
+    {
+      "epoch": 0.11735807860262008,
+      "grad_norm": 0.13936272263526917,
+      "learning_rate": 4.901205991639213e-05,
+      "loss": 0.1930275321006775,
+      "step": 645
+    },
+    {
+      "epoch": 0.11826783114992721,
+      "grad_norm": 0.1448420137166977,
+      "learning_rate": 4.899146773943374e-05,
+      "loss": 0.20026936531066894,
+      "step": 650
+    },
+    {
+      "epoch": 0.11917758369723436,
+      "grad_norm": 0.1312534064054489,
+      "learning_rate": 4.897066757968014e-05,
+      "loss": 0.19062033891677857,
+      "step": 655
+    },
+    {
+      "epoch": 0.12008733624454149,
+      "grad_norm": 0.13644742965698242,
+      "learning_rate": 4.894965961744859e-05,
+      "loss": 0.18719595670700073,
+      "step": 660
+    },
+    {
+      "epoch": 0.12099708879184862,
+      "grad_norm": 0.14276087284088135,
+      "learning_rate": 4.892844403485777e-05,
+      "loss": 0.19784307479858398,
+      "step": 665
+    },
+    {
+      "epoch": 0.12190684133915575,
+      "grad_norm": 0.14735399186611176,
+      "learning_rate": 4.890702101582623e-05,
+      "loss": 0.19163782596588136,
+      "step": 670
+    },
+    {
+      "epoch": 0.12281659388646288,
+      "grad_norm": 0.15742065012454987,
+      "learning_rate": 4.888539074607082e-05,
+      "loss": 0.19312986135482788,
+      "step": 675
+    },
+    {
+      "epoch": 0.12372634643377002,
+      "grad_norm": 0.12917031347751617,
+      "learning_rate": 4.8863553413105025e-05,
+      "loss": 0.20066320896148682,
+      "step": 680
+    },
+    {
+      "epoch": 0.12463609898107715,
+      "grad_norm": 0.1484801322221756,
+      "learning_rate": 4.884150920623737e-05,
+      "loss": 0.20096096992492676,
+      "step": 685
+    },
+    {
+      "epoch": 0.12554585152838427,
+      "grad_norm": 0.1455296128988266,
+      "learning_rate": 4.88192583165698e-05,
+      "loss": 0.20518505573272705,
+      "step": 690
+    },
+    {
+      "epoch": 0.12645560407569142,
+      "grad_norm": 0.14517490565776825,
+      "learning_rate": 4.879680093699598e-05,
+      "loss": 0.18859238624572755,
+      "step": 695
+    },
+    {
+      "epoch": 0.12736535662299855,
+      "grad_norm": 0.18778090178966522,
+      "learning_rate": 4.877413726219964e-05,
+      "loss": 0.197074818611145,
+      "step": 700
+    },
+    {
+      "epoch": 0.12827510917030568,
+      "grad_norm": 0.13497677445411682,
+      "learning_rate": 4.87512674886529e-05,
+      "loss": 0.18713107109069824,
+      "step": 705
+    },
+    {
+      "epoch": 0.12918486171761281,
+      "grad_norm": 0.12657155096530914,
+      "learning_rate": 4.872819181461455e-05,
+      "loss": 0.1858484387397766,
+      "step": 710
+    },
+    {
+      "epoch": 0.13009461426491994,
+      "grad_norm": 0.11458148807287216,
+      "learning_rate": 4.870491044012834e-05,
+      "loss": 0.18732179403305055,
+      "step": 715
+    },
+    {
+      "epoch": 0.13100436681222707,
+      "grad_norm": 0.13000249862670898,
+      "learning_rate": 4.8681423567021244e-05,
+      "loss": 0.1872936010360718,
+      "step": 720
+    },
+    {
+      "epoch": 0.1319141193595342,
+      "grad_norm": 0.14580890536308289,
+      "learning_rate": 4.865773139890172e-05,
+      "loss": 0.19280019998550416,
+      "step": 725
+    },
+    {
+      "epoch": 0.13282387190684133,
+      "grad_norm": 0.1507277935743332,
+      "learning_rate": 4.8633834141157913e-05,
+      "loss": 0.1898929238319397,
+      "step": 730
+    },
+    {
+      "epoch": 0.13373362445414846,
+      "grad_norm": 0.1418737769126892,
+      "learning_rate": 4.860973200095592e-05,
+      "loss": 0.17926375865936278,
+      "step": 735
+    },
+    {
+      "epoch": 0.1346433770014556,
+      "grad_norm": 0.17151866853237152,
+      "learning_rate": 4.858542518723794e-05,
+      "loss": 0.18963592052459716,
+      "step": 740
+    },
+    {
+      "epoch": 0.13555312954876272,
+      "grad_norm": 0.11162743717432022,
+      "learning_rate": 4.8560913910720535e-05,
+      "loss": 0.19466646909713745,
+      "step": 745
+    },
+    {
+      "epoch": 0.13646288209606988,
+      "grad_norm": 0.15628376603126526,
+      "learning_rate": 4.8536198383892725e-05,
+      "loss": 0.19494034051895143,
+      "step": 750
+    },
+    {
+      "epoch": 0.137372634643377,
+      "grad_norm": 0.18209289014339447,
+      "learning_rate": 4.851127882101421e-05,
+      "loss": 0.18747550249099731,
+      "step": 755
+    },
+    {
+      "epoch": 0.13828238719068414,
+      "grad_norm": 0.14559614658355713,
+      "learning_rate": 4.8486155438113454e-05,
+      "loss": 0.1897158980369568,
+      "step": 760
+    },
+    {
+      "epoch": 0.13919213973799127,
+      "grad_norm": 0.3198587894439697,
+      "learning_rate": 4.846082845298586e-05,
+      "loss": 0.18571001291275024,
+      "step": 765
+    },
+    {
+      "epoch": 0.1401018922852984,
+      "grad_norm": 0.1486678421497345,
+      "learning_rate": 4.843529808519189e-05,
+      "loss": 0.19561930894851684,
+      "step": 770
+    },
+    {
+      "epoch": 0.14101164483260553,
+      "grad_norm": 0.15318170189857483,
+      "learning_rate": 4.840956455605509e-05,
+      "loss": 0.187040114402771,
+      "step": 775
+    },
+    {
+      "epoch": 0.14192139737991266,
+      "grad_norm": 0.13754244148731232,
+      "learning_rate": 4.838362808866025e-05,
+      "loss": 0.18345539569854735,
+      "step": 780
+    },
+    {
+      "epoch": 0.1428311499272198,
+      "grad_norm": 0.12943248450756073,
+      "learning_rate": 4.835748890785143e-05,
+      "loss": 0.1921079397201538,
+      "step": 785
+    },
+    {
+      "epoch": 0.14374090247452692,
+      "grad_norm": 0.110458143055439,
+      "learning_rate": 4.833114724023001e-05,
+      "loss": 0.17927205562591553,
+      "step": 790
+    },
+    {
+      "epoch": 0.14465065502183405,
+      "grad_norm": 0.2421770840883255,
+      "learning_rate": 4.830460331415275e-05,
+      "loss": 0.18317567110061644,
+      "step": 795
+    },
+    {
+      "epoch": 0.14556040756914118,
+      "grad_norm": 0.14752762019634247,
+      "learning_rate": 4.8277857359729787e-05,
+      "loss": 0.1843916058540344,
+      "step": 800
+    },
+    {
+      "epoch": 0.14647016011644834,
+      "grad_norm": 0.15043556690216064,
+      "learning_rate": 4.8250909608822644e-05,
+      "loss": 0.18354393243789674,
+      "step": 805
+    },
+    {
+      "epoch": 0.14737991266375547,
+      "grad_norm": 0.1381794661283493,
+      "learning_rate": 4.822376029504223e-05,
+      "loss": 0.1789781332015991,
+      "step": 810
+    },
+    {
+      "epoch": 0.1482896652110626,
+      "grad_norm": 0.18386174738407135,
+      "learning_rate": 4.819640965374681e-05,
+      "loss": 0.19494292736053467,
+      "step": 815
+    },
+    {
+      "epoch": 0.14919941775836973,
+      "grad_norm": 0.13829593360424042,
+      "learning_rate": 4.816885792203996e-05,
+      "loss": 0.18486063480377196,
+      "step": 820
+    },
+    {
+      "epoch": 0.15010917030567686,
+      "grad_norm": 0.15033291280269623,
+      "learning_rate": 4.814110533876852e-05,
+      "loss": 0.18061509132385253,
+      "step": 825
+    },
+    {
+      "epoch": 0.151018922852984,
+      "grad_norm": 0.17150473594665527,
+      "learning_rate": 4.811315214452051e-05,
+      "loss": 0.18464866876602173,
+      "step": 830
+    },
+    {
+      "epoch": 0.15192867540029112,
+      "grad_norm": 0.15317125618457794,
+      "learning_rate": 4.808499858162307e-05,
+      "loss": 0.1837708592414856,
+      "step": 835
+    },
+    {
+      "epoch": 0.15283842794759825,
+      "grad_norm": 0.2671392560005188,
+      "learning_rate": 4.805664489414031e-05,
+      "loss": 0.19338636398315429,
+      "step": 840
+    },
+    {
+      "epoch": 0.15374818049490538,
+      "grad_norm": 0.14047028124332428,
+      "learning_rate": 4.802809132787125e-05,
+      "loss": 0.17069108486175538,
+      "step": 845
+    },
+    {
+      "epoch": 0.1546579330422125,
+      "grad_norm": 0.1520431935787201,
+      "learning_rate": 4.799933813034768e-05,
+      "loss": 0.18607735633850098,
+      "step": 850
+    },
+    {
+      "epoch": 0.15556768558951964,
+      "grad_norm": 0.17239463329315186,
+      "learning_rate": 4.797038555083197e-05,
+      "loss": 0.18069062232971192,
+      "step": 855
+    },
+    {
+      "epoch": 0.1564774381368268,
+      "grad_norm": 0.1377955675125122,
+      "learning_rate": 4.794123384031495e-05,
+      "loss": 0.18870222568511963,
+      "step": 860
+    },
+    {
+      "epoch": 0.15738719068413393,
+      "grad_norm": 0.15901461243629456,
+      "learning_rate": 4.791188325151373e-05,
+      "loss": 0.18128334283828734,
+      "step": 865
+    },
+    {
+      "epoch": 0.15829694323144106,
+      "grad_norm": 0.14634132385253906,
+      "learning_rate": 4.7882334038869495e-05,
+      "loss": 0.1866163969039917,
+      "step": 870
+    },
+    {
+      "epoch": 0.1592066957787482,
+      "grad_norm": 0.15361061692237854,
+      "learning_rate": 4.785258645854529e-05,
+      "loss": 0.17850807905197144,
+      "step": 875
+    },
+    {
+      "epoch": 0.16011644832605532,
+      "grad_norm": 0.13751649856567383,
+      "learning_rate": 4.782264076842385e-05,
+      "loss": 0.17731113433837892,
+      "step": 880
+    },
+    {
+      "epoch": 0.16102620087336245,
+      "grad_norm": 0.17909638583660126,
+      "learning_rate": 4.7792497228105314e-05,
+      "loss": 0.18344542980194092,
+      "step": 885
+    },
+    {
+      "epoch": 0.16193595342066958,
+      "grad_norm": 0.16038304567337036,
+      "learning_rate": 4.776215609890498e-05,
+      "loss": 0.18868647813796996,
+      "step": 890
+    },
+    {
+      "epoch": 0.1628457059679767,
+      "grad_norm": 0.1653951108455658,
+      "learning_rate": 4.773161764385107e-05,
+      "loss": 0.18614152669906617,
+      "step": 895
+    },
+    {
+      "epoch": 0.16375545851528384,
+      "grad_norm": 0.16193026304244995,
+      "learning_rate": 4.770088212768241e-05,
+      "loss": 0.18564575910568237,
+      "step": 900
+    },
+    {
+      "epoch": 0.16466521106259097,
+      "grad_norm": 0.16048531234264374,
+      "learning_rate": 4.7669949816846173e-05,
+      "loss": 0.18330031633377075,
+      "step": 905
+    },
+    {
+      "epoch": 0.1655749636098981,
+      "grad_norm": 0.1440177708864212,
+      "learning_rate": 4.7638820979495534e-05,
+      "loss": 0.17712442874908446,
+      "step": 910
+    },
+    {
+      "epoch": 0.16648471615720525,
+      "grad_norm": 0.19635969400405884,
+      "learning_rate": 4.760749588548738e-05,
+      "loss": 0.18679027557373046,
+      "step": 915
+    },
+    {
+      "epoch": 0.16739446870451238,
+      "grad_norm": 0.15576541423797607,
+      "learning_rate": 4.757597480637995e-05,
+      "loss": 0.19283764362335204,
+      "step": 920
+    },
+    {
+      "epoch": 0.1683042212518195,
+      "grad_norm": 0.1550331562757492,
+      "learning_rate": 4.7544258015430463e-05,
+      "loss": 0.18269542455673218,
+      "step": 925
+    },
+    {
+      "epoch": 0.16921397379912664,
+      "grad_norm": 0.18369626998901367,
+      "learning_rate": 4.75123457875928e-05,
+      "loss": 0.1697891116142273,
+      "step": 930
+    },
+    {
+      "epoch": 0.17012372634643377,
+      "grad_norm": 0.15266314148902893,
+      "learning_rate": 4.7480238399515074e-05,
+      "loss": 0.18523451089859008,
+      "step": 935
+    },
+    {
+      "epoch": 0.1710334788937409,
+      "grad_norm": 0.16709664463996887,
+      "learning_rate": 4.744793612953724e-05,
+      "loss": 0.1803238034248352,
+      "step": 940
+    },
+    {
+      "epoch": 0.17194323144104803,
+      "grad_norm": 0.14929179847240448,
+      "learning_rate": 4.741543925768872e-05,
+      "loss": 0.1861217737197876,
+      "step": 945
+    },
+    {
+      "epoch": 0.17285298398835516,
+      "grad_norm": 0.1362280696630478,
+      "learning_rate": 4.7382748065685915e-05,
+      "loss": 0.17896100282669067,
+      "step": 950
+    },
+    {
+      "epoch": 0.1737627365356623,
+      "grad_norm": 0.15290239453315735,
+      "learning_rate": 4.734986283692982e-05,
+      "loss": 0.18432788848876952,
+      "step": 955
+    },
+    {
+      "epoch": 0.17467248908296942,
+      "grad_norm": 0.1287035197019577,
+      "learning_rate": 4.73167838565035e-05,
+      "loss": 0.18485682010650634,
+      "step": 960
+    },
+    {
+      "epoch": 0.17558224163027655,
+      "grad_norm": 0.17969627678394318,
+      "learning_rate": 4.728351141116971e-05,
+      "loss": 0.17361557483673096,
+      "step": 965
+    },
+    {
+      "epoch": 0.1764919941775837,
+      "grad_norm": 0.13751201331615448,
+      "learning_rate": 4.7250045789368326e-05,
+      "loss": 0.1731679320335388,
+      "step": 970
+    },
+    {
+      "epoch": 0.17740174672489084,
+      "grad_norm": 0.1603265255689621,
+      "learning_rate": 4.721638728121388e-05,
+      "loss": 0.17308170795440675,
+      "step": 975
+    },
+    {
+      "epoch": 0.17831149927219797,
+      "grad_norm": 0.1592789888381958,
+      "learning_rate": 4.718253617849306e-05,
+      "loss": 0.17534757852554322,
+      "step": 980
+    },
+    {
+      "epoch": 0.1792212518195051,
+      "grad_norm": 0.12727224826812744,
+      "learning_rate": 4.714849277466214e-05,
+      "loss": 0.17817609310150145,
+      "step": 985
+    },
+    {
+      "epoch": 0.18013100436681223,
+      "grad_norm": 0.15401554107666016,
+      "learning_rate": 4.711425736484447e-05,
+      "loss": 0.1733405351638794,
+      "step": 990
+    },
+    {
+      "epoch": 0.18104075691411936,
+      "grad_norm": 0.13253968954086304,
+      "learning_rate": 4.7079830245827906e-05,
+      "loss": 0.17846795320510864,
+      "step": 995
+    },
+    {
+      "epoch": 0.1819505094614265,
+      "grad_norm": 0.21846213936805725,
+      "learning_rate": 4.7045211716062245e-05,
+      "loss": 0.18021599054336548,
+      "step": 1000
+    },
+    {
+      "epoch": 0.18286026200873362,
+      "grad_norm": 0.16867990791797638,
+      "learning_rate": 4.7010402075656595e-05,
+      "loss": 0.18232386112213134,
+      "step": 1005
+    },
+    {
+      "epoch": 0.18377001455604075,
+      "grad_norm": 0.17180582880973816,
+      "learning_rate": 4.697540162637686e-05,
+      "loss": 0.1816317319869995,
+      "step": 1010
+    },
+    {
+      "epoch": 0.18467976710334788,
+      "grad_norm": 0.16480213403701782,
+      "learning_rate": 4.694021067164303e-05,
+      "loss": 0.17718446254730225,
+      "step": 1015
+    },
+    {
+      "epoch": 0.185589519650655,
+      "grad_norm": 0.15015918016433716,
+      "learning_rate": 4.6904829516526605e-05,
+      "loss": 0.17412011623382567,
+      "step": 1020
+    },
+    {
+      "epoch": 0.18649927219796217,
+      "grad_norm": 0.14445139467716217,
+      "learning_rate": 4.686925846774795e-05,
+      "loss": 0.1778018832206726,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1874090247452693,
+      "grad_norm": 0.1701960265636444,
+      "learning_rate": 4.683349783367362e-05,
+      "loss": 0.16901081800460815,
+      "step": 1030
+    },
+    {
+      "epoch": 0.18831877729257643,
+      "grad_norm": 0.15894867479801178,
+      "learning_rate": 4.679754792431368e-05,
+      "loss": 0.17055928707122803,
+      "step": 1035
+    },
+    {
+      "epoch": 0.18922852983988356,
+      "grad_norm": 0.1511942446231842,
+      "learning_rate": 4.676140905131903e-05,
+      "loss": 0.17339680194854737,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1901382823871907,
+      "grad_norm": 0.14735209941864014,
+      "learning_rate": 4.672508152797872e-05,
+      "loss": 0.17802717685699462,
+      "step": 1045
+    },
+    {
+      "epoch": 0.19104803493449782,
+      "grad_norm": 0.17367291450500488,
+      "learning_rate": 4.66885656692172e-05,
+      "loss": 0.1732744097709656,
+      "step": 1050
+    },
+    {
+      "epoch": 0.19195778748180495,
+      "grad_norm": 0.147227481007576,
+      "learning_rate": 4.665186179159159e-05,
+      "loss": 0.17040517330169677,
+      "step": 1055
+    },
+    {
+      "epoch": 0.19286754002911208,
+      "grad_norm": 0.1709655076265335,
+      "learning_rate": 4.6614970213289e-05,
+      "loss": 0.17794088125228882,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1937772925764192,
+      "grad_norm": 0.1588088721036911,
+      "learning_rate": 4.657789125412366e-05,
+      "loss": 0.17180380821228028,
+      "step": 1065
+    },
+    {
+      "epoch": 0.19468704512372634,
+      "grad_norm": 0.14827021956443787,
+      "learning_rate": 4.654062523553428e-05,
+      "loss": 0.182997989654541,
+      "step": 1070
+    },
+    {
+      "epoch": 0.19559679767103347,
+      "grad_norm": 0.16230466961860657,
+      "learning_rate": 4.6503172480581126e-05,
+      "loss": 0.17346880435943604,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1965065502183406,
+      "grad_norm": 0.1637624353170395,
+      "learning_rate": 4.646553331394333e-05,
+      "loss": 0.17263576984405518,
+      "step": 1080
+    },
+    {
+      "epoch": 0.19741630276564776,
+      "grad_norm": 0.15977843105793,
+      "learning_rate": 4.642770806191603e-05,
+      "loss": 0.17284308671951293,
+      "step": 1085
+    },
+    {
+      "epoch": 0.19832605531295489,
+      "grad_norm": 0.15394869446754456,
+      "learning_rate": 4.6389697052407534e-05,
+      "loss": 0.17797101736068727,
+      "step": 1090
+    },
+    {
+      "epoch": 0.19923580786026202,
+      "grad_norm": 0.15995225310325623,
+      "learning_rate": 4.6351500614936485e-05,
+      "loss": 0.18137198686599731,
+      "step": 1095
+    },
+    {
+      "epoch": 0.20014556040756915,
+      "grad_norm": 0.1779479682445526,
+      "learning_rate": 4.6313119080629006e-05,
+      "loss": 0.17998344898223878,
+      "step": 1100
+    },
+    {
+      "epoch": 0.20105531295487628,
+      "grad_norm": 0.14362832903862,
+      "learning_rate": 4.627455278221584e-05,
+      "loss": 0.18196423053741456,
+      "step": 1105
+    },
+    {
+      "epoch": 0.2019650655021834,
+      "grad_norm": 0.15951639413833618,
+      "learning_rate": 4.623580205402947e-05,
+      "loss": 0.17423888444900512,
+      "step": 1110
+    },
+    {
+      "epoch": 0.20287481804949054,
+      "grad_norm": 0.17273563146591187,
+      "learning_rate": 4.619686723200115e-05,
+      "loss": 0.17392473220825194,
+      "step": 1115
+    },
+    {
+      "epoch": 0.20378457059679767,
+      "grad_norm": 0.1655360758304596,
+      "learning_rate": 4.615774865365813e-05,
+      "loss": 0.17528389692306517,
+      "step": 1120
+    },
+    {
+      "epoch": 0.2046943231441048,
+      "grad_norm": 0.15920691192150116,
+      "learning_rate": 4.611844665812058e-05,
+      "loss": 0.1806849241256714,
+      "step": 1125
+    },
+    {
+      "epoch": 0.20560407569141192,
+      "grad_norm": 0.16114577651023865,
+      "learning_rate": 4.607896158609875e-05,
+      "loss": 0.17217352390289306,
+      "step": 1130
+    },
+    {
+      "epoch": 0.20651382823871905,
+      "grad_norm": 0.1499422937631607,
+      "learning_rate": 4.603929377988999e-05,
+      "loss": 0.17806737422943114,
+      "step": 1135
+    },
+    {
+      "epoch": 0.2074235807860262,
+      "grad_norm": 0.17605191469192505,
+      "learning_rate": 4.5999443583375765e-05,
+      "loss": 0.17842113971710205,
+      "step": 1140
+    },
+    {
+      "epoch": 0.20833333333333334,
+      "grad_norm": 0.16117210686206818,
+      "learning_rate": 4.595941134201871e-05,
+      "loss": 0.18379683494567872,
+      "step": 1145
+    },
+    {
+      "epoch": 0.20924308588064047,
+      "grad_norm": 0.21199050545692444,
+      "learning_rate": 4.591919740285957e-05,
+      "loss": 0.16286123991012574,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2101528384279476,
+      "grad_norm": 0.15100529789924622,
+      "learning_rate": 4.587880211451427e-05,
+      "loss": 0.17995200157165528,
+      "step": 1155
+    },
+    {
+      "epoch": 0.21106259097525473,
+      "grad_norm": 0.16618172824382782,
+      "learning_rate": 4.583822582717085e-05,
+      "loss": 0.16960303783416747,
+      "step": 1160
+    },
+    {
+      "epoch": 0.21197234352256186,
+      "grad_norm": 0.14743569493293762,
+      "learning_rate": 4.579746889258643e-05,
+      "loss": 0.17762668132781984,
+      "step": 1165
+    },
+    {
+      "epoch": 0.212882096069869,
+      "grad_norm": 0.1697179079055786,
+      "learning_rate": 4.575653166408417e-05,
+      "loss": 0.16656005382537842,
+      "step": 1170
+    },
+    {
+      "epoch": 0.21379184861717612,
+      "grad_norm": 0.14886513352394104,
+      "learning_rate": 4.57154144965502e-05,
+      "loss": 0.17091882228851318,
+      "step": 1175
+    },
+    {
+      "epoch": 0.21470160116448325,
+      "grad_norm": 0.18197473883628845,
+      "learning_rate": 4.5674117746430556e-05,
+      "loss": 0.1770920753479004,
+      "step": 1180
+    },
+    {
+      "epoch": 0.21561135371179038,
+      "grad_norm": 0.17323088645935059,
+      "learning_rate": 4.563264177172807e-05,
+      "loss": 0.1734643578529358,
+      "step": 1185
+    },
+    {
+      "epoch": 0.2165211062590975,
+      "grad_norm": 0.1521984338760376,
+      "learning_rate": 4.559098693199929e-05,
+      "loss": 0.17515116930007935,
+      "step": 1190
+    },
+    {
+      "epoch": 0.21743085880640467,
+      "grad_norm": 0.1842304915189743,
+      "learning_rate": 4.554915358835134e-05,
+      "loss": 0.16798022985458375,
+      "step": 1195
+    },
+    {
+      "epoch": 0.2183406113537118,
+      "grad_norm": 0.14753451943397522,
+      "learning_rate": 4.5507142103438794e-05,
+      "loss": 0.1755476713180542,
+      "step": 1200
+    },
+    {
+      "epoch": 0.21925036390101893,
+      "grad_norm": 0.17096194624900818,
+      "learning_rate": 4.546495284146057e-05,
+      "loss": 0.1792473554611206,
+      "step": 1205
+    },
+    {
+      "epoch": 0.22016011644832606,
+      "grad_norm": 0.1579233556985855,
+      "learning_rate": 4.542258616815669e-05,
+      "loss": 0.17230144739151002,
+      "step": 1210
+    },
+    {
+      "epoch": 0.2210698689956332,
+      "grad_norm": 0.177297905087471,
+      "learning_rate": 4.5380042450805216e-05,
+      "loss": 0.1807127833366394,
+      "step": 1215
+    },
+    {
+      "epoch": 0.22197962154294032,
+      "grad_norm": 0.14331696927547455,
+      "learning_rate": 4.533732205821897e-05,
+      "loss": 0.17201389074325563,
+      "step": 1220
+    },
+    {
+      "epoch": 0.22288937409024745,
+      "grad_norm": 0.14473360776901245,
+      "learning_rate": 4.529442536074239e-05,
+      "loss": 0.17036900520324708,
+      "step": 1225
+    },
+    {
+      "epoch": 0.22379912663755458,
+      "grad_norm": 0.1820901483297348,
+      "learning_rate": 4.5251352730248314e-05,
+      "loss": 0.17704882621765136,
+      "step": 1230
+    },
+    {
+      "epoch": 0.2247088791848617,
+      "grad_norm": 0.1948976367712021,
+      "learning_rate": 4.5208104540134746e-05,
+      "loss": 0.1706973433494568,
+      "step": 1235
+    },
+    {
+      "epoch": 0.22561863173216884,
+      "grad_norm": 0.16660070419311523,
+      "learning_rate": 4.51646811653216e-05,
+      "loss": 0.17636821269989014,
+      "step": 1240
+    },
+    {
+      "epoch": 0.22652838427947597,
+      "grad_norm": 0.1699984073638916,
+      "learning_rate": 4.512108298224751e-05,
+      "loss": 0.16986632347106934,
+      "step": 1245
+    },
+    {
+      "epoch": 0.22743813682678313,
+      "grad_norm": 0.17601042985916138,
+      "learning_rate": 4.50773103688665e-05,
+      "loss": 0.17507898807525635,
+      "step": 1250
+    },
+    {
+      "epoch": 0.22834788937409026,
+      "grad_norm": 0.17557238042354584,
+      "learning_rate": 4.503336370464476e-05,
+      "loss": 0.17702863216400147,
+      "step": 1255
+    },
+    {
+      "epoch": 0.2292576419213974,
+      "grad_norm": 0.1800651252269745,
+      "learning_rate": 4.498924337055729e-05,
+      "loss": 0.16419180631637573,
+      "step": 1260
+    },
+    {
+      "epoch": 0.23016739446870452,
+      "grad_norm": 0.2022479772567749,
+      "learning_rate": 4.494494974908468e-05,
+      "loss": 0.17482060194015503,
+      "step": 1265
+    },
+    {
+      "epoch": 0.23107714701601165,
+      "grad_norm": 0.14180205762386322,
+      "learning_rate": 4.490048322420973e-05,
+      "loss": 0.1723136067390442,
+      "step": 1270
+    },
+    {
+      "epoch": 0.23198689956331878,
+      "grad_norm": 0.18607310950756073,
+      "learning_rate": 4.485584418141419e-05,
+      "loss": 0.17096419334411622,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2328966521106259,
+      "grad_norm": 0.15958310663700104,
+      "learning_rate": 4.481103300767529e-05,
+      "loss": 0.1656244158744812,
+      "step": 1280
+    },
+    {
+      "epoch": 0.23380640465793304,
+      "grad_norm": 0.17552383244037628,
+      "learning_rate": 4.476605009146255e-05,
+      "loss": 0.17677626609802247,
+      "step": 1285
+    },
+    {
+      "epoch": 0.23471615720524017,
+      "grad_norm": 0.15299823880195618,
+      "learning_rate": 4.472089582273429e-05,
+      "loss": 0.1778991103172302,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2356259097525473,
+      "grad_norm": 0.14613987505435944,
+      "learning_rate": 4.46755705929343e-05,
+      "loss": 0.17071452140808105,
+      "step": 1295
+    },
+    {
+      "epoch": 0.23653566229985443,
+      "grad_norm": 0.17781122028827667,
+      "learning_rate": 4.463007479498843e-05,
+      "loss": 0.16955430507659913,
+      "step": 1300
+    },
+    {
+      "epoch": 0.23744541484716158,
+      "grad_norm": 0.16326487064361572,
+      "learning_rate": 4.458440882330119e-05,
+      "loss": 0.1777693510055542,
+      "step": 1305
+    },
+    {
+      "epoch": 0.23835516739446871,
+      "grad_norm": 0.17701926827430725,
+      "learning_rate": 4.4538573073752365e-05,
+      "loss": 0.16323351860046387,
+      "step": 1310
+    },
+    {
+      "epoch": 0.23926491994177584,
+      "grad_norm": 0.13104717433452606,
+      "learning_rate": 4.449256794369349e-05,
+      "loss": 0.17653456926345826,
+      "step": 1315
+    },
+    {
+      "epoch": 0.24017467248908297,
+      "grad_norm": 0.1796836256980896,
+      "learning_rate": 4.444639383194452e-05,
+      "loss": 0.17189600467681884,
+      "step": 1320
+    },
+    {
+      "epoch": 0.2410844250363901,
+      "grad_norm": 0.14919696748256683,
+      "learning_rate": 4.440005113879029e-05,
+      "loss": 0.17003334760665895,
+      "step": 1325
+    },
+    {
+      "epoch": 0.24199417758369723,
+      "grad_norm": 0.1728784441947937,
+      "learning_rate": 4.4353540265977064e-05,
+      "loss": 0.17397408485412597,
+      "step": 1330
+    },
+    {
+      "epoch": 0.24290393013100436,
+      "grad_norm": 0.14591015875339508,
+      "learning_rate": 4.43068616167091e-05,
+      "loss": 0.16498478651046752,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2438136826783115,
+      "grad_norm": 0.18417201936244965,
+      "learning_rate": 4.4260015595645055e-05,
+      "loss": 0.16841750144958495,
+      "step": 1340
+    },
+    {
+      "epoch": 0.24472343522561862,
+      "grad_norm": 0.16264279186725616,
+      "learning_rate": 4.4213002608894605e-05,
+      "loss": 0.16907373666763306,
+      "step": 1345
+    },
+    {
+      "epoch": 0.24563318777292575,
+      "grad_norm": 0.15248481929302216,
+      "learning_rate": 4.416582306401481e-05,
+      "loss": 0.15931472778320313,
+      "step": 1350
+    },
+    {
+      "epoch": 0.24654294032023288,
+      "grad_norm": 0.1488373875617981,
+      "learning_rate": 4.4118477370006636e-05,
+      "loss": 0.1701716423034668,
+      "step": 1355
+    },
+    {
+      "epoch": 0.24745269286754004,
+      "grad_norm": 0.14679782092571259,
+      "learning_rate": 4.407096593731142e-05,
+      "loss": 0.157412326335907,
+      "step": 1360
+    },
+    {
+      "epoch": 0.24836244541484717,
+      "grad_norm": 0.17139530181884766,
+      "learning_rate": 4.402328917780728e-05,
+      "loss": 0.17303754091262818,
+      "step": 1365
+    },
+    {
+      "epoch": 0.2492721979621543,
+      "grad_norm": 0.1534871757030487,
+      "learning_rate": 4.397544750480554e-05,
+      "loss": 0.1786255121231079,
+      "step": 1370
+    },
+    {
+      "epoch": 0.2501819505094614,
+      "grad_norm": 0.1876252293586731,
+      "learning_rate": 4.39274413330472e-05,
+      "loss": 0.16442898511886597,
+      "step": 1375
+    },
+    {
+      "epoch": 0.25109170305676853,
+      "grad_norm": 0.16165752708911896,
+      "learning_rate": 4.387927107869928e-05,
+      "loss": 0.1780426025390625,
+      "step": 1380
+    },
+    {
+      "epoch": 0.25200145560407566,
+      "grad_norm": 0.17242255806922913,
+      "learning_rate": 4.383093715935124e-05,
+      "loss": 0.15959256887435913,
+      "step": 1385
+    },
+    {
+      "epoch": 0.25291120815138285,
+      "grad_norm": 0.1627114862203598,
+      "learning_rate": 4.378243999401137e-05,
+      "loss": 0.17606115341186523,
+      "step": 1390
+    },
+    {
+      "epoch": 0.25382096069869,
+      "grad_norm": 0.15911224484443665,
+      "learning_rate": 4.373378000310312e-05,
+      "loss": 0.16798585653305054,
+      "step": 1395
+    },
+    {
+      "epoch": 0.2547307132459971,
+      "grad_norm": 0.15542249381542206,
+      "learning_rate": 4.3684957608461505e-05,
+      "loss": 0.1695417881011963,
+      "step": 1400
+    },
+    {
+      "epoch": 0.25564046579330424,
+      "grad_norm": 0.1475304812192917,
+      "learning_rate": 4.363597323332941e-05,
+      "loss": 0.16340878009796142,
+      "step": 1405
+    },
+    {
+      "epoch": 0.25655021834061137,
+      "grad_norm": 0.16943927109241486,
+      "learning_rate": 4.358682730235395e-05,
+      "loss": 0.17240238189697266,
+      "step": 1410
+    },
+    {
+      "epoch": 0.2574599708879185,
+      "grad_norm": 0.1816391944885254,
+      "learning_rate": 4.3537520241582744e-05,
+      "loss": 0.16558437347412108,
+      "step": 1415
+    },
+    {
+      "epoch": 0.25836972343522563,
+      "grad_norm": 0.23851341009140015,
+      "learning_rate": 4.348805247846027e-05,
+      "loss": 0.16796000003814698,
+      "step": 1420
+    },
+    {
+      "epoch": 0.25927947598253276,
+      "grad_norm": 0.15415243804454803,
+      "learning_rate": 4.343842444182414e-05,
+      "loss": 0.1746017098426819,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2601892285298399,
+      "grad_norm": 0.15651032328605652,
+      "learning_rate": 4.338863656190139e-05,
+      "loss": 0.1649057984352112,
+      "step": 1430
+    },
+    {
+      "epoch": 0.261098981077147,
+      "grad_norm": 0.16601966321468353,
+      "learning_rate": 4.333868927030471e-05,
+      "loss": 0.15888988971710205,
+      "step": 1435
+    },
+    {
+      "epoch": 0.26200873362445415,
+      "grad_norm": 0.1549467295408249,
+      "learning_rate": 4.328858300002876e-05,
+      "loss": 0.16357985734939576,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2629184861717613,
+      "grad_norm": 0.16332370042800903,
+      "learning_rate": 4.32383181854464e-05,
+      "loss": 0.16749982833862304,
+      "step": 1445
+    },
+    {
+      "epoch": 0.2638282387190684,
+      "grad_norm": 0.14827077090740204,
+      "learning_rate": 4.3187895262304894e-05,
+      "loss": 0.16886214017868043,
+      "step": 1450
+    },
+    {
+      "epoch": 0.26473799126637554,
+      "grad_norm": 0.1557198166847229,
+      "learning_rate": 4.313731466772216e-05,
+      "loss": 0.17512214183807373,
+      "step": 1455
+    },
+    {
+      "epoch": 0.26564774381368267,
+      "grad_norm": 0.17263570427894592,
+      "learning_rate": 4.308657684018299e-05,
+      "loss": 0.16248074769973755,
+      "step": 1460
+    },
+    {
+      "epoch": 0.2665574963609898,
+      "grad_norm": 0.17135761678218842,
+      "learning_rate": 4.303568221953521e-05,
+      "loss": 0.16605921983718872,
+      "step": 1465
+    },
+    {
+      "epoch": 0.26746724890829693,
+      "grad_norm": 0.14322632551193237,
+      "learning_rate": 4.2984631246985897e-05,
+      "loss": 0.1610772728919983,
+      "step": 1470
+    },
+    {
+      "epoch": 0.26837700145560406,
+      "grad_norm": 0.18852312862873077,
+      "learning_rate": 4.2933424365097564e-05,
+      "loss": 0.1686462163925171,
+      "step": 1475
+    },
+    {
+      "epoch": 0.2692867540029112,
+      "grad_norm": 0.1780245155096054,
+      "learning_rate": 4.2882062017784294e-05,
+      "loss": 0.16953932046890258,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2701965065502183,
+      "grad_norm": 0.180568665266037,
+      "learning_rate": 4.2830544650307895e-05,
+      "loss": 0.16442664861679077,
+      "step": 1485
+    },
+    {
+      "epoch": 0.27110625909752545,
+      "grad_norm": 0.16876435279846191,
+      "learning_rate": 4.277887270927407e-05,
+      "loss": 0.17128173112869263,
+      "step": 1490
+    },
+    {
+      "epoch": 0.2720160116448326,
+      "grad_norm": 0.164053276181221,
+      "learning_rate": 4.2727046642628513e-05,
+      "loss": 0.16331382989883422,
+      "step": 1495
+    },
+    {
+      "epoch": 0.27292576419213976,
+      "grad_norm": 0.14577528834342957,
+      "learning_rate": 4.267506689965305e-05,
+      "loss": 0.1638316035270691,
+      "step": 1500
+    },
+    {
+      "epoch": 0.2738355167394469,
+      "grad_norm": 0.1648740917444229,
+      "learning_rate": 4.262293393096171e-05,
+      "loss": 0.15332664251327516,
+      "step": 1505
+    },
+    {
+      "epoch": 0.274745269286754,
+      "grad_norm": 0.16445094347000122,
+      "learning_rate": 4.257064818849685e-05,
+      "loss": 0.1706634521484375,
+      "step": 1510
+    },
+    {
+      "epoch": 0.27565502183406115,
+      "grad_norm": 0.1584935486316681,
+      "learning_rate": 4.251821012552524e-05,
+      "loss": 0.1684114694595337,
+      "step": 1515
+    },
+    {
+      "epoch": 0.2765647743813683,
+      "grad_norm": 0.17215611040592194,
+      "learning_rate": 4.24656201966341e-05,
+      "loss": 0.15594131946563722,
+      "step": 1520
+    },
+    {
+      "epoch": 0.2774745269286754,
+      "grad_norm": 0.15945589542388916,
+      "learning_rate": 4.2412878857727214e-05,
+      "loss": 0.1686659574508667,
+      "step": 1525
+    },
+    {
+      "epoch": 0.27838427947598254,
+      "grad_norm": 0.16103951632976532,
+      "learning_rate": 4.2359986566020906e-05,
+      "loss": 0.17779340744018554,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2792940320232897,
+      "grad_norm": 0.1770307570695877,
+      "learning_rate": 4.230694378004014e-05,
+      "loss": 0.16786882877349854,
+      "step": 1535
+    },
+    {
+      "epoch": 0.2802037845705968,
+      "grad_norm": 0.16225053369998932,
+      "learning_rate": 4.2253750959614504e-05,
+      "loss": 0.16558897495269775,
+      "step": 1540
+    },
+    {
+      "epoch": 0.28111353711790393,
+      "grad_norm": 0.27213969826698303,
+      "learning_rate": 4.220040856587425e-05,
+      "loss": 0.1641119599342346,
+      "step": 1545
+    },
+    {
+      "epoch": 0.28202328966521106,
+      "grad_norm": 0.1773071587085724,
+      "learning_rate": 4.2146917061246284e-05,
+      "loss": 0.16919140815734862,
+      "step": 1550
+    },
+    {
+      "epoch": 0.2829330422125182,
+      "grad_norm": 0.15519705414772034,
+      "learning_rate": 4.209327690945014e-05,
+      "loss": 0.15501506328582765,
+      "step": 1555
+    },
+    {
+      "epoch": 0.2838427947598253,
+      "grad_norm": 0.19921597838401794,
+      "learning_rate": 4.203948857549402e-05,
+      "loss": 0.1690821886062622,
+      "step": 1560
+    },
+    {
+      "epoch": 0.28475254730713245,
+      "grad_norm": 0.15417630970478058,
+      "learning_rate": 4.1985552525670696e-05,
+      "loss": 0.1675640344619751,
+      "step": 1565
+    },
+    {
+      "epoch": 0.2856622998544396,
+      "grad_norm": 0.1739572137594223,
+      "learning_rate": 4.193146922755348e-05,
+      "loss": 0.16738017797470092,
+      "step": 1570
+    },
+    {
+      "epoch": 0.2865720524017467,
+      "grad_norm": 0.1384361982345581,
+      "learning_rate": 4.187723914999221e-05,
+      "loss": 0.16802358627319336,
+      "step": 1575
+    },
+    {
+      "epoch": 0.28748180494905384,
+      "grad_norm": 0.1491454839706421,
+      "learning_rate": 4.182286276310915e-05,
+      "loss": 0.1619583249092102,
+      "step": 1580
+    },
+    {
+      "epoch": 0.288391557496361,
+      "grad_norm": 0.15831919014453888,
+      "learning_rate": 4.176834053829492e-05,
+      "loss": 0.1625199794769287,
+      "step": 1585
+    },
+    {
+      "epoch": 0.2893013100436681,
+      "grad_norm": 0.16265396773815155,
+      "learning_rate": 4.1713672948204416e-05,
+      "loss": 0.16718552112579346,
+      "step": 1590
+    },
+    {
+      "epoch": 0.29021106259097523,
+      "grad_norm": 0.15153461694717407,
+      "learning_rate": 4.1658860466752714e-05,
+      "loss": 0.15979087352752686,
+      "step": 1595
+    },
+    {
+      "epoch": 0.29112081513828236,
+      "grad_norm": 0.1620412915945053,
+      "learning_rate": 4.160390356911096e-05,
+      "loss": 0.16103557348251343,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2920305676855895,
+      "grad_norm": 0.16673807799816132,
+      "learning_rate": 4.154880273170223e-05,
+      "loss": 0.16394708156585694,
+      "step": 1605
+    },
+    {
+      "epoch": 0.2929403202328967,
+      "grad_norm": 0.14834867417812347,
+      "learning_rate": 4.149355843219744e-05,
+      "loss": 0.15916435718536376,
+      "step": 1610
+    },
+    {
+      "epoch": 0.2938500727802038,
+      "grad_norm": 0.16977964341640472,
+      "learning_rate": 4.143817114951119e-05,
+      "loss": 0.16538127660751342,
+      "step": 1615
+    },
+    {
+      "epoch": 0.29475982532751094,
+      "grad_norm": 0.17986875772476196,
+      "learning_rate": 4.138264136379756e-05,
+      "loss": 0.15514618158340454,
+      "step": 1620
+    },
+    {
+      "epoch": 0.29566957787481807,
+      "grad_norm": 0.15794920921325684,
+      "learning_rate": 4.132696955644605e-05,
+      "loss": 0.15992183685302735,
+      "step": 1625
+    },
+    {
+      "epoch": 0.2965793304221252,
+      "grad_norm": 0.19955399632453918,
+      "learning_rate": 4.127115621007731e-05,
+      "loss": 0.16362056732177735,
+      "step": 1630
+    },
+    {
+      "epoch": 0.29748908296943233,
+      "grad_norm": 0.1352023035287857,
+      "learning_rate": 4.121520180853903e-05,
+      "loss": 0.15631601810455323,
+      "step": 1635
+    },
+    {
+      "epoch": 0.29839883551673946,
+      "grad_norm": 0.15340781211853027,
+      "learning_rate": 4.1159106836901674e-05,
+      "loss": 0.1571858048439026,
+      "step": 1640
+    },
+    {
+      "epoch": 0.2993085880640466,
+      "grad_norm": 0.15311770141124725,
+      "learning_rate": 4.110287178145433e-05,
+      "loss": 0.16082344055175782,
+      "step": 1645
+    },
+    {
+      "epoch": 0.3002183406113537,
+      "grad_norm": 0.17811627686023712,
+      "learning_rate": 4.10464971297005e-05,
+      "loss": 0.16117215156555176,
+      "step": 1650
+    },
+    {
+      "epoch": 0.30112809315866085,
+      "grad_norm": 0.21060039103031158,
+      "learning_rate": 4.0989983370353805e-05,
+      "loss": 0.15838587284088135,
+      "step": 1655
+    },
+    {
+      "epoch": 0.302037845705968,
+      "grad_norm": 0.155836820602417,
+      "learning_rate": 4.093333099333383e-05,
+      "loss": 0.16648870706558228,
+      "step": 1660
+    },
+    {
+      "epoch": 0.3029475982532751,
+      "grad_norm": 0.13711698353290558,
+      "learning_rate": 4.0876540489761826e-05,
+      "loss": 0.16899349689483642,
+      "step": 1665
+    },
+    {
+      "epoch": 0.30385735080058224,
+      "grad_norm": 0.15162716805934906,
+      "learning_rate": 4.0819612351956485e-05,
+      "loss": 0.16574090719223022,
+      "step": 1670
+    },
+    {
+      "epoch": 0.30476710334788937,
+      "grad_norm": 0.15016348659992218,
+      "learning_rate": 4.0762547073429615e-05,
+      "loss": 0.1689780354499817,
+      "step": 1675
+    },
+    {
+      "epoch": 0.3056768558951965,
+      "grad_norm": 0.15182986855506897,
+      "learning_rate": 4.070534514888194e-05,
+      "loss": 0.1593686819076538,
+      "step": 1680
+    },
+    {
+      "epoch": 0.3065866084425036,
+      "grad_norm": 0.15648750960826874,
+      "learning_rate": 4.0648007074198765e-05,
+      "loss": 0.16436235904693602,
+      "step": 1685
+    },
+    {
+      "epoch": 0.30749636098981076,
+      "grad_norm": 0.18339484930038452,
+      "learning_rate": 4.0590533346445665e-05,
+      "loss": 0.1678077220916748,
+      "step": 1690
+    },
+    {
+      "epoch": 0.3084061135371179,
+      "grad_norm": 0.16426527500152588,
+      "learning_rate": 4.053292446386422e-05,
+      "loss": 0.1689227342605591,
+      "step": 1695
+    },
+    {
+      "epoch": 0.309315866084425,
+      "grad_norm": 0.16129335761070251,
+      "learning_rate": 4.047518092586766e-05,
+      "loss": 0.16592445373535156,
+      "step": 1700
+    },
+    {
+      "epoch": 0.31022561863173215,
+      "grad_norm": 0.15512363612651825,
+      "learning_rate": 4.041730323303654e-05,
+      "loss": 0.16142364740371704,
+      "step": 1705
+    },
+    {
+      "epoch": 0.3111353711790393,
+      "grad_norm": 0.159842386841774,
+      "learning_rate": 4.0359291887114425e-05,
+      "loss": 0.1702875852584839,
+      "step": 1710
+    },
+    {
+      "epoch": 0.3120451237263464,
+      "grad_norm": 0.19558854401111603,
+      "learning_rate": 4.030114739100352e-05,
+      "loss": 0.15966148376464845,
+      "step": 1715
+    },
+    {
+      "epoch": 0.3129548762736536,
+      "grad_norm": 0.1577496975660324,
+      "learning_rate": 4.024287024876029e-05,
+      "loss": 0.1620358943939209,
+      "step": 1720
+    },
+    {
+      "epoch": 0.3138646288209607,
+      "grad_norm": 0.1629355251789093,
+      "learning_rate": 4.0184460965591144e-05,
+      "loss": 0.16511552333831786,
+      "step": 1725
+    },
+    {
+      "epoch": 0.31477438136826785,
+      "grad_norm": 0.17060767114162445,
+      "learning_rate": 4.0125920047848e-05,
+      "loss": 0.15672838687896729,
+      "step": 1730
+    },
+    {
+      "epoch": 0.315684133915575,
+      "grad_norm": 0.22447620332241058,
+      "learning_rate": 4.006724800302394e-05,
+      "loss": 0.15339784622192382,
+      "step": 1735
+    },
+    {
+      "epoch": 0.3165938864628821,
+      "grad_norm": 0.14572037756443024,
+      "learning_rate": 4.000844533974878e-05,
+      "loss": 0.16566959619522095,
+      "step": 1740
+    },
+    {
+      "epoch": 0.31750363901018924,
+      "grad_norm": 0.15915483236312866,
+      "learning_rate": 3.9949512567784684e-05,
+      "loss": 0.16153957843780517,
+      "step": 1745
+    },
+    {
+      "epoch": 0.3184133915574964,
+      "grad_norm": 0.1668540984392166,
+      "learning_rate": 3.9890450198021704e-05,
+      "loss": 0.1659809947013855,
+      "step": 1750
+    },
+    {
+      "epoch": 0.3193231441048035,
+      "grad_norm": 0.16612035036087036,
+      "learning_rate": 3.983125874247341e-05,
+      "loss": 0.16941241025924683,
+      "step": 1755
+    },
+    {
+      "epoch": 0.32023289665211063,
+      "grad_norm": 0.15163679420948029,
+      "learning_rate": 3.9771938714272407e-05,
+      "loss": 0.16053590774536133,
+      "step": 1760
+    },
+    {
+      "epoch": 0.32114264919941776,
+      "grad_norm": 0.1797824203968048,
+      "learning_rate": 3.97124906276659e-05,
+      "loss": 0.1667110800743103,
+      "step": 1765
+    },
+    {
+      "epoch": 0.3220524017467249,
+      "grad_norm": 0.15076608955860138,
+      "learning_rate": 3.9652914998011237e-05,
+      "loss": 0.1607860803604126,
+      "step": 1770
+    },
+    {
+      "epoch": 0.322962154294032,
+      "grad_norm": 0.16523587703704834,
+      "learning_rate": 3.959321234177144e-05,
+      "loss": 0.16515827178955078,
+      "step": 1775
+    },
+    {
+      "epoch": 0.32387190684133915,
+      "grad_norm": 0.22065149247646332,
+      "learning_rate": 3.9533383176510746e-05,
+      "loss": 0.1618957757949829,
+      "step": 1780
+    },
+    {
+      "epoch": 0.3247816593886463,
+      "grad_norm": 0.16426463425159454,
+      "learning_rate": 3.9473428020890066e-05,
+      "loss": 0.15763382911682128,
+      "step": 1785
+    },
+    {
+      "epoch": 0.3256914119359534,
+      "grad_norm": 0.16474904119968414,
+      "learning_rate": 3.941334739466257e-05,
+      "loss": 0.15135571956634522,
+      "step": 1790
+    },
+    {
+      "epoch": 0.32660116448326054,
+      "grad_norm": 0.16746412217617035,
+      "learning_rate": 3.935314181866909e-05,
+      "loss": 0.15925389528274536,
+      "step": 1795
+    },
+    {
+      "epoch": 0.32751091703056767,
+      "grad_norm": 0.17819371819496155,
+      "learning_rate": 3.929281181483369e-05,
+      "loss": 0.1598669171333313,
+      "step": 1800
+    },
+    {
+      "epoch": 0.3284206695778748,
+      "grad_norm": 0.1816040277481079,
+      "learning_rate": 3.923235790615907e-05,
+      "loss": 0.1652522087097168,
+      "step": 1805
+    },
+    {
+      "epoch": 0.32933042212518193,
+      "grad_norm": 0.14846695959568024,
+      "learning_rate": 3.917178061672211e-05,
+      "loss": 0.16665585041046144,
+      "step": 1810
+    },
+    {
+      "epoch": 0.33024017467248906,
+      "grad_norm": 0.1734926551580429,
+      "learning_rate": 3.911108047166924e-05,
+      "loss": 0.16069791316986085,
+      "step": 1815
+    },
+    {
+      "epoch": 0.3311499272197962,
+      "grad_norm": 0.16154922544956207,
+      "learning_rate": 3.905025799721194e-05,
+      "loss": 0.16114097833633423,
+      "step": 1820
+    },
+    {
+      "epoch": 0.3320596797671033,
+      "grad_norm": 0.1538771390914917,
+      "learning_rate": 3.898931372062217e-05,
+      "loss": 0.1602831244468689,
+      "step": 1825
+    },
+    {
+      "epoch": 0.3329694323144105,
+      "grad_norm": 0.14036566019058228,
+      "learning_rate": 3.892824817022781e-05,
+      "loss": 0.1502395749092102,
+      "step": 1830
+    },
+    {
+      "epoch": 0.33387918486171764,
+      "grad_norm": 0.19212059676647186,
+      "learning_rate": 3.886706187540804e-05,
+      "loss": 0.16265250444412233,
+      "step": 1835
+    },
+    {
+      "epoch": 0.33478893740902477,
+      "grad_norm": 0.17410333454608917,
+      "learning_rate": 3.880575536658881e-05,
+      "loss": 0.15689224004745483,
+      "step": 1840
+    },
+    {
+      "epoch": 0.3356986899563319,
+      "grad_norm": 0.15165294706821442,
+      "learning_rate": 3.874432917523817e-05,
+      "loss": 0.15033140182495117,
+      "step": 1845
+    },
+    {
+      "epoch": 0.336608442503639,
+      "grad_norm": 0.16166730225086212,
+      "learning_rate": 3.8682783833861736e-05,
+      "loss": 0.16896235942840576,
+      "step": 1850
+    },
+    {
+      "epoch": 0.33751819505094616,
+      "grad_norm": 0.16497021913528442,
+      "learning_rate": 3.8621119875998026e-05,
+      "loss": 0.1600774645805359,
+      "step": 1855
+    },
+    {
+      "epoch": 0.3384279475982533,
+      "grad_norm": 0.17264948785305023,
+      "learning_rate": 3.855933783621384e-05,
+      "loss": 0.16947593688964843,
+      "step": 1860
+    },
+    {
+      "epoch": 0.3393377001455604,
+      "grad_norm": 0.16870704293251038,
+      "learning_rate": 3.8497438250099636e-05,
+      "loss": 0.16062095165252685,
+      "step": 1865
+    },
+    {
+      "epoch": 0.34024745269286755,
+      "grad_norm": 0.16644036769866943,
+      "learning_rate": 3.843542165426492e-05,
+      "loss": 0.16015599966049193,
+      "step": 1870
+    },
+    {
+      "epoch": 0.3411572052401747,
+      "grad_norm": 0.1626352220773697,
+      "learning_rate": 3.837328858633349e-05,
+      "loss": 0.17444703578948975,
+      "step": 1875
+    },
+    {
+      "epoch": 0.3420669577874818,
+      "grad_norm": 0.1427375227212906,
+      "learning_rate": 3.83110395849389e-05,
+      "loss": 0.1589805006980896,
+      "step": 1880
+    },
+    {
+      "epoch": 0.34297671033478894,
+      "grad_norm": 0.17840255796909332,
+      "learning_rate": 3.824867518971973e-05,
+      "loss": 0.15953952074050903,
+      "step": 1885
+    },
+    {
+      "epoch": 0.34388646288209607,
+      "grad_norm": 0.16998249292373657,
+      "learning_rate": 3.818619594131489e-05,
+      "loss": 0.16027032136917113,
+      "step": 1890
+    },
+    {
+      "epoch": 0.3447962154294032,
+      "grad_norm": 0.14950257539749146,
+      "learning_rate": 3.812360238135897e-05,
+      "loss": 0.15335670709609986,
+      "step": 1895
+    },
+    {
+      "epoch": 0.3457059679767103,
+      "grad_norm": 0.1678011417388916,
+      "learning_rate": 3.806089505247752e-05,
+      "loss": 0.1560648798942566,
+      "step": 1900
+    },
+    {
+      "epoch": 0.34661572052401746,
+      "grad_norm": 0.17944541573524475,
+      "learning_rate": 3.799807449828238e-05,
+      "loss": 0.16072254180908202,
+      "step": 1905
+    },
+    {
+      "epoch": 0.3475254730713246,
+      "grad_norm": 0.166817307472229,
+      "learning_rate": 3.793514126336691e-05,
+      "loss": 0.1542820692062378,
+      "step": 1910
+    },
+    {
+      "epoch": 0.3484352256186317,
+      "grad_norm": 0.16047626733779907,
+      "learning_rate": 3.787209589330134e-05,
+      "loss": 0.16092092990875245,
+      "step": 1915
+    },
+    {
+      "epoch": 0.34934497816593885,
+      "grad_norm": 0.16478900611400604,
+      "learning_rate": 3.7808938934627965e-05,
+      "loss": 0.16765867471694945,
+      "step": 1920
+    },
+    {
+      "epoch": 0.350254730713246,
+      "grad_norm": 0.15349514782428741,
+      "learning_rate": 3.774567093485648e-05,
+      "loss": 0.15890377759933472,
+      "step": 1925
+    },
+    {
+      "epoch": 0.3511644832605531,
+      "grad_norm": 0.1515921950340271,
+      "learning_rate": 3.768229244245917e-05,
+      "loss": 0.16668319702148438,
+      "step": 1930
+    },
+    {
+      "epoch": 0.35207423580786024,
+      "grad_norm": 0.16310466825962067,
+      "learning_rate": 3.7618804006866195e-05,
+      "loss": 0.15182652473449706,
+      "step": 1935
+    },
+    {
+      "epoch": 0.3529839883551674,
+      "grad_norm": 0.17294517159461975,
+      "learning_rate": 3.755520617846084e-05,
+      "loss": 0.16287628412246705,
+      "step": 1940
+    },
+    {
+      "epoch": 0.35389374090247455,
+      "grad_norm": 0.1482895463705063,
+      "learning_rate": 3.749149950857467e-05,
+      "loss": 0.15321952104568481,
+      "step": 1945
+    },
+    {
+      "epoch": 0.3548034934497817,
+      "grad_norm": 0.2236029952764511,
+      "learning_rate": 3.7427684549482847e-05,
+      "loss": 0.15403482913970948,
+      "step": 1950
+    },
+    {
+      "epoch": 0.3557132459970888,
+      "grad_norm": 0.20185327529907227,
+      "learning_rate": 3.736376185439927e-05,
+      "loss": 0.1633884072303772,
+      "step": 1955
+    },
+    {
+      "epoch": 0.35662299854439594,
+      "grad_norm": 0.13906247913837433,
+      "learning_rate": 3.7299731977471816e-05,
+      "loss": 0.15925350189208984,
+      "step": 1960
+    },
+    {
+      "epoch": 0.35753275109170307,
+      "grad_norm": 0.18665002286434174,
+      "learning_rate": 3.723559547377751e-05,
+      "loss": 0.1612026572227478,
+      "step": 1965
+    },
+    {
+      "epoch": 0.3584425036390102,
+      "grad_norm": 0.16913433372974396,
+      "learning_rate": 3.717135289931774e-05,
+      "loss": 0.15479494333267213,
+      "step": 1970
+    },
+    {
+      "epoch": 0.35935225618631733,
+      "grad_norm": 0.1620066910982132,
+      "learning_rate": 3.7107004811013434e-05,
+      "loss": 0.1604058027267456,
+      "step": 1975
+    },
+    {
+      "epoch": 0.36026200873362446,
+      "grad_norm": 0.16838301718235016,
+      "learning_rate": 3.704255176670021e-05,
+      "loss": 0.15335073471069335,
+      "step": 1980
+    },
+    {
+      "epoch": 0.3611717612809316,
+      "grad_norm": 0.3054695427417755,
+      "learning_rate": 3.6977994325123535e-05,
+      "loss": 0.16558053493499755,
+      "step": 1985
+    },
+    {
+      "epoch": 0.3620815138282387,
+      "grad_norm": 0.1526716649532318,
+      "learning_rate": 3.6913333045933934e-05,
+      "loss": 0.16148923635482787,
+      "step": 1990
+    },
+    {
+      "epoch": 0.36299126637554585,
+      "grad_norm": 0.15328513085842133,
+      "learning_rate": 3.684856848968209e-05,
+      "loss": 0.1553613781929016,
+      "step": 1995
+    },
+    {
+      "epoch": 0.363901018922853,
+      "grad_norm": 0.16129714250564575,
+      "learning_rate": 3.6783701217813995e-05,
+      "loss": 0.16724612712860107,
+      "step": 2000
+    },
+    {
+      "epoch": 0.3648107714701601,
+      "grad_norm": 0.15715539455413818,
+      "learning_rate": 3.6718731792666086e-05,
+      "loss": 0.15867922306060792,
+      "step": 2005
+    },
+    {
+      "epoch": 0.36572052401746724,
+      "grad_norm": 0.15569166839122772,
+      "learning_rate": 3.6653660777460366e-05,
+      "loss": 0.1552058696746826,
+      "step": 2010
+    },
+    {
+      "epoch": 0.36663027656477437,
+      "grad_norm": 0.16223010420799255,
+      "learning_rate": 3.6588488736299535e-05,
+      "loss": 0.1583200454711914,
+      "step": 2015
+    },
+    {
+      "epoch": 0.3675400291120815,
+      "grad_norm": 0.18441995978355408,
+      "learning_rate": 3.652321623416209e-05,
+      "loss": 0.15050662755966188,
+      "step": 2020
+    },
+    {
+      "epoch": 0.36844978165938863,
+      "grad_norm": 0.13792674243450165,
+      "learning_rate": 3.645784383689742e-05,
+      "loss": 0.15458759069442748,
+      "step": 2025
+    },
+    {
+      "epoch": 0.36935953420669576,
+      "grad_norm": 0.14993111789226532,
+      "learning_rate": 3.639237211122091e-05,
+      "loss": 0.15926222801208495,
+      "step": 2030
+    },
+    {
+      "epoch": 0.3702692867540029,
+      "grad_norm": 0.16815930604934692,
+      "learning_rate": 3.632680162470904e-05,
+      "loss": 0.15524441003799438,
+      "step": 2035
+    },
+    {
+      "epoch": 0.37117903930131,
+      "grad_norm": 0.13312821090221405,
+      "learning_rate": 3.626113294579441e-05,
+      "loss": 0.15883516073226928,
+      "step": 2040
+    },
+    {
+      "epoch": 0.37208879184861715,
+      "grad_norm": 0.16838273406028748,
+      "learning_rate": 3.619536664376091e-05,
+      "loss": 0.15829603672027587,
+      "step": 2045
+    },
+    {
+      "epoch": 0.37299854439592434,
+      "grad_norm": 0.14706873893737793,
+      "learning_rate": 3.612950328873869e-05,
+      "loss": 0.15644397735595703,
+      "step": 2050
+    },
+    {
+      "epoch": 0.37390829694323147,
+      "grad_norm": 0.1644199639558792,
+      "learning_rate": 3.606354345169926e-05,
+      "loss": 0.15858219861984252,
+      "step": 2055
+    },
+    {
+      "epoch": 0.3748180494905386,
+      "grad_norm": 0.18077051639556885,
+      "learning_rate": 3.599748770445055e-05,
+      "loss": 0.1641286849975586,
+      "step": 2060
+    },
+    {
+      "epoch": 0.3757278020378457,
+      "grad_norm": 0.16329127550125122,
+      "learning_rate": 3.5931336619631914e-05,
+      "loss": 0.15027186870574952,
+      "step": 2065
+    },
+    {
+      "epoch": 0.37663755458515286,
+      "grad_norm": 0.16346783936023712,
+      "learning_rate": 3.586509077070922e-05,
+      "loss": 0.1558641314506531,
+      "step": 2070
+    },
+    {
+      "epoch": 0.37754730713246,
+      "grad_norm": 0.1727602630853653,
+      "learning_rate": 3.5798750731969834e-05,
+      "loss": 0.15390506982803345,
+      "step": 2075
+    },
+    {
+      "epoch": 0.3784570596797671,
+      "grad_norm": 0.7598192691802979,
+      "learning_rate": 3.5732317078517654e-05,
+      "loss": 0.1533232808113098,
+      "step": 2080
+    },
+    {
+      "epoch": 0.37936681222707425,
+      "grad_norm": 0.1433355212211609,
+      "learning_rate": 3.5665790386268124e-05,
+      "loss": 0.15560413599014283,
+      "step": 2085
+    },
+    {
+      "epoch": 0.3802765647743814,
+      "grad_norm": 0.18439625203609467,
+      "learning_rate": 3.559917123194325e-05,
+      "loss": 0.16695556640625,
+      "step": 2090
+    },
+    {
+      "epoch": 0.3811863173216885,
+      "grad_norm": 0.1693502813577652,
+      "learning_rate": 3.55324601930666e-05,
+      "loss": 0.15957870483398437,
+      "step": 2095
+    },
+    {
+      "epoch": 0.38209606986899564,
+      "grad_norm": 0.17776088416576385,
+      "learning_rate": 3.54656578479583e-05,
+      "loss": 0.1527492880821228,
+      "step": 2100
+    },
+    {
+      "epoch": 0.38300582241630277,
+      "grad_norm": 0.15993724763393402,
+      "learning_rate": 3.539876477572998e-05,
+      "loss": 0.1567505717277527,
+      "step": 2105
+    },
+    {
+      "epoch": 0.3839155749636099,
+      "grad_norm": 0.17067375779151917,
+      "learning_rate": 3.533178155627981e-05,
+      "loss": 0.14660797119140626,
+      "step": 2110
+    },
+    {
+      "epoch": 0.384825327510917,
+      "grad_norm": 0.20239882171154022,
+      "learning_rate": 3.526470877028745e-05,
+      "loss": 0.1596767544746399,
+      "step": 2115
+    },
+    {
+      "epoch": 0.38573508005822416,
+      "grad_norm": 0.1863643079996109,
+      "learning_rate": 3.5197546999209005e-05,
+      "loss": 0.15738571882247926,
+      "step": 2120
+    },
+    {
+      "epoch": 0.3866448326055313,
+      "grad_norm": 0.16994133591651917,
+      "learning_rate": 3.5130296825272014e-05,
+      "loss": 0.16255316734313965,
+      "step": 2125
+    },
+    {
+      "epoch": 0.3875545851528384,
+      "grad_norm": 0.18703415989875793,
+      "learning_rate": 3.5062958831470355e-05,
+      "loss": 0.15206334590911866,
+      "step": 2130
+    },
+    {
+      "epoch": 0.38846433770014555,
+      "grad_norm": 0.15433982014656067,
+      "learning_rate": 3.4995533601559226e-05,
+      "loss": 0.1590178370475769,
+      "step": 2135
+    },
+    {
+      "epoch": 0.3893740902474527,
+      "grad_norm": 0.16498146951198578,
+      "learning_rate": 3.4928021720050104e-05,
+      "loss": 0.14759145975112914,
+      "step": 2140
+    },
+    {
+      "epoch": 0.3902838427947598,
+      "grad_norm": 0.17880478501319885,
+      "learning_rate": 3.486042377220562e-05,
+      "loss": 0.1642458915710449,
+      "step": 2145
+    },
+    {
+      "epoch": 0.39119359534206694,
+      "grad_norm": 0.14700061082839966,
+      "learning_rate": 3.479274034403455e-05,
+      "loss": 0.16105138063430785,
+      "step": 2150
+    },
+    {
+      "epoch": 0.39210334788937407,
+      "grad_norm": 0.1620762050151825,
+      "learning_rate": 3.472497202228664e-05,
+      "loss": 0.15104985237121582,
+      "step": 2155
+    },
+    {
+      "epoch": 0.3930131004366812,
+      "grad_norm": 0.1625058799982071,
+      "learning_rate": 3.4657119394447654e-05,
+      "loss": 0.16145485639572144,
+      "step": 2160
+    },
+    {
+      "epoch": 0.3939228529839884,
+      "grad_norm": 0.1631549596786499,
+      "learning_rate": 3.458918304873417e-05,
+      "loss": 0.16712255477905275,
+      "step": 2165
+    },
+    {
+      "epoch": 0.3948326055312955,
+      "grad_norm": 0.16041551530361176,
+      "learning_rate": 3.452116357408853e-05,
+      "loss": 0.15118330717086792,
+      "step": 2170
+    },
+    {
+      "epoch": 0.39574235807860264,
+      "grad_norm": 0.16692611575126648,
+      "learning_rate": 3.44530615601737e-05,
+      "loss": 0.16982550621032716,
+      "step": 2175
+    },
+    {
+      "epoch": 0.39665211062590977,
+      "grad_norm": 0.16082268953323364,
+      "learning_rate": 3.438487759736821e-05,
+      "loss": 0.1513260006904602,
+      "step": 2180
+    },
+    {
+      "epoch": 0.3975618631732169,
+      "grad_norm": 0.1474589854478836,
+      "learning_rate": 3.4316612276761004e-05,
+      "loss": 0.14968743324279785,
+      "step": 2185
+    },
+    {
+      "epoch": 0.39847161572052403,
+      "grad_norm": 0.14531342685222626,
+      "learning_rate": 3.42482661901463e-05,
+      "loss": 0.1563260555267334,
+      "step": 2190
+    },
+    {
+      "epoch": 0.39938136826783116,
+      "grad_norm": 0.16775506734848022,
+      "learning_rate": 3.41798399300185e-05,
+      "loss": 0.14861010313034057,
+      "step": 2195
+    },
+    {
+      "epoch": 0.4002911208151383,
+      "grad_norm": 0.15065217018127441,
+      "learning_rate": 3.411133408956703e-05,
+      "loss": 0.15559519529342652,
+      "step": 2200
+    },
+    {
+      "epoch": 0.4012008733624454,
+      "grad_norm": 0.16655296087265015,
+      "learning_rate": 3.4042749262671184e-05,
+      "loss": 0.16025567054748535,
+      "step": 2205
+    },
+    {
+      "epoch": 0.40211062590975255,
+      "grad_norm": 0.14773905277252197,
+      "learning_rate": 3.397408604389501e-05,
+      "loss": 0.15074082612991332,
+      "step": 2210
+    },
+    {
+      "epoch": 0.4030203784570597,
+      "grad_norm": 0.16233304142951965,
+      "learning_rate": 3.3905345028482125e-05,
+      "loss": 0.15490520000457764,
+      "step": 2215
+    },
+    {
+      "epoch": 0.4039301310043668,
+      "grad_norm": 0.17520153522491455,
+      "learning_rate": 3.383652681235058e-05,
+      "loss": 0.1517520785331726,
+      "step": 2220
+    },
+    {
+      "epoch": 0.40483988355167394,
+      "grad_norm": 0.14749875664710999,
+      "learning_rate": 3.376763199208766e-05,
+      "loss": 0.15410997867584228,
+      "step": 2225
+    },
+    {
+      "epoch": 0.40574963609898107,
+      "grad_norm": 0.16855919361114502,
+      "learning_rate": 3.369866116494477e-05,
+      "loss": 0.1510261058807373,
+      "step": 2230
+    },
+    {
+      "epoch": 0.4066593886462882,
+      "grad_norm": 0.1594122350215912,
+      "learning_rate": 3.362961492883218e-05,
+      "loss": 0.1493813395500183,
+      "step": 2235
+    },
+    {
+      "epoch": 0.40756914119359533,
+      "grad_norm": 0.13645926117897034,
+      "learning_rate": 3.3560493882313915e-05,
+      "loss": 0.14876762628555298,
+      "step": 2240
+    },
+    {
+      "epoch": 0.40847889374090246,
+      "grad_norm": 0.14304400980472565,
+      "learning_rate": 3.349129862460251e-05,
+      "loss": 0.15567013025283813,
+      "step": 2245
+    },
+    {
+      "epoch": 0.4093886462882096,
+      "grad_norm": 0.17040041089057922,
+      "learning_rate": 3.342202975555386e-05,
+      "loss": 0.1563249945640564,
+      "step": 2250
+    },
+    {
+      "epoch": 0.4102983988355167,
+      "grad_norm": 0.15594671666622162,
+      "learning_rate": 3.3352687875661984e-05,
+      "loss": 0.1546410083770752,
+      "step": 2255
+    },
+    {
+      "epoch": 0.41120815138282385,
+      "grad_norm": 0.1677195280790329,
+      "learning_rate": 3.328327358605384e-05,
+      "loss": 0.15710171461105346,
+      "step": 2260
+    },
+    {
+      "epoch": 0.412117903930131,
+      "grad_norm": 0.1731705516576767,
+      "learning_rate": 3.321378748848412e-05,
+      "loss": 0.16444036960601807,
+      "step": 2265
+    },
+    {
+      "epoch": 0.4130276564774381,
+      "grad_norm": 0.18779033422470093,
+      "learning_rate": 3.3144230185329984e-05,
+      "loss": 0.15659687519073487,
+      "step": 2270
+    },
+    {
+      "epoch": 0.4139374090247453,
+      "grad_norm": 0.1543768346309662,
+      "learning_rate": 3.3074602279585913e-05,
+      "loss": 0.15100739002227784,
+      "step": 2275
+    },
+    {
+      "epoch": 0.4148471615720524,
+      "grad_norm": 0.16672168672084808,
+      "learning_rate": 3.300490437485843e-05,
+      "loss": 0.15535364151000977,
+      "step": 2280
+    },
+    {
+      "epoch": 0.41575691411935956,
+      "grad_norm": 0.16741308569908142,
+      "learning_rate": 3.293513707536089e-05,
+      "loss": 0.15523911714553834,
+      "step": 2285
+    },
+    {
+      "epoch": 0.4166666666666667,
+      "grad_norm": 0.1488303542137146,
+      "learning_rate": 3.286530098590822e-05,
+      "loss": 0.1542000651359558,
+      "step": 2290
+    },
+    {
+      "epoch": 0.4175764192139738,
+      "grad_norm": 0.1637732982635498,
+      "learning_rate": 3.2795396711911694e-05,
+      "loss": 0.15354831218719484,
+      "step": 2295
+    },
+    {
+      "epoch": 0.41848617176128095,
+      "grad_norm": 0.1472022533416748,
+      "learning_rate": 3.272542485937369e-05,
+      "loss": 0.16235145330429077,
+      "step": 2300
+    },
+    {
+      "epoch": 0.4193959243085881,
+      "grad_norm": 0.15908290445804596,
+      "learning_rate": 3.265538603488241e-05,
+      "loss": 0.15642645359039306,
+      "step": 2305
+    },
+    {
+      "epoch": 0.4203056768558952,
+      "grad_norm": 0.1584865301847458,
+      "learning_rate": 3.2585280845606645e-05,
+      "loss": 0.15490249395370484,
+      "step": 2310
+    },
+    {
+      "epoch": 0.42121542940320233,
+      "grad_norm": 0.15893949568271637,
+      "learning_rate": 3.251510989929052e-05,
+      "loss": 0.1598116159439087,
+      "step": 2315
+    },
+    {
+      "epoch": 0.42212518195050946,
+      "grad_norm": 0.18930596113204956,
+      "learning_rate": 3.244487380424817e-05,
+      "loss": 0.1482008934020996,
+      "step": 2320
+    },
+    {
+      "epoch": 0.4230349344978166,
+      "grad_norm": 0.132876455783844,
+      "learning_rate": 3.237457316935856e-05,
+      "loss": 0.15304710865020751,
+      "step": 2325
+    },
+    {
+      "epoch": 0.4239446870451237,
+      "grad_norm": 0.16447032988071442,
+      "learning_rate": 3.2304208604060106e-05,
+      "loss": 0.15298750400543212,
+      "step": 2330
+    },
+    {
+      "epoch": 0.42485443959243085,
+      "grad_norm": 0.17748120427131653,
+      "learning_rate": 3.223378071834546e-05,
+      "loss": 0.1556084156036377,
+      "step": 2335
+    },
+    {
+      "epoch": 0.425764192139738,
+      "grad_norm": 0.16366586089134216,
+      "learning_rate": 3.2163290122756206e-05,
+      "loss": 0.14387927055358887,
+      "step": 2340
+    },
+    {
+      "epoch": 0.4266739446870451,
+      "grad_norm": 0.15398970246315002,
+      "learning_rate": 3.209273742837755e-05,
+      "loss": 0.16091293096542358,
+      "step": 2345
+    },
+    {
+      "epoch": 0.42758369723435224,
+      "grad_norm": 0.164212167263031,
+      "learning_rate": 3.202212324683305e-05,
+      "loss": 0.15523531436920165,
+      "step": 2350
+    },
+    {
+      "epoch": 0.4284934497816594,
+      "grad_norm": 0.16749800741672516,
+      "learning_rate": 3.1951448190279255e-05,
+      "loss": 0.15354975461959838,
+      "step": 2355
+    },
+    {
+      "epoch": 0.4294032023289665,
+      "grad_norm": 0.14137034118175507,
+      "learning_rate": 3.18807128714005e-05,
+      "loss": 0.14981694221496583,
+      "step": 2360
+    },
+    {
+      "epoch": 0.43031295487627363,
+      "grad_norm": 0.14848439395427704,
+      "learning_rate": 3.1809917903403507e-05,
+      "loss": 0.15448769330978393,
+      "step": 2365
+    },
+    {
+      "epoch": 0.43122270742358076,
+      "grad_norm": 0.1747605800628662,
+      "learning_rate": 3.1739063900012095e-05,
+      "loss": 0.15882387161254882,
+      "step": 2370
+    },
+    {
+      "epoch": 0.4321324599708879,
+      "grad_norm": 0.16054467856884003,
+      "learning_rate": 3.166815147546186e-05,
+      "loss": 0.15170297622680665,
+      "step": 2375
+    },
+    {
+      "epoch": 0.433042212518195,
+      "grad_norm": 0.15428027510643005,
+      "learning_rate": 3.1597181244494886e-05,
+      "loss": 0.16202548742294312,
+      "step": 2380
+    },
+    {
+      "epoch": 0.4339519650655022,
+      "grad_norm": 0.16747219860553741,
+      "learning_rate": 3.1526153822354325e-05,
+      "loss": 0.15461477041244506,
+      "step": 2385
+    },
+    {
+      "epoch": 0.43486171761280934,
+      "grad_norm": 0.17415772378444672,
+      "learning_rate": 3.145506982477918e-05,
+      "loss": 0.16173542737960817,
+      "step": 2390
+    },
+    {
+      "epoch": 0.43577147016011647,
+      "grad_norm": 0.1293518990278244,
+      "learning_rate": 3.1383929867998865e-05,
+      "loss": 0.15572521686553956,
+      "step": 2395
+    },
+    {
+      "epoch": 0.4366812227074236,
+      "grad_norm": 0.16909323632717133,
+      "learning_rate": 3.1312734568727935e-05,
+      "loss": 0.15898628234863282,
+      "step": 2400
+    },
+    {
+      "epoch": 0.43759097525473073,
+      "grad_norm": 0.16770294308662415,
+      "learning_rate": 3.124148454416069e-05,
+      "loss": 0.1536281704902649,
+      "step": 2405
+    },
+    {
+      "epoch": 0.43850072780203786,
+      "grad_norm": 0.14078612625598907,
+      "learning_rate": 3.117018041196585e-05,
+      "loss": 0.15274266004562378,
+      "step": 2410
+    },
+    {
+      "epoch": 0.439410480349345,
+      "grad_norm": 0.15457536280155182,
+      "learning_rate": 3.1098822790281226e-05,
+      "loss": 0.15391263961791993,
+      "step": 2415
+    },
+    {
+      "epoch": 0.4403202328966521,
+      "grad_norm": 0.1640717089176178,
+      "learning_rate": 3.102741229770827e-05,
+      "loss": 0.15515168905258178,
+      "step": 2420
+    },
+    {
+      "epoch": 0.44122998544395925,
+      "grad_norm": 0.2601533830165863,
+      "learning_rate": 3.095594955330683e-05,
+      "loss": 0.1587247371673584,
+      "step": 2425
+    },
+    {
+      "epoch": 0.4421397379912664,
+      "grad_norm": 0.1352529525756836,
+      "learning_rate": 3.08844351765897e-05,
+      "loss": 0.1483217477798462,
+      "step": 2430
+    },
+    {
+      "epoch": 0.4430494905385735,
+      "grad_norm": 0.18479721248149872,
+      "learning_rate": 3.081286978751728e-05,
+      "loss": 0.15121787786483765,
+      "step": 2435
+    },
+    {
+      "epoch": 0.44395924308588064,
+      "grad_norm": 0.16954511404037476,
+      "learning_rate": 3.074125400649221e-05,
+      "loss": 0.16073100566864013,
+      "step": 2440
+    },
+    {
+      "epoch": 0.44486899563318777,
+      "grad_norm": 0.15154729783535004,
+      "learning_rate": 3.0669588454353944e-05,
+      "loss": 0.15738017559051515,
+      "step": 2445
+    },
+    {
+      "epoch": 0.4457787481804949,
+      "grad_norm": 0.1540488302707672,
+      "learning_rate": 3.059787375237344e-05,
+      "loss": 0.1515384554862976,
+      "step": 2450
+    },
+    {
+      "epoch": 0.44668850072780203,
+      "grad_norm": 0.1814432442188263,
+      "learning_rate": 3.052611052224774e-05,
+      "loss": 0.15731438398361205,
+      "step": 2455
+    },
+    {
+      "epoch": 0.44759825327510916,
+      "grad_norm": 0.16657036542892456,
+      "learning_rate": 3.0454299386094542e-05,
+      "loss": 0.15741543769836425,
+      "step": 2460
+    },
+    {
+      "epoch": 0.4485080058224163,
+      "grad_norm": 0.2177237570285797,
+      "learning_rate": 3.0382440966446875e-05,
+      "loss": 0.14972515106201173,
+      "step": 2465
+    },
+    {
+      "epoch": 0.4494177583697234,
+      "grad_norm": 0.1669909954071045,
+      "learning_rate": 3.031053588624766e-05,
+      "loss": 0.1506432294845581,
+      "step": 2470
+    },
+    {
+      "epoch": 0.45032751091703055,
+      "grad_norm": 0.1752234250307083,
+      "learning_rate": 3.0238584768844313e-05,
+      "loss": 0.14969609975814818,
+      "step": 2475
+    },
+    {
+      "epoch": 0.4512372634643377,
+      "grad_norm": 0.18267901241779327,
+      "learning_rate": 3.0166588237983363e-05,
+      "loss": 0.15112748146057128,
+      "step": 2480
+    },
+    {
+      "epoch": 0.4521470160116448,
+      "grad_norm": 0.16250105202198029,
+      "learning_rate": 3.0094546917805007e-05,
+      "loss": 0.15864100456237792,
+      "step": 2485
+    },
+    {
+      "epoch": 0.45305676855895194,
+      "grad_norm": 0.14825721085071564,
+      "learning_rate": 3.0022461432837752e-05,
+      "loss": 0.1513954520225525,
+      "step": 2490
+    },
+    {
+      "epoch": 0.4539665211062591,
+      "grad_norm": 0.1626640111207962,
+      "learning_rate": 2.9950332407992943e-05,
+      "loss": 0.1505578875541687,
+      "step": 2495
+    },
+    {
+      "epoch": 0.45487627365356625,
+      "grad_norm": 0.1535351574420929,
+      "learning_rate": 2.987816046855939e-05,
+      "loss": 0.15255829095840454,
+      "step": 2500
+    },
+    {
+      "epoch": 0.4557860262008734,
+      "grad_norm": 0.17552775144577026,
+      "learning_rate": 2.9805946240197928e-05,
+      "loss": 0.1516443133354187,
+      "step": 2505
+    },
+    {
+      "epoch": 0.4566957787481805,
+      "grad_norm": 0.16020981967449188,
+      "learning_rate": 2.9733690348935994e-05,
+      "loss": 0.14519743919372557,
+      "step": 2510
+    },
+    {
+      "epoch": 0.45760553129548764,
+      "grad_norm": 0.17800211906433105,
+      "learning_rate": 2.9661393421162204e-05,
+      "loss": 0.15679080486297609,
+      "step": 2515
+    },
+    {
+      "epoch": 0.4585152838427948,
+      "grad_norm": 0.16016991436481476,
+      "learning_rate": 2.9589056083620902e-05,
+      "loss": 0.14768127202987671,
+      "step": 2520
+    },
+    {
+      "epoch": 0.4594250363901019,
+      "grad_norm": 0.16272081434726715,
+      "learning_rate": 2.951667896340679e-05,
+      "loss": 0.1513301968574524,
+      "step": 2525
+    },
+    {
+      "epoch": 0.46033478893740903,
+      "grad_norm": 0.1726413071155548,
+      "learning_rate": 2.9444262687959402e-05,
+      "loss": 0.14819332361221313,
+      "step": 2530
+    },
+    {
+      "epoch": 0.46124454148471616,
+      "grad_norm": 0.1670403778553009,
+      "learning_rate": 2.9371807885057735e-05,
+      "loss": 0.15245940685272216,
+      "step": 2535
+    },
+    {
+      "epoch": 0.4621542940320233,
+      "grad_norm": 0.1650049239397049,
+      "learning_rate": 2.9299315182814772e-05,
+      "loss": 0.15187418460845947,
+      "step": 2540
+    },
+    {
+      "epoch": 0.4630640465793304,
+      "grad_norm": 0.16327734291553497,
+      "learning_rate": 2.9226785209672047e-05,
+      "loss": 0.15579828023910522,
+      "step": 2545
+    },
+    {
+      "epoch": 0.46397379912663755,
+      "grad_norm": 0.3367880582809448,
+      "learning_rate": 2.91542185943942e-05,
+      "loss": 0.15617697238922118,
+      "step": 2550
+    },
+    {
+      "epoch": 0.4648835516739447,
+      "grad_norm": 0.1731594055891037,
+      "learning_rate": 2.908161596606353e-05,
+      "loss": 0.1559603691101074,
+      "step": 2555
+    },
+    {
+      "epoch": 0.4657933042212518,
+      "grad_norm": 0.1477293074131012,
+      "learning_rate": 2.9008977954074517e-05,
+      "loss": 0.15567959547042848,
+      "step": 2560
+    },
+    {
+      "epoch": 0.46670305676855894,
+      "grad_norm": 0.16227173805236816,
+      "learning_rate": 2.8936305188128392e-05,
+      "loss": 0.1522113561630249,
+      "step": 2565
+    },
+    {
+      "epoch": 0.4676128093158661,
+      "grad_norm": 0.2031075656414032,
+      "learning_rate": 2.8863598298227674e-05,
+      "loss": 0.15054640769958497,
+      "step": 2570
+    },
+    {
+      "epoch": 0.4685225618631732,
+      "grad_norm": 0.18351472914218903,
+      "learning_rate": 2.8790857914670698e-05,
+      "loss": 0.15837019681930542,
+      "step": 2575
+    },
+    {
+      "epoch": 0.46943231441048033,
+      "grad_norm": 0.15914765000343323,
+      "learning_rate": 2.871808466804616e-05,
+      "loss": 0.1550259470939636,
+      "step": 2580
+    },
+    {
+      "epoch": 0.47034206695778746,
+      "grad_norm": 0.17366717755794525,
+      "learning_rate": 2.8645279189227636e-05,
+      "loss": 0.15702390670776367,
+      "step": 2585
+    },
+    {
+      "epoch": 0.4712518195050946,
+      "grad_norm": 0.13677838444709778,
+      "learning_rate": 2.8572442109368134e-05,
+      "loss": 0.15485031604766847,
+      "step": 2590
+    },
+    {
+      "epoch": 0.4721615720524017,
+      "grad_norm": 0.1477748304605484,
+      "learning_rate": 2.8499574059894617e-05,
+      "loss": 0.14577245712280273,
+      "step": 2595
+    },
+    {
+      "epoch": 0.47307132459970885,
+      "grad_norm": 0.1582217663526535,
+      "learning_rate": 2.842667567250252e-05,
+      "loss": 0.15586793422698975,
+      "step": 2600
+    },
+    {
+      "epoch": 0.47398107714701604,
+      "grad_norm": 0.19658738374710083,
+      "learning_rate": 2.8353747579150268e-05,
+      "loss": 0.15060495138168334,
+      "step": 2605
+    },
+    {
+      "epoch": 0.47489082969432317,
+      "grad_norm": 0.176767036318779,
+      "learning_rate": 2.828079041205382e-05,
+      "loss": 0.15116705894470214,
+      "step": 2610
+    },
+    {
+      "epoch": 0.4758005822416303,
+      "grad_norm": 0.16972507536411285,
+      "learning_rate": 2.820780480368117e-05,
+      "loss": 0.1541937470436096,
+      "step": 2615
+    },
+    {
+      "epoch": 0.47671033478893743,
+      "grad_norm": 0.1548585742712021,
+      "learning_rate": 2.8134791386746884e-05,
+      "loss": 0.14334756135940552,
+      "step": 2620
+    },
+    {
+      "epoch": 0.47762008733624456,
+      "grad_norm": 0.15411986410617828,
+      "learning_rate": 2.806175079420658e-05,
+      "loss": 0.14642289876937867,
+      "step": 2625
+    },
+    {
+      "epoch": 0.4785298398835517,
+      "grad_norm": 0.16609491407871246,
+      "learning_rate": 2.7988683659251474e-05,
+      "loss": 0.15083469152450563,
+      "step": 2630
+    },
+    {
+      "epoch": 0.4794395924308588,
+      "grad_norm": 0.16592684388160706,
+      "learning_rate": 2.791559061530289e-05,
+      "loss": 0.14218480587005616,
+      "step": 2635
+    },
+    {
+      "epoch": 0.48034934497816595,
+      "grad_norm": 0.1764935404062271,
+      "learning_rate": 2.7842472296006722e-05,
+      "loss": 0.15004343986511232,
+      "step": 2640
+    },
+    {
+      "epoch": 0.4812590975254731,
+      "grad_norm": 0.20094354450702667,
+      "learning_rate": 2.7769329335228022e-05,
+      "loss": 0.14975016117095946,
+      "step": 2645
+    },
+    {
+      "epoch": 0.4821688500727802,
+      "grad_norm": 0.1869269460439682,
+      "learning_rate": 2.769616236704542e-05,
+      "loss": 0.155981707572937,
+      "step": 2650
+    },
+    {
+      "epoch": 0.48307860262008734,
+      "grad_norm": 0.16671574115753174,
+      "learning_rate": 2.762297202574571e-05,
+      "loss": 0.14633859395980836,
+      "step": 2655
+    },
+    {
+      "epoch": 0.48398835516739447,
+      "grad_norm": 0.14999663829803467,
+      "learning_rate": 2.754975894581826e-05,
+      "loss": 0.15692603588104248,
+      "step": 2660
+    },
+    {
+      "epoch": 0.4848981077147016,
+      "grad_norm": 0.16893649101257324,
+      "learning_rate": 2.7476523761949592e-05,
+      "loss": 0.14530394077301026,
+      "step": 2665
+    },
+    {
+      "epoch": 0.48580786026200873,
+      "grad_norm": 0.16039884090423584,
+      "learning_rate": 2.740326710901784e-05,
+      "loss": 0.15013915300369263,
+      "step": 2670
+    },
+    {
+      "epoch": 0.48671761280931586,
+      "grad_norm": 0.16672006249427795,
+      "learning_rate": 2.732998962208725e-05,
+      "loss": 0.15667349100112915,
+      "step": 2675
+    },
+    {
+      "epoch": 0.487627365356623,
+      "grad_norm": 0.2160867303609848,
+      "learning_rate": 2.7256691936402684e-05,
+      "loss": 0.14335414171218872,
+      "step": 2680
+    },
+    {
+      "epoch": 0.4885371179039301,
+      "grad_norm": 0.349030077457428,
+      "learning_rate": 2.71833746873841e-05,
+      "loss": 0.1437530279159546,
+      "step": 2685
+    },
+    {
+      "epoch": 0.48944687045123725,
+      "grad_norm": 0.18380966782569885,
+      "learning_rate": 2.7110038510621073e-05,
+      "loss": 0.1476014256477356,
+      "step": 2690
+    },
+    {
+      "epoch": 0.4903566229985444,
+      "grad_norm": 0.1523742377758026,
+      "learning_rate": 2.703668404186722e-05,
+      "loss": 0.14578526020050048,
+      "step": 2695
+    },
+    {
+      "epoch": 0.4912663755458515,
+      "grad_norm": 0.16092729568481445,
+      "learning_rate": 2.696331191703479e-05,
+      "loss": 0.15335593223571778,
+      "step": 2700
+    },
+    {
+      "epoch": 0.49217612809315864,
+      "grad_norm": 0.17185333371162415,
+      "learning_rate": 2.688992277218904e-05,
+      "loss": 0.1540898084640503,
+      "step": 2705
+    },
+    {
+      "epoch": 0.49308588064046577,
+      "grad_norm": 0.1521969735622406,
+      "learning_rate": 2.6816517243542792e-05,
+      "loss": 0.15171396732330322,
+      "step": 2710
+    },
+    {
+      "epoch": 0.49399563318777295,
+      "grad_norm": 0.16064171493053436,
+      "learning_rate": 2.674309596745092e-05,
+      "loss": 0.1505839228630066,
+      "step": 2715
+    },
+    {
+      "epoch": 0.4949053857350801,
+      "grad_norm": 0.16430898010730743,
+      "learning_rate": 2.6669659580404795e-05,
+      "loss": 0.1551363468170166,
+      "step": 2720
+    },
+    {
+      "epoch": 0.4958151382823872,
+      "grad_norm": 0.16125477850437164,
+      "learning_rate": 2.659620871902677e-05,
+      "loss": 0.15069286823272704,
+      "step": 2725
+    },
+    {
+      "epoch": 0.49672489082969434,
+      "grad_norm": 0.1428450047969818,
+      "learning_rate": 2.652274402006471e-05,
+      "loss": 0.15511081218719483,
+      "step": 2730
+    },
+    {
+      "epoch": 0.4976346433770015,
+      "grad_norm": 0.15452754497528076,
+      "learning_rate": 2.6449266120386406e-05,
+      "loss": 0.14941939115524291,
+      "step": 2735
+    },
+    {
+      "epoch": 0.4985443959243086,
+      "grad_norm": 0.17243537306785583,
+      "learning_rate": 2.6375775656974123e-05,
+      "loss": 0.151741623878479,
+      "step": 2740
+    },
+    {
+      "epoch": 0.49945414847161573,
+      "grad_norm": 0.13736453652381897,
+      "learning_rate": 2.6302273266919008e-05,
+      "loss": 0.147042977809906,
+      "step": 2745
+    },
+    {
+      "epoch": 0.5003639010189228,
+      "grad_norm": 0.16241495311260223,
+      "learning_rate": 2.6228759587415614e-05,
+      "loss": 0.14664684534072875,
+      "step": 2750
+    },
+    {
+      "epoch": 0.50127365356623,
+      "grad_norm": 0.193496435880661,
+      "learning_rate": 2.6155235255756356e-05,
+      "loss": 0.15486966371536254,
+      "step": 2755
+    },
+    {
+      "epoch": 0.5021834061135371,
+      "grad_norm": 0.1542847901582718,
+      "learning_rate": 2.6081700909326e-05,
+      "loss": 0.15148009061813356,
+      "step": 2760
+    },
+    {
+      "epoch": 0.5030931586608443,
+      "grad_norm": 0.1696511209011078,
+      "learning_rate": 2.6008157185596142e-05,
+      "loss": 0.14190055131912233,
+      "step": 2765
+    },
+    {
+      "epoch": 0.5040029112081513,
+      "grad_norm": 0.14690077304840088,
+      "learning_rate": 2.5934604722119655e-05,
+      "loss": 0.1570739269256592,
+      "step": 2770
+    },
+    {
+      "epoch": 0.5049126637554585,
+      "grad_norm": 0.17149671912193298,
+      "learning_rate": 2.5861044156525162e-05,
+      "loss": 0.14940304756164552,
+      "step": 2775
+    },
+    {
+      "epoch": 0.5058224163027657,
+      "grad_norm": 0.16639231145381927,
+      "learning_rate": 2.578747612651155e-05,
+      "loss": 0.15691237449645995,
+      "step": 2780
+    },
+    {
+      "epoch": 0.5067321688500728,
+      "grad_norm": 0.2062763124704361,
+      "learning_rate": 2.5713901269842404e-05,
+      "loss": 0.1564734935760498,
+      "step": 2785
+    },
+    {
+      "epoch": 0.50764192139738,
+      "grad_norm": 0.12636308372020721,
+      "learning_rate": 2.5640320224340502e-05,
+      "loss": 0.14539417028427123,
+      "step": 2790
+    },
+    {
+      "epoch": 0.508551673944687,
+      "grad_norm": 0.16893689334392548,
+      "learning_rate": 2.556673362788225e-05,
+      "loss": 0.15440930128097535,
+      "step": 2795
+    },
+    {
+      "epoch": 0.5094614264919942,
+      "grad_norm": 0.16250015795230865,
+      "learning_rate": 2.54931421183922e-05,
+      "loss": 0.14485647678375244,
+      "step": 2800
+    },
+    {
+      "epoch": 0.5103711790393013,
+      "grad_norm": 0.1700994372367859,
+      "learning_rate": 2.5419546333837462e-05,
+      "loss": 0.15411126613616943,
+      "step": 2805
+    },
+    {
+      "epoch": 0.5112809315866085,
+      "grad_norm": 0.1547706127166748,
+      "learning_rate": 2.5345946912222256e-05,
+      "loss": 0.15516072511672974,
+      "step": 2810
+    },
+    {
+      "epoch": 0.5121906841339156,
+      "grad_norm": 0.17955681681632996,
+      "learning_rate": 2.527234449158228e-05,
+      "loss": 0.15546923875808716,
+      "step": 2815
+    },
+    {
+      "epoch": 0.5131004366812227,
+      "grad_norm": 0.163709819316864,
+      "learning_rate": 2.519873970997927e-05,
+      "loss": 0.15665037631988527,
+      "step": 2820
+    },
+    {
+      "epoch": 0.5140101892285298,
+      "grad_norm": 0.17859576642513275,
+      "learning_rate": 2.5125133205495405e-05,
+      "loss": 0.1539722204208374,
+      "step": 2825
+    },
+    {
+      "epoch": 0.514919941775837,
+      "grad_norm": 0.17443150281906128,
+      "learning_rate": 2.5051525616227806e-05,
+      "loss": 0.148411762714386,
+      "step": 2830
+    },
+    {
+      "epoch": 0.5158296943231441,
+      "grad_norm": 0.17397581040859222,
+      "learning_rate": 2.4977917580283007e-05,
+      "loss": 0.14880497455596925,
+      "step": 2835
+    },
+    {
+      "epoch": 0.5167394468704513,
+      "grad_norm": 0.14565663039684296,
+      "learning_rate": 2.4904309735771405e-05,
+      "loss": 0.14934680461883545,
+      "step": 2840
+    },
+    {
+      "epoch": 0.5176491994177583,
+      "grad_norm": 0.17895659804344177,
+      "learning_rate": 2.4830702720801746e-05,
+      "loss": 0.15287939310073853,
+      "step": 2845
+    },
+    {
+      "epoch": 0.5185589519650655,
+      "grad_norm": 0.15812788903713226,
+      "learning_rate": 2.4757097173475572e-05,
+      "loss": 0.14576947689056396,
+      "step": 2850
+    },
+    {
+      "epoch": 0.5194687045123726,
+      "grad_norm": 0.17123781144618988,
+      "learning_rate": 2.46834937318817e-05,
+      "loss": 0.15224847793579102,
+      "step": 2855
+    },
+    {
+      "epoch": 0.5203784570596798,
+      "grad_norm": 0.14845474064350128,
+      "learning_rate": 2.460989303409072e-05,
+      "loss": 0.14901585578918458,
+      "step": 2860
+    },
+    {
+      "epoch": 0.5212882096069869,
+      "grad_norm": 0.23493704199790955,
+      "learning_rate": 2.4536295718149407e-05,
+      "loss": 0.1517487049102783,
+      "step": 2865
+    },
+    {
+      "epoch": 0.522197962154294,
+      "grad_norm": 0.16209843754768372,
+      "learning_rate": 2.4462702422075217e-05,
+      "loss": 0.14327445030212402,
+      "step": 2870
+    },
+    {
+      "epoch": 0.5231077147016011,
+      "grad_norm": 0.17249803245067596,
+      "learning_rate": 2.4389113783850793e-05,
+      "loss": 0.1517549753189087,
+      "step": 2875
+    },
+    {
+      "epoch": 0.5240174672489083,
+      "grad_norm": 0.14561402797698975,
+      "learning_rate": 2.431553044141836e-05,
+      "loss": 0.14764087200164794,
+      "step": 2880
+    },
+    {
+      "epoch": 0.5249272197962155,
+      "grad_norm": 0.17033302783966064,
+      "learning_rate": 2.4241953032674256e-05,
+      "loss": 0.15181604623794556,
+      "step": 2885
+    },
+    {
+      "epoch": 0.5258369723435226,
+      "grad_norm": 0.1184430941939354,
+      "learning_rate": 2.4168382195463367e-05,
+      "loss": 0.14264242649078368,
+      "step": 2890
+    },
+    {
+      "epoch": 0.5267467248908297,
+      "grad_norm": 0.17521196603775024,
+      "learning_rate": 2.4094818567573618e-05,
+      "loss": 0.1509538173675537,
+      "step": 2895
+    },
+    {
+      "epoch": 0.5276564774381368,
+      "grad_norm": 0.1681576371192932,
+      "learning_rate": 2.4021262786730428e-05,
+      "loss": 0.15344605445861817,
+      "step": 2900
+    },
+    {
+      "epoch": 0.528566229985444,
+      "grad_norm": 0.17134182155132294,
+      "learning_rate": 2.3947715490591206e-05,
+      "loss": 0.15161689519882202,
+      "step": 2905
+    },
+    {
+      "epoch": 0.5294759825327511,
+      "grad_norm": 0.1796472817659378,
+      "learning_rate": 2.3874177316739778e-05,
+      "loss": 0.15086464881896972,
+      "step": 2910
+    },
+    {
+      "epoch": 0.5303857350800583,
+      "grad_norm": 0.23268625140190125,
+      "learning_rate": 2.380064890268093e-05,
+      "loss": 0.15354180335998535,
+      "step": 2915
+    },
+    {
+      "epoch": 0.5312954876273653,
+      "grad_norm": 0.16318941116333008,
+      "learning_rate": 2.372713088583481e-05,
+      "loss": 0.15131797790527343,
+      "step": 2920
+    },
+    {
+      "epoch": 0.5322052401746725,
+      "grad_norm": 0.18171803653240204,
+      "learning_rate": 2.365362390353143e-05,
+      "loss": 0.15784090757369995,
+      "step": 2925
+    },
+    {
+      "epoch": 0.5331149927219796,
+      "grad_norm": 0.17672640085220337,
+      "learning_rate": 2.3580128593005156e-05,
+      "loss": 0.15509436130523682,
+      "step": 2930
+    },
+    {
+      "epoch": 0.5340247452692868,
+      "grad_norm": 0.15985223650932312,
+      "learning_rate": 2.3506645591389174e-05,
+      "loss": 0.14851027727127075,
+      "step": 2935
+    },
+    {
+      "epoch": 0.5349344978165939,
+      "grad_norm": 0.16597607731819153,
+      "learning_rate": 2.343317553570995e-05,
+      "loss": 0.1504931092262268,
+      "step": 2940
+    },
+    {
+      "epoch": 0.535844250363901,
+      "grad_norm": 0.20180748403072357,
+      "learning_rate": 2.3359719062881725e-05,
+      "loss": 0.15023820400238036,
+      "step": 2945
+    },
+    {
+      "epoch": 0.5367540029112081,
+      "grad_norm": 0.1735963076353073,
+      "learning_rate": 2.3286276809701e-05,
+      "loss": 0.15374408960342406,
+      "step": 2950
+    },
+    {
+      "epoch": 0.5376637554585153,
+      "grad_norm": 0.17629501223564148,
+      "learning_rate": 2.3212849412840995e-05,
+      "loss": 0.15007833242416382,
+      "step": 2955
+    },
+    {
+      "epoch": 0.5385735080058224,
+      "grad_norm": 0.1493796557188034,
+      "learning_rate": 2.3139437508846155e-05,
+      "loss": 0.15206656455993653,
+      "step": 2960
+    },
+    {
+      "epoch": 0.5394832605531296,
+      "grad_norm": 0.17426837980747223,
+      "learning_rate": 2.306604173412659e-05,
+      "loss": 0.1441131591796875,
+      "step": 2965
+    },
+    {
+      "epoch": 0.5403930131004366,
+      "grad_norm": 0.16984431445598602,
+      "learning_rate": 2.2992662724952613e-05,
+      "loss": 0.14438753128051757,
+      "step": 2970
+    },
+    {
+      "epoch": 0.5413027656477438,
+      "grad_norm": 0.1814386397600174,
+      "learning_rate": 2.2919301117449167e-05,
+      "loss": 0.14887022972106934,
+      "step": 2975
+    },
+    {
+      "epoch": 0.5422125181950509,
+      "grad_norm": 0.158392995595932,
+      "learning_rate": 2.2845957547590368e-05,
+      "loss": 0.14404361248016356,
+      "step": 2980
+    },
+    {
+      "epoch": 0.5431222707423581,
+      "grad_norm": 0.17496263980865479,
+      "learning_rate": 2.2772632651193953e-05,
+      "loss": 0.1454906702041626,
+      "step": 2985
+    },
+    {
+      "epoch": 0.5440320232896652,
+      "grad_norm": 0.157533198595047,
+      "learning_rate": 2.2699327063915766e-05,
+      "loss": 0.1458217740058899,
+      "step": 2990
+    },
+    {
+      "epoch": 0.5449417758369723,
+      "grad_norm": 0.1767890453338623,
+      "learning_rate": 2.262604142124427e-05,
+      "loss": 0.14384825229644777,
+      "step": 2995
+    },
+    {
+      "epoch": 0.5458515283842795,
+      "grad_norm": 0.1851050704717636,
+      "learning_rate": 2.2552776358495033e-05,
+      "loss": 0.14832457304000854,
+      "step": 3000
+    },
+    {
+      "epoch": 0.5467612809315866,
+      "grad_norm": 0.164175882935524,
+      "learning_rate": 2.247953251080521e-05,
+      "loss": 0.14999878406524658,
+      "step": 3005
+    },
+    {
+      "epoch": 0.5476710334788938,
+      "grad_norm": 0.3403675854206085,
+      "learning_rate": 2.240631051312804e-05,
+      "loss": 0.1443937063217163,
+      "step": 3010
+    },
+    {
+      "epoch": 0.5485807860262009,
+      "grad_norm": 0.16751109063625336,
+      "learning_rate": 2.2333111000227342e-05,
+      "loss": 0.1462402105331421,
+      "step": 3015
+    },
+    {
+      "epoch": 0.549490538573508,
+      "grad_norm": 0.14741151034832,
+      "learning_rate": 2.225993460667201e-05,
+      "loss": 0.149855899810791,
+      "step": 3020
+    },
+    {
+      "epoch": 0.5504002911208151,
+      "grad_norm": 0.20605266094207764,
+      "learning_rate": 2.218678196683054e-05,
+      "loss": 0.15413178205490113,
+      "step": 3025
+    },
+    {
+      "epoch": 0.5513100436681223,
+      "grad_norm": 0.14884796738624573,
+      "learning_rate": 2.2113653714865473e-05,
+      "loss": 0.14592334032058715,
+      "step": 3030
+    },
+    {
+      "epoch": 0.5522197962154294,
+      "grad_norm": 0.17114350199699402,
+      "learning_rate": 2.2040550484727943e-05,
+      "loss": 0.1498338460922241,
+      "step": 3035
+    },
+    {
+      "epoch": 0.5531295487627366,
+      "grad_norm": 0.16496853530406952,
+      "learning_rate": 2.196747291015219e-05,
+      "loss": 0.14650315046310425,
+      "step": 3040
+    },
+    {
+      "epoch": 0.5540393013100436,
+      "grad_norm": 0.15172401070594788,
+      "learning_rate": 2.189442162465001e-05,
+      "loss": 0.14984124898910522,
+      "step": 3045
+    },
+    {
+      "epoch": 0.5549490538573508,
+      "grad_norm": 0.19258467853069305,
+      "learning_rate": 2.182139726150532e-05,
+      "loss": 0.1486764669418335,
+      "step": 3050
+    },
+    {
+      "epoch": 0.5558588064046579,
+      "grad_norm": 0.1749001443386078,
+      "learning_rate": 2.1748400453768652e-05,
+      "loss": 0.14983701705932617,
+      "step": 3055
+    },
+    {
+      "epoch": 0.5567685589519651,
+      "grad_norm": 0.37510567903518677,
+      "learning_rate": 2.1675431834251637e-05,
+      "loss": 0.14483561515808105,
+      "step": 3060
+    },
+    {
+      "epoch": 0.5576783114992722,
+      "grad_norm": 0.16932405531406403,
+      "learning_rate": 2.1602492035521553e-05,
+      "loss": 0.14487643241882325,
+      "step": 3065
+    },
+    {
+      "epoch": 0.5585880640465793,
+      "grad_norm": 0.174176424741745,
+      "learning_rate": 2.152958168989584e-05,
+      "loss": 0.14737497568130492,
+      "step": 3070
+    },
+    {
+      "epoch": 0.5594978165938864,
+      "grad_norm": 0.1601252257823944,
+      "learning_rate": 2.1456701429436577e-05,
+      "loss": 0.15183379650115966,
+      "step": 3075
+    },
+    {
+      "epoch": 0.5604075691411936,
+      "grad_norm": 0.14960910379886627,
+      "learning_rate": 2.1383851885945085e-05,
+      "loss": 0.143074893951416,
+      "step": 3080
+    },
+    {
+      "epoch": 0.5613173216885007,
+      "grad_norm": 0.1678633838891983,
+      "learning_rate": 2.1311033690956346e-05,
+      "loss": 0.14961432218551635,
+      "step": 3085
+    },
+    {
+      "epoch": 0.5622270742358079,
+      "grad_norm": 0.15814319252967834,
+      "learning_rate": 2.1238247475733613e-05,
+      "loss": 0.14308581352233887,
+      "step": 3090
+    },
+    {
+      "epoch": 0.5631368267831149,
+      "grad_norm": 0.21240772306919098,
+      "learning_rate": 2.1165493871262887e-05,
+      "loss": 0.14737485647201537,
+      "step": 3095
+    },
+    {
+      "epoch": 0.5640465793304221,
+      "grad_norm": 0.15161271393299103,
+      "learning_rate": 2.109277350824749e-05,
+      "loss": 0.14534420967102052,
+      "step": 3100
+    },
+    {
+      "epoch": 0.5649563318777293,
+      "grad_norm": 0.16572362184524536,
+      "learning_rate": 2.1020087017102537e-05,
+      "loss": 0.14299670457839966,
+      "step": 3105
+    },
+    {
+      "epoch": 0.5658660844250364,
+      "grad_norm": 0.1548164039850235,
+      "learning_rate": 2.094743502794954e-05,
+      "loss": 0.14371142387390137,
+      "step": 3110
+    },
+    {
+      "epoch": 0.5667758369723436,
+      "grad_norm": 0.2574169933795929,
+      "learning_rate": 2.0874818170610885e-05,
+      "loss": 0.14350423812866211,
+      "step": 3115
+    },
+    {
+      "epoch": 0.5676855895196506,
+      "grad_norm": 0.16359548270702362,
+      "learning_rate": 2.080223707460443e-05,
+      "loss": 0.1520243763923645,
+      "step": 3120
+    },
+    {
+      "epoch": 0.5685953420669578,
+      "grad_norm": 0.1798320859670639,
+      "learning_rate": 2.072969236913799e-05,
+      "loss": 0.14832595586776734,
+      "step": 3125
+    },
+    {
+      "epoch": 0.5695050946142649,
+      "grad_norm": 0.17045916616916656,
+      "learning_rate": 2.0657184683103926e-05,
+      "loss": 0.15308042764663696,
+      "step": 3130
+    },
+    {
+      "epoch": 0.5704148471615721,
+      "grad_norm": 0.16345897316932678,
+      "learning_rate": 2.058471464507366e-05,
+      "loss": 0.14564799070358275,
+      "step": 3135
+    },
+    {
+      "epoch": 0.5713245997088792,
+      "grad_norm": 0.15170110762119293,
+      "learning_rate": 2.0512282883292257e-05,
+      "loss": 0.14161767959594726,
+      "step": 3140
+    },
+    {
+      "epoch": 0.5722343522561864,
+      "grad_norm": 0.8107472658157349,
+      "learning_rate": 2.0439890025672955e-05,
+      "loss": 0.14481087923049926,
+      "step": 3145
+    },
+    {
+      "epoch": 0.5731441048034934,
+      "grad_norm": 0.15346679091453552,
+      "learning_rate": 2.036753669979174e-05,
+      "loss": 0.14860262870788574,
+      "step": 3150
+    },
+    {
+      "epoch": 0.5740538573508006,
+      "grad_norm": 0.1632593423128128,
+      "learning_rate": 2.0295223532881886e-05,
+      "loss": 0.1481687307357788,
+      "step": 3155
+    },
+    {
+      "epoch": 0.5749636098981077,
+      "grad_norm": 0.23399172723293304,
+      "learning_rate": 2.022295115182852e-05,
+      "loss": 0.149153733253479,
+      "step": 3160
+    },
+    {
+      "epoch": 0.5758733624454149,
+      "grad_norm": 0.14977394044399261,
+      "learning_rate": 2.015072018316323e-05,
+      "loss": 0.14921388626098633,
+      "step": 3165
+    },
+    {
+      "epoch": 0.576783114992722,
+      "grad_norm": 0.1550658792257309,
+      "learning_rate": 2.007853125305856e-05,
+      "loss": 0.1482759475708008,
+      "step": 3170
+    },
+    {
+      "epoch": 0.5776928675400291,
+      "grad_norm": 0.16661737859249115,
+      "learning_rate": 2.0006384987322645e-05,
+      "loss": 0.14903552532196046,
+      "step": 3175
+    },
+    {
+      "epoch": 0.5786026200873362,
+      "grad_norm": 0.1746823936700821,
+      "learning_rate": 1.9934282011393753e-05,
+      "loss": 0.1412947654724121,
+      "step": 3180
+    },
+    {
+      "epoch": 0.5795123726346434,
+      "grad_norm": 0.17025792598724365,
+      "learning_rate": 1.9862222950334857e-05,
+      "loss": 0.15289769172668458,
+      "step": 3185
+    },
+    {
+      "epoch": 0.5804221251819505,
+      "grad_norm": 0.16857658326625824,
+      "learning_rate": 1.9790208428828252e-05,
+      "loss": 0.14419941902160643,
+      "step": 3190
+    },
+    {
+      "epoch": 0.5813318777292577,
+      "grad_norm": 0.16099876165390015,
+      "learning_rate": 1.9718239071170118e-05,
+      "loss": 0.14476487636566163,
+      "step": 3195
+    },
+    {
+      "epoch": 0.5822416302765647,
+      "grad_norm": 0.16140873730182648,
+      "learning_rate": 1.964631550126508e-05,
+      "loss": 0.14588416814804078,
+      "step": 3200
+    },
+    {
+      "epoch": 0.5831513828238719,
+      "grad_norm": 0.15719448029994965,
+      "learning_rate": 1.957443834262087e-05,
+      "loss": 0.15144693851470947,
+      "step": 3205
+    },
+    {
+      "epoch": 0.584061135371179,
+      "grad_norm": 0.16512645781040192,
+      "learning_rate": 1.950260821834285e-05,
+      "loss": 0.14787566661834717,
+      "step": 3210
+    },
+    {
+      "epoch": 0.5849708879184862,
+      "grad_norm": 0.18584516644477844,
+      "learning_rate": 1.9430825751128643e-05,
+      "loss": 0.14514710903167724,
+      "step": 3215
+    },
+    {
+      "epoch": 0.5858806404657934,
+      "grad_norm": 0.17640981078147888,
+      "learning_rate": 1.9359091563262742e-05,
+      "loss": 0.1511004686355591,
+      "step": 3220
+    },
+    {
+      "epoch": 0.5867903930131004,
+      "grad_norm": 0.1697624921798706,
+      "learning_rate": 1.9287406276611095e-05,
+      "loss": 0.15392563343048096,
+      "step": 3225
+    },
+    {
+      "epoch": 0.5877001455604076,
+      "grad_norm": 0.1677260845899582,
+      "learning_rate": 1.9215770512615725e-05,
+      "loss": 0.15311745405197144,
+      "step": 3230
+    },
+    {
+      "epoch": 0.5886098981077147,
+      "grad_norm": 0.15357480943202972,
+      "learning_rate": 1.9144184892289337e-05,
+      "loss": 0.14370160102844237,
+      "step": 3235
+    },
+    {
+      "epoch": 0.5895196506550219,
+      "grad_norm": 0.18601207435131073,
+      "learning_rate": 1.9072650036209955e-05,
+      "loss": 0.14095077514648438,
+      "step": 3240
+    },
+    {
+      "epoch": 0.590429403202329,
+      "grad_norm": 0.17313526570796967,
+      "learning_rate": 1.9001166564515513e-05,
+      "loss": 0.148259174823761,
+      "step": 3245
+    },
+    {
+      "epoch": 0.5913391557496361,
+      "grad_norm": 0.1634378433227539,
+      "learning_rate": 1.8929735096898504e-05,
+      "loss": 0.15082294940948487,
+      "step": 3250
+    },
+    {
+      "epoch": 0.5922489082969432,
+      "grad_norm": 0.18542174994945526,
+      "learning_rate": 1.885835625260058e-05,
+      "loss": 0.14461435079574586,
+      "step": 3255
+    },
+    {
+      "epoch": 0.5931586608442504,
+      "grad_norm": 0.1740756630897522,
+      "learning_rate": 1.87870306504072e-05,
+      "loss": 0.14083608388900756,
+      "step": 3260
+    },
+    {
+      "epoch": 0.5940684133915575,
+      "grad_norm": 0.25606217980384827,
+      "learning_rate": 1.8715758908642288e-05,
+      "loss": 0.15125386714935302,
+      "step": 3265
+    },
+    {
+      "epoch": 0.5949781659388647,
+      "grad_norm": 0.20194627344608307,
+      "learning_rate": 1.8644541645162834e-05,
+      "loss": 0.14433003664016725,
+      "step": 3270
+    },
+    {
+      "epoch": 0.5958879184861717,
+      "grad_norm": 0.1902168095111847,
+      "learning_rate": 1.8573379477353542e-05,
+      "loss": 0.14718132019042968,
+      "step": 3275
+    },
+    {
+      "epoch": 0.5967976710334789,
+      "grad_norm": 0.15122972428798676,
+      "learning_rate": 1.850227302212151e-05,
+      "loss": 0.153376567363739,
+      "step": 3280
+    },
+    {
+      "epoch": 0.597707423580786,
+      "grad_norm": 0.14331959187984467,
+      "learning_rate": 1.843122289589085e-05,
+      "loss": 0.146630597114563,
+      "step": 3285
+    },
+    {
+      "epoch": 0.5986171761280932,
+      "grad_norm": 0.15083099901676178,
+      "learning_rate": 1.836022971459737e-05,
+      "loss": 0.1445971965789795,
+      "step": 3290
+    },
+    {
+      "epoch": 0.5995269286754003,
+      "grad_norm": 0.16585418581962585,
+      "learning_rate": 1.828929409368321e-05,
+      "loss": 0.15120241641998292,
+      "step": 3295
+    },
+    {
+      "epoch": 0.6004366812227074,
+      "grad_norm": 0.1653224229812622,
+      "learning_rate": 1.8218416648091524e-05,
+      "loss": 0.14349838495254516,
+      "step": 3300
+    },
+    {
+      "epoch": 0.6013464337700145,
+      "grad_norm": 0.1891375184059143,
+      "learning_rate": 1.8147597992261124e-05,
+      "loss": 0.15171384811401367,
+      "step": 3305
+    },
+    {
+      "epoch": 0.6022561863173217,
+      "grad_norm": 0.13392704725265503,
+      "learning_rate": 1.8076838740121187e-05,
+      "loss": 0.14607118368148803,
+      "step": 3310
+    },
+    {
+      "epoch": 0.6031659388646288,
+      "grad_norm": 0.15421944856643677,
+      "learning_rate": 1.8006139505085926e-05,
+      "loss": 0.1380957007408142,
+      "step": 3315
+    },
+    {
+      "epoch": 0.604075691411936,
+      "grad_norm": 0.16637761890888214,
+      "learning_rate": 1.7935500900049246e-05,
+      "loss": 0.14604611396789552,
+      "step": 3320
+    },
+    {
+      "epoch": 0.6049854439592431,
+      "grad_norm": 0.16638441383838654,
+      "learning_rate": 1.7864923537379445e-05,
+      "loss": 0.1513611912727356,
+      "step": 3325
+    },
+    {
+      "epoch": 0.6058951965065502,
+      "grad_norm": 0.1745707094669342,
+      "learning_rate": 1.779440802891394e-05,
+      "loss": 0.15391240119934083,
+      "step": 3330
+    },
+    {
+      "epoch": 0.6068049490538574,
+      "grad_norm": 0.1620505005121231,
+      "learning_rate": 1.77239549859539e-05,
+      "loss": 0.14986472129821776,
+      "step": 3335
+    },
+    {
+      "epoch": 0.6077147016011645,
+      "grad_norm": 0.1579132080078125,
+      "learning_rate": 1.7653565019259e-05,
+      "loss": 0.1466603994369507,
+      "step": 3340
+    },
+    {
+      "epoch": 0.6086244541484717,
+      "grad_norm": 0.19154994189739227,
+      "learning_rate": 1.7583238739042086e-05,
+      "loss": 0.15228934288024903,
+      "step": 3345
+    },
+    {
+      "epoch": 0.6095342066957787,
+      "grad_norm": 0.15771779417991638,
+      "learning_rate": 1.7512976754963913e-05,
+      "loss": 0.14965078830718995,
+      "step": 3350
+    },
+    {
+      "epoch": 0.6104439592430859,
+      "grad_norm": 0.18406136333942413,
+      "learning_rate": 1.744277967612785e-05,
+      "loss": 0.1473196864128113,
+      "step": 3355
+    },
+    {
+      "epoch": 0.611353711790393,
+      "grad_norm": 0.17603816092014313,
+      "learning_rate": 1.7372648111074607e-05,
+      "loss": 0.1430676221847534,
+      "step": 3360
+    },
+    {
+      "epoch": 0.6122634643377002,
+      "grad_norm": 0.156408429145813,
+      "learning_rate": 1.7302582667776933e-05,
+      "loss": 0.14018454551696777,
+      "step": 3365
+    },
+    {
+      "epoch": 0.6131732168850073,
+      "grad_norm": 0.14504843950271606,
+      "learning_rate": 1.7232583953634407e-05,
+      "loss": 0.14505640268325806,
+      "step": 3370
+    },
+    {
+      "epoch": 0.6140829694323144,
+      "grad_norm": 0.1864968240261078,
+      "learning_rate": 1.716265257546808e-05,
+      "loss": 0.14810394048690795,
+      "step": 3375
+    },
+    {
+      "epoch": 0.6149927219796215,
+      "grad_norm": 0.1621711403131485,
+      "learning_rate": 1.7092789139515295e-05,
+      "loss": 0.14203091859817504,
+      "step": 3380
+    },
+    {
+      "epoch": 0.6159024745269287,
+      "grad_norm": 0.17994914948940277,
+      "learning_rate": 1.70229942514244e-05,
+      "loss": 0.14565644264221192,
+      "step": 3385
+    },
+    {
+      "epoch": 0.6168122270742358,
+      "grad_norm": 0.1707388162612915,
+      "learning_rate": 1.6953268516249486e-05,
+      "loss": 0.14449434280395507,
+      "step": 3390
+    },
+    {
+      "epoch": 0.617721979621543,
+      "grad_norm": 0.16425329446792603,
+      "learning_rate": 1.6883612538445175e-05,
+      "loss": 0.15185940265655518,
+      "step": 3395
+    },
+    {
+      "epoch": 0.61863173216885,
+      "grad_norm": 0.15987788140773773,
+      "learning_rate": 1.6814026921861335e-05,
+      "loss": 0.14994431734085084,
+      "step": 3400
+    },
+    {
+      "epoch": 0.6195414847161572,
+      "grad_norm": 0.2987690269947052,
+      "learning_rate": 1.6744512269737894e-05,
+      "loss": 0.14652738571166993,
+      "step": 3405
+    },
+    {
+      "epoch": 0.6204512372634643,
+      "grad_norm": 0.1681315004825592,
+      "learning_rate": 1.6675069184699574e-05,
+      "loss": 0.14566165208816528,
+      "step": 3410
+    },
+    {
+      "epoch": 0.6213609898107715,
+      "grad_norm": 0.15847846865653992,
+      "learning_rate": 1.660569826875069e-05,
+      "loss": 0.1374401330947876,
+      "step": 3415
+    },
+    {
+      "epoch": 0.6222707423580786,
+      "grad_norm": 0.16370312869548798,
+      "learning_rate": 1.6536400123269907e-05,
+      "loss": 0.14905524253845215,
+      "step": 3420
+    },
+    {
+      "epoch": 0.6231804949053857,
+      "grad_norm": 0.16054444015026093,
+      "learning_rate": 1.6467175349005054e-05,
+      "loss": 0.1496324896812439,
+      "step": 3425
+    },
+    {
+      "epoch": 0.6240902474526928,
+      "grad_norm": 0.1663951277732849,
+      "learning_rate": 1.639802454606788e-05,
+      "loss": 0.1504170298576355,
+      "step": 3430
+    },
+    {
+      "epoch": 0.625,
+      "grad_norm": 0.1591310054063797,
+      "learning_rate": 1.6328948313928906e-05,
+      "loss": 0.1410186171531677,
+      "step": 3435
+    },
+    {
+      "epoch": 0.6259097525473072,
+      "grad_norm": 0.1637524962425232,
+      "learning_rate": 1.6259947251412178e-05,
+      "loss": 0.13963305950164795,
+      "step": 3440
+    },
+    {
+      "epoch": 0.6268195050946143,
+      "grad_norm": 0.1688017100095749,
+      "learning_rate": 1.6191021956690096e-05,
+      "loss": 0.14727941751480103,
+      "step": 3445
+    },
+    {
+      "epoch": 0.6277292576419214,
+      "grad_norm": 0.1691795438528061,
+      "learning_rate": 1.612217302727821e-05,
+      "loss": 0.14856183528900146,
+      "step": 3450
+    },
+    {
+      "epoch": 0.6286390101892285,
+      "grad_norm": 0.18501746654510498,
+      "learning_rate": 1.60534010600301e-05,
+      "loss": 0.1481746554374695,
+      "step": 3455
+    },
+    {
+      "epoch": 0.6295487627365357,
+      "grad_norm": 0.16234716773033142,
+      "learning_rate": 1.5984706651132125e-05,
+      "loss": 0.1427530527114868,
+      "step": 3460
+    },
+    {
+      "epoch": 0.6304585152838428,
+      "grad_norm": 0.16013780236244202,
+      "learning_rate": 1.5916090396098293e-05,
+      "loss": 0.14264426231384278,
+      "step": 3465
+    },
+    {
+      "epoch": 0.63136826783115,
+      "grad_norm": 0.17116396129131317,
+      "learning_rate": 1.5847552889765095e-05,
+      "loss": 0.14109257459640503,
+      "step": 3470
+    },
+    {
+      "epoch": 0.632278020378457,
+      "grad_norm": 0.16949769854545593,
+      "learning_rate": 1.5779094726286344e-05,
+      "loss": 0.1387040376663208,
+      "step": 3475
+    },
+    {
+      "epoch": 0.6331877729257642,
+      "grad_norm": 0.14983431994915009,
+      "learning_rate": 1.5710716499128044e-05,
+      "loss": 0.13645120859146118,
+      "step": 3480
+    },
+    {
+      "epoch": 0.6340975254730713,
+      "grad_norm": 0.1632554531097412,
+      "learning_rate": 1.564241880106321e-05,
+      "loss": 0.14883992671966553,
+      "step": 3485
+    },
+    {
+      "epoch": 0.6350072780203785,
+      "grad_norm": 0.15686506032943726,
+      "learning_rate": 1.5574202224166744e-05,
+      "loss": 0.14244272708892822,
+      "step": 3490
+    },
+    {
+      "epoch": 0.6359170305676856,
+      "grad_norm": 0.18843458592891693,
+      "learning_rate": 1.5506067359810333e-05,
+      "loss": 0.15149861574172974,
+      "step": 3495
+    },
+    {
+      "epoch": 0.6368267831149927,
+      "grad_norm": 0.15874551236629486,
+      "learning_rate": 1.5438014798657275e-05,
+      "loss": 0.15188233852386473,
+      "step": 3500
+    },
+    {
+      "epoch": 0.6377365356622998,
+      "grad_norm": 0.17014239728450775,
+      "learning_rate": 1.5370045130657366e-05,
+      "loss": 0.14694437980651856,
+      "step": 3505
+    },
+    {
+      "epoch": 0.638646288209607,
+      "grad_norm": 0.14744038879871368,
+      "learning_rate": 1.5302158945041838e-05,
+      "loss": 0.14434736967086792,
+      "step": 3510
+    },
+    {
+      "epoch": 0.6395560407569141,
+      "grad_norm": 0.2069770246744156,
+      "learning_rate": 1.523435683031818e-05,
+      "loss": 0.13982917070388795,
+      "step": 3515
+    },
+    {
+      "epoch": 0.6404657933042213,
+      "grad_norm": 0.17811502516269684,
+      "learning_rate": 1.5166639374265063e-05,
+      "loss": 0.1408839702606201,
+      "step": 3520
+    },
+    {
+      "epoch": 0.6413755458515283,
+      "grad_norm": 0.165786474943161,
+      "learning_rate": 1.509900716392728e-05,
+      "loss": 0.15312877893447877,
+      "step": 3525
+    },
+    {
+      "epoch": 0.6422852983988355,
+      "grad_norm": 0.1633884161710739,
+      "learning_rate": 1.5031460785610596e-05,
+      "loss": 0.1488795518875122,
+      "step": 3530
+    },
+    {
+      "epoch": 0.6431950509461426,
+      "grad_norm": 0.16498984396457672,
+      "learning_rate": 1.4964000824876723e-05,
+      "loss": 0.15031465291976928,
+      "step": 3535
+    },
+    {
+      "epoch": 0.6441048034934498,
+      "grad_norm": 0.18043678998947144,
+      "learning_rate": 1.4896627866538191e-05,
+      "loss": 0.147829806804657,
+      "step": 3540
+    },
+    {
+      "epoch": 0.6450145560407569,
+      "grad_norm": 0.16813597083091736,
+      "learning_rate": 1.4829342494653315e-05,
+      "loss": 0.1418998956680298,
+      "step": 3545
+    },
+    {
+      "epoch": 0.645924308588064,
+      "grad_norm": 0.1817242056131363,
+      "learning_rate": 1.4762145292521118e-05,
+      "loss": 0.14508869647979736,
+      "step": 3550
+    },
+    {
+      "epoch": 0.6468340611353712,
+      "grad_norm": 0.14666494727134705,
+      "learning_rate": 1.469503684267628e-05,
+      "loss": 0.14159854650497436,
+      "step": 3555
+    },
+    {
+      "epoch": 0.6477438136826783,
+      "grad_norm": 0.16485381126403809,
+      "learning_rate": 1.4628017726884086e-05,
+      "loss": 0.14419105052947997,
+      "step": 3560
+    },
+    {
+      "epoch": 0.6486535662299855,
+      "grad_norm": 0.16100342571735382,
+      "learning_rate": 1.4561088526135375e-05,
+      "loss": 0.14501721858978273,
+      "step": 3565
+    },
+    {
+      "epoch": 0.6495633187772926,
+      "grad_norm": 0.16996590793132782,
+      "learning_rate": 1.4494249820641493e-05,
+      "loss": 0.1377166509628296,
+      "step": 3570
+    },
+    {
+      "epoch": 0.6504730713245997,
+      "grad_norm": 0.16168837249279022,
+      "learning_rate": 1.4427502189829339e-05,
+      "loss": 0.1414325475692749,
+      "step": 3575
+    },
+    {
+      "epoch": 0.6513828238719068,
+      "grad_norm": 0.16318906843662262,
+      "learning_rate": 1.436084621233621e-05,
+      "loss": 0.14685193300247193,
+      "step": 3580
+    },
+    {
+      "epoch": 0.652292576419214,
+      "grad_norm": 0.1636219322681427,
+      "learning_rate": 1.4294282466004899e-05,
+      "loss": 0.1405899167060852,
+      "step": 3585
+    },
+    {
+      "epoch": 0.6532023289665211,
+      "grad_norm": 0.1838461309671402,
+      "learning_rate": 1.422781152787865e-05,
+      "loss": 0.14386332035064697,
+      "step": 3590
+    },
+    {
+      "epoch": 0.6541120815138283,
+      "grad_norm": 0.1796344667673111,
+      "learning_rate": 1.4161433974196115e-05,
+      "loss": 0.1513024687767029,
+      "step": 3595
+    },
+    {
+      "epoch": 0.6550218340611353,
+      "grad_norm": 0.16424529254436493,
+      "learning_rate": 1.4095150380386427e-05,
+      "loss": 0.14238927364349366,
+      "step": 3600
+    },
+    {
+      "epoch": 0.6559315866084425,
+      "grad_norm": 0.19264160096645355,
+      "learning_rate": 1.402896132106415e-05,
+      "loss": 0.14297477006912232,
+      "step": 3605
+    },
+    {
+      "epoch": 0.6568413391557496,
+      "grad_norm": 0.18319948017597198,
+      "learning_rate": 1.3962867370024347e-05,
+      "loss": 0.1448880434036255,
+      "step": 3610
+    },
+    {
+      "epoch": 0.6577510917030568,
+      "grad_norm": 0.16507290303707123,
+      "learning_rate": 1.389686910023758e-05,
+      "loss": 0.14724698066711425,
+      "step": 3615
+    },
+    {
+      "epoch": 0.6586608442503639,
+      "grad_norm": 0.17871244251728058,
+      "learning_rate": 1.3830967083844942e-05,
+      "loss": 0.14479386806488037,
+      "step": 3620
+    },
+    {
+      "epoch": 0.659570596797671,
+      "grad_norm": 0.1846228390932083,
+      "learning_rate": 1.3765161892153112e-05,
+      "loss": 0.1453616738319397,
+      "step": 3625
+    },
+    {
+      "epoch": 0.6604803493449781,
+      "grad_norm": 0.17185978591442108,
+      "learning_rate": 1.3699454095629372e-05,
+      "loss": 0.14906206130981445,
+      "step": 3630
+    },
+    {
+      "epoch": 0.6613901018922853,
+      "grad_norm": 0.14751191437244415,
+      "learning_rate": 1.3633844263896698e-05,
+      "loss": 0.13991892337799072,
+      "step": 3635
+    },
+    {
+      "epoch": 0.6622998544395924,
+      "grad_norm": 0.22059763967990875,
+      "learning_rate": 1.3568332965728817e-05,
+      "loss": 0.14680869579315187,
+      "step": 3640
+    },
+    {
+      "epoch": 0.6632096069868996,
+      "grad_norm": 0.15295909345149994,
+      "learning_rate": 1.3502920769045232e-05,
+      "loss": 0.1404443383216858,
+      "step": 3645
+    },
+    {
+      "epoch": 0.6641193595342066,
+      "grad_norm": 0.14600558578968048,
+      "learning_rate": 1.3437608240906364e-05,
+      "loss": 0.14663270711898804,
+      "step": 3650
+    },
+    {
+      "epoch": 0.6650291120815138,
+      "grad_norm": 0.15548352897167206,
+      "learning_rate": 1.3372395947508587e-05,
+      "loss": 0.1431443452835083,
+      "step": 3655
+    },
+    {
+      "epoch": 0.665938864628821,
+      "grad_norm": 0.1813388466835022,
+      "learning_rate": 1.3307284454179342e-05,
+      "loss": 0.1458706736564636,
+      "step": 3660
+    },
+    {
+      "epoch": 0.6668486171761281,
+      "grad_norm": 0.16326870024204254,
+      "learning_rate": 1.3242274325372247e-05,
+      "loss": 0.14700595140457154,
+      "step": 3665
+    },
+    {
+      "epoch": 0.6677583697234353,
+      "grad_norm": 0.18779197335243225,
+      "learning_rate": 1.3177366124662149e-05,
+      "loss": 0.1497237801551819,
+      "step": 3670
+    },
+    {
+      "epoch": 0.6686681222707423,
+      "grad_norm": 0.16291002929210663,
+      "learning_rate": 1.3112560414740315e-05,
+      "loss": 0.1387086868286133,
+      "step": 3675
+    },
+    {
+      "epoch": 0.6695778748180495,
+      "grad_norm": 0.1532297134399414,
+      "learning_rate": 1.3047857757409487e-05,
+      "loss": 0.14497545957565308,
+      "step": 3680
+    },
+    {
+      "epoch": 0.6704876273653566,
+      "grad_norm": 0.14697515964508057,
+      "learning_rate": 1.2983258713579066e-05,
+      "loss": 0.1494283437728882,
+      "step": 3685
+    },
+    {
+      "epoch": 0.6713973799126638,
+      "grad_norm": 0.15213452279567719,
+      "learning_rate": 1.2918763843260218e-05,
+      "loss": 0.1468907594680786,
+      "step": 3690
+    },
+    {
+      "epoch": 0.6723071324599709,
+      "grad_norm": 0.1745215803384781,
+      "learning_rate": 1.285437370556099e-05,
+      "loss": 0.14997754096984864,
+      "step": 3695
+    },
+    {
+      "epoch": 0.673216885007278,
+      "grad_norm": 0.19207637012004852,
+      "learning_rate": 1.2790088858681577e-05,
+      "loss": 0.14202862977981567,
+      "step": 3700
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.0364761863728922e+18,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-3700/training_args.bin b/checkpoint-3700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-3700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/checkpoint-3800/README.md b/checkpoint-3800/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-3800/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-3800/adapter_config.json b/checkpoint-3800/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-3800/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-3800/adapter_model.safetensors b/checkpoint-3800/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..fd326176a9cbc79c72c49b2842b8cbc4e18bf908
--- /dev/null
+++ b/checkpoint-3800/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2a3f03ada8e1ed15490b9cb47aeb536f3bb198392f1bd7a70f78da113ba68092
+size 169741912
diff --git a/checkpoint-3800/chat_template.jinja b/checkpoint-3800/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-3800/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-3800/optimizer.pt b/checkpoint-3800/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..745b657cab9fd348ccd3ebb0bd5903c4ef06ee9d
--- /dev/null
+++ b/checkpoint-3800/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4748594c024b04d6c6b59db5553f34bddad24d2fd2dc7fd1a33325f745c5fc20
+size 72807355
diff --git a/checkpoint-3800/processor_config.json b/checkpoint-3800/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-3800/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-3800/rng_state.pth b/checkpoint-3800/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-3800/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-3800/scheduler.pt b/checkpoint-3800/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..dab54530d938cf24c40cb74d9e9d029117f1ef7a
--- /dev/null
+++ b/checkpoint-3800/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8d0b9d2db22198f8653b00ec482cfbb9290754de8025d23224966c69c5a07bc9
+size 1465
diff --git a/checkpoint-3800/tokenizer.json b/checkpoint-3800/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-3800/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-3800/tokenizer_config.json b/checkpoint-3800/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-3800/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-3800/trainer_state.json b/checkpoint-3800/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..5276e96084aef72b5e015d7228036587bd08a926
--- /dev/null
+++ b/checkpoint-3800/trainer_state.json
@@ -0,0 +1,5362 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.6914119359534207,
+  "eval_steps": 100,
+  "global_step": 3800,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    },
+    {
+      "epoch": 0.03729985443959243,
+      "grad_norm": 0.061678580939769745,
+      "learning_rate": 4.999340748636462e-05,
+      "loss": 0.24956226348876953,
+      "step": 205
+    },
+    {
+      "epoch": 0.03820960698689956,
+      "grad_norm": 0.07328873127698898,
+      "learning_rate": 4.999160884067051e-05,
+      "loss": 0.26169676780700685,
+      "step": 210
+    },
+    {
+      "epoch": 0.0391193595342067,
+      "grad_norm": 0.08287990838289261,
+      "learning_rate": 4.9989593541926246e-05,
+      "loss": 0.2574604034423828,
+      "step": 215
+    },
+    {
+      "epoch": 0.04002911208151383,
+      "grad_norm": 0.06787359714508057,
+      "learning_rate": 4.9987361607602525e-05,
+      "loss": 0.25351409912109374,
+      "step": 220
+    },
+    {
+      "epoch": 0.04093886462882096,
+      "grad_norm": 0.06695502996444702,
+      "learning_rate": 4.998491305704805e-05,
+      "loss": 0.24522039890289307,
+      "step": 225
+    },
+    {
+      "epoch": 0.041848617176128096,
+      "grad_norm": 0.08872214704751968,
+      "learning_rate": 4.9982247911489375e-05,
+      "loss": 0.2581867933273315,
+      "step": 230
+    },
+    {
+      "epoch": 0.042758369723435226,
+      "grad_norm": 0.07637131959199905,
+      "learning_rate": 4.9979366194030743e-05,
+      "loss": 0.25569658279418944,
+      "step": 235
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 0.08158119022846222,
+      "learning_rate": 4.997626792965385e-05,
+      "loss": 0.2529409646987915,
+      "step": 240
+    },
+    {
+      "epoch": 0.04457787481804949,
+      "grad_norm": 0.07529161125421524,
+      "learning_rate": 4.997295314521766e-05,
+      "loss": 0.24049024581909179,
+      "step": 245
+    },
+    {
+      "epoch": 0.04548762736535662,
+      "grad_norm": 0.08860139548778534,
+      "learning_rate": 4.996942186945813e-05,
+      "loss": 0.2490522861480713,
+      "step": 250
+    },
+    {
+      "epoch": 0.04639737991266375,
+      "grad_norm": 0.0850321501493454,
+      "learning_rate": 4.9965674132988005e-05,
+      "loss": 0.24180831909179687,
+      "step": 255
+    },
+    {
+      "epoch": 0.04730713245997089,
+      "grad_norm": 0.07556115090847015,
+      "learning_rate": 4.996170996829653e-05,
+      "loss": 0.2509631872177124,
+      "step": 260
+    },
+    {
+      "epoch": 0.04821688500727802,
+      "grad_norm": 0.07971206307411194,
+      "learning_rate": 4.995752940974918e-05,
+      "loss": 0.24398891925811766,
+      "step": 265
+    },
+    {
+      "epoch": 0.04912663755458515,
+      "grad_norm": 0.09149336814880371,
+      "learning_rate": 4.9953132493587344e-05,
+      "loss": 0.2300492286682129,
+      "step": 270
+    },
+    {
+      "epoch": 0.050036390101892286,
+      "grad_norm": 0.08265820890665054,
+      "learning_rate": 4.9948519257928034e-05,
+      "loss": 0.24246792793273925,
+      "step": 275
+    },
+    {
+      "epoch": 0.050946142649199416,
+      "grad_norm": 0.10328587144613266,
+      "learning_rate": 4.9943689742763534e-05,
+      "loss": 0.2367171049118042,
+      "step": 280
+    },
+    {
+      "epoch": 0.05185589519650655,
+      "grad_norm": 0.0836917981505394,
+      "learning_rate": 4.993864398996105e-05,
+      "loss": 0.23215813636779786,
+      "step": 285
+    },
+    {
+      "epoch": 0.05276564774381368,
+      "grad_norm": 0.09475161135196686,
+      "learning_rate": 4.99333820432624e-05,
+      "loss": 0.2350748062133789,
+      "step": 290
+    },
+    {
+      "epoch": 0.05367540029112081,
+      "grad_norm": 0.08040128648281097,
+      "learning_rate": 4.992790394828355e-05,
+      "loss": 0.23253886699676513,
+      "step": 295
+    },
+    {
+      "epoch": 0.05458515283842795,
+      "grad_norm": 0.08852150291204453,
+      "learning_rate": 4.992220975251428e-05,
+      "loss": 0.23856515884399415,
+      "step": 300
+    },
+    {
+      "epoch": 0.05549490538573508,
+      "grad_norm": 0.09565229713916779,
+      "learning_rate": 4.991629950531775e-05,
+      "loss": 0.23311660289764405,
+      "step": 305
+    },
+    {
+      "epoch": 0.05640465793304221,
+      "grad_norm": 0.08158160001039505,
+      "learning_rate": 4.991017325793009e-05,
+      "loss": 0.22467944622039795,
+      "step": 310
+    },
+    {
+      "epoch": 0.05731441048034935,
+      "grad_norm": 0.07746429741382599,
+      "learning_rate": 4.990383106345994e-05,
+      "loss": 0.229844069480896,
+      "step": 315
+    },
+    {
+      "epoch": 0.05822416302765648,
+      "grad_norm": 0.08564355969429016,
+      "learning_rate": 4.989727297688797e-05,
+      "loss": 0.22414517402648926,
+      "step": 320
+    },
+    {
+      "epoch": 0.05913391557496361,
+      "grad_norm": 0.07517435401678085,
+      "learning_rate": 4.9890499055066435e-05,
+      "loss": 0.2236532211303711,
+      "step": 325
+    },
+    {
+      "epoch": 0.060043668122270744,
+      "grad_norm": 0.111734539270401,
+      "learning_rate": 4.988350935671869e-05,
+      "loss": 0.21474847793579102,
+      "step": 330
+    },
+    {
+      "epoch": 0.060953420669577874,
+      "grad_norm": 0.09906989336013794,
+      "learning_rate": 4.987630394243866e-05,
+      "loss": 0.23321933746337892,
+      "step": 335
+    },
+    {
+      "epoch": 0.06186317321688501,
+      "grad_norm": 0.10131457448005676,
+      "learning_rate": 4.98688828746903e-05,
+      "loss": 0.2310662031173706,
+      "step": 340
+    },
+    {
+      "epoch": 0.06277292576419213,
+      "grad_norm": 0.09203507006168365,
+      "learning_rate": 4.986124621780708e-05,
+      "loss": 0.22021169662475587,
+      "step": 345
+    },
+    {
+      "epoch": 0.06368267831149928,
+      "grad_norm": 0.09505912661552429,
+      "learning_rate": 4.9853394037991416e-05,
+      "loss": 0.2197155237197876,
+      "step": 350
+    },
+    {
+      "epoch": 0.06459243085880641,
+      "grad_norm": 0.09038657695055008,
+      "learning_rate": 4.984532640331412e-05,
+      "loss": 0.22066287994384765,
+      "step": 355
+    },
+    {
+      "epoch": 0.06550218340611354,
+      "grad_norm": 0.09707064181566238,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 0.22455451488494874,
+      "step": 360
+    },
+    {
+      "epoch": 0.06641193595342067,
+      "grad_norm": 0.10367228090763092,
+      "learning_rate": 4.98285450509961e-05,
+      "loss": 0.21993820667266845,
+      "step": 365
+    },
+    {
+      "epoch": 0.0673216885007278,
+      "grad_norm": 0.12229471653699875,
+      "learning_rate": 4.9819831478833456e-05,
+      "loss": 0.2168867588043213,
+      "step": 370
+    },
+    {
+      "epoch": 0.06823144104803494,
+      "grad_norm": 0.0964592918753624,
+      "learning_rate": 4.981090274276406e-05,
+      "loss": 0.21579203605651856,
+      "step": 375
+    },
+    {
+      "epoch": 0.06914119359534207,
+      "grad_norm": 0.09400496631860733,
+      "learning_rate": 4.980175892019141e-05,
+      "loss": 0.20972180366516113,
+      "step": 380
+    },
+    {
+      "epoch": 0.0700509461426492,
+      "grad_norm": 0.08158645778894424,
+      "learning_rate": 4.9792400090383594e-05,
+      "loss": 0.22148358821868896,
+      "step": 385
+    },
+    {
+      "epoch": 0.07096069868995633,
+      "grad_norm": 0.10916394740343094,
+      "learning_rate": 4.978282633447261e-05,
+      "loss": 0.2214418649673462,
+      "step": 390
+    },
+    {
+      "epoch": 0.07187045123726346,
+      "grad_norm": 0.11138810962438583,
+      "learning_rate": 4.9773037735453636e-05,
+      "loss": 0.21814754009246826,
+      "step": 395
+    },
+    {
+      "epoch": 0.07278020378457059,
+      "grad_norm": 0.10914396494626999,
+      "learning_rate": 4.9763034378184365e-05,
+      "loss": 0.21310818195343018,
+      "step": 400
+    },
+    {
+      "epoch": 0.07368995633187773,
+      "grad_norm": 0.1043366864323616,
+      "learning_rate": 4.975281634938421e-05,
+      "loss": 0.21266789436340333,
+      "step": 405
+    },
+    {
+      "epoch": 0.07459970887918486,
+      "grad_norm": 0.1036868542432785,
+      "learning_rate": 4.9742383737633594e-05,
+      "loss": 0.21606721878051757,
+      "step": 410
+    },
+    {
+      "epoch": 0.075509461426492,
+      "grad_norm": 0.11640442907810211,
+      "learning_rate": 4.9731736633373144e-05,
+      "loss": 0.21532948017120362,
+      "step": 415
+    },
+    {
+      "epoch": 0.07641921397379912,
+      "grad_norm": 0.11219926178455353,
+      "learning_rate": 4.9720875128902956e-05,
+      "loss": 0.2191627025604248,
+      "step": 420
+    },
+    {
+      "epoch": 0.07732896652110625,
+      "grad_norm": 0.12103637307882309,
+      "learning_rate": 4.970979931838176e-05,
+      "loss": 0.20938868522644044,
+      "step": 425
+    },
+    {
+      "epoch": 0.0782387190684134,
+      "grad_norm": 0.13274189829826355,
+      "learning_rate": 4.96985092978261e-05,
+      "loss": 0.21792960166931152,
+      "step": 430
+    },
+    {
+      "epoch": 0.07914847161572053,
+      "grad_norm": 0.11164513230323792,
+      "learning_rate": 4.968700516510954e-05,
+      "loss": 0.2022618055343628,
+      "step": 435
+    },
+    {
+      "epoch": 0.08005822416302766,
+      "grad_norm": 0.09532847255468369,
+      "learning_rate": 4.967528701996174e-05,
+      "loss": 0.21255812644958497,
+      "step": 440
+    },
+    {
+      "epoch": 0.08096797671033479,
+      "grad_norm": 0.10279258340597153,
+      "learning_rate": 4.96633549639677e-05,
+      "loss": 0.20683050155639648,
+      "step": 445
+    },
+    {
+      "epoch": 0.08187772925764192,
+      "grad_norm": 0.1257462352514267,
+      "learning_rate": 4.965120910056677e-05,
+      "loss": 0.21419920921325683,
+      "step": 450
+    },
+    {
+      "epoch": 0.08278748180494905,
+      "grad_norm": 0.11663137376308441,
+      "learning_rate": 4.963884953505186e-05,
+      "loss": 0.2072287082672119,
+      "step": 455
+    },
+    {
+      "epoch": 0.08369723435225619,
+      "grad_norm": 0.10488224029541016,
+      "learning_rate": 4.96262763745684e-05,
+      "loss": 0.1982678532600403,
+      "step": 460
+    },
+    {
+      "epoch": 0.08460698689956332,
+      "grad_norm": 0.11801692098379135,
+      "learning_rate": 4.961348972811354e-05,
+      "loss": 0.20662031173706055,
+      "step": 465
+    },
+    {
+      "epoch": 0.08551673944687045,
+      "grad_norm": 0.11318827420473099,
+      "learning_rate": 4.96004897065351e-05,
+      "loss": 0.20947303771972656,
+      "step": 470
+    },
+    {
+      "epoch": 0.08642649199417758,
+      "grad_norm": 0.13409486413002014,
+      "learning_rate": 4.95872764225307e-05,
+      "loss": 0.19670876264572143,
+      "step": 475
+    },
+    {
+      "epoch": 0.08733624454148471,
+      "grad_norm": 0.14440792798995972,
+      "learning_rate": 4.957384999064672e-05,
+      "loss": 0.19842848777770997,
+      "step": 480
+    },
+    {
+      "epoch": 0.08824599708879186,
+      "grad_norm": 0.12246996909379959,
+      "learning_rate": 4.956021052727731e-05,
+      "loss": 0.20318071842193602,
+      "step": 485
+    },
+    {
+      "epoch": 0.08915574963609899,
+      "grad_norm": 0.13437233865261078,
+      "learning_rate": 4.954635815066342e-05,
+      "loss": 0.21675212383270265,
+      "step": 490
+    },
+    {
+      "epoch": 0.09006550218340612,
+      "grad_norm": 0.11109672486782074,
+      "learning_rate": 4.9532292980891744e-05,
+      "loss": 0.2100757837295532,
+      "step": 495
+    },
+    {
+      "epoch": 0.09097525473071325,
+      "grad_norm": 0.1388893872499466,
+      "learning_rate": 4.9518015139893675e-05,
+      "loss": 0.20303285121917725,
+      "step": 500
+    },
+    {
+      "epoch": 0.09188500727802038,
+      "grad_norm": 0.13239721953868866,
+      "learning_rate": 4.950352475144427e-05,
+      "loss": 0.2152268409729004,
+      "step": 505
+    },
+    {
+      "epoch": 0.0927947598253275,
+      "grad_norm": 0.12834979593753815,
+      "learning_rate": 4.948882194116119e-05,
+      "loss": 0.20799248218536376,
+      "step": 510
+    },
+    {
+      "epoch": 0.09370451237263465,
+      "grad_norm": 0.11886704713106155,
+      "learning_rate": 4.947390683650354e-05,
+      "loss": 0.20394976139068605,
+      "step": 515
+    },
+    {
+      "epoch": 0.09461426491994178,
+      "grad_norm": 0.11398876458406448,
+      "learning_rate": 4.945877956677083e-05,
+      "loss": 0.2091092586517334,
+      "step": 520
+    },
+    {
+      "epoch": 0.09552401746724891,
+      "grad_norm": 0.1422540694475174,
+      "learning_rate": 4.944344026310186e-05,
+      "loss": 0.19564238786697388,
+      "step": 525
+    },
+    {
+      "epoch": 0.09643377001455604,
+      "grad_norm": 0.11359584331512451,
+      "learning_rate": 4.9427889058473535e-05,
+      "loss": 0.20493624210357667,
+      "step": 530
+    },
+    {
+      "epoch": 0.09734352256186317,
+      "grad_norm": 0.11703553050756454,
+      "learning_rate": 4.941212608769974e-05,
+      "loss": 0.2098615884780884,
+      "step": 535
+    },
+    {
+      "epoch": 0.0982532751091703,
+      "grad_norm": 0.14552047848701477,
+      "learning_rate": 4.939615148743017e-05,
+      "loss": 0.20382182598114013,
+      "step": 540
+    },
+    {
+      "epoch": 0.09916302765647744,
+      "grad_norm": 0.13178016245365143,
+      "learning_rate": 4.937996539614914e-05,
+      "loss": 0.19901862144470214,
+      "step": 545
+    },
+    {
+      "epoch": 0.10007278020378457,
+      "grad_norm": 0.635392427444458,
+      "learning_rate": 4.936356795417439e-05,
+      "loss": 0.20694944858551026,
+      "step": 550
+    },
+    {
+      "epoch": 0.1009825327510917,
+      "grad_norm": 0.15019077062606812,
+      "learning_rate": 4.934695930365586e-05,
+      "loss": 0.19313746690750122,
+      "step": 555
+    },
+    {
+      "epoch": 0.10189228529839883,
+      "grad_norm": 0.12941956520080566,
+      "learning_rate": 4.9330139588574474e-05,
+      "loss": 0.19671722650527954,
+      "step": 560
+    },
+    {
+      "epoch": 0.10280203784570596,
+      "grad_norm": 0.13818831741809845,
+      "learning_rate": 4.931310895474088e-05,
+      "loss": 0.20026786327362062,
+      "step": 565
+    },
+    {
+      "epoch": 0.1037117903930131,
+      "grad_norm": 0.12011194974184036,
+      "learning_rate": 4.929586754979417e-05,
+      "loss": 0.1932437539100647,
+      "step": 570
+    },
+    {
+      "epoch": 0.10462154294032024,
+      "grad_norm": 0.1345364898443222,
+      "learning_rate": 4.9278415523200644e-05,
+      "loss": 0.20245940685272218,
+      "step": 575
+    },
+    {
+      "epoch": 0.10553129548762737,
+      "grad_norm": 0.13281017541885376,
+      "learning_rate": 4.926075302625247e-05,
+      "loss": 0.19864981174468993,
+      "step": 580
+    },
+    {
+      "epoch": 0.1064410480349345,
+      "grad_norm": 0.13465586304664612,
+      "learning_rate": 4.924288021206639e-05,
+      "loss": 0.19573183059692384,
+      "step": 585
+    },
+    {
+      "epoch": 0.10735080058224163,
+      "grad_norm": 0.15225961804389954,
+      "learning_rate": 4.9224797235582396e-05,
+      "loss": 0.19946801662445068,
+      "step": 590
+    },
+    {
+      "epoch": 0.10826055312954876,
+      "grad_norm": 0.12816746532917023,
+      "learning_rate": 4.92065042535624e-05,
+      "loss": 0.19851526021957397,
+      "step": 595
+    },
+    {
+      "epoch": 0.1091703056768559,
+      "grad_norm": 0.13802853226661682,
+      "learning_rate": 4.9188001424588824e-05,
+      "loss": 0.19321763515472412,
+      "step": 600
+    },
+    {
+      "epoch": 0.11008005822416303,
+      "grad_norm": 0.17504797875881195,
+      "learning_rate": 4.9169288909063295e-05,
+      "loss": 0.2032616138458252,
+      "step": 605
+    },
+    {
+      "epoch": 0.11098981077147016,
+      "grad_norm": 0.13544194400310516,
+      "learning_rate": 4.91503668692052e-05,
+      "loss": 0.2011256456375122,
+      "step": 610
+    },
+    {
+      "epoch": 0.11189956331877729,
+      "grad_norm": 1.3976134061813354,
+      "learning_rate": 4.91312354690503e-05,
+      "loss": 0.19916868209838867,
+      "step": 615
+    },
+    {
+      "epoch": 0.11280931586608442,
+      "grad_norm": 0.1465059071779251,
+      "learning_rate": 4.91118948744493e-05,
+      "loss": 0.19487457275390624,
+      "step": 620
+    },
+    {
+      "epoch": 0.11371906841339156,
+      "grad_norm": 0.12103168666362762,
+      "learning_rate": 4.909234525306645e-05,
+      "loss": 0.1907251238822937,
+      "step": 625
+    },
+    {
+      "epoch": 0.1146288209606987,
+      "grad_norm": 0.12660574913024902,
+      "learning_rate": 4.907258677437802e-05,
+      "loss": 0.19327253103256226,
+      "step": 630
+    },
+    {
+      "epoch": 0.11553857350800582,
+      "grad_norm": 0.1347813606262207,
+      "learning_rate": 4.90526196096709e-05,
+      "loss": 0.19637736082077026,
+      "step": 635
+    },
+    {
+      "epoch": 0.11644832605531295,
+      "grad_norm": 0.14953652024269104,
+      "learning_rate": 4.903244393204107e-05,
+      "loss": 0.20325069427490233,
+      "step": 640
+    },
+    {
+      "epoch": 0.11735807860262008,
+      "grad_norm": 0.13936272263526917,
+      "learning_rate": 4.901205991639213e-05,
+      "loss": 0.1930275321006775,
+      "step": 645
+    },
+    {
+      "epoch": 0.11826783114992721,
+      "grad_norm": 0.1448420137166977,
+      "learning_rate": 4.899146773943374e-05,
+      "loss": 0.20026936531066894,
+      "step": 650
+    },
+    {
+      "epoch": 0.11917758369723436,
+      "grad_norm": 0.1312534064054489,
+      "learning_rate": 4.897066757968014e-05,
+      "loss": 0.19062033891677857,
+      "step": 655
+    },
+    {
+      "epoch": 0.12008733624454149,
+      "grad_norm": 0.13644742965698242,
+      "learning_rate": 4.894965961744859e-05,
+      "loss": 0.18719595670700073,
+      "step": 660
+    },
+    {
+      "epoch": 0.12099708879184862,
+      "grad_norm": 0.14276087284088135,
+      "learning_rate": 4.892844403485777e-05,
+      "loss": 0.19784307479858398,
+      "step": 665
+    },
+    {
+      "epoch": 0.12190684133915575,
+      "grad_norm": 0.14735399186611176,
+      "learning_rate": 4.890702101582623e-05,
+      "loss": 0.19163782596588136,
+      "step": 670
+    },
+    {
+      "epoch": 0.12281659388646288,
+      "grad_norm": 0.15742065012454987,
+      "learning_rate": 4.888539074607082e-05,
+      "loss": 0.19312986135482788,
+      "step": 675
+    },
+    {
+      "epoch": 0.12372634643377002,
+      "grad_norm": 0.12917031347751617,
+      "learning_rate": 4.8863553413105025e-05,
+      "loss": 0.20066320896148682,
+      "step": 680
+    },
+    {
+      "epoch": 0.12463609898107715,
+      "grad_norm": 0.1484801322221756,
+      "learning_rate": 4.884150920623737e-05,
+      "loss": 0.20096096992492676,
+      "step": 685
+    },
+    {
+      "epoch": 0.12554585152838427,
+      "grad_norm": 0.1455296128988266,
+      "learning_rate": 4.88192583165698e-05,
+      "loss": 0.20518505573272705,
+      "step": 690
+    },
+    {
+      "epoch": 0.12645560407569142,
+      "grad_norm": 0.14517490565776825,
+      "learning_rate": 4.879680093699598e-05,
+      "loss": 0.18859238624572755,
+      "step": 695
+    },
+    {
+      "epoch": 0.12736535662299855,
+      "grad_norm": 0.18778090178966522,
+      "learning_rate": 4.877413726219964e-05,
+      "loss": 0.197074818611145,
+      "step": 700
+    },
+    {
+      "epoch": 0.12827510917030568,
+      "grad_norm": 0.13497677445411682,
+      "learning_rate": 4.87512674886529e-05,
+      "loss": 0.18713107109069824,
+      "step": 705
+    },
+    {
+      "epoch": 0.12918486171761281,
+      "grad_norm": 0.12657155096530914,
+      "learning_rate": 4.872819181461455e-05,
+      "loss": 0.1858484387397766,
+      "step": 710
+    },
+    {
+      "epoch": 0.13009461426491994,
+      "grad_norm": 0.11458148807287216,
+      "learning_rate": 4.870491044012834e-05,
+      "loss": 0.18732179403305055,
+      "step": 715
+    },
+    {
+      "epoch": 0.13100436681222707,
+      "grad_norm": 0.13000249862670898,
+      "learning_rate": 4.8681423567021244e-05,
+      "loss": 0.1872936010360718,
+      "step": 720
+    },
+    {
+      "epoch": 0.1319141193595342,
+      "grad_norm": 0.14580890536308289,
+      "learning_rate": 4.865773139890172e-05,
+      "loss": 0.19280019998550416,
+      "step": 725
+    },
+    {
+      "epoch": 0.13282387190684133,
+      "grad_norm": 0.1507277935743332,
+      "learning_rate": 4.8633834141157913e-05,
+      "loss": 0.1898929238319397,
+      "step": 730
+    },
+    {
+      "epoch": 0.13373362445414846,
+      "grad_norm": 0.1418737769126892,
+      "learning_rate": 4.860973200095592e-05,
+      "loss": 0.17926375865936278,
+      "step": 735
+    },
+    {
+      "epoch": 0.1346433770014556,
+      "grad_norm": 0.17151866853237152,
+      "learning_rate": 4.858542518723794e-05,
+      "loss": 0.18963592052459716,
+      "step": 740
+    },
+    {
+      "epoch": 0.13555312954876272,
+      "grad_norm": 0.11162743717432022,
+      "learning_rate": 4.8560913910720535e-05,
+      "loss": 0.19466646909713745,
+      "step": 745
+    },
+    {
+      "epoch": 0.13646288209606988,
+      "grad_norm": 0.15628376603126526,
+      "learning_rate": 4.8536198383892725e-05,
+      "loss": 0.19494034051895143,
+      "step": 750
+    },
+    {
+      "epoch": 0.137372634643377,
+      "grad_norm": 0.18209289014339447,
+      "learning_rate": 4.851127882101421e-05,
+      "loss": 0.18747550249099731,
+      "step": 755
+    },
+    {
+      "epoch": 0.13828238719068414,
+      "grad_norm": 0.14559614658355713,
+      "learning_rate": 4.8486155438113454e-05,
+      "loss": 0.1897158980369568,
+      "step": 760
+    },
+    {
+      "epoch": 0.13919213973799127,
+      "grad_norm": 0.3198587894439697,
+      "learning_rate": 4.846082845298586e-05,
+      "loss": 0.18571001291275024,
+      "step": 765
+    },
+    {
+      "epoch": 0.1401018922852984,
+      "grad_norm": 0.1486678421497345,
+      "learning_rate": 4.843529808519189e-05,
+      "loss": 0.19561930894851684,
+      "step": 770
+    },
+    {
+      "epoch": 0.14101164483260553,
+      "grad_norm": 0.15318170189857483,
+      "learning_rate": 4.840956455605509e-05,
+      "loss": 0.187040114402771,
+      "step": 775
+    },
+    {
+      "epoch": 0.14192139737991266,
+      "grad_norm": 0.13754244148731232,
+      "learning_rate": 4.838362808866025e-05,
+      "loss": 0.18345539569854735,
+      "step": 780
+    },
+    {
+      "epoch": 0.1428311499272198,
+      "grad_norm": 0.12943248450756073,
+      "learning_rate": 4.835748890785143e-05,
+      "loss": 0.1921079397201538,
+      "step": 785
+    },
+    {
+      "epoch": 0.14374090247452692,
+      "grad_norm": 0.110458143055439,
+      "learning_rate": 4.833114724023001e-05,
+      "loss": 0.17927205562591553,
+      "step": 790
+    },
+    {
+      "epoch": 0.14465065502183405,
+      "grad_norm": 0.2421770840883255,
+      "learning_rate": 4.830460331415275e-05,
+      "loss": 0.18317567110061644,
+      "step": 795
+    },
+    {
+      "epoch": 0.14556040756914118,
+      "grad_norm": 0.14752762019634247,
+      "learning_rate": 4.8277857359729787e-05,
+      "loss": 0.1843916058540344,
+      "step": 800
+    },
+    {
+      "epoch": 0.14647016011644834,
+      "grad_norm": 0.15043556690216064,
+      "learning_rate": 4.8250909608822644e-05,
+      "loss": 0.18354393243789674,
+      "step": 805
+    },
+    {
+      "epoch": 0.14737991266375547,
+      "grad_norm": 0.1381794661283493,
+      "learning_rate": 4.822376029504223e-05,
+      "loss": 0.1789781332015991,
+      "step": 810
+    },
+    {
+      "epoch": 0.1482896652110626,
+      "grad_norm": 0.18386174738407135,
+      "learning_rate": 4.819640965374681e-05,
+      "loss": 0.19494292736053467,
+      "step": 815
+    },
+    {
+      "epoch": 0.14919941775836973,
+      "grad_norm": 0.13829593360424042,
+      "learning_rate": 4.816885792203996e-05,
+      "loss": 0.18486063480377196,
+      "step": 820
+    },
+    {
+      "epoch": 0.15010917030567686,
+      "grad_norm": 0.15033291280269623,
+      "learning_rate": 4.814110533876852e-05,
+      "loss": 0.18061509132385253,
+      "step": 825
+    },
+    {
+      "epoch": 0.151018922852984,
+      "grad_norm": 0.17150473594665527,
+      "learning_rate": 4.811315214452051e-05,
+      "loss": 0.18464866876602173,
+      "step": 830
+    },
+    {
+      "epoch": 0.15192867540029112,
+      "grad_norm": 0.15317125618457794,
+      "learning_rate": 4.808499858162307e-05,
+      "loss": 0.1837708592414856,
+      "step": 835
+    },
+    {
+      "epoch": 0.15283842794759825,
+      "grad_norm": 0.2671392560005188,
+      "learning_rate": 4.805664489414031e-05,
+      "loss": 0.19338636398315429,
+      "step": 840
+    },
+    {
+      "epoch": 0.15374818049490538,
+      "grad_norm": 0.14047028124332428,
+      "learning_rate": 4.802809132787125e-05,
+      "loss": 0.17069108486175538,
+      "step": 845
+    },
+    {
+      "epoch": 0.1546579330422125,
+      "grad_norm": 0.1520431935787201,
+      "learning_rate": 4.799933813034768e-05,
+      "loss": 0.18607735633850098,
+      "step": 850
+    },
+    {
+      "epoch": 0.15556768558951964,
+      "grad_norm": 0.17239463329315186,
+      "learning_rate": 4.797038555083197e-05,
+      "loss": 0.18069062232971192,
+      "step": 855
+    },
+    {
+      "epoch": 0.1564774381368268,
+      "grad_norm": 0.1377955675125122,
+      "learning_rate": 4.794123384031495e-05,
+      "loss": 0.18870222568511963,
+      "step": 860
+    },
+    {
+      "epoch": 0.15738719068413393,
+      "grad_norm": 0.15901461243629456,
+      "learning_rate": 4.791188325151373e-05,
+      "loss": 0.18128334283828734,
+      "step": 865
+    },
+    {
+      "epoch": 0.15829694323144106,
+      "grad_norm": 0.14634132385253906,
+      "learning_rate": 4.7882334038869495e-05,
+      "loss": 0.1866163969039917,
+      "step": 870
+    },
+    {
+      "epoch": 0.1592066957787482,
+      "grad_norm": 0.15361061692237854,
+      "learning_rate": 4.785258645854529e-05,
+      "loss": 0.17850807905197144,
+      "step": 875
+    },
+    {
+      "epoch": 0.16011644832605532,
+      "grad_norm": 0.13751649856567383,
+      "learning_rate": 4.782264076842385e-05,
+      "loss": 0.17731113433837892,
+      "step": 880
+    },
+    {
+      "epoch": 0.16102620087336245,
+      "grad_norm": 0.17909638583660126,
+      "learning_rate": 4.7792497228105314e-05,
+      "loss": 0.18344542980194092,
+      "step": 885
+    },
+    {
+      "epoch": 0.16193595342066958,
+      "grad_norm": 0.16038304567337036,
+      "learning_rate": 4.776215609890498e-05,
+      "loss": 0.18868647813796996,
+      "step": 890
+    },
+    {
+      "epoch": 0.1628457059679767,
+      "grad_norm": 0.1653951108455658,
+      "learning_rate": 4.773161764385107e-05,
+      "loss": 0.18614152669906617,
+      "step": 895
+    },
+    {
+      "epoch": 0.16375545851528384,
+      "grad_norm": 0.16193026304244995,
+      "learning_rate": 4.770088212768241e-05,
+      "loss": 0.18564575910568237,
+      "step": 900
+    },
+    {
+      "epoch": 0.16466521106259097,
+      "grad_norm": 0.16048531234264374,
+      "learning_rate": 4.7669949816846173e-05,
+      "loss": 0.18330031633377075,
+      "step": 905
+    },
+    {
+      "epoch": 0.1655749636098981,
+      "grad_norm": 0.1440177708864212,
+      "learning_rate": 4.7638820979495534e-05,
+      "loss": 0.17712442874908446,
+      "step": 910
+    },
+    {
+      "epoch": 0.16648471615720525,
+      "grad_norm": 0.19635969400405884,
+      "learning_rate": 4.760749588548738e-05,
+      "loss": 0.18679027557373046,
+      "step": 915
+    },
+    {
+      "epoch": 0.16739446870451238,
+      "grad_norm": 0.15576541423797607,
+      "learning_rate": 4.757597480637995e-05,
+      "loss": 0.19283764362335204,
+      "step": 920
+    },
+    {
+      "epoch": 0.1683042212518195,
+      "grad_norm": 0.1550331562757492,
+      "learning_rate": 4.7544258015430463e-05,
+      "loss": 0.18269542455673218,
+      "step": 925
+    },
+    {
+      "epoch": 0.16921397379912664,
+      "grad_norm": 0.18369626998901367,
+      "learning_rate": 4.75123457875928e-05,
+      "loss": 0.1697891116142273,
+      "step": 930
+    },
+    {
+      "epoch": 0.17012372634643377,
+      "grad_norm": 0.15266314148902893,
+      "learning_rate": 4.7480238399515074e-05,
+      "loss": 0.18523451089859008,
+      "step": 935
+    },
+    {
+      "epoch": 0.1710334788937409,
+      "grad_norm": 0.16709664463996887,
+      "learning_rate": 4.744793612953724e-05,
+      "loss": 0.1803238034248352,
+      "step": 940
+    },
+    {
+      "epoch": 0.17194323144104803,
+      "grad_norm": 0.14929179847240448,
+      "learning_rate": 4.741543925768872e-05,
+      "loss": 0.1861217737197876,
+      "step": 945
+    },
+    {
+      "epoch": 0.17285298398835516,
+      "grad_norm": 0.1362280696630478,
+      "learning_rate": 4.7382748065685915e-05,
+      "loss": 0.17896100282669067,
+      "step": 950
+    },
+    {
+      "epoch": 0.1737627365356623,
+      "grad_norm": 0.15290239453315735,
+      "learning_rate": 4.734986283692982e-05,
+      "loss": 0.18432788848876952,
+      "step": 955
+    },
+    {
+      "epoch": 0.17467248908296942,
+      "grad_norm": 0.1287035197019577,
+      "learning_rate": 4.73167838565035e-05,
+      "loss": 0.18485682010650634,
+      "step": 960
+    },
+    {
+      "epoch": 0.17558224163027655,
+      "grad_norm": 0.17969627678394318,
+      "learning_rate": 4.728351141116971e-05,
+      "loss": 0.17361557483673096,
+      "step": 965
+    },
+    {
+      "epoch": 0.1764919941775837,
+      "grad_norm": 0.13751201331615448,
+      "learning_rate": 4.7250045789368326e-05,
+      "loss": 0.1731679320335388,
+      "step": 970
+    },
+    {
+      "epoch": 0.17740174672489084,
+      "grad_norm": 0.1603265255689621,
+      "learning_rate": 4.721638728121388e-05,
+      "loss": 0.17308170795440675,
+      "step": 975
+    },
+    {
+      "epoch": 0.17831149927219797,
+      "grad_norm": 0.1592789888381958,
+      "learning_rate": 4.718253617849306e-05,
+      "loss": 0.17534757852554322,
+      "step": 980
+    },
+    {
+      "epoch": 0.1792212518195051,
+      "grad_norm": 0.12727224826812744,
+      "learning_rate": 4.714849277466214e-05,
+      "loss": 0.17817609310150145,
+      "step": 985
+    },
+    {
+      "epoch": 0.18013100436681223,
+      "grad_norm": 0.15401554107666016,
+      "learning_rate": 4.711425736484447e-05,
+      "loss": 0.1733405351638794,
+      "step": 990
+    },
+    {
+      "epoch": 0.18104075691411936,
+      "grad_norm": 0.13253968954086304,
+      "learning_rate": 4.7079830245827906e-05,
+      "loss": 0.17846795320510864,
+      "step": 995
+    },
+    {
+      "epoch": 0.1819505094614265,
+      "grad_norm": 0.21846213936805725,
+      "learning_rate": 4.7045211716062245e-05,
+      "loss": 0.18021599054336548,
+      "step": 1000
+    },
+    {
+      "epoch": 0.18286026200873362,
+      "grad_norm": 0.16867990791797638,
+      "learning_rate": 4.7010402075656595e-05,
+      "loss": 0.18232386112213134,
+      "step": 1005
+    },
+    {
+      "epoch": 0.18377001455604075,
+      "grad_norm": 0.17180582880973816,
+      "learning_rate": 4.697540162637686e-05,
+      "loss": 0.1816317319869995,
+      "step": 1010
+    },
+    {
+      "epoch": 0.18467976710334788,
+      "grad_norm": 0.16480213403701782,
+      "learning_rate": 4.694021067164303e-05,
+      "loss": 0.17718446254730225,
+      "step": 1015
+    },
+    {
+      "epoch": 0.185589519650655,
+      "grad_norm": 0.15015918016433716,
+      "learning_rate": 4.6904829516526605e-05,
+      "loss": 0.17412011623382567,
+      "step": 1020
+    },
+    {
+      "epoch": 0.18649927219796217,
+      "grad_norm": 0.14445139467716217,
+      "learning_rate": 4.686925846774795e-05,
+      "loss": 0.1778018832206726,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1874090247452693,
+      "grad_norm": 0.1701960265636444,
+      "learning_rate": 4.683349783367362e-05,
+      "loss": 0.16901081800460815,
+      "step": 1030
+    },
+    {
+      "epoch": 0.18831877729257643,
+      "grad_norm": 0.15894867479801178,
+      "learning_rate": 4.679754792431368e-05,
+      "loss": 0.17055928707122803,
+      "step": 1035
+    },
+    {
+      "epoch": 0.18922852983988356,
+      "grad_norm": 0.1511942446231842,
+      "learning_rate": 4.676140905131903e-05,
+      "loss": 0.17339680194854737,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1901382823871907,
+      "grad_norm": 0.14735209941864014,
+      "learning_rate": 4.672508152797872e-05,
+      "loss": 0.17802717685699462,
+      "step": 1045
+    },
+    {
+      "epoch": 0.19104803493449782,
+      "grad_norm": 0.17367291450500488,
+      "learning_rate": 4.66885656692172e-05,
+      "loss": 0.1732744097709656,
+      "step": 1050
+    },
+    {
+      "epoch": 0.19195778748180495,
+      "grad_norm": 0.147227481007576,
+      "learning_rate": 4.665186179159159e-05,
+      "loss": 0.17040517330169677,
+      "step": 1055
+    },
+    {
+      "epoch": 0.19286754002911208,
+      "grad_norm": 0.1709655076265335,
+      "learning_rate": 4.6614970213289e-05,
+      "loss": 0.17794088125228882,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1937772925764192,
+      "grad_norm": 0.1588088721036911,
+      "learning_rate": 4.657789125412366e-05,
+      "loss": 0.17180380821228028,
+      "step": 1065
+    },
+    {
+      "epoch": 0.19468704512372634,
+      "grad_norm": 0.14827021956443787,
+      "learning_rate": 4.654062523553428e-05,
+      "loss": 0.182997989654541,
+      "step": 1070
+    },
+    {
+      "epoch": 0.19559679767103347,
+      "grad_norm": 0.16230466961860657,
+      "learning_rate": 4.6503172480581126e-05,
+      "loss": 0.17346880435943604,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1965065502183406,
+      "grad_norm": 0.1637624353170395,
+      "learning_rate": 4.646553331394333e-05,
+      "loss": 0.17263576984405518,
+      "step": 1080
+    },
+    {
+      "epoch": 0.19741630276564776,
+      "grad_norm": 0.15977843105793,
+      "learning_rate": 4.642770806191603e-05,
+      "loss": 0.17284308671951293,
+      "step": 1085
+    },
+    {
+      "epoch": 0.19832605531295489,
+      "grad_norm": 0.15394869446754456,
+      "learning_rate": 4.6389697052407534e-05,
+      "loss": 0.17797101736068727,
+      "step": 1090
+    },
+    {
+      "epoch": 0.19923580786026202,
+      "grad_norm": 0.15995225310325623,
+      "learning_rate": 4.6351500614936485e-05,
+      "loss": 0.18137198686599731,
+      "step": 1095
+    },
+    {
+      "epoch": 0.20014556040756915,
+      "grad_norm": 0.1779479682445526,
+      "learning_rate": 4.6313119080629006e-05,
+      "loss": 0.17998344898223878,
+      "step": 1100
+    },
+    {
+      "epoch": 0.20105531295487628,
+      "grad_norm": 0.14362832903862,
+      "learning_rate": 4.627455278221584e-05,
+      "loss": 0.18196423053741456,
+      "step": 1105
+    },
+    {
+      "epoch": 0.2019650655021834,
+      "grad_norm": 0.15951639413833618,
+      "learning_rate": 4.623580205402947e-05,
+      "loss": 0.17423888444900512,
+      "step": 1110
+    },
+    {
+      "epoch": 0.20287481804949054,
+      "grad_norm": 0.17273563146591187,
+      "learning_rate": 4.619686723200115e-05,
+      "loss": 0.17392473220825194,
+      "step": 1115
+    },
+    {
+      "epoch": 0.20378457059679767,
+      "grad_norm": 0.1655360758304596,
+      "learning_rate": 4.615774865365813e-05,
+      "loss": 0.17528389692306517,
+      "step": 1120
+    },
+    {
+      "epoch": 0.2046943231441048,
+      "grad_norm": 0.15920691192150116,
+      "learning_rate": 4.611844665812058e-05,
+      "loss": 0.1806849241256714,
+      "step": 1125
+    },
+    {
+      "epoch": 0.20560407569141192,
+      "grad_norm": 0.16114577651023865,
+      "learning_rate": 4.607896158609875e-05,
+      "loss": 0.17217352390289306,
+      "step": 1130
+    },
+    {
+      "epoch": 0.20651382823871905,
+      "grad_norm": 0.1499422937631607,
+      "learning_rate": 4.603929377988999e-05,
+      "loss": 0.17806737422943114,
+      "step": 1135
+    },
+    {
+      "epoch": 0.2074235807860262,
+      "grad_norm": 0.17605191469192505,
+      "learning_rate": 4.5999443583375765e-05,
+      "loss": 0.17842113971710205,
+      "step": 1140
+    },
+    {
+      "epoch": 0.20833333333333334,
+      "grad_norm": 0.16117210686206818,
+      "learning_rate": 4.595941134201871e-05,
+      "loss": 0.18379683494567872,
+      "step": 1145
+    },
+    {
+      "epoch": 0.20924308588064047,
+      "grad_norm": 0.21199050545692444,
+      "learning_rate": 4.591919740285957e-05,
+      "loss": 0.16286123991012574,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2101528384279476,
+      "grad_norm": 0.15100529789924622,
+      "learning_rate": 4.587880211451427e-05,
+      "loss": 0.17995200157165528,
+      "step": 1155
+    },
+    {
+      "epoch": 0.21106259097525473,
+      "grad_norm": 0.16618172824382782,
+      "learning_rate": 4.583822582717085e-05,
+      "loss": 0.16960303783416747,
+      "step": 1160
+    },
+    {
+      "epoch": 0.21197234352256186,
+      "grad_norm": 0.14743569493293762,
+      "learning_rate": 4.579746889258643e-05,
+      "loss": 0.17762668132781984,
+      "step": 1165
+    },
+    {
+      "epoch": 0.212882096069869,
+      "grad_norm": 0.1697179079055786,
+      "learning_rate": 4.575653166408417e-05,
+      "loss": 0.16656005382537842,
+      "step": 1170
+    },
+    {
+      "epoch": 0.21379184861717612,
+      "grad_norm": 0.14886513352394104,
+      "learning_rate": 4.57154144965502e-05,
+      "loss": 0.17091882228851318,
+      "step": 1175
+    },
+    {
+      "epoch": 0.21470160116448325,
+      "grad_norm": 0.18197473883628845,
+      "learning_rate": 4.5674117746430556e-05,
+      "loss": 0.1770920753479004,
+      "step": 1180
+    },
+    {
+      "epoch": 0.21561135371179038,
+      "grad_norm": 0.17323088645935059,
+      "learning_rate": 4.563264177172807e-05,
+      "loss": 0.1734643578529358,
+      "step": 1185
+    },
+    {
+      "epoch": 0.2165211062590975,
+      "grad_norm": 0.1521984338760376,
+      "learning_rate": 4.559098693199929e-05,
+      "loss": 0.17515116930007935,
+      "step": 1190
+    },
+    {
+      "epoch": 0.21743085880640467,
+      "grad_norm": 0.1842304915189743,
+      "learning_rate": 4.554915358835134e-05,
+      "loss": 0.16798022985458375,
+      "step": 1195
+    },
+    {
+      "epoch": 0.2183406113537118,
+      "grad_norm": 0.14753451943397522,
+      "learning_rate": 4.5507142103438794e-05,
+      "loss": 0.1755476713180542,
+      "step": 1200
+    },
+    {
+      "epoch": 0.21925036390101893,
+      "grad_norm": 0.17096194624900818,
+      "learning_rate": 4.546495284146057e-05,
+      "loss": 0.1792473554611206,
+      "step": 1205
+    },
+    {
+      "epoch": 0.22016011644832606,
+      "grad_norm": 0.1579233556985855,
+      "learning_rate": 4.542258616815669e-05,
+      "loss": 0.17230144739151002,
+      "step": 1210
+    },
+    {
+      "epoch": 0.2210698689956332,
+      "grad_norm": 0.177297905087471,
+      "learning_rate": 4.5380042450805216e-05,
+      "loss": 0.1807127833366394,
+      "step": 1215
+    },
+    {
+      "epoch": 0.22197962154294032,
+      "grad_norm": 0.14331696927547455,
+      "learning_rate": 4.533732205821897e-05,
+      "loss": 0.17201389074325563,
+      "step": 1220
+    },
+    {
+      "epoch": 0.22288937409024745,
+      "grad_norm": 0.14473360776901245,
+      "learning_rate": 4.529442536074239e-05,
+      "loss": 0.17036900520324708,
+      "step": 1225
+    },
+    {
+      "epoch": 0.22379912663755458,
+      "grad_norm": 0.1820901483297348,
+      "learning_rate": 4.5251352730248314e-05,
+      "loss": 0.17704882621765136,
+      "step": 1230
+    },
+    {
+      "epoch": 0.2247088791848617,
+      "grad_norm": 0.1948976367712021,
+      "learning_rate": 4.5208104540134746e-05,
+      "loss": 0.1706973433494568,
+      "step": 1235
+    },
+    {
+      "epoch": 0.22561863173216884,
+      "grad_norm": 0.16660070419311523,
+      "learning_rate": 4.51646811653216e-05,
+      "loss": 0.17636821269989014,
+      "step": 1240
+    },
+    {
+      "epoch": 0.22652838427947597,
+      "grad_norm": 0.1699984073638916,
+      "learning_rate": 4.512108298224751e-05,
+      "loss": 0.16986632347106934,
+      "step": 1245
+    },
+    {
+      "epoch": 0.22743813682678313,
+      "grad_norm": 0.17601042985916138,
+      "learning_rate": 4.50773103688665e-05,
+      "loss": 0.17507898807525635,
+      "step": 1250
+    },
+    {
+      "epoch": 0.22834788937409026,
+      "grad_norm": 0.17557238042354584,
+      "learning_rate": 4.503336370464476e-05,
+      "loss": 0.17702863216400147,
+      "step": 1255
+    },
+    {
+      "epoch": 0.2292576419213974,
+      "grad_norm": 0.1800651252269745,
+      "learning_rate": 4.498924337055729e-05,
+      "loss": 0.16419180631637573,
+      "step": 1260
+    },
+    {
+      "epoch": 0.23016739446870452,
+      "grad_norm": 0.2022479772567749,
+      "learning_rate": 4.494494974908468e-05,
+      "loss": 0.17482060194015503,
+      "step": 1265
+    },
+    {
+      "epoch": 0.23107714701601165,
+      "grad_norm": 0.14180205762386322,
+      "learning_rate": 4.490048322420973e-05,
+      "loss": 0.1723136067390442,
+      "step": 1270
+    },
+    {
+      "epoch": 0.23198689956331878,
+      "grad_norm": 0.18607310950756073,
+      "learning_rate": 4.485584418141419e-05,
+      "loss": 0.17096419334411622,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2328966521106259,
+      "grad_norm": 0.15958310663700104,
+      "learning_rate": 4.481103300767529e-05,
+      "loss": 0.1656244158744812,
+      "step": 1280
+    },
+    {
+      "epoch": 0.23380640465793304,
+      "grad_norm": 0.17552383244037628,
+      "learning_rate": 4.476605009146255e-05,
+      "loss": 0.17677626609802247,
+      "step": 1285
+    },
+    {
+      "epoch": 0.23471615720524017,
+      "grad_norm": 0.15299823880195618,
+      "learning_rate": 4.472089582273429e-05,
+      "loss": 0.1778991103172302,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2356259097525473,
+      "grad_norm": 0.14613987505435944,
+      "learning_rate": 4.46755705929343e-05,
+      "loss": 0.17071452140808105,
+      "step": 1295
+    },
+    {
+      "epoch": 0.23653566229985443,
+      "grad_norm": 0.17781122028827667,
+      "learning_rate": 4.463007479498843e-05,
+      "loss": 0.16955430507659913,
+      "step": 1300
+    },
+    {
+      "epoch": 0.23744541484716158,
+      "grad_norm": 0.16326487064361572,
+      "learning_rate": 4.458440882330119e-05,
+      "loss": 0.1777693510055542,
+      "step": 1305
+    },
+    {
+      "epoch": 0.23835516739446871,
+      "grad_norm": 0.17701926827430725,
+      "learning_rate": 4.4538573073752365e-05,
+      "loss": 0.16323351860046387,
+      "step": 1310
+    },
+    {
+      "epoch": 0.23926491994177584,
+      "grad_norm": 0.13104717433452606,
+      "learning_rate": 4.449256794369349e-05,
+      "loss": 0.17653456926345826,
+      "step": 1315
+    },
+    {
+      "epoch": 0.24017467248908297,
+      "grad_norm": 0.1796836256980896,
+      "learning_rate": 4.444639383194452e-05,
+      "loss": 0.17189600467681884,
+      "step": 1320
+    },
+    {
+      "epoch": 0.2410844250363901,
+      "grad_norm": 0.14919696748256683,
+      "learning_rate": 4.440005113879029e-05,
+      "loss": 0.17003334760665895,
+      "step": 1325
+    },
+    {
+      "epoch": 0.24199417758369723,
+      "grad_norm": 0.1728784441947937,
+      "learning_rate": 4.4353540265977064e-05,
+      "loss": 0.17397408485412597,
+      "step": 1330
+    },
+    {
+      "epoch": 0.24290393013100436,
+      "grad_norm": 0.14591015875339508,
+      "learning_rate": 4.43068616167091e-05,
+      "loss": 0.16498478651046752,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2438136826783115,
+      "grad_norm": 0.18417201936244965,
+      "learning_rate": 4.4260015595645055e-05,
+      "loss": 0.16841750144958495,
+      "step": 1340
+    },
+    {
+      "epoch": 0.24472343522561862,
+      "grad_norm": 0.16264279186725616,
+      "learning_rate": 4.4213002608894605e-05,
+      "loss": 0.16907373666763306,
+      "step": 1345
+    },
+    {
+      "epoch": 0.24563318777292575,
+      "grad_norm": 0.15248481929302216,
+      "learning_rate": 4.416582306401481e-05,
+      "loss": 0.15931472778320313,
+      "step": 1350
+    },
+    {
+      "epoch": 0.24654294032023288,
+      "grad_norm": 0.1488373875617981,
+      "learning_rate": 4.4118477370006636e-05,
+      "loss": 0.1701716423034668,
+      "step": 1355
+    },
+    {
+      "epoch": 0.24745269286754004,
+      "grad_norm": 0.14679782092571259,
+      "learning_rate": 4.407096593731142e-05,
+      "loss": 0.157412326335907,
+      "step": 1360
+    },
+    {
+      "epoch": 0.24836244541484717,
+      "grad_norm": 0.17139530181884766,
+      "learning_rate": 4.402328917780728e-05,
+      "loss": 0.17303754091262818,
+      "step": 1365
+    },
+    {
+      "epoch": 0.2492721979621543,
+      "grad_norm": 0.1534871757030487,
+      "learning_rate": 4.397544750480554e-05,
+      "loss": 0.1786255121231079,
+      "step": 1370
+    },
+    {
+      "epoch": 0.2501819505094614,
+      "grad_norm": 0.1876252293586731,
+      "learning_rate": 4.39274413330472e-05,
+      "loss": 0.16442898511886597,
+      "step": 1375
+    },
+    {
+      "epoch": 0.25109170305676853,
+      "grad_norm": 0.16165752708911896,
+      "learning_rate": 4.387927107869928e-05,
+      "loss": 0.1780426025390625,
+      "step": 1380
+    },
+    {
+      "epoch": 0.25200145560407566,
+      "grad_norm": 0.17242255806922913,
+      "learning_rate": 4.383093715935124e-05,
+      "loss": 0.15959256887435913,
+      "step": 1385
+    },
+    {
+      "epoch": 0.25291120815138285,
+      "grad_norm": 0.1627114862203598,
+      "learning_rate": 4.378243999401137e-05,
+      "loss": 0.17606115341186523,
+      "step": 1390
+    },
+    {
+      "epoch": 0.25382096069869,
+      "grad_norm": 0.15911224484443665,
+      "learning_rate": 4.373378000310312e-05,
+      "loss": 0.16798585653305054,
+      "step": 1395
+    },
+    {
+      "epoch": 0.2547307132459971,
+      "grad_norm": 0.15542249381542206,
+      "learning_rate": 4.3684957608461505e-05,
+      "loss": 0.1695417881011963,
+      "step": 1400
+    },
+    {
+      "epoch": 0.25564046579330424,
+      "grad_norm": 0.1475304812192917,
+      "learning_rate": 4.363597323332941e-05,
+      "loss": 0.16340878009796142,
+      "step": 1405
+    },
+    {
+      "epoch": 0.25655021834061137,
+      "grad_norm": 0.16943927109241486,
+      "learning_rate": 4.358682730235395e-05,
+      "loss": 0.17240238189697266,
+      "step": 1410
+    },
+    {
+      "epoch": 0.2574599708879185,
+      "grad_norm": 0.1816391944885254,
+      "learning_rate": 4.3537520241582744e-05,
+      "loss": 0.16558437347412108,
+      "step": 1415
+    },
+    {
+      "epoch": 0.25836972343522563,
+      "grad_norm": 0.23851341009140015,
+      "learning_rate": 4.348805247846027e-05,
+      "loss": 0.16796000003814698,
+      "step": 1420
+    },
+    {
+      "epoch": 0.25927947598253276,
+      "grad_norm": 0.15415243804454803,
+      "learning_rate": 4.343842444182414e-05,
+      "loss": 0.1746017098426819,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2601892285298399,
+      "grad_norm": 0.15651032328605652,
+      "learning_rate": 4.338863656190139e-05,
+      "loss": 0.1649057984352112,
+      "step": 1430
+    },
+    {
+      "epoch": 0.261098981077147,
+      "grad_norm": 0.16601966321468353,
+      "learning_rate": 4.333868927030471e-05,
+      "loss": 0.15888988971710205,
+      "step": 1435
+    },
+    {
+      "epoch": 0.26200873362445415,
+      "grad_norm": 0.1549467295408249,
+      "learning_rate": 4.328858300002876e-05,
+      "loss": 0.16357985734939576,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2629184861717613,
+      "grad_norm": 0.16332370042800903,
+      "learning_rate": 4.32383181854464e-05,
+      "loss": 0.16749982833862304,
+      "step": 1445
+    },
+    {
+      "epoch": 0.2638282387190684,
+      "grad_norm": 0.14827077090740204,
+      "learning_rate": 4.3187895262304894e-05,
+      "loss": 0.16886214017868043,
+      "step": 1450
+    },
+    {
+      "epoch": 0.26473799126637554,
+      "grad_norm": 0.1557198166847229,
+      "learning_rate": 4.313731466772216e-05,
+      "loss": 0.17512214183807373,
+      "step": 1455
+    },
+    {
+      "epoch": 0.26564774381368267,
+      "grad_norm": 0.17263570427894592,
+      "learning_rate": 4.308657684018299e-05,
+      "loss": 0.16248074769973755,
+      "step": 1460
+    },
+    {
+      "epoch": 0.2665574963609898,
+      "grad_norm": 0.17135761678218842,
+      "learning_rate": 4.303568221953521e-05,
+      "loss": 0.16605921983718872,
+      "step": 1465
+    },
+    {
+      "epoch": 0.26746724890829693,
+      "grad_norm": 0.14322632551193237,
+      "learning_rate": 4.2984631246985897e-05,
+      "loss": 0.1610772728919983,
+      "step": 1470
+    },
+    {
+      "epoch": 0.26837700145560406,
+      "grad_norm": 0.18852312862873077,
+      "learning_rate": 4.2933424365097564e-05,
+      "loss": 0.1686462163925171,
+      "step": 1475
+    },
+    {
+      "epoch": 0.2692867540029112,
+      "grad_norm": 0.1780245155096054,
+      "learning_rate": 4.2882062017784294e-05,
+      "loss": 0.16953932046890258,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2701965065502183,
+      "grad_norm": 0.180568665266037,
+      "learning_rate": 4.2830544650307895e-05,
+      "loss": 0.16442664861679077,
+      "step": 1485
+    },
+    {
+      "epoch": 0.27110625909752545,
+      "grad_norm": 0.16876435279846191,
+      "learning_rate": 4.277887270927407e-05,
+      "loss": 0.17128173112869263,
+      "step": 1490
+    },
+    {
+      "epoch": 0.2720160116448326,
+      "grad_norm": 0.164053276181221,
+      "learning_rate": 4.2727046642628513e-05,
+      "loss": 0.16331382989883422,
+      "step": 1495
+    },
+    {
+      "epoch": 0.27292576419213976,
+      "grad_norm": 0.14577528834342957,
+      "learning_rate": 4.267506689965305e-05,
+      "loss": 0.1638316035270691,
+      "step": 1500
+    },
+    {
+      "epoch": 0.2738355167394469,
+      "grad_norm": 0.1648740917444229,
+      "learning_rate": 4.262293393096171e-05,
+      "loss": 0.15332664251327516,
+      "step": 1505
+    },
+    {
+      "epoch": 0.274745269286754,
+      "grad_norm": 0.16445094347000122,
+      "learning_rate": 4.257064818849685e-05,
+      "loss": 0.1706634521484375,
+      "step": 1510
+    },
+    {
+      "epoch": 0.27565502183406115,
+      "grad_norm": 0.1584935486316681,
+      "learning_rate": 4.251821012552524e-05,
+      "loss": 0.1684114694595337,
+      "step": 1515
+    },
+    {
+      "epoch": 0.2765647743813683,
+      "grad_norm": 0.17215611040592194,
+      "learning_rate": 4.24656201966341e-05,
+      "loss": 0.15594131946563722,
+      "step": 1520
+    },
+    {
+      "epoch": 0.2774745269286754,
+      "grad_norm": 0.15945589542388916,
+      "learning_rate": 4.2412878857727214e-05,
+      "loss": 0.1686659574508667,
+      "step": 1525
+    },
+    {
+      "epoch": 0.27838427947598254,
+      "grad_norm": 0.16103951632976532,
+      "learning_rate": 4.2359986566020906e-05,
+      "loss": 0.17779340744018554,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2792940320232897,
+      "grad_norm": 0.1770307570695877,
+      "learning_rate": 4.230694378004014e-05,
+      "loss": 0.16786882877349854,
+      "step": 1535
+    },
+    {
+      "epoch": 0.2802037845705968,
+      "grad_norm": 0.16225053369998932,
+      "learning_rate": 4.2253750959614504e-05,
+      "loss": 0.16558897495269775,
+      "step": 1540
+    },
+    {
+      "epoch": 0.28111353711790393,
+      "grad_norm": 0.27213969826698303,
+      "learning_rate": 4.220040856587425e-05,
+      "loss": 0.1641119599342346,
+      "step": 1545
+    },
+    {
+      "epoch": 0.28202328966521106,
+      "grad_norm": 0.1773071587085724,
+      "learning_rate": 4.2146917061246284e-05,
+      "loss": 0.16919140815734862,
+      "step": 1550
+    },
+    {
+      "epoch": 0.2829330422125182,
+      "grad_norm": 0.15519705414772034,
+      "learning_rate": 4.209327690945014e-05,
+      "loss": 0.15501506328582765,
+      "step": 1555
+    },
+    {
+      "epoch": 0.2838427947598253,
+      "grad_norm": 0.19921597838401794,
+      "learning_rate": 4.203948857549402e-05,
+      "loss": 0.1690821886062622,
+      "step": 1560
+    },
+    {
+      "epoch": 0.28475254730713245,
+      "grad_norm": 0.15417630970478058,
+      "learning_rate": 4.1985552525670696e-05,
+      "loss": 0.1675640344619751,
+      "step": 1565
+    },
+    {
+      "epoch": 0.2856622998544396,
+      "grad_norm": 0.1739572137594223,
+      "learning_rate": 4.193146922755348e-05,
+      "loss": 0.16738017797470092,
+      "step": 1570
+    },
+    {
+      "epoch": 0.2865720524017467,
+      "grad_norm": 0.1384361982345581,
+      "learning_rate": 4.187723914999221e-05,
+      "loss": 0.16802358627319336,
+      "step": 1575
+    },
+    {
+      "epoch": 0.28748180494905384,
+      "grad_norm": 0.1491454839706421,
+      "learning_rate": 4.182286276310915e-05,
+      "loss": 0.1619583249092102,
+      "step": 1580
+    },
+    {
+      "epoch": 0.288391557496361,
+      "grad_norm": 0.15831919014453888,
+      "learning_rate": 4.176834053829492e-05,
+      "loss": 0.1625199794769287,
+      "step": 1585
+    },
+    {
+      "epoch": 0.2893013100436681,
+      "grad_norm": 0.16265396773815155,
+      "learning_rate": 4.1713672948204416e-05,
+      "loss": 0.16718552112579346,
+      "step": 1590
+    },
+    {
+      "epoch": 0.29021106259097523,
+      "grad_norm": 0.15153461694717407,
+      "learning_rate": 4.1658860466752714e-05,
+      "loss": 0.15979087352752686,
+      "step": 1595
+    },
+    {
+      "epoch": 0.29112081513828236,
+      "grad_norm": 0.1620412915945053,
+      "learning_rate": 4.160390356911096e-05,
+      "loss": 0.16103557348251343,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2920305676855895,
+      "grad_norm": 0.16673807799816132,
+      "learning_rate": 4.154880273170223e-05,
+      "loss": 0.16394708156585694,
+      "step": 1605
+    },
+    {
+      "epoch": 0.2929403202328967,
+      "grad_norm": 0.14834867417812347,
+      "learning_rate": 4.149355843219744e-05,
+      "loss": 0.15916435718536376,
+      "step": 1610
+    },
+    {
+      "epoch": 0.2938500727802038,
+      "grad_norm": 0.16977964341640472,
+      "learning_rate": 4.143817114951119e-05,
+      "loss": 0.16538127660751342,
+      "step": 1615
+    },
+    {
+      "epoch": 0.29475982532751094,
+      "grad_norm": 0.17986875772476196,
+      "learning_rate": 4.138264136379756e-05,
+      "loss": 0.15514618158340454,
+      "step": 1620
+    },
+    {
+      "epoch": 0.29566957787481807,
+      "grad_norm": 0.15794920921325684,
+      "learning_rate": 4.132696955644605e-05,
+      "loss": 0.15992183685302735,
+      "step": 1625
+    },
+    {
+      "epoch": 0.2965793304221252,
+      "grad_norm": 0.19955399632453918,
+      "learning_rate": 4.127115621007731e-05,
+      "loss": 0.16362056732177735,
+      "step": 1630
+    },
+    {
+      "epoch": 0.29748908296943233,
+      "grad_norm": 0.1352023035287857,
+      "learning_rate": 4.121520180853903e-05,
+      "loss": 0.15631601810455323,
+      "step": 1635
+    },
+    {
+      "epoch": 0.29839883551673946,
+      "grad_norm": 0.15340781211853027,
+      "learning_rate": 4.1159106836901674e-05,
+      "loss": 0.1571858048439026,
+      "step": 1640
+    },
+    {
+      "epoch": 0.2993085880640466,
+      "grad_norm": 0.15311770141124725,
+      "learning_rate": 4.110287178145433e-05,
+      "loss": 0.16082344055175782,
+      "step": 1645
+    },
+    {
+      "epoch": 0.3002183406113537,
+      "grad_norm": 0.17811627686023712,
+      "learning_rate": 4.10464971297005e-05,
+      "loss": 0.16117215156555176,
+      "step": 1650
+    },
+    {
+      "epoch": 0.30112809315866085,
+      "grad_norm": 0.21060039103031158,
+      "learning_rate": 4.0989983370353805e-05,
+      "loss": 0.15838587284088135,
+      "step": 1655
+    },
+    {
+      "epoch": 0.302037845705968,
+      "grad_norm": 0.155836820602417,
+      "learning_rate": 4.093333099333383e-05,
+      "loss": 0.16648870706558228,
+      "step": 1660
+    },
+    {
+      "epoch": 0.3029475982532751,
+      "grad_norm": 0.13711698353290558,
+      "learning_rate": 4.0876540489761826e-05,
+      "loss": 0.16899349689483642,
+      "step": 1665
+    },
+    {
+      "epoch": 0.30385735080058224,
+      "grad_norm": 0.15162716805934906,
+      "learning_rate": 4.0819612351956485e-05,
+      "loss": 0.16574090719223022,
+      "step": 1670
+    },
+    {
+      "epoch": 0.30476710334788937,
+      "grad_norm": 0.15016348659992218,
+      "learning_rate": 4.0762547073429615e-05,
+      "loss": 0.1689780354499817,
+      "step": 1675
+    },
+    {
+      "epoch": 0.3056768558951965,
+      "grad_norm": 0.15182986855506897,
+      "learning_rate": 4.070534514888194e-05,
+      "loss": 0.1593686819076538,
+      "step": 1680
+    },
+    {
+      "epoch": 0.3065866084425036,
+      "grad_norm": 0.15648750960826874,
+      "learning_rate": 4.0648007074198765e-05,
+      "loss": 0.16436235904693602,
+      "step": 1685
+    },
+    {
+      "epoch": 0.30749636098981076,
+      "grad_norm": 0.18339484930038452,
+      "learning_rate": 4.0590533346445665e-05,
+      "loss": 0.1678077220916748,
+      "step": 1690
+    },
+    {
+      "epoch": 0.3084061135371179,
+      "grad_norm": 0.16426527500152588,
+      "learning_rate": 4.053292446386422e-05,
+      "loss": 0.1689227342605591,
+      "step": 1695
+    },
+    {
+      "epoch": 0.309315866084425,
+      "grad_norm": 0.16129335761070251,
+      "learning_rate": 4.047518092586766e-05,
+      "loss": 0.16592445373535156,
+      "step": 1700
+    },
+    {
+      "epoch": 0.31022561863173215,
+      "grad_norm": 0.15512363612651825,
+      "learning_rate": 4.041730323303654e-05,
+      "loss": 0.16142364740371704,
+      "step": 1705
+    },
+    {
+      "epoch": 0.3111353711790393,
+      "grad_norm": 0.159842386841774,
+      "learning_rate": 4.0359291887114425e-05,
+      "loss": 0.1702875852584839,
+      "step": 1710
+    },
+    {
+      "epoch": 0.3120451237263464,
+      "grad_norm": 0.19558854401111603,
+      "learning_rate": 4.030114739100352e-05,
+      "loss": 0.15966148376464845,
+      "step": 1715
+    },
+    {
+      "epoch": 0.3129548762736536,
+      "grad_norm": 0.1577496975660324,
+      "learning_rate": 4.024287024876029e-05,
+      "loss": 0.1620358943939209,
+      "step": 1720
+    },
+    {
+      "epoch": 0.3138646288209607,
+      "grad_norm": 0.1629355251789093,
+      "learning_rate": 4.0184460965591144e-05,
+      "loss": 0.16511552333831786,
+      "step": 1725
+    },
+    {
+      "epoch": 0.31477438136826785,
+      "grad_norm": 0.17060767114162445,
+      "learning_rate": 4.0125920047848e-05,
+      "loss": 0.15672838687896729,
+      "step": 1730
+    },
+    {
+      "epoch": 0.315684133915575,
+      "grad_norm": 0.22447620332241058,
+      "learning_rate": 4.006724800302394e-05,
+      "loss": 0.15339784622192382,
+      "step": 1735
+    },
+    {
+      "epoch": 0.3165938864628821,
+      "grad_norm": 0.14572037756443024,
+      "learning_rate": 4.000844533974878e-05,
+      "loss": 0.16566959619522095,
+      "step": 1740
+    },
+    {
+      "epoch": 0.31750363901018924,
+      "grad_norm": 0.15915483236312866,
+      "learning_rate": 3.9949512567784684e-05,
+      "loss": 0.16153957843780517,
+      "step": 1745
+    },
+    {
+      "epoch": 0.3184133915574964,
+      "grad_norm": 0.1668540984392166,
+      "learning_rate": 3.9890450198021704e-05,
+      "loss": 0.1659809947013855,
+      "step": 1750
+    },
+    {
+      "epoch": 0.3193231441048035,
+      "grad_norm": 0.16612035036087036,
+      "learning_rate": 3.983125874247341e-05,
+      "loss": 0.16941241025924683,
+      "step": 1755
+    },
+    {
+      "epoch": 0.32023289665211063,
+      "grad_norm": 0.15163679420948029,
+      "learning_rate": 3.9771938714272407e-05,
+      "loss": 0.16053590774536133,
+      "step": 1760
+    },
+    {
+      "epoch": 0.32114264919941776,
+      "grad_norm": 0.1797824203968048,
+      "learning_rate": 3.97124906276659e-05,
+      "loss": 0.1667110800743103,
+      "step": 1765
+    },
+    {
+      "epoch": 0.3220524017467249,
+      "grad_norm": 0.15076608955860138,
+      "learning_rate": 3.9652914998011237e-05,
+      "loss": 0.1607860803604126,
+      "step": 1770
+    },
+    {
+      "epoch": 0.322962154294032,
+      "grad_norm": 0.16523587703704834,
+      "learning_rate": 3.959321234177144e-05,
+      "loss": 0.16515827178955078,
+      "step": 1775
+    },
+    {
+      "epoch": 0.32387190684133915,
+      "grad_norm": 0.22065149247646332,
+      "learning_rate": 3.9533383176510746e-05,
+      "loss": 0.1618957757949829,
+      "step": 1780
+    },
+    {
+      "epoch": 0.3247816593886463,
+      "grad_norm": 0.16426463425159454,
+      "learning_rate": 3.9473428020890066e-05,
+      "loss": 0.15763382911682128,
+      "step": 1785
+    },
+    {
+      "epoch": 0.3256914119359534,
+      "grad_norm": 0.16474904119968414,
+      "learning_rate": 3.941334739466257e-05,
+      "loss": 0.15135571956634522,
+      "step": 1790
+    },
+    {
+      "epoch": 0.32660116448326054,
+      "grad_norm": 0.16746412217617035,
+      "learning_rate": 3.935314181866909e-05,
+      "loss": 0.15925389528274536,
+      "step": 1795
+    },
+    {
+      "epoch": 0.32751091703056767,
+      "grad_norm": 0.17819371819496155,
+      "learning_rate": 3.929281181483369e-05,
+      "loss": 0.1598669171333313,
+      "step": 1800
+    },
+    {
+      "epoch": 0.3284206695778748,
+      "grad_norm": 0.1816040277481079,
+      "learning_rate": 3.923235790615907e-05,
+      "loss": 0.1652522087097168,
+      "step": 1805
+    },
+    {
+      "epoch": 0.32933042212518193,
+      "grad_norm": 0.14846695959568024,
+      "learning_rate": 3.917178061672211e-05,
+      "loss": 0.16665585041046144,
+      "step": 1810
+    },
+    {
+      "epoch": 0.33024017467248906,
+      "grad_norm": 0.1734926551580429,
+      "learning_rate": 3.911108047166924e-05,
+      "loss": 0.16069791316986085,
+      "step": 1815
+    },
+    {
+      "epoch": 0.3311499272197962,
+      "grad_norm": 0.16154922544956207,
+      "learning_rate": 3.905025799721194e-05,
+      "loss": 0.16114097833633423,
+      "step": 1820
+    },
+    {
+      "epoch": 0.3320596797671033,
+      "grad_norm": 0.1538771390914917,
+      "learning_rate": 3.898931372062217e-05,
+      "loss": 0.1602831244468689,
+      "step": 1825
+    },
+    {
+      "epoch": 0.3329694323144105,
+      "grad_norm": 0.14036566019058228,
+      "learning_rate": 3.892824817022781e-05,
+      "loss": 0.1502395749092102,
+      "step": 1830
+    },
+    {
+      "epoch": 0.33387918486171764,
+      "grad_norm": 0.19212059676647186,
+      "learning_rate": 3.886706187540804e-05,
+      "loss": 0.16265250444412233,
+      "step": 1835
+    },
+    {
+      "epoch": 0.33478893740902477,
+      "grad_norm": 0.17410333454608917,
+      "learning_rate": 3.880575536658881e-05,
+      "loss": 0.15689224004745483,
+      "step": 1840
+    },
+    {
+      "epoch": 0.3356986899563319,
+      "grad_norm": 0.15165294706821442,
+      "learning_rate": 3.874432917523817e-05,
+      "loss": 0.15033140182495117,
+      "step": 1845
+    },
+    {
+      "epoch": 0.336608442503639,
+      "grad_norm": 0.16166730225086212,
+      "learning_rate": 3.8682783833861736e-05,
+      "loss": 0.16896235942840576,
+      "step": 1850
+    },
+    {
+      "epoch": 0.33751819505094616,
+      "grad_norm": 0.16497021913528442,
+      "learning_rate": 3.8621119875998026e-05,
+      "loss": 0.1600774645805359,
+      "step": 1855
+    },
+    {
+      "epoch": 0.3384279475982533,
+      "grad_norm": 0.17264948785305023,
+      "learning_rate": 3.855933783621384e-05,
+      "loss": 0.16947593688964843,
+      "step": 1860
+    },
+    {
+      "epoch": 0.3393377001455604,
+      "grad_norm": 0.16870704293251038,
+      "learning_rate": 3.8497438250099636e-05,
+      "loss": 0.16062095165252685,
+      "step": 1865
+    },
+    {
+      "epoch": 0.34024745269286755,
+      "grad_norm": 0.16644036769866943,
+      "learning_rate": 3.843542165426492e-05,
+      "loss": 0.16015599966049193,
+      "step": 1870
+    },
+    {
+      "epoch": 0.3411572052401747,
+      "grad_norm": 0.1626352220773697,
+      "learning_rate": 3.837328858633349e-05,
+      "loss": 0.17444703578948975,
+      "step": 1875
+    },
+    {
+      "epoch": 0.3420669577874818,
+      "grad_norm": 0.1427375227212906,
+      "learning_rate": 3.83110395849389e-05,
+      "loss": 0.1589805006980896,
+      "step": 1880
+    },
+    {
+      "epoch": 0.34297671033478894,
+      "grad_norm": 0.17840255796909332,
+      "learning_rate": 3.824867518971973e-05,
+      "loss": 0.15953952074050903,
+      "step": 1885
+    },
+    {
+      "epoch": 0.34388646288209607,
+      "grad_norm": 0.16998249292373657,
+      "learning_rate": 3.818619594131489e-05,
+      "loss": 0.16027032136917113,
+      "step": 1890
+    },
+    {
+      "epoch": 0.3447962154294032,
+      "grad_norm": 0.14950257539749146,
+      "learning_rate": 3.812360238135897e-05,
+      "loss": 0.15335670709609986,
+      "step": 1895
+    },
+    {
+      "epoch": 0.3457059679767103,
+      "grad_norm": 0.1678011417388916,
+      "learning_rate": 3.806089505247752e-05,
+      "loss": 0.1560648798942566,
+      "step": 1900
+    },
+    {
+      "epoch": 0.34661572052401746,
+      "grad_norm": 0.17944541573524475,
+      "learning_rate": 3.799807449828238e-05,
+      "loss": 0.16072254180908202,
+      "step": 1905
+    },
+    {
+      "epoch": 0.3475254730713246,
+      "grad_norm": 0.166817307472229,
+      "learning_rate": 3.793514126336691e-05,
+      "loss": 0.1542820692062378,
+      "step": 1910
+    },
+    {
+      "epoch": 0.3484352256186317,
+      "grad_norm": 0.16047626733779907,
+      "learning_rate": 3.787209589330134e-05,
+      "loss": 0.16092092990875245,
+      "step": 1915
+    },
+    {
+      "epoch": 0.34934497816593885,
+      "grad_norm": 0.16478900611400604,
+      "learning_rate": 3.7808938934627965e-05,
+      "loss": 0.16765867471694945,
+      "step": 1920
+    },
+    {
+      "epoch": 0.350254730713246,
+      "grad_norm": 0.15349514782428741,
+      "learning_rate": 3.774567093485648e-05,
+      "loss": 0.15890377759933472,
+      "step": 1925
+    },
+    {
+      "epoch": 0.3511644832605531,
+      "grad_norm": 0.1515921950340271,
+      "learning_rate": 3.768229244245917e-05,
+      "loss": 0.16668319702148438,
+      "step": 1930
+    },
+    {
+      "epoch": 0.35207423580786024,
+      "grad_norm": 0.16310466825962067,
+      "learning_rate": 3.7618804006866195e-05,
+      "loss": 0.15182652473449706,
+      "step": 1935
+    },
+    {
+      "epoch": 0.3529839883551674,
+      "grad_norm": 0.17294517159461975,
+      "learning_rate": 3.755520617846084e-05,
+      "loss": 0.16287628412246705,
+      "step": 1940
+    },
+    {
+      "epoch": 0.35389374090247455,
+      "grad_norm": 0.1482895463705063,
+      "learning_rate": 3.749149950857467e-05,
+      "loss": 0.15321952104568481,
+      "step": 1945
+    },
+    {
+      "epoch": 0.3548034934497817,
+      "grad_norm": 0.2236029952764511,
+      "learning_rate": 3.7427684549482847e-05,
+      "loss": 0.15403482913970948,
+      "step": 1950
+    },
+    {
+      "epoch": 0.3557132459970888,
+      "grad_norm": 0.20185327529907227,
+      "learning_rate": 3.736376185439927e-05,
+      "loss": 0.1633884072303772,
+      "step": 1955
+    },
+    {
+      "epoch": 0.35662299854439594,
+      "grad_norm": 0.13906247913837433,
+      "learning_rate": 3.7299731977471816e-05,
+      "loss": 0.15925350189208984,
+      "step": 1960
+    },
+    {
+      "epoch": 0.35753275109170307,
+      "grad_norm": 0.18665002286434174,
+      "learning_rate": 3.723559547377751e-05,
+      "loss": 0.1612026572227478,
+      "step": 1965
+    },
+    {
+      "epoch": 0.3584425036390102,
+      "grad_norm": 0.16913433372974396,
+      "learning_rate": 3.717135289931774e-05,
+      "loss": 0.15479494333267213,
+      "step": 1970
+    },
+    {
+      "epoch": 0.35935225618631733,
+      "grad_norm": 0.1620066910982132,
+      "learning_rate": 3.7107004811013434e-05,
+      "loss": 0.1604058027267456,
+      "step": 1975
+    },
+    {
+      "epoch": 0.36026200873362446,
+      "grad_norm": 0.16838301718235016,
+      "learning_rate": 3.704255176670021e-05,
+      "loss": 0.15335073471069335,
+      "step": 1980
+    },
+    {
+      "epoch": 0.3611717612809316,
+      "grad_norm": 0.3054695427417755,
+      "learning_rate": 3.6977994325123535e-05,
+      "loss": 0.16558053493499755,
+      "step": 1985
+    },
+    {
+      "epoch": 0.3620815138282387,
+      "grad_norm": 0.1526716649532318,
+      "learning_rate": 3.6913333045933934e-05,
+      "loss": 0.16148923635482787,
+      "step": 1990
+    },
+    {
+      "epoch": 0.36299126637554585,
+      "grad_norm": 0.15328513085842133,
+      "learning_rate": 3.684856848968209e-05,
+      "loss": 0.1553613781929016,
+      "step": 1995
+    },
+    {
+      "epoch": 0.363901018922853,
+      "grad_norm": 0.16129714250564575,
+      "learning_rate": 3.6783701217813995e-05,
+      "loss": 0.16724612712860107,
+      "step": 2000
+    },
+    {
+      "epoch": 0.3648107714701601,
+      "grad_norm": 0.15715539455413818,
+      "learning_rate": 3.6718731792666086e-05,
+      "loss": 0.15867922306060792,
+      "step": 2005
+    },
+    {
+      "epoch": 0.36572052401746724,
+      "grad_norm": 0.15569166839122772,
+      "learning_rate": 3.6653660777460366e-05,
+      "loss": 0.1552058696746826,
+      "step": 2010
+    },
+    {
+      "epoch": 0.36663027656477437,
+      "grad_norm": 0.16223010420799255,
+      "learning_rate": 3.6588488736299535e-05,
+      "loss": 0.1583200454711914,
+      "step": 2015
+    },
+    {
+      "epoch": 0.3675400291120815,
+      "grad_norm": 0.18441995978355408,
+      "learning_rate": 3.652321623416209e-05,
+      "loss": 0.15050662755966188,
+      "step": 2020
+    },
+    {
+      "epoch": 0.36844978165938863,
+      "grad_norm": 0.13792674243450165,
+      "learning_rate": 3.645784383689742e-05,
+      "loss": 0.15458759069442748,
+      "step": 2025
+    },
+    {
+      "epoch": 0.36935953420669576,
+      "grad_norm": 0.14993111789226532,
+      "learning_rate": 3.639237211122091e-05,
+      "loss": 0.15926222801208495,
+      "step": 2030
+    },
+    {
+      "epoch": 0.3702692867540029,
+      "grad_norm": 0.16815930604934692,
+      "learning_rate": 3.632680162470904e-05,
+      "loss": 0.15524441003799438,
+      "step": 2035
+    },
+    {
+      "epoch": 0.37117903930131,
+      "grad_norm": 0.13312821090221405,
+      "learning_rate": 3.626113294579441e-05,
+      "loss": 0.15883516073226928,
+      "step": 2040
+    },
+    {
+      "epoch": 0.37208879184861715,
+      "grad_norm": 0.16838273406028748,
+      "learning_rate": 3.619536664376091e-05,
+      "loss": 0.15829603672027587,
+      "step": 2045
+    },
+    {
+      "epoch": 0.37299854439592434,
+      "grad_norm": 0.14706873893737793,
+      "learning_rate": 3.612950328873869e-05,
+      "loss": 0.15644397735595703,
+      "step": 2050
+    },
+    {
+      "epoch": 0.37390829694323147,
+      "grad_norm": 0.1644199639558792,
+      "learning_rate": 3.606354345169926e-05,
+      "loss": 0.15858219861984252,
+      "step": 2055
+    },
+    {
+      "epoch": 0.3748180494905386,
+      "grad_norm": 0.18077051639556885,
+      "learning_rate": 3.599748770445055e-05,
+      "loss": 0.1641286849975586,
+      "step": 2060
+    },
+    {
+      "epoch": 0.3757278020378457,
+      "grad_norm": 0.16329127550125122,
+      "learning_rate": 3.5931336619631914e-05,
+      "loss": 0.15027186870574952,
+      "step": 2065
+    },
+    {
+      "epoch": 0.37663755458515286,
+      "grad_norm": 0.16346783936023712,
+      "learning_rate": 3.586509077070922e-05,
+      "loss": 0.1558641314506531,
+      "step": 2070
+    },
+    {
+      "epoch": 0.37754730713246,
+      "grad_norm": 0.1727602630853653,
+      "learning_rate": 3.5798750731969834e-05,
+      "loss": 0.15390506982803345,
+      "step": 2075
+    },
+    {
+      "epoch": 0.3784570596797671,
+      "grad_norm": 0.7598192691802979,
+      "learning_rate": 3.5732317078517654e-05,
+      "loss": 0.1533232808113098,
+      "step": 2080
+    },
+    {
+      "epoch": 0.37936681222707425,
+      "grad_norm": 0.1433355212211609,
+      "learning_rate": 3.5665790386268124e-05,
+      "loss": 0.15560413599014283,
+      "step": 2085
+    },
+    {
+      "epoch": 0.3802765647743814,
+      "grad_norm": 0.18439625203609467,
+      "learning_rate": 3.559917123194325e-05,
+      "loss": 0.16695556640625,
+      "step": 2090
+    },
+    {
+      "epoch": 0.3811863173216885,
+      "grad_norm": 0.1693502813577652,
+      "learning_rate": 3.55324601930666e-05,
+      "loss": 0.15957870483398437,
+      "step": 2095
+    },
+    {
+      "epoch": 0.38209606986899564,
+      "grad_norm": 0.17776088416576385,
+      "learning_rate": 3.54656578479583e-05,
+      "loss": 0.1527492880821228,
+      "step": 2100
+    },
+    {
+      "epoch": 0.38300582241630277,
+      "grad_norm": 0.15993724763393402,
+      "learning_rate": 3.539876477572998e-05,
+      "loss": 0.1567505717277527,
+      "step": 2105
+    },
+    {
+      "epoch": 0.3839155749636099,
+      "grad_norm": 0.17067375779151917,
+      "learning_rate": 3.533178155627981e-05,
+      "loss": 0.14660797119140626,
+      "step": 2110
+    },
+    {
+      "epoch": 0.384825327510917,
+      "grad_norm": 0.20239882171154022,
+      "learning_rate": 3.526470877028745e-05,
+      "loss": 0.1596767544746399,
+      "step": 2115
+    },
+    {
+      "epoch": 0.38573508005822416,
+      "grad_norm": 0.1863643079996109,
+      "learning_rate": 3.5197546999209005e-05,
+      "loss": 0.15738571882247926,
+      "step": 2120
+    },
+    {
+      "epoch": 0.3866448326055313,
+      "grad_norm": 0.16994133591651917,
+      "learning_rate": 3.5130296825272014e-05,
+      "loss": 0.16255316734313965,
+      "step": 2125
+    },
+    {
+      "epoch": 0.3875545851528384,
+      "grad_norm": 0.18703415989875793,
+      "learning_rate": 3.5062958831470355e-05,
+      "loss": 0.15206334590911866,
+      "step": 2130
+    },
+    {
+      "epoch": 0.38846433770014555,
+      "grad_norm": 0.15433982014656067,
+      "learning_rate": 3.4995533601559226e-05,
+      "loss": 0.1590178370475769,
+      "step": 2135
+    },
+    {
+      "epoch": 0.3893740902474527,
+      "grad_norm": 0.16498146951198578,
+      "learning_rate": 3.4928021720050104e-05,
+      "loss": 0.14759145975112914,
+      "step": 2140
+    },
+    {
+      "epoch": 0.3902838427947598,
+      "grad_norm": 0.17880478501319885,
+      "learning_rate": 3.486042377220562e-05,
+      "loss": 0.1642458915710449,
+      "step": 2145
+    },
+    {
+      "epoch": 0.39119359534206694,
+      "grad_norm": 0.14700061082839966,
+      "learning_rate": 3.479274034403455e-05,
+      "loss": 0.16105138063430785,
+      "step": 2150
+    },
+    {
+      "epoch": 0.39210334788937407,
+      "grad_norm": 0.1620762050151825,
+      "learning_rate": 3.472497202228664e-05,
+      "loss": 0.15104985237121582,
+      "step": 2155
+    },
+    {
+      "epoch": 0.3930131004366812,
+      "grad_norm": 0.1625058799982071,
+      "learning_rate": 3.4657119394447654e-05,
+      "loss": 0.16145485639572144,
+      "step": 2160
+    },
+    {
+      "epoch": 0.3939228529839884,
+      "grad_norm": 0.1631549596786499,
+      "learning_rate": 3.458918304873417e-05,
+      "loss": 0.16712255477905275,
+      "step": 2165
+    },
+    {
+      "epoch": 0.3948326055312955,
+      "grad_norm": 0.16041551530361176,
+      "learning_rate": 3.452116357408853e-05,
+      "loss": 0.15118330717086792,
+      "step": 2170
+    },
+    {
+      "epoch": 0.39574235807860264,
+      "grad_norm": 0.16692611575126648,
+      "learning_rate": 3.44530615601737e-05,
+      "loss": 0.16982550621032716,
+      "step": 2175
+    },
+    {
+      "epoch": 0.39665211062590977,
+      "grad_norm": 0.16082268953323364,
+      "learning_rate": 3.438487759736821e-05,
+      "loss": 0.1513260006904602,
+      "step": 2180
+    },
+    {
+      "epoch": 0.3975618631732169,
+      "grad_norm": 0.1474589854478836,
+      "learning_rate": 3.4316612276761004e-05,
+      "loss": 0.14968743324279785,
+      "step": 2185
+    },
+    {
+      "epoch": 0.39847161572052403,
+      "grad_norm": 0.14531342685222626,
+      "learning_rate": 3.42482661901463e-05,
+      "loss": 0.1563260555267334,
+      "step": 2190
+    },
+    {
+      "epoch": 0.39938136826783116,
+      "grad_norm": 0.16775506734848022,
+      "learning_rate": 3.41798399300185e-05,
+      "loss": 0.14861010313034057,
+      "step": 2195
+    },
+    {
+      "epoch": 0.4002911208151383,
+      "grad_norm": 0.15065217018127441,
+      "learning_rate": 3.411133408956703e-05,
+      "loss": 0.15559519529342652,
+      "step": 2200
+    },
+    {
+      "epoch": 0.4012008733624454,
+      "grad_norm": 0.16655296087265015,
+      "learning_rate": 3.4042749262671184e-05,
+      "loss": 0.16025567054748535,
+      "step": 2205
+    },
+    {
+      "epoch": 0.40211062590975255,
+      "grad_norm": 0.14773905277252197,
+      "learning_rate": 3.397408604389501e-05,
+      "loss": 0.15074082612991332,
+      "step": 2210
+    },
+    {
+      "epoch": 0.4030203784570597,
+      "grad_norm": 0.16233304142951965,
+      "learning_rate": 3.3905345028482125e-05,
+      "loss": 0.15490520000457764,
+      "step": 2215
+    },
+    {
+      "epoch": 0.4039301310043668,
+      "grad_norm": 0.17520153522491455,
+      "learning_rate": 3.383652681235058e-05,
+      "loss": 0.1517520785331726,
+      "step": 2220
+    },
+    {
+      "epoch": 0.40483988355167394,
+      "grad_norm": 0.14749875664710999,
+      "learning_rate": 3.376763199208766e-05,
+      "loss": 0.15410997867584228,
+      "step": 2225
+    },
+    {
+      "epoch": 0.40574963609898107,
+      "grad_norm": 0.16855919361114502,
+      "learning_rate": 3.369866116494477e-05,
+      "loss": 0.1510261058807373,
+      "step": 2230
+    },
+    {
+      "epoch": 0.4066593886462882,
+      "grad_norm": 0.1594122350215912,
+      "learning_rate": 3.362961492883218e-05,
+      "loss": 0.1493813395500183,
+      "step": 2235
+    },
+    {
+      "epoch": 0.40756914119359533,
+      "grad_norm": 0.13645926117897034,
+      "learning_rate": 3.3560493882313915e-05,
+      "loss": 0.14876762628555298,
+      "step": 2240
+    },
+    {
+      "epoch": 0.40847889374090246,
+      "grad_norm": 0.14304400980472565,
+      "learning_rate": 3.349129862460251e-05,
+      "loss": 0.15567013025283813,
+      "step": 2245
+    },
+    {
+      "epoch": 0.4093886462882096,
+      "grad_norm": 0.17040041089057922,
+      "learning_rate": 3.342202975555386e-05,
+      "loss": 0.1563249945640564,
+      "step": 2250
+    },
+    {
+      "epoch": 0.4102983988355167,
+      "grad_norm": 0.15594671666622162,
+      "learning_rate": 3.3352687875661984e-05,
+      "loss": 0.1546410083770752,
+      "step": 2255
+    },
+    {
+      "epoch": 0.41120815138282385,
+      "grad_norm": 0.1677195280790329,
+      "learning_rate": 3.328327358605384e-05,
+      "loss": 0.15710171461105346,
+      "step": 2260
+    },
+    {
+      "epoch": 0.412117903930131,
+      "grad_norm": 0.1731705516576767,
+      "learning_rate": 3.321378748848412e-05,
+      "loss": 0.16444036960601807,
+      "step": 2265
+    },
+    {
+      "epoch": 0.4130276564774381,
+      "grad_norm": 0.18779033422470093,
+      "learning_rate": 3.3144230185329984e-05,
+      "loss": 0.15659687519073487,
+      "step": 2270
+    },
+    {
+      "epoch": 0.4139374090247453,
+      "grad_norm": 0.1543768346309662,
+      "learning_rate": 3.3074602279585913e-05,
+      "loss": 0.15100739002227784,
+      "step": 2275
+    },
+    {
+      "epoch": 0.4148471615720524,
+      "grad_norm": 0.16672168672084808,
+      "learning_rate": 3.300490437485843e-05,
+      "loss": 0.15535364151000977,
+      "step": 2280
+    },
+    {
+      "epoch": 0.41575691411935956,
+      "grad_norm": 0.16741308569908142,
+      "learning_rate": 3.293513707536089e-05,
+      "loss": 0.15523911714553834,
+      "step": 2285
+    },
+    {
+      "epoch": 0.4166666666666667,
+      "grad_norm": 0.1488303542137146,
+      "learning_rate": 3.286530098590822e-05,
+      "loss": 0.1542000651359558,
+      "step": 2290
+    },
+    {
+      "epoch": 0.4175764192139738,
+      "grad_norm": 0.1637732982635498,
+      "learning_rate": 3.2795396711911694e-05,
+      "loss": 0.15354831218719484,
+      "step": 2295
+    },
+    {
+      "epoch": 0.41848617176128095,
+      "grad_norm": 0.1472022533416748,
+      "learning_rate": 3.272542485937369e-05,
+      "loss": 0.16235145330429077,
+      "step": 2300
+    },
+    {
+      "epoch": 0.4193959243085881,
+      "grad_norm": 0.15908290445804596,
+      "learning_rate": 3.265538603488241e-05,
+      "loss": 0.15642645359039306,
+      "step": 2305
+    },
+    {
+      "epoch": 0.4203056768558952,
+      "grad_norm": 0.1584865301847458,
+      "learning_rate": 3.2585280845606645e-05,
+      "loss": 0.15490249395370484,
+      "step": 2310
+    },
+    {
+      "epoch": 0.42121542940320233,
+      "grad_norm": 0.15893949568271637,
+      "learning_rate": 3.251510989929052e-05,
+      "loss": 0.1598116159439087,
+      "step": 2315
+    },
+    {
+      "epoch": 0.42212518195050946,
+      "grad_norm": 0.18930596113204956,
+      "learning_rate": 3.244487380424817e-05,
+      "loss": 0.1482008934020996,
+      "step": 2320
+    },
+    {
+      "epoch": 0.4230349344978166,
+      "grad_norm": 0.132876455783844,
+      "learning_rate": 3.237457316935856e-05,
+      "loss": 0.15304710865020751,
+      "step": 2325
+    },
+    {
+      "epoch": 0.4239446870451237,
+      "grad_norm": 0.16447032988071442,
+      "learning_rate": 3.2304208604060106e-05,
+      "loss": 0.15298750400543212,
+      "step": 2330
+    },
+    {
+      "epoch": 0.42485443959243085,
+      "grad_norm": 0.17748120427131653,
+      "learning_rate": 3.223378071834546e-05,
+      "loss": 0.1556084156036377,
+      "step": 2335
+    },
+    {
+      "epoch": 0.425764192139738,
+      "grad_norm": 0.16366586089134216,
+      "learning_rate": 3.2163290122756206e-05,
+      "loss": 0.14387927055358887,
+      "step": 2340
+    },
+    {
+      "epoch": 0.4266739446870451,
+      "grad_norm": 0.15398970246315002,
+      "learning_rate": 3.209273742837755e-05,
+      "loss": 0.16091293096542358,
+      "step": 2345
+    },
+    {
+      "epoch": 0.42758369723435224,
+      "grad_norm": 0.164212167263031,
+      "learning_rate": 3.202212324683305e-05,
+      "loss": 0.15523531436920165,
+      "step": 2350
+    },
+    {
+      "epoch": 0.4284934497816594,
+      "grad_norm": 0.16749800741672516,
+      "learning_rate": 3.1951448190279255e-05,
+      "loss": 0.15354975461959838,
+      "step": 2355
+    },
+    {
+      "epoch": 0.4294032023289665,
+      "grad_norm": 0.14137034118175507,
+      "learning_rate": 3.18807128714005e-05,
+      "loss": 0.14981694221496583,
+      "step": 2360
+    },
+    {
+      "epoch": 0.43031295487627363,
+      "grad_norm": 0.14848439395427704,
+      "learning_rate": 3.1809917903403507e-05,
+      "loss": 0.15448769330978393,
+      "step": 2365
+    },
+    {
+      "epoch": 0.43122270742358076,
+      "grad_norm": 0.1747605800628662,
+      "learning_rate": 3.1739063900012095e-05,
+      "loss": 0.15882387161254882,
+      "step": 2370
+    },
+    {
+      "epoch": 0.4321324599708879,
+      "grad_norm": 0.16054467856884003,
+      "learning_rate": 3.166815147546186e-05,
+      "loss": 0.15170297622680665,
+      "step": 2375
+    },
+    {
+      "epoch": 0.433042212518195,
+      "grad_norm": 0.15428027510643005,
+      "learning_rate": 3.1597181244494886e-05,
+      "loss": 0.16202548742294312,
+      "step": 2380
+    },
+    {
+      "epoch": 0.4339519650655022,
+      "grad_norm": 0.16747219860553741,
+      "learning_rate": 3.1526153822354325e-05,
+      "loss": 0.15461477041244506,
+      "step": 2385
+    },
+    {
+      "epoch": 0.43486171761280934,
+      "grad_norm": 0.17415772378444672,
+      "learning_rate": 3.145506982477918e-05,
+      "loss": 0.16173542737960817,
+      "step": 2390
+    },
+    {
+      "epoch": 0.43577147016011647,
+      "grad_norm": 0.1293518990278244,
+      "learning_rate": 3.1383929867998865e-05,
+      "loss": 0.15572521686553956,
+      "step": 2395
+    },
+    {
+      "epoch": 0.4366812227074236,
+      "grad_norm": 0.16909323632717133,
+      "learning_rate": 3.1312734568727935e-05,
+      "loss": 0.15898628234863282,
+      "step": 2400
+    },
+    {
+      "epoch": 0.43759097525473073,
+      "grad_norm": 0.16770294308662415,
+      "learning_rate": 3.124148454416069e-05,
+      "loss": 0.1536281704902649,
+      "step": 2405
+    },
+    {
+      "epoch": 0.43850072780203786,
+      "grad_norm": 0.14078612625598907,
+      "learning_rate": 3.117018041196585e-05,
+      "loss": 0.15274266004562378,
+      "step": 2410
+    },
+    {
+      "epoch": 0.439410480349345,
+      "grad_norm": 0.15457536280155182,
+      "learning_rate": 3.1098822790281226e-05,
+      "loss": 0.15391263961791993,
+      "step": 2415
+    },
+    {
+      "epoch": 0.4403202328966521,
+      "grad_norm": 0.1640717089176178,
+      "learning_rate": 3.102741229770827e-05,
+      "loss": 0.15515168905258178,
+      "step": 2420
+    },
+    {
+      "epoch": 0.44122998544395925,
+      "grad_norm": 0.2601533830165863,
+      "learning_rate": 3.095594955330683e-05,
+      "loss": 0.1587247371673584,
+      "step": 2425
+    },
+    {
+      "epoch": 0.4421397379912664,
+      "grad_norm": 0.1352529525756836,
+      "learning_rate": 3.08844351765897e-05,
+      "loss": 0.1483217477798462,
+      "step": 2430
+    },
+    {
+      "epoch": 0.4430494905385735,
+      "grad_norm": 0.18479721248149872,
+      "learning_rate": 3.081286978751728e-05,
+      "loss": 0.15121787786483765,
+      "step": 2435
+    },
+    {
+      "epoch": 0.44395924308588064,
+      "grad_norm": 0.16954511404037476,
+      "learning_rate": 3.074125400649221e-05,
+      "loss": 0.16073100566864013,
+      "step": 2440
+    },
+    {
+      "epoch": 0.44486899563318777,
+      "grad_norm": 0.15154729783535004,
+      "learning_rate": 3.0669588454353944e-05,
+      "loss": 0.15738017559051515,
+      "step": 2445
+    },
+    {
+      "epoch": 0.4457787481804949,
+      "grad_norm": 0.1540488302707672,
+      "learning_rate": 3.059787375237344e-05,
+      "loss": 0.1515384554862976,
+      "step": 2450
+    },
+    {
+      "epoch": 0.44668850072780203,
+      "grad_norm": 0.1814432442188263,
+      "learning_rate": 3.052611052224774e-05,
+      "loss": 0.15731438398361205,
+      "step": 2455
+    },
+    {
+      "epoch": 0.44759825327510916,
+      "grad_norm": 0.16657036542892456,
+      "learning_rate": 3.0454299386094542e-05,
+      "loss": 0.15741543769836425,
+      "step": 2460
+    },
+    {
+      "epoch": 0.4485080058224163,
+      "grad_norm": 0.2177237570285797,
+      "learning_rate": 3.0382440966446875e-05,
+      "loss": 0.14972515106201173,
+      "step": 2465
+    },
+    {
+      "epoch": 0.4494177583697234,
+      "grad_norm": 0.1669909954071045,
+      "learning_rate": 3.031053588624766e-05,
+      "loss": 0.1506432294845581,
+      "step": 2470
+    },
+    {
+      "epoch": 0.45032751091703055,
+      "grad_norm": 0.1752234250307083,
+      "learning_rate": 3.0238584768844313e-05,
+      "loss": 0.14969609975814818,
+      "step": 2475
+    },
+    {
+      "epoch": 0.4512372634643377,
+      "grad_norm": 0.18267901241779327,
+      "learning_rate": 3.0166588237983363e-05,
+      "loss": 0.15112748146057128,
+      "step": 2480
+    },
+    {
+      "epoch": 0.4521470160116448,
+      "grad_norm": 0.16250105202198029,
+      "learning_rate": 3.0094546917805007e-05,
+      "loss": 0.15864100456237792,
+      "step": 2485
+    },
+    {
+      "epoch": 0.45305676855895194,
+      "grad_norm": 0.14825721085071564,
+      "learning_rate": 3.0022461432837752e-05,
+      "loss": 0.1513954520225525,
+      "step": 2490
+    },
+    {
+      "epoch": 0.4539665211062591,
+      "grad_norm": 0.1626640111207962,
+      "learning_rate": 2.9950332407992943e-05,
+      "loss": 0.1505578875541687,
+      "step": 2495
+    },
+    {
+      "epoch": 0.45487627365356625,
+      "grad_norm": 0.1535351574420929,
+      "learning_rate": 2.987816046855939e-05,
+      "loss": 0.15255829095840454,
+      "step": 2500
+    },
+    {
+      "epoch": 0.4557860262008734,
+      "grad_norm": 0.17552775144577026,
+      "learning_rate": 2.9805946240197928e-05,
+      "loss": 0.1516443133354187,
+      "step": 2505
+    },
+    {
+      "epoch": 0.4566957787481805,
+      "grad_norm": 0.16020981967449188,
+      "learning_rate": 2.9733690348935994e-05,
+      "loss": 0.14519743919372557,
+      "step": 2510
+    },
+    {
+      "epoch": 0.45760553129548764,
+      "grad_norm": 0.17800211906433105,
+      "learning_rate": 2.9661393421162204e-05,
+      "loss": 0.15679080486297609,
+      "step": 2515
+    },
+    {
+      "epoch": 0.4585152838427948,
+      "grad_norm": 0.16016991436481476,
+      "learning_rate": 2.9589056083620902e-05,
+      "loss": 0.14768127202987671,
+      "step": 2520
+    },
+    {
+      "epoch": 0.4594250363901019,
+      "grad_norm": 0.16272081434726715,
+      "learning_rate": 2.951667896340679e-05,
+      "loss": 0.1513301968574524,
+      "step": 2525
+    },
+    {
+      "epoch": 0.46033478893740903,
+      "grad_norm": 0.1726413071155548,
+      "learning_rate": 2.9444262687959402e-05,
+      "loss": 0.14819332361221313,
+      "step": 2530
+    },
+    {
+      "epoch": 0.46124454148471616,
+      "grad_norm": 0.1670403778553009,
+      "learning_rate": 2.9371807885057735e-05,
+      "loss": 0.15245940685272216,
+      "step": 2535
+    },
+    {
+      "epoch": 0.4621542940320233,
+      "grad_norm": 0.1650049239397049,
+      "learning_rate": 2.9299315182814772e-05,
+      "loss": 0.15187418460845947,
+      "step": 2540
+    },
+    {
+      "epoch": 0.4630640465793304,
+      "grad_norm": 0.16327734291553497,
+      "learning_rate": 2.9226785209672047e-05,
+      "loss": 0.15579828023910522,
+      "step": 2545
+    },
+    {
+      "epoch": 0.46397379912663755,
+      "grad_norm": 0.3367880582809448,
+      "learning_rate": 2.91542185943942e-05,
+      "loss": 0.15617697238922118,
+      "step": 2550
+    },
+    {
+      "epoch": 0.4648835516739447,
+      "grad_norm": 0.1731594055891037,
+      "learning_rate": 2.908161596606353e-05,
+      "loss": 0.1559603691101074,
+      "step": 2555
+    },
+    {
+      "epoch": 0.4657933042212518,
+      "grad_norm": 0.1477293074131012,
+      "learning_rate": 2.9008977954074517e-05,
+      "loss": 0.15567959547042848,
+      "step": 2560
+    },
+    {
+      "epoch": 0.46670305676855894,
+      "grad_norm": 0.16227173805236816,
+      "learning_rate": 2.8936305188128392e-05,
+      "loss": 0.1522113561630249,
+      "step": 2565
+    },
+    {
+      "epoch": 0.4676128093158661,
+      "grad_norm": 0.2031075656414032,
+      "learning_rate": 2.8863598298227674e-05,
+      "loss": 0.15054640769958497,
+      "step": 2570
+    },
+    {
+      "epoch": 0.4685225618631732,
+      "grad_norm": 0.18351472914218903,
+      "learning_rate": 2.8790857914670698e-05,
+      "loss": 0.15837019681930542,
+      "step": 2575
+    },
+    {
+      "epoch": 0.46943231441048033,
+      "grad_norm": 0.15914765000343323,
+      "learning_rate": 2.871808466804616e-05,
+      "loss": 0.1550259470939636,
+      "step": 2580
+    },
+    {
+      "epoch": 0.47034206695778746,
+      "grad_norm": 0.17366717755794525,
+      "learning_rate": 2.8645279189227636e-05,
+      "loss": 0.15702390670776367,
+      "step": 2585
+    },
+    {
+      "epoch": 0.4712518195050946,
+      "grad_norm": 0.13677838444709778,
+      "learning_rate": 2.8572442109368134e-05,
+      "loss": 0.15485031604766847,
+      "step": 2590
+    },
+    {
+      "epoch": 0.4721615720524017,
+      "grad_norm": 0.1477748304605484,
+      "learning_rate": 2.8499574059894617e-05,
+      "loss": 0.14577245712280273,
+      "step": 2595
+    },
+    {
+      "epoch": 0.47307132459970885,
+      "grad_norm": 0.1582217663526535,
+      "learning_rate": 2.842667567250252e-05,
+      "loss": 0.15586793422698975,
+      "step": 2600
+    },
+    {
+      "epoch": 0.47398107714701604,
+      "grad_norm": 0.19658738374710083,
+      "learning_rate": 2.8353747579150268e-05,
+      "loss": 0.15060495138168334,
+      "step": 2605
+    },
+    {
+      "epoch": 0.47489082969432317,
+      "grad_norm": 0.176767036318779,
+      "learning_rate": 2.828079041205382e-05,
+      "loss": 0.15116705894470214,
+      "step": 2610
+    },
+    {
+      "epoch": 0.4758005822416303,
+      "grad_norm": 0.16972507536411285,
+      "learning_rate": 2.820780480368117e-05,
+      "loss": 0.1541937470436096,
+      "step": 2615
+    },
+    {
+      "epoch": 0.47671033478893743,
+      "grad_norm": 0.1548585742712021,
+      "learning_rate": 2.8134791386746884e-05,
+      "loss": 0.14334756135940552,
+      "step": 2620
+    },
+    {
+      "epoch": 0.47762008733624456,
+      "grad_norm": 0.15411986410617828,
+      "learning_rate": 2.806175079420658e-05,
+      "loss": 0.14642289876937867,
+      "step": 2625
+    },
+    {
+      "epoch": 0.4785298398835517,
+      "grad_norm": 0.16609491407871246,
+      "learning_rate": 2.7988683659251474e-05,
+      "loss": 0.15083469152450563,
+      "step": 2630
+    },
+    {
+      "epoch": 0.4794395924308588,
+      "grad_norm": 0.16592684388160706,
+      "learning_rate": 2.791559061530289e-05,
+      "loss": 0.14218480587005616,
+      "step": 2635
+    },
+    {
+      "epoch": 0.48034934497816595,
+      "grad_norm": 0.1764935404062271,
+      "learning_rate": 2.7842472296006722e-05,
+      "loss": 0.15004343986511232,
+      "step": 2640
+    },
+    {
+      "epoch": 0.4812590975254731,
+      "grad_norm": 0.20094354450702667,
+      "learning_rate": 2.7769329335228022e-05,
+      "loss": 0.14975016117095946,
+      "step": 2645
+    },
+    {
+      "epoch": 0.4821688500727802,
+      "grad_norm": 0.1869269460439682,
+      "learning_rate": 2.769616236704542e-05,
+      "loss": 0.155981707572937,
+      "step": 2650
+    },
+    {
+      "epoch": 0.48307860262008734,
+      "grad_norm": 0.16671574115753174,
+      "learning_rate": 2.762297202574571e-05,
+      "loss": 0.14633859395980836,
+      "step": 2655
+    },
+    {
+      "epoch": 0.48398835516739447,
+      "grad_norm": 0.14999663829803467,
+      "learning_rate": 2.754975894581826e-05,
+      "loss": 0.15692603588104248,
+      "step": 2660
+    },
+    {
+      "epoch": 0.4848981077147016,
+      "grad_norm": 0.16893649101257324,
+      "learning_rate": 2.7476523761949592e-05,
+      "loss": 0.14530394077301026,
+      "step": 2665
+    },
+    {
+      "epoch": 0.48580786026200873,
+      "grad_norm": 0.16039884090423584,
+      "learning_rate": 2.740326710901784e-05,
+      "loss": 0.15013915300369263,
+      "step": 2670
+    },
+    {
+      "epoch": 0.48671761280931586,
+      "grad_norm": 0.16672006249427795,
+      "learning_rate": 2.732998962208725e-05,
+      "loss": 0.15667349100112915,
+      "step": 2675
+    },
+    {
+      "epoch": 0.487627365356623,
+      "grad_norm": 0.2160867303609848,
+      "learning_rate": 2.7256691936402684e-05,
+      "loss": 0.14335414171218872,
+      "step": 2680
+    },
+    {
+      "epoch": 0.4885371179039301,
+      "grad_norm": 0.349030077457428,
+      "learning_rate": 2.71833746873841e-05,
+      "loss": 0.1437530279159546,
+      "step": 2685
+    },
+    {
+      "epoch": 0.48944687045123725,
+      "grad_norm": 0.18380966782569885,
+      "learning_rate": 2.7110038510621073e-05,
+      "loss": 0.1476014256477356,
+      "step": 2690
+    },
+    {
+      "epoch": 0.4903566229985444,
+      "grad_norm": 0.1523742377758026,
+      "learning_rate": 2.703668404186722e-05,
+      "loss": 0.14578526020050048,
+      "step": 2695
+    },
+    {
+      "epoch": 0.4912663755458515,
+      "grad_norm": 0.16092729568481445,
+      "learning_rate": 2.696331191703479e-05,
+      "loss": 0.15335593223571778,
+      "step": 2700
+    },
+    {
+      "epoch": 0.49217612809315864,
+      "grad_norm": 0.17185333371162415,
+      "learning_rate": 2.688992277218904e-05,
+      "loss": 0.1540898084640503,
+      "step": 2705
+    },
+    {
+      "epoch": 0.49308588064046577,
+      "grad_norm": 0.1521969735622406,
+      "learning_rate": 2.6816517243542792e-05,
+      "loss": 0.15171396732330322,
+      "step": 2710
+    },
+    {
+      "epoch": 0.49399563318777295,
+      "grad_norm": 0.16064171493053436,
+      "learning_rate": 2.674309596745092e-05,
+      "loss": 0.1505839228630066,
+      "step": 2715
+    },
+    {
+      "epoch": 0.4949053857350801,
+      "grad_norm": 0.16430898010730743,
+      "learning_rate": 2.6669659580404795e-05,
+      "loss": 0.1551363468170166,
+      "step": 2720
+    },
+    {
+      "epoch": 0.4958151382823872,
+      "grad_norm": 0.16125477850437164,
+      "learning_rate": 2.659620871902677e-05,
+      "loss": 0.15069286823272704,
+      "step": 2725
+    },
+    {
+      "epoch": 0.49672489082969434,
+      "grad_norm": 0.1428450047969818,
+      "learning_rate": 2.652274402006471e-05,
+      "loss": 0.15511081218719483,
+      "step": 2730
+    },
+    {
+      "epoch": 0.4976346433770015,
+      "grad_norm": 0.15452754497528076,
+      "learning_rate": 2.6449266120386406e-05,
+      "loss": 0.14941939115524291,
+      "step": 2735
+    },
+    {
+      "epoch": 0.4985443959243086,
+      "grad_norm": 0.17243537306785583,
+      "learning_rate": 2.6375775656974123e-05,
+      "loss": 0.151741623878479,
+      "step": 2740
+    },
+    {
+      "epoch": 0.49945414847161573,
+      "grad_norm": 0.13736453652381897,
+      "learning_rate": 2.6302273266919008e-05,
+      "loss": 0.147042977809906,
+      "step": 2745
+    },
+    {
+      "epoch": 0.5003639010189228,
+      "grad_norm": 0.16241495311260223,
+      "learning_rate": 2.6228759587415614e-05,
+      "loss": 0.14664684534072875,
+      "step": 2750
+    },
+    {
+      "epoch": 0.50127365356623,
+      "grad_norm": 0.193496435880661,
+      "learning_rate": 2.6155235255756356e-05,
+      "loss": 0.15486966371536254,
+      "step": 2755
+    },
+    {
+      "epoch": 0.5021834061135371,
+      "grad_norm": 0.1542847901582718,
+      "learning_rate": 2.6081700909326e-05,
+      "loss": 0.15148009061813356,
+      "step": 2760
+    },
+    {
+      "epoch": 0.5030931586608443,
+      "grad_norm": 0.1696511209011078,
+      "learning_rate": 2.6008157185596142e-05,
+      "loss": 0.14190055131912233,
+      "step": 2765
+    },
+    {
+      "epoch": 0.5040029112081513,
+      "grad_norm": 0.14690077304840088,
+      "learning_rate": 2.5934604722119655e-05,
+      "loss": 0.1570739269256592,
+      "step": 2770
+    },
+    {
+      "epoch": 0.5049126637554585,
+      "grad_norm": 0.17149671912193298,
+      "learning_rate": 2.5861044156525162e-05,
+      "loss": 0.14940304756164552,
+      "step": 2775
+    },
+    {
+      "epoch": 0.5058224163027657,
+      "grad_norm": 0.16639231145381927,
+      "learning_rate": 2.578747612651155e-05,
+      "loss": 0.15691237449645995,
+      "step": 2780
+    },
+    {
+      "epoch": 0.5067321688500728,
+      "grad_norm": 0.2062763124704361,
+      "learning_rate": 2.5713901269842404e-05,
+      "loss": 0.1564734935760498,
+      "step": 2785
+    },
+    {
+      "epoch": 0.50764192139738,
+      "grad_norm": 0.12636308372020721,
+      "learning_rate": 2.5640320224340502e-05,
+      "loss": 0.14539417028427123,
+      "step": 2790
+    },
+    {
+      "epoch": 0.508551673944687,
+      "grad_norm": 0.16893689334392548,
+      "learning_rate": 2.556673362788225e-05,
+      "loss": 0.15440930128097535,
+      "step": 2795
+    },
+    {
+      "epoch": 0.5094614264919942,
+      "grad_norm": 0.16250015795230865,
+      "learning_rate": 2.54931421183922e-05,
+      "loss": 0.14485647678375244,
+      "step": 2800
+    },
+    {
+      "epoch": 0.5103711790393013,
+      "grad_norm": 0.1700994372367859,
+      "learning_rate": 2.5419546333837462e-05,
+      "loss": 0.15411126613616943,
+      "step": 2805
+    },
+    {
+      "epoch": 0.5112809315866085,
+      "grad_norm": 0.1547706127166748,
+      "learning_rate": 2.5345946912222256e-05,
+      "loss": 0.15516072511672974,
+      "step": 2810
+    },
+    {
+      "epoch": 0.5121906841339156,
+      "grad_norm": 0.17955681681632996,
+      "learning_rate": 2.527234449158228e-05,
+      "loss": 0.15546923875808716,
+      "step": 2815
+    },
+    {
+      "epoch": 0.5131004366812227,
+      "grad_norm": 0.163709819316864,
+      "learning_rate": 2.519873970997927e-05,
+      "loss": 0.15665037631988527,
+      "step": 2820
+    },
+    {
+      "epoch": 0.5140101892285298,
+      "grad_norm": 0.17859576642513275,
+      "learning_rate": 2.5125133205495405e-05,
+      "loss": 0.1539722204208374,
+      "step": 2825
+    },
+    {
+      "epoch": 0.514919941775837,
+      "grad_norm": 0.17443150281906128,
+      "learning_rate": 2.5051525616227806e-05,
+      "loss": 0.148411762714386,
+      "step": 2830
+    },
+    {
+      "epoch": 0.5158296943231441,
+      "grad_norm": 0.17397581040859222,
+      "learning_rate": 2.4977917580283007e-05,
+      "loss": 0.14880497455596925,
+      "step": 2835
+    },
+    {
+      "epoch": 0.5167394468704513,
+      "grad_norm": 0.14565663039684296,
+      "learning_rate": 2.4904309735771405e-05,
+      "loss": 0.14934680461883545,
+      "step": 2840
+    },
+    {
+      "epoch": 0.5176491994177583,
+      "grad_norm": 0.17895659804344177,
+      "learning_rate": 2.4830702720801746e-05,
+      "loss": 0.15287939310073853,
+      "step": 2845
+    },
+    {
+      "epoch": 0.5185589519650655,
+      "grad_norm": 0.15812788903713226,
+      "learning_rate": 2.4757097173475572e-05,
+      "loss": 0.14576947689056396,
+      "step": 2850
+    },
+    {
+      "epoch": 0.5194687045123726,
+      "grad_norm": 0.17123781144618988,
+      "learning_rate": 2.46834937318817e-05,
+      "loss": 0.15224847793579102,
+      "step": 2855
+    },
+    {
+      "epoch": 0.5203784570596798,
+      "grad_norm": 0.14845474064350128,
+      "learning_rate": 2.460989303409072e-05,
+      "loss": 0.14901585578918458,
+      "step": 2860
+    },
+    {
+      "epoch": 0.5212882096069869,
+      "grad_norm": 0.23493704199790955,
+      "learning_rate": 2.4536295718149407e-05,
+      "loss": 0.1517487049102783,
+      "step": 2865
+    },
+    {
+      "epoch": 0.522197962154294,
+      "grad_norm": 0.16209843754768372,
+      "learning_rate": 2.4462702422075217e-05,
+      "loss": 0.14327445030212402,
+      "step": 2870
+    },
+    {
+      "epoch": 0.5231077147016011,
+      "grad_norm": 0.17249803245067596,
+      "learning_rate": 2.4389113783850793e-05,
+      "loss": 0.1517549753189087,
+      "step": 2875
+    },
+    {
+      "epoch": 0.5240174672489083,
+      "grad_norm": 0.14561402797698975,
+      "learning_rate": 2.431553044141836e-05,
+      "loss": 0.14764087200164794,
+      "step": 2880
+    },
+    {
+      "epoch": 0.5249272197962155,
+      "grad_norm": 0.17033302783966064,
+      "learning_rate": 2.4241953032674256e-05,
+      "loss": 0.15181604623794556,
+      "step": 2885
+    },
+    {
+      "epoch": 0.5258369723435226,
+      "grad_norm": 0.1184430941939354,
+      "learning_rate": 2.4168382195463367e-05,
+      "loss": 0.14264242649078368,
+      "step": 2890
+    },
+    {
+      "epoch": 0.5267467248908297,
+      "grad_norm": 0.17521196603775024,
+      "learning_rate": 2.4094818567573618e-05,
+      "loss": 0.1509538173675537,
+      "step": 2895
+    },
+    {
+      "epoch": 0.5276564774381368,
+      "grad_norm": 0.1681576371192932,
+      "learning_rate": 2.4021262786730428e-05,
+      "loss": 0.15344605445861817,
+      "step": 2900
+    },
+    {
+      "epoch": 0.528566229985444,
+      "grad_norm": 0.17134182155132294,
+      "learning_rate": 2.3947715490591206e-05,
+      "loss": 0.15161689519882202,
+      "step": 2905
+    },
+    {
+      "epoch": 0.5294759825327511,
+      "grad_norm": 0.1796472817659378,
+      "learning_rate": 2.3874177316739778e-05,
+      "loss": 0.15086464881896972,
+      "step": 2910
+    },
+    {
+      "epoch": 0.5303857350800583,
+      "grad_norm": 0.23268625140190125,
+      "learning_rate": 2.380064890268093e-05,
+      "loss": 0.15354180335998535,
+      "step": 2915
+    },
+    {
+      "epoch": 0.5312954876273653,
+      "grad_norm": 0.16318941116333008,
+      "learning_rate": 2.372713088583481e-05,
+      "loss": 0.15131797790527343,
+      "step": 2920
+    },
+    {
+      "epoch": 0.5322052401746725,
+      "grad_norm": 0.18171803653240204,
+      "learning_rate": 2.365362390353143e-05,
+      "loss": 0.15784090757369995,
+      "step": 2925
+    },
+    {
+      "epoch": 0.5331149927219796,
+      "grad_norm": 0.17672640085220337,
+      "learning_rate": 2.3580128593005156e-05,
+      "loss": 0.15509436130523682,
+      "step": 2930
+    },
+    {
+      "epoch": 0.5340247452692868,
+      "grad_norm": 0.15985223650932312,
+      "learning_rate": 2.3506645591389174e-05,
+      "loss": 0.14851027727127075,
+      "step": 2935
+    },
+    {
+      "epoch": 0.5349344978165939,
+      "grad_norm": 0.16597607731819153,
+      "learning_rate": 2.343317553570995e-05,
+      "loss": 0.1504931092262268,
+      "step": 2940
+    },
+    {
+      "epoch": 0.535844250363901,
+      "grad_norm": 0.20180748403072357,
+      "learning_rate": 2.3359719062881725e-05,
+      "loss": 0.15023820400238036,
+      "step": 2945
+    },
+    {
+      "epoch": 0.5367540029112081,
+      "grad_norm": 0.1735963076353073,
+      "learning_rate": 2.3286276809701e-05,
+      "loss": 0.15374408960342406,
+      "step": 2950
+    },
+    {
+      "epoch": 0.5376637554585153,
+      "grad_norm": 0.17629501223564148,
+      "learning_rate": 2.3212849412840995e-05,
+      "loss": 0.15007833242416382,
+      "step": 2955
+    },
+    {
+      "epoch": 0.5385735080058224,
+      "grad_norm": 0.1493796557188034,
+      "learning_rate": 2.3139437508846155e-05,
+      "loss": 0.15206656455993653,
+      "step": 2960
+    },
+    {
+      "epoch": 0.5394832605531296,
+      "grad_norm": 0.17426837980747223,
+      "learning_rate": 2.306604173412659e-05,
+      "loss": 0.1441131591796875,
+      "step": 2965
+    },
+    {
+      "epoch": 0.5403930131004366,
+      "grad_norm": 0.16984431445598602,
+      "learning_rate": 2.2992662724952613e-05,
+      "loss": 0.14438753128051757,
+      "step": 2970
+    },
+    {
+      "epoch": 0.5413027656477438,
+      "grad_norm": 0.1814386397600174,
+      "learning_rate": 2.2919301117449167e-05,
+      "loss": 0.14887022972106934,
+      "step": 2975
+    },
+    {
+      "epoch": 0.5422125181950509,
+      "grad_norm": 0.158392995595932,
+      "learning_rate": 2.2845957547590368e-05,
+      "loss": 0.14404361248016356,
+      "step": 2980
+    },
+    {
+      "epoch": 0.5431222707423581,
+      "grad_norm": 0.17496263980865479,
+      "learning_rate": 2.2772632651193953e-05,
+      "loss": 0.1454906702041626,
+      "step": 2985
+    },
+    {
+      "epoch": 0.5440320232896652,
+      "grad_norm": 0.157533198595047,
+      "learning_rate": 2.2699327063915766e-05,
+      "loss": 0.1458217740058899,
+      "step": 2990
+    },
+    {
+      "epoch": 0.5449417758369723,
+      "grad_norm": 0.1767890453338623,
+      "learning_rate": 2.262604142124427e-05,
+      "loss": 0.14384825229644777,
+      "step": 2995
+    },
+    {
+      "epoch": 0.5458515283842795,
+      "grad_norm": 0.1851050704717636,
+      "learning_rate": 2.2552776358495033e-05,
+      "loss": 0.14832457304000854,
+      "step": 3000
+    },
+    {
+      "epoch": 0.5467612809315866,
+      "grad_norm": 0.164175882935524,
+      "learning_rate": 2.247953251080521e-05,
+      "loss": 0.14999878406524658,
+      "step": 3005
+    },
+    {
+      "epoch": 0.5476710334788938,
+      "grad_norm": 0.3403675854206085,
+      "learning_rate": 2.240631051312804e-05,
+      "loss": 0.1443937063217163,
+      "step": 3010
+    },
+    {
+      "epoch": 0.5485807860262009,
+      "grad_norm": 0.16751109063625336,
+      "learning_rate": 2.2333111000227342e-05,
+      "loss": 0.1462402105331421,
+      "step": 3015
+    },
+    {
+      "epoch": 0.549490538573508,
+      "grad_norm": 0.14741151034832,
+      "learning_rate": 2.225993460667201e-05,
+      "loss": 0.149855899810791,
+      "step": 3020
+    },
+    {
+      "epoch": 0.5504002911208151,
+      "grad_norm": 0.20605266094207764,
+      "learning_rate": 2.218678196683054e-05,
+      "loss": 0.15413178205490113,
+      "step": 3025
+    },
+    {
+      "epoch": 0.5513100436681223,
+      "grad_norm": 0.14884796738624573,
+      "learning_rate": 2.2113653714865473e-05,
+      "loss": 0.14592334032058715,
+      "step": 3030
+    },
+    {
+      "epoch": 0.5522197962154294,
+      "grad_norm": 0.17114350199699402,
+      "learning_rate": 2.2040550484727943e-05,
+      "loss": 0.1498338460922241,
+      "step": 3035
+    },
+    {
+      "epoch": 0.5531295487627366,
+      "grad_norm": 0.16496853530406952,
+      "learning_rate": 2.196747291015219e-05,
+      "loss": 0.14650315046310425,
+      "step": 3040
+    },
+    {
+      "epoch": 0.5540393013100436,
+      "grad_norm": 0.15172401070594788,
+      "learning_rate": 2.189442162465001e-05,
+      "loss": 0.14984124898910522,
+      "step": 3045
+    },
+    {
+      "epoch": 0.5549490538573508,
+      "grad_norm": 0.19258467853069305,
+      "learning_rate": 2.182139726150532e-05,
+      "loss": 0.1486764669418335,
+      "step": 3050
+    },
+    {
+      "epoch": 0.5558588064046579,
+      "grad_norm": 0.1749001443386078,
+      "learning_rate": 2.1748400453768652e-05,
+      "loss": 0.14983701705932617,
+      "step": 3055
+    },
+    {
+      "epoch": 0.5567685589519651,
+      "grad_norm": 0.37510567903518677,
+      "learning_rate": 2.1675431834251637e-05,
+      "loss": 0.14483561515808105,
+      "step": 3060
+    },
+    {
+      "epoch": 0.5576783114992722,
+      "grad_norm": 0.16932405531406403,
+      "learning_rate": 2.1602492035521553e-05,
+      "loss": 0.14487643241882325,
+      "step": 3065
+    },
+    {
+      "epoch": 0.5585880640465793,
+      "grad_norm": 0.174176424741745,
+      "learning_rate": 2.152958168989584e-05,
+      "loss": 0.14737497568130492,
+      "step": 3070
+    },
+    {
+      "epoch": 0.5594978165938864,
+      "grad_norm": 0.1601252257823944,
+      "learning_rate": 2.1456701429436577e-05,
+      "loss": 0.15183379650115966,
+      "step": 3075
+    },
+    {
+      "epoch": 0.5604075691411936,
+      "grad_norm": 0.14960910379886627,
+      "learning_rate": 2.1383851885945085e-05,
+      "loss": 0.143074893951416,
+      "step": 3080
+    },
+    {
+      "epoch": 0.5613173216885007,
+      "grad_norm": 0.1678633838891983,
+      "learning_rate": 2.1311033690956346e-05,
+      "loss": 0.14961432218551635,
+      "step": 3085
+    },
+    {
+      "epoch": 0.5622270742358079,
+      "grad_norm": 0.15814319252967834,
+      "learning_rate": 2.1238247475733613e-05,
+      "loss": 0.14308581352233887,
+      "step": 3090
+    },
+    {
+      "epoch": 0.5631368267831149,
+      "grad_norm": 0.21240772306919098,
+      "learning_rate": 2.1165493871262887e-05,
+      "loss": 0.14737485647201537,
+      "step": 3095
+    },
+    {
+      "epoch": 0.5640465793304221,
+      "grad_norm": 0.15161271393299103,
+      "learning_rate": 2.109277350824749e-05,
+      "loss": 0.14534420967102052,
+      "step": 3100
+    },
+    {
+      "epoch": 0.5649563318777293,
+      "grad_norm": 0.16572362184524536,
+      "learning_rate": 2.1020087017102537e-05,
+      "loss": 0.14299670457839966,
+      "step": 3105
+    },
+    {
+      "epoch": 0.5658660844250364,
+      "grad_norm": 0.1548164039850235,
+      "learning_rate": 2.094743502794954e-05,
+      "loss": 0.14371142387390137,
+      "step": 3110
+    },
+    {
+      "epoch": 0.5667758369723436,
+      "grad_norm": 0.2574169933795929,
+      "learning_rate": 2.0874818170610885e-05,
+      "loss": 0.14350423812866211,
+      "step": 3115
+    },
+    {
+      "epoch": 0.5676855895196506,
+      "grad_norm": 0.16359548270702362,
+      "learning_rate": 2.080223707460443e-05,
+      "loss": 0.1520243763923645,
+      "step": 3120
+    },
+    {
+      "epoch": 0.5685953420669578,
+      "grad_norm": 0.1798320859670639,
+      "learning_rate": 2.072969236913799e-05,
+      "loss": 0.14832595586776734,
+      "step": 3125
+    },
+    {
+      "epoch": 0.5695050946142649,
+      "grad_norm": 0.17045916616916656,
+      "learning_rate": 2.0657184683103926e-05,
+      "loss": 0.15308042764663696,
+      "step": 3130
+    },
+    {
+      "epoch": 0.5704148471615721,
+      "grad_norm": 0.16345897316932678,
+      "learning_rate": 2.058471464507366e-05,
+      "loss": 0.14564799070358275,
+      "step": 3135
+    },
+    {
+      "epoch": 0.5713245997088792,
+      "grad_norm": 0.15170110762119293,
+      "learning_rate": 2.0512282883292257e-05,
+      "loss": 0.14161767959594726,
+      "step": 3140
+    },
+    {
+      "epoch": 0.5722343522561864,
+      "grad_norm": 0.8107472658157349,
+      "learning_rate": 2.0439890025672955e-05,
+      "loss": 0.14481087923049926,
+      "step": 3145
+    },
+    {
+      "epoch": 0.5731441048034934,
+      "grad_norm": 0.15346679091453552,
+      "learning_rate": 2.036753669979174e-05,
+      "loss": 0.14860262870788574,
+      "step": 3150
+    },
+    {
+      "epoch": 0.5740538573508006,
+      "grad_norm": 0.1632593423128128,
+      "learning_rate": 2.0295223532881886e-05,
+      "loss": 0.1481687307357788,
+      "step": 3155
+    },
+    {
+      "epoch": 0.5749636098981077,
+      "grad_norm": 0.23399172723293304,
+      "learning_rate": 2.022295115182852e-05,
+      "loss": 0.149153733253479,
+      "step": 3160
+    },
+    {
+      "epoch": 0.5758733624454149,
+      "grad_norm": 0.14977394044399261,
+      "learning_rate": 2.015072018316323e-05,
+      "loss": 0.14921388626098633,
+      "step": 3165
+    },
+    {
+      "epoch": 0.576783114992722,
+      "grad_norm": 0.1550658792257309,
+      "learning_rate": 2.007853125305856e-05,
+      "loss": 0.1482759475708008,
+      "step": 3170
+    },
+    {
+      "epoch": 0.5776928675400291,
+      "grad_norm": 0.16661737859249115,
+      "learning_rate": 2.0006384987322645e-05,
+      "loss": 0.14903552532196046,
+      "step": 3175
+    },
+    {
+      "epoch": 0.5786026200873362,
+      "grad_norm": 0.1746823936700821,
+      "learning_rate": 1.9934282011393753e-05,
+      "loss": 0.1412947654724121,
+      "step": 3180
+    },
+    {
+      "epoch": 0.5795123726346434,
+      "grad_norm": 0.17025792598724365,
+      "learning_rate": 1.9862222950334857e-05,
+      "loss": 0.15289769172668458,
+      "step": 3185
+    },
+    {
+      "epoch": 0.5804221251819505,
+      "grad_norm": 0.16857658326625824,
+      "learning_rate": 1.9790208428828252e-05,
+      "loss": 0.14419941902160643,
+      "step": 3190
+    },
+    {
+      "epoch": 0.5813318777292577,
+      "grad_norm": 0.16099876165390015,
+      "learning_rate": 1.9718239071170118e-05,
+      "loss": 0.14476487636566163,
+      "step": 3195
+    },
+    {
+      "epoch": 0.5822416302765647,
+      "grad_norm": 0.16140873730182648,
+      "learning_rate": 1.964631550126508e-05,
+      "loss": 0.14588416814804078,
+      "step": 3200
+    },
+    {
+      "epoch": 0.5831513828238719,
+      "grad_norm": 0.15719448029994965,
+      "learning_rate": 1.957443834262087e-05,
+      "loss": 0.15144693851470947,
+      "step": 3205
+    },
+    {
+      "epoch": 0.584061135371179,
+      "grad_norm": 0.16512645781040192,
+      "learning_rate": 1.950260821834285e-05,
+      "loss": 0.14787566661834717,
+      "step": 3210
+    },
+    {
+      "epoch": 0.5849708879184862,
+      "grad_norm": 0.18584516644477844,
+      "learning_rate": 1.9430825751128643e-05,
+      "loss": 0.14514710903167724,
+      "step": 3215
+    },
+    {
+      "epoch": 0.5858806404657934,
+      "grad_norm": 0.17640981078147888,
+      "learning_rate": 1.9359091563262742e-05,
+      "loss": 0.1511004686355591,
+      "step": 3220
+    },
+    {
+      "epoch": 0.5867903930131004,
+      "grad_norm": 0.1697624921798706,
+      "learning_rate": 1.9287406276611095e-05,
+      "loss": 0.15392563343048096,
+      "step": 3225
+    },
+    {
+      "epoch": 0.5877001455604076,
+      "grad_norm": 0.1677260845899582,
+      "learning_rate": 1.9215770512615725e-05,
+      "loss": 0.15311745405197144,
+      "step": 3230
+    },
+    {
+      "epoch": 0.5886098981077147,
+      "grad_norm": 0.15357480943202972,
+      "learning_rate": 1.9144184892289337e-05,
+      "loss": 0.14370160102844237,
+      "step": 3235
+    },
+    {
+      "epoch": 0.5895196506550219,
+      "grad_norm": 0.18601207435131073,
+      "learning_rate": 1.9072650036209955e-05,
+      "loss": 0.14095077514648438,
+      "step": 3240
+    },
+    {
+      "epoch": 0.590429403202329,
+      "grad_norm": 0.17313526570796967,
+      "learning_rate": 1.9001166564515513e-05,
+      "loss": 0.148259174823761,
+      "step": 3245
+    },
+    {
+      "epoch": 0.5913391557496361,
+      "grad_norm": 0.1634378433227539,
+      "learning_rate": 1.8929735096898504e-05,
+      "loss": 0.15082294940948487,
+      "step": 3250
+    },
+    {
+      "epoch": 0.5922489082969432,
+      "grad_norm": 0.18542174994945526,
+      "learning_rate": 1.885835625260058e-05,
+      "loss": 0.14461435079574586,
+      "step": 3255
+    },
+    {
+      "epoch": 0.5931586608442504,
+      "grad_norm": 0.1740756630897522,
+      "learning_rate": 1.87870306504072e-05,
+      "loss": 0.14083608388900756,
+      "step": 3260
+    },
+    {
+      "epoch": 0.5940684133915575,
+      "grad_norm": 0.25606217980384827,
+      "learning_rate": 1.8715758908642288e-05,
+      "loss": 0.15125386714935302,
+      "step": 3265
+    },
+    {
+      "epoch": 0.5949781659388647,
+      "grad_norm": 0.20194627344608307,
+      "learning_rate": 1.8644541645162834e-05,
+      "loss": 0.14433003664016725,
+      "step": 3270
+    },
+    {
+      "epoch": 0.5958879184861717,
+      "grad_norm": 0.1902168095111847,
+      "learning_rate": 1.8573379477353542e-05,
+      "loss": 0.14718132019042968,
+      "step": 3275
+    },
+    {
+      "epoch": 0.5967976710334789,
+      "grad_norm": 0.15122972428798676,
+      "learning_rate": 1.850227302212151e-05,
+      "loss": 0.153376567363739,
+      "step": 3280
+    },
+    {
+      "epoch": 0.597707423580786,
+      "grad_norm": 0.14331959187984467,
+      "learning_rate": 1.843122289589085e-05,
+      "loss": 0.146630597114563,
+      "step": 3285
+    },
+    {
+      "epoch": 0.5986171761280932,
+      "grad_norm": 0.15083099901676178,
+      "learning_rate": 1.836022971459737e-05,
+      "loss": 0.1445971965789795,
+      "step": 3290
+    },
+    {
+      "epoch": 0.5995269286754003,
+      "grad_norm": 0.16585418581962585,
+      "learning_rate": 1.828929409368321e-05,
+      "loss": 0.15120241641998292,
+      "step": 3295
+    },
+    {
+      "epoch": 0.6004366812227074,
+      "grad_norm": 0.1653224229812622,
+      "learning_rate": 1.8218416648091524e-05,
+      "loss": 0.14349838495254516,
+      "step": 3300
+    },
+    {
+      "epoch": 0.6013464337700145,
+      "grad_norm": 0.1891375184059143,
+      "learning_rate": 1.8147597992261124e-05,
+      "loss": 0.15171384811401367,
+      "step": 3305
+    },
+    {
+      "epoch": 0.6022561863173217,
+      "grad_norm": 0.13392704725265503,
+      "learning_rate": 1.8076838740121187e-05,
+      "loss": 0.14607118368148803,
+      "step": 3310
+    },
+    {
+      "epoch": 0.6031659388646288,
+      "grad_norm": 0.15421944856643677,
+      "learning_rate": 1.8006139505085926e-05,
+      "loss": 0.1380957007408142,
+      "step": 3315
+    },
+    {
+      "epoch": 0.604075691411936,
+      "grad_norm": 0.16637761890888214,
+      "learning_rate": 1.7935500900049246e-05,
+      "loss": 0.14604611396789552,
+      "step": 3320
+    },
+    {
+      "epoch": 0.6049854439592431,
+      "grad_norm": 0.16638441383838654,
+      "learning_rate": 1.7864923537379445e-05,
+      "loss": 0.1513611912727356,
+      "step": 3325
+    },
+    {
+      "epoch": 0.6058951965065502,
+      "grad_norm": 0.1745707094669342,
+      "learning_rate": 1.779440802891394e-05,
+      "loss": 0.15391240119934083,
+      "step": 3330
+    },
+    {
+      "epoch": 0.6068049490538574,
+      "grad_norm": 0.1620505005121231,
+      "learning_rate": 1.77239549859539e-05,
+      "loss": 0.14986472129821776,
+      "step": 3335
+    },
+    {
+      "epoch": 0.6077147016011645,
+      "grad_norm": 0.1579132080078125,
+      "learning_rate": 1.7653565019259e-05,
+      "loss": 0.1466603994369507,
+      "step": 3340
+    },
+    {
+      "epoch": 0.6086244541484717,
+      "grad_norm": 0.19154994189739227,
+      "learning_rate": 1.7583238739042086e-05,
+      "loss": 0.15228934288024903,
+      "step": 3345
+    },
+    {
+      "epoch": 0.6095342066957787,
+      "grad_norm": 0.15771779417991638,
+      "learning_rate": 1.7512976754963913e-05,
+      "loss": 0.14965078830718995,
+      "step": 3350
+    },
+    {
+      "epoch": 0.6104439592430859,
+      "grad_norm": 0.18406136333942413,
+      "learning_rate": 1.744277967612785e-05,
+      "loss": 0.1473196864128113,
+      "step": 3355
+    },
+    {
+      "epoch": 0.611353711790393,
+      "grad_norm": 0.17603816092014313,
+      "learning_rate": 1.7372648111074607e-05,
+      "loss": 0.1430676221847534,
+      "step": 3360
+    },
+    {
+      "epoch": 0.6122634643377002,
+      "grad_norm": 0.156408429145813,
+      "learning_rate": 1.7302582667776933e-05,
+      "loss": 0.14018454551696777,
+      "step": 3365
+    },
+    {
+      "epoch": 0.6131732168850073,
+      "grad_norm": 0.14504843950271606,
+      "learning_rate": 1.7232583953634407e-05,
+      "loss": 0.14505640268325806,
+      "step": 3370
+    },
+    {
+      "epoch": 0.6140829694323144,
+      "grad_norm": 0.1864968240261078,
+      "learning_rate": 1.716265257546808e-05,
+      "loss": 0.14810394048690795,
+      "step": 3375
+    },
+    {
+      "epoch": 0.6149927219796215,
+      "grad_norm": 0.1621711403131485,
+      "learning_rate": 1.7092789139515295e-05,
+      "loss": 0.14203091859817504,
+      "step": 3380
+    },
+    {
+      "epoch": 0.6159024745269287,
+      "grad_norm": 0.17994914948940277,
+      "learning_rate": 1.70229942514244e-05,
+      "loss": 0.14565644264221192,
+      "step": 3385
+    },
+    {
+      "epoch": 0.6168122270742358,
+      "grad_norm": 0.1707388162612915,
+      "learning_rate": 1.6953268516249486e-05,
+      "loss": 0.14449434280395507,
+      "step": 3390
+    },
+    {
+      "epoch": 0.617721979621543,
+      "grad_norm": 0.16425329446792603,
+      "learning_rate": 1.6883612538445175e-05,
+      "loss": 0.15185940265655518,
+      "step": 3395
+    },
+    {
+      "epoch": 0.61863173216885,
+      "grad_norm": 0.15987788140773773,
+      "learning_rate": 1.6814026921861335e-05,
+      "loss": 0.14994431734085084,
+      "step": 3400
+    },
+    {
+      "epoch": 0.6195414847161572,
+      "grad_norm": 0.2987690269947052,
+      "learning_rate": 1.6744512269737894e-05,
+      "loss": 0.14652738571166993,
+      "step": 3405
+    },
+    {
+      "epoch": 0.6204512372634643,
+      "grad_norm": 0.1681315004825592,
+      "learning_rate": 1.6675069184699574e-05,
+      "loss": 0.14566165208816528,
+      "step": 3410
+    },
+    {
+      "epoch": 0.6213609898107715,
+      "grad_norm": 0.15847846865653992,
+      "learning_rate": 1.660569826875069e-05,
+      "loss": 0.1374401330947876,
+      "step": 3415
+    },
+    {
+      "epoch": 0.6222707423580786,
+      "grad_norm": 0.16370312869548798,
+      "learning_rate": 1.6536400123269907e-05,
+      "loss": 0.14905524253845215,
+      "step": 3420
+    },
+    {
+      "epoch": 0.6231804949053857,
+      "grad_norm": 0.16054444015026093,
+      "learning_rate": 1.6467175349005054e-05,
+      "loss": 0.1496324896812439,
+      "step": 3425
+    },
+    {
+      "epoch": 0.6240902474526928,
+      "grad_norm": 0.1663951277732849,
+      "learning_rate": 1.639802454606788e-05,
+      "loss": 0.1504170298576355,
+      "step": 3430
+    },
+    {
+      "epoch": 0.625,
+      "grad_norm": 0.1591310054063797,
+      "learning_rate": 1.6328948313928906e-05,
+      "loss": 0.1410186171531677,
+      "step": 3435
+    },
+    {
+      "epoch": 0.6259097525473072,
+      "grad_norm": 0.1637524962425232,
+      "learning_rate": 1.6259947251412178e-05,
+      "loss": 0.13963305950164795,
+      "step": 3440
+    },
+    {
+      "epoch": 0.6268195050946143,
+      "grad_norm": 0.1688017100095749,
+      "learning_rate": 1.6191021956690096e-05,
+      "loss": 0.14727941751480103,
+      "step": 3445
+    },
+    {
+      "epoch": 0.6277292576419214,
+      "grad_norm": 0.1691795438528061,
+      "learning_rate": 1.612217302727821e-05,
+      "loss": 0.14856183528900146,
+      "step": 3450
+    },
+    {
+      "epoch": 0.6286390101892285,
+      "grad_norm": 0.18501746654510498,
+      "learning_rate": 1.60534010600301e-05,
+      "loss": 0.1481746554374695,
+      "step": 3455
+    },
+    {
+      "epoch": 0.6295487627365357,
+      "grad_norm": 0.16234716773033142,
+      "learning_rate": 1.5984706651132125e-05,
+      "loss": 0.1427530527114868,
+      "step": 3460
+    },
+    {
+      "epoch": 0.6304585152838428,
+      "grad_norm": 0.16013780236244202,
+      "learning_rate": 1.5916090396098293e-05,
+      "loss": 0.14264426231384278,
+      "step": 3465
+    },
+    {
+      "epoch": 0.63136826783115,
+      "grad_norm": 0.17116396129131317,
+      "learning_rate": 1.5847552889765095e-05,
+      "loss": 0.14109257459640503,
+      "step": 3470
+    },
+    {
+      "epoch": 0.632278020378457,
+      "grad_norm": 0.16949769854545593,
+      "learning_rate": 1.5779094726286344e-05,
+      "loss": 0.1387040376663208,
+      "step": 3475
+    },
+    {
+      "epoch": 0.6331877729257642,
+      "grad_norm": 0.14983431994915009,
+      "learning_rate": 1.5710716499128044e-05,
+      "loss": 0.13645120859146118,
+      "step": 3480
+    },
+    {
+      "epoch": 0.6340975254730713,
+      "grad_norm": 0.1632554531097412,
+      "learning_rate": 1.564241880106321e-05,
+      "loss": 0.14883992671966553,
+      "step": 3485
+    },
+    {
+      "epoch": 0.6350072780203785,
+      "grad_norm": 0.15686506032943726,
+      "learning_rate": 1.5574202224166744e-05,
+      "loss": 0.14244272708892822,
+      "step": 3490
+    },
+    {
+      "epoch": 0.6359170305676856,
+      "grad_norm": 0.18843458592891693,
+      "learning_rate": 1.5506067359810333e-05,
+      "loss": 0.15149861574172974,
+      "step": 3495
+    },
+    {
+      "epoch": 0.6368267831149927,
+      "grad_norm": 0.15874551236629486,
+      "learning_rate": 1.5438014798657275e-05,
+      "loss": 0.15188233852386473,
+      "step": 3500
+    },
+    {
+      "epoch": 0.6377365356622998,
+      "grad_norm": 0.17014239728450775,
+      "learning_rate": 1.5370045130657366e-05,
+      "loss": 0.14694437980651856,
+      "step": 3505
+    },
+    {
+      "epoch": 0.638646288209607,
+      "grad_norm": 0.14744038879871368,
+      "learning_rate": 1.5302158945041838e-05,
+      "loss": 0.14434736967086792,
+      "step": 3510
+    },
+    {
+      "epoch": 0.6395560407569141,
+      "grad_norm": 0.2069770246744156,
+      "learning_rate": 1.523435683031818e-05,
+      "loss": 0.13982917070388795,
+      "step": 3515
+    },
+    {
+      "epoch": 0.6404657933042213,
+      "grad_norm": 0.17811502516269684,
+      "learning_rate": 1.5166639374265063e-05,
+      "loss": 0.1408839702606201,
+      "step": 3520
+    },
+    {
+      "epoch": 0.6413755458515283,
+      "grad_norm": 0.165786474943161,
+      "learning_rate": 1.509900716392728e-05,
+      "loss": 0.15312877893447877,
+      "step": 3525
+    },
+    {
+      "epoch": 0.6422852983988355,
+      "grad_norm": 0.1633884161710739,
+      "learning_rate": 1.5031460785610596e-05,
+      "loss": 0.1488795518875122,
+      "step": 3530
+    },
+    {
+      "epoch": 0.6431950509461426,
+      "grad_norm": 0.16498984396457672,
+      "learning_rate": 1.4964000824876723e-05,
+      "loss": 0.15031465291976928,
+      "step": 3535
+    },
+    {
+      "epoch": 0.6441048034934498,
+      "grad_norm": 0.18043678998947144,
+      "learning_rate": 1.4896627866538191e-05,
+      "loss": 0.147829806804657,
+      "step": 3540
+    },
+    {
+      "epoch": 0.6450145560407569,
+      "grad_norm": 0.16813597083091736,
+      "learning_rate": 1.4829342494653315e-05,
+      "loss": 0.1418998956680298,
+      "step": 3545
+    },
+    {
+      "epoch": 0.645924308588064,
+      "grad_norm": 0.1817242056131363,
+      "learning_rate": 1.4762145292521118e-05,
+      "loss": 0.14508869647979736,
+      "step": 3550
+    },
+    {
+      "epoch": 0.6468340611353712,
+      "grad_norm": 0.14666494727134705,
+      "learning_rate": 1.469503684267628e-05,
+      "loss": 0.14159854650497436,
+      "step": 3555
+    },
+    {
+      "epoch": 0.6477438136826783,
+      "grad_norm": 0.16485381126403809,
+      "learning_rate": 1.4628017726884086e-05,
+      "loss": 0.14419105052947997,
+      "step": 3560
+    },
+    {
+      "epoch": 0.6486535662299855,
+      "grad_norm": 0.16100342571735382,
+      "learning_rate": 1.4561088526135375e-05,
+      "loss": 0.14501721858978273,
+      "step": 3565
+    },
+    {
+      "epoch": 0.6495633187772926,
+      "grad_norm": 0.16996590793132782,
+      "learning_rate": 1.4494249820641493e-05,
+      "loss": 0.1377166509628296,
+      "step": 3570
+    },
+    {
+      "epoch": 0.6504730713245997,
+      "grad_norm": 0.16168837249279022,
+      "learning_rate": 1.4427502189829339e-05,
+      "loss": 0.1414325475692749,
+      "step": 3575
+    },
+    {
+      "epoch": 0.6513828238719068,
+      "grad_norm": 0.16318906843662262,
+      "learning_rate": 1.436084621233621e-05,
+      "loss": 0.14685193300247193,
+      "step": 3580
+    },
+    {
+      "epoch": 0.652292576419214,
+      "grad_norm": 0.1636219322681427,
+      "learning_rate": 1.4294282466004899e-05,
+      "loss": 0.1405899167060852,
+      "step": 3585
+    },
+    {
+      "epoch": 0.6532023289665211,
+      "grad_norm": 0.1838461309671402,
+      "learning_rate": 1.422781152787865e-05,
+      "loss": 0.14386332035064697,
+      "step": 3590
+    },
+    {
+      "epoch": 0.6541120815138283,
+      "grad_norm": 0.1796344667673111,
+      "learning_rate": 1.4161433974196115e-05,
+      "loss": 0.1513024687767029,
+      "step": 3595
+    },
+    {
+      "epoch": 0.6550218340611353,
+      "grad_norm": 0.16424529254436493,
+      "learning_rate": 1.4095150380386427e-05,
+      "loss": 0.14238927364349366,
+      "step": 3600
+    },
+    {
+      "epoch": 0.6559315866084425,
+      "grad_norm": 0.19264160096645355,
+      "learning_rate": 1.402896132106415e-05,
+      "loss": 0.14297477006912232,
+      "step": 3605
+    },
+    {
+      "epoch": 0.6568413391557496,
+      "grad_norm": 0.18319948017597198,
+      "learning_rate": 1.3962867370024347e-05,
+      "loss": 0.1448880434036255,
+      "step": 3610
+    },
+    {
+      "epoch": 0.6577510917030568,
+      "grad_norm": 0.16507290303707123,
+      "learning_rate": 1.389686910023758e-05,
+      "loss": 0.14724698066711425,
+      "step": 3615
+    },
+    {
+      "epoch": 0.6586608442503639,
+      "grad_norm": 0.17871244251728058,
+      "learning_rate": 1.3830967083844942e-05,
+      "loss": 0.14479386806488037,
+      "step": 3620
+    },
+    {
+      "epoch": 0.659570596797671,
+      "grad_norm": 0.1846228390932083,
+      "learning_rate": 1.3765161892153112e-05,
+      "loss": 0.1453616738319397,
+      "step": 3625
+    },
+    {
+      "epoch": 0.6604803493449781,
+      "grad_norm": 0.17185978591442108,
+      "learning_rate": 1.3699454095629372e-05,
+      "loss": 0.14906206130981445,
+      "step": 3630
+    },
+    {
+      "epoch": 0.6613901018922853,
+      "grad_norm": 0.14751191437244415,
+      "learning_rate": 1.3633844263896698e-05,
+      "loss": 0.13991892337799072,
+      "step": 3635
+    },
+    {
+      "epoch": 0.6622998544395924,
+      "grad_norm": 0.22059763967990875,
+      "learning_rate": 1.3568332965728817e-05,
+      "loss": 0.14680869579315187,
+      "step": 3640
+    },
+    {
+      "epoch": 0.6632096069868996,
+      "grad_norm": 0.15295909345149994,
+      "learning_rate": 1.3502920769045232e-05,
+      "loss": 0.1404443383216858,
+      "step": 3645
+    },
+    {
+      "epoch": 0.6641193595342066,
+      "grad_norm": 0.14600558578968048,
+      "learning_rate": 1.3437608240906364e-05,
+      "loss": 0.14663270711898804,
+      "step": 3650
+    },
+    {
+      "epoch": 0.6650291120815138,
+      "grad_norm": 0.15548352897167206,
+      "learning_rate": 1.3372395947508587e-05,
+      "loss": 0.1431443452835083,
+      "step": 3655
+    },
+    {
+      "epoch": 0.665938864628821,
+      "grad_norm": 0.1813388466835022,
+      "learning_rate": 1.3307284454179342e-05,
+      "loss": 0.1458706736564636,
+      "step": 3660
+    },
+    {
+      "epoch": 0.6668486171761281,
+      "grad_norm": 0.16326870024204254,
+      "learning_rate": 1.3242274325372247e-05,
+      "loss": 0.14700595140457154,
+      "step": 3665
+    },
+    {
+      "epoch": 0.6677583697234353,
+      "grad_norm": 0.18779197335243225,
+      "learning_rate": 1.3177366124662149e-05,
+      "loss": 0.1497237801551819,
+      "step": 3670
+    },
+    {
+      "epoch": 0.6686681222707423,
+      "grad_norm": 0.16291002929210663,
+      "learning_rate": 1.3112560414740315e-05,
+      "loss": 0.1387086868286133,
+      "step": 3675
+    },
+    {
+      "epoch": 0.6695778748180495,
+      "grad_norm": 0.1532297134399414,
+      "learning_rate": 1.3047857757409487e-05,
+      "loss": 0.14497545957565308,
+      "step": 3680
+    },
+    {
+      "epoch": 0.6704876273653566,
+      "grad_norm": 0.14697515964508057,
+      "learning_rate": 1.2983258713579066e-05,
+      "loss": 0.1494283437728882,
+      "step": 3685
+    },
+    {
+      "epoch": 0.6713973799126638,
+      "grad_norm": 0.15213452279567719,
+      "learning_rate": 1.2918763843260218e-05,
+      "loss": 0.1468907594680786,
+      "step": 3690
+    },
+    {
+      "epoch": 0.6723071324599709,
+      "grad_norm": 0.1745215803384781,
+      "learning_rate": 1.285437370556099e-05,
+      "loss": 0.14997754096984864,
+      "step": 3695
+    },
+    {
+      "epoch": 0.673216885007278,
+      "grad_norm": 0.19207637012004852,
+      "learning_rate": 1.2790088858681577e-05,
+      "loss": 0.14202862977981567,
+      "step": 3700
+    },
+    {
+      "epoch": 0.6741266375545851,
+      "grad_norm": 0.1521359086036682,
+      "learning_rate": 1.2725909859909313e-05,
+      "loss": 0.14547673463821412,
+      "step": 3705
+    },
+    {
+      "epoch": 0.6750363901018923,
+      "grad_norm": 0.16975535452365875,
+      "learning_rate": 1.2661837265613999e-05,
+      "loss": 0.14006874561309815,
+      "step": 3710
+    },
+    {
+      "epoch": 0.6759461426491994,
+      "grad_norm": 0.22234582901000977,
+      "learning_rate": 1.2597871631242992e-05,
+      "loss": 0.13691173791885375,
+      "step": 3715
+    },
+    {
+      "epoch": 0.6768558951965066,
+      "grad_norm": 0.16082969307899475,
+      "learning_rate": 1.2534013511316383e-05,
+      "loss": 0.14932308197021485,
+      "step": 3720
+    },
+    {
+      "epoch": 0.6777656477438136,
+      "grad_norm": 0.1751091182231903,
+      "learning_rate": 1.247026345942226e-05,
+      "loss": 0.14531974792480468,
+      "step": 3725
+    },
+    {
+      "epoch": 0.6786754002911208,
+      "grad_norm": 0.15838147699832916,
+      "learning_rate": 1.2406622028211844e-05,
+      "loss": 0.14759832620620728,
+      "step": 3730
+    },
+    {
+      "epoch": 0.6795851528384279,
+      "grad_norm": 0.1771744042634964,
+      "learning_rate": 1.2343089769394714e-05,
+      "loss": 0.1382831573486328,
+      "step": 3735
+    },
+    {
+      "epoch": 0.6804949053857351,
+      "grad_norm": 0.16301538050174713,
+      "learning_rate": 1.2279667233734037e-05,
+      "loss": 0.14444775581359864,
+      "step": 3740
+    },
+    {
+      "epoch": 0.6814046579330422,
+      "grad_norm": 0.1584121286869049,
+      "learning_rate": 1.2216354971041796e-05,
+      "loss": 0.14200170040130616,
+      "step": 3745
+    },
+    {
+      "epoch": 0.6823144104803494,
+      "grad_norm": 0.139187291264534,
+      "learning_rate": 1.2153153530174007e-05,
+      "loss": 0.14318310022354125,
+      "step": 3750
+    },
+    {
+      "epoch": 0.6832241630276564,
+      "grad_norm": 0.13665248453617096,
+      "learning_rate": 1.2090063459025955e-05,
+      "loss": 0.1411946654319763,
+      "step": 3755
+    },
+    {
+      "epoch": 0.6841339155749636,
+      "grad_norm": 0.16273781657218933,
+      "learning_rate": 1.2027085304527475e-05,
+      "loss": 0.14873508214950562,
+      "step": 3760
+    },
+    {
+      "epoch": 0.6850436681222707,
+      "grad_norm": 0.16317526996135712,
+      "learning_rate": 1.1964219612638194e-05,
+      "loss": 0.14644203186035157,
+      "step": 3765
+    },
+    {
+      "epoch": 0.6859534206695779,
+      "grad_norm": 0.17253617942333221,
+      "learning_rate": 1.1901466928342777e-05,
+      "loss": 0.14027841091156007,
+      "step": 3770
+    },
+    {
+      "epoch": 0.6868631732168851,
+      "grad_norm": 0.19692830741405487,
+      "learning_rate": 1.183882779564624e-05,
+      "loss": 0.14411110877990724,
+      "step": 3775
+    },
+    {
+      "epoch": 0.6877729257641921,
+      "grad_norm": 0.15444578230381012,
+      "learning_rate": 1.1776302757569214e-05,
+      "loss": 0.14355008602142333,
+      "step": 3780
+    },
+    {
+      "epoch": 0.6886826783114993,
+      "grad_norm": 0.1622200757265091,
+      "learning_rate": 1.1713892356143239e-05,
+      "loss": 0.14794334173202514,
+      "step": 3785
+    },
+    {
+      "epoch": 0.6895924308588064,
+      "grad_norm": 0.1898501068353653,
+      "learning_rate": 1.1651597132406073e-05,
+      "loss": 0.1418622612953186,
+      "step": 3790
+    },
+    {
+      "epoch": 0.6905021834061136,
+      "grad_norm": 0.17803208529949188,
+      "learning_rate": 1.1589417626396973e-05,
+      "loss": 0.14576040506362914,
+      "step": 3795
+    },
+    {
+      "epoch": 0.6914119359534207,
+      "grad_norm": 0.17138013243675232,
+      "learning_rate": 1.1527354377152053e-05,
+      "loss": 0.14494270086288452,
+      "step": 3800
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.0912782443288745e+18,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-3800/training_args.bin b/checkpoint-3800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-3800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/checkpoint-3900/README.md b/checkpoint-3900/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-3900/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-3900/adapter_config.json b/checkpoint-3900/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-3900/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-3900/adapter_model.safetensors b/checkpoint-3900/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b1c4604f7b97d8f15ce3b6520f68580a8e21a217
--- /dev/null
+++ b/checkpoint-3900/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:930c154cc15967bdadde6e3a4e438cf417ae2f9ff8f99e23c659a4c10500f638
+size 169741912
diff --git a/checkpoint-3900/chat_template.jinja b/checkpoint-3900/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-3900/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-3900/optimizer.pt b/checkpoint-3900/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d6a5abecb5d413a2d1b28bb175cd6ed9cf91fabe
--- /dev/null
+++ b/checkpoint-3900/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3832963bedc2ddbde6ce39d4c5512721b4e48d086d12cdec63cef88f57e37f5f
+size 72807355
diff --git a/checkpoint-3900/processor_config.json b/checkpoint-3900/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-3900/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-3900/rng_state.pth b/checkpoint-3900/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-3900/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-3900/scheduler.pt b/checkpoint-3900/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5f4abc4618901a501ee3a382986ffeee15db63a0
--- /dev/null
+++ b/checkpoint-3900/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1bc4a059b28299a8bc4683917a6a5cb6a7fd2550c2a08109f253cfa3584ebaae
+size 1465
diff --git a/checkpoint-3900/tokenizer.json b/checkpoint-3900/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-3900/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-3900/tokenizer_config.json b/checkpoint-3900/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-3900/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-3900/trainer_state.json b/checkpoint-3900/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..30a48f9418583fbf44dbde1efce9bcebf59a78ba
--- /dev/null
+++ b/checkpoint-3900/trainer_state.json
@@ -0,0 +1,5502 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.7096069868995634,
+  "eval_steps": 100,
+  "global_step": 3900,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    },
+    {
+      "epoch": 0.03729985443959243,
+      "grad_norm": 0.061678580939769745,
+      "learning_rate": 4.999340748636462e-05,
+      "loss": 0.24956226348876953,
+      "step": 205
+    },
+    {
+      "epoch": 0.03820960698689956,
+      "grad_norm": 0.07328873127698898,
+      "learning_rate": 4.999160884067051e-05,
+      "loss": 0.26169676780700685,
+      "step": 210
+    },
+    {
+      "epoch": 0.0391193595342067,
+      "grad_norm": 0.08287990838289261,
+      "learning_rate": 4.9989593541926246e-05,
+      "loss": 0.2574604034423828,
+      "step": 215
+    },
+    {
+      "epoch": 0.04002911208151383,
+      "grad_norm": 0.06787359714508057,
+      "learning_rate": 4.9987361607602525e-05,
+      "loss": 0.25351409912109374,
+      "step": 220
+    },
+    {
+      "epoch": 0.04093886462882096,
+      "grad_norm": 0.06695502996444702,
+      "learning_rate": 4.998491305704805e-05,
+      "loss": 0.24522039890289307,
+      "step": 225
+    },
+    {
+      "epoch": 0.041848617176128096,
+      "grad_norm": 0.08872214704751968,
+      "learning_rate": 4.9982247911489375e-05,
+      "loss": 0.2581867933273315,
+      "step": 230
+    },
+    {
+      "epoch": 0.042758369723435226,
+      "grad_norm": 0.07637131959199905,
+      "learning_rate": 4.9979366194030743e-05,
+      "loss": 0.25569658279418944,
+      "step": 235
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 0.08158119022846222,
+      "learning_rate": 4.997626792965385e-05,
+      "loss": 0.2529409646987915,
+      "step": 240
+    },
+    {
+      "epoch": 0.04457787481804949,
+      "grad_norm": 0.07529161125421524,
+      "learning_rate": 4.997295314521766e-05,
+      "loss": 0.24049024581909179,
+      "step": 245
+    },
+    {
+      "epoch": 0.04548762736535662,
+      "grad_norm": 0.08860139548778534,
+      "learning_rate": 4.996942186945813e-05,
+      "loss": 0.2490522861480713,
+      "step": 250
+    },
+    {
+      "epoch": 0.04639737991266375,
+      "grad_norm": 0.0850321501493454,
+      "learning_rate": 4.9965674132988005e-05,
+      "loss": 0.24180831909179687,
+      "step": 255
+    },
+    {
+      "epoch": 0.04730713245997089,
+      "grad_norm": 0.07556115090847015,
+      "learning_rate": 4.996170996829653e-05,
+      "loss": 0.2509631872177124,
+      "step": 260
+    },
+    {
+      "epoch": 0.04821688500727802,
+      "grad_norm": 0.07971206307411194,
+      "learning_rate": 4.995752940974918e-05,
+      "loss": 0.24398891925811766,
+      "step": 265
+    },
+    {
+      "epoch": 0.04912663755458515,
+      "grad_norm": 0.09149336814880371,
+      "learning_rate": 4.9953132493587344e-05,
+      "loss": 0.2300492286682129,
+      "step": 270
+    },
+    {
+      "epoch": 0.050036390101892286,
+      "grad_norm": 0.08265820890665054,
+      "learning_rate": 4.9948519257928034e-05,
+      "loss": 0.24246792793273925,
+      "step": 275
+    },
+    {
+      "epoch": 0.050946142649199416,
+      "grad_norm": 0.10328587144613266,
+      "learning_rate": 4.9943689742763534e-05,
+      "loss": 0.2367171049118042,
+      "step": 280
+    },
+    {
+      "epoch": 0.05185589519650655,
+      "grad_norm": 0.0836917981505394,
+      "learning_rate": 4.993864398996105e-05,
+      "loss": 0.23215813636779786,
+      "step": 285
+    },
+    {
+      "epoch": 0.05276564774381368,
+      "grad_norm": 0.09475161135196686,
+      "learning_rate": 4.99333820432624e-05,
+      "loss": 0.2350748062133789,
+      "step": 290
+    },
+    {
+      "epoch": 0.05367540029112081,
+      "grad_norm": 0.08040128648281097,
+      "learning_rate": 4.992790394828355e-05,
+      "loss": 0.23253886699676513,
+      "step": 295
+    },
+    {
+      "epoch": 0.05458515283842795,
+      "grad_norm": 0.08852150291204453,
+      "learning_rate": 4.992220975251428e-05,
+      "loss": 0.23856515884399415,
+      "step": 300
+    },
+    {
+      "epoch": 0.05549490538573508,
+      "grad_norm": 0.09565229713916779,
+      "learning_rate": 4.991629950531775e-05,
+      "loss": 0.23311660289764405,
+      "step": 305
+    },
+    {
+      "epoch": 0.05640465793304221,
+      "grad_norm": 0.08158160001039505,
+      "learning_rate": 4.991017325793009e-05,
+      "loss": 0.22467944622039795,
+      "step": 310
+    },
+    {
+      "epoch": 0.05731441048034935,
+      "grad_norm": 0.07746429741382599,
+      "learning_rate": 4.990383106345994e-05,
+      "loss": 0.229844069480896,
+      "step": 315
+    },
+    {
+      "epoch": 0.05822416302765648,
+      "grad_norm": 0.08564355969429016,
+      "learning_rate": 4.989727297688797e-05,
+      "loss": 0.22414517402648926,
+      "step": 320
+    },
+    {
+      "epoch": 0.05913391557496361,
+      "grad_norm": 0.07517435401678085,
+      "learning_rate": 4.9890499055066435e-05,
+      "loss": 0.2236532211303711,
+      "step": 325
+    },
+    {
+      "epoch": 0.060043668122270744,
+      "grad_norm": 0.111734539270401,
+      "learning_rate": 4.988350935671869e-05,
+      "loss": 0.21474847793579102,
+      "step": 330
+    },
+    {
+      "epoch": 0.060953420669577874,
+      "grad_norm": 0.09906989336013794,
+      "learning_rate": 4.987630394243866e-05,
+      "loss": 0.23321933746337892,
+      "step": 335
+    },
+    {
+      "epoch": 0.06186317321688501,
+      "grad_norm": 0.10131457448005676,
+      "learning_rate": 4.98688828746903e-05,
+      "loss": 0.2310662031173706,
+      "step": 340
+    },
+    {
+      "epoch": 0.06277292576419213,
+      "grad_norm": 0.09203507006168365,
+      "learning_rate": 4.986124621780708e-05,
+      "loss": 0.22021169662475587,
+      "step": 345
+    },
+    {
+      "epoch": 0.06368267831149928,
+      "grad_norm": 0.09505912661552429,
+      "learning_rate": 4.9853394037991416e-05,
+      "loss": 0.2197155237197876,
+      "step": 350
+    },
+    {
+      "epoch": 0.06459243085880641,
+      "grad_norm": 0.09038657695055008,
+      "learning_rate": 4.984532640331412e-05,
+      "loss": 0.22066287994384765,
+      "step": 355
+    },
+    {
+      "epoch": 0.06550218340611354,
+      "grad_norm": 0.09707064181566238,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 0.22455451488494874,
+      "step": 360
+    },
+    {
+      "epoch": 0.06641193595342067,
+      "grad_norm": 0.10367228090763092,
+      "learning_rate": 4.98285450509961e-05,
+      "loss": 0.21993820667266845,
+      "step": 365
+    },
+    {
+      "epoch": 0.0673216885007278,
+      "grad_norm": 0.12229471653699875,
+      "learning_rate": 4.9819831478833456e-05,
+      "loss": 0.2168867588043213,
+      "step": 370
+    },
+    {
+      "epoch": 0.06823144104803494,
+      "grad_norm": 0.0964592918753624,
+      "learning_rate": 4.981090274276406e-05,
+      "loss": 0.21579203605651856,
+      "step": 375
+    },
+    {
+      "epoch": 0.06914119359534207,
+      "grad_norm": 0.09400496631860733,
+      "learning_rate": 4.980175892019141e-05,
+      "loss": 0.20972180366516113,
+      "step": 380
+    },
+    {
+      "epoch": 0.0700509461426492,
+      "grad_norm": 0.08158645778894424,
+      "learning_rate": 4.9792400090383594e-05,
+      "loss": 0.22148358821868896,
+      "step": 385
+    },
+    {
+      "epoch": 0.07096069868995633,
+      "grad_norm": 0.10916394740343094,
+      "learning_rate": 4.978282633447261e-05,
+      "loss": 0.2214418649673462,
+      "step": 390
+    },
+    {
+      "epoch": 0.07187045123726346,
+      "grad_norm": 0.11138810962438583,
+      "learning_rate": 4.9773037735453636e-05,
+      "loss": 0.21814754009246826,
+      "step": 395
+    },
+    {
+      "epoch": 0.07278020378457059,
+      "grad_norm": 0.10914396494626999,
+      "learning_rate": 4.9763034378184365e-05,
+      "loss": 0.21310818195343018,
+      "step": 400
+    },
+    {
+      "epoch": 0.07368995633187773,
+      "grad_norm": 0.1043366864323616,
+      "learning_rate": 4.975281634938421e-05,
+      "loss": 0.21266789436340333,
+      "step": 405
+    },
+    {
+      "epoch": 0.07459970887918486,
+      "grad_norm": 0.1036868542432785,
+      "learning_rate": 4.9742383737633594e-05,
+      "loss": 0.21606721878051757,
+      "step": 410
+    },
+    {
+      "epoch": 0.075509461426492,
+      "grad_norm": 0.11640442907810211,
+      "learning_rate": 4.9731736633373144e-05,
+      "loss": 0.21532948017120362,
+      "step": 415
+    },
+    {
+      "epoch": 0.07641921397379912,
+      "grad_norm": 0.11219926178455353,
+      "learning_rate": 4.9720875128902956e-05,
+      "loss": 0.2191627025604248,
+      "step": 420
+    },
+    {
+      "epoch": 0.07732896652110625,
+      "grad_norm": 0.12103637307882309,
+      "learning_rate": 4.970979931838176e-05,
+      "loss": 0.20938868522644044,
+      "step": 425
+    },
+    {
+      "epoch": 0.0782387190684134,
+      "grad_norm": 0.13274189829826355,
+      "learning_rate": 4.96985092978261e-05,
+      "loss": 0.21792960166931152,
+      "step": 430
+    },
+    {
+      "epoch": 0.07914847161572053,
+      "grad_norm": 0.11164513230323792,
+      "learning_rate": 4.968700516510954e-05,
+      "loss": 0.2022618055343628,
+      "step": 435
+    },
+    {
+      "epoch": 0.08005822416302766,
+      "grad_norm": 0.09532847255468369,
+      "learning_rate": 4.967528701996174e-05,
+      "loss": 0.21255812644958497,
+      "step": 440
+    },
+    {
+      "epoch": 0.08096797671033479,
+      "grad_norm": 0.10279258340597153,
+      "learning_rate": 4.96633549639677e-05,
+      "loss": 0.20683050155639648,
+      "step": 445
+    },
+    {
+      "epoch": 0.08187772925764192,
+      "grad_norm": 0.1257462352514267,
+      "learning_rate": 4.965120910056677e-05,
+      "loss": 0.21419920921325683,
+      "step": 450
+    },
+    {
+      "epoch": 0.08278748180494905,
+      "grad_norm": 0.11663137376308441,
+      "learning_rate": 4.963884953505186e-05,
+      "loss": 0.2072287082672119,
+      "step": 455
+    },
+    {
+      "epoch": 0.08369723435225619,
+      "grad_norm": 0.10488224029541016,
+      "learning_rate": 4.96262763745684e-05,
+      "loss": 0.1982678532600403,
+      "step": 460
+    },
+    {
+      "epoch": 0.08460698689956332,
+      "grad_norm": 0.11801692098379135,
+      "learning_rate": 4.961348972811354e-05,
+      "loss": 0.20662031173706055,
+      "step": 465
+    },
+    {
+      "epoch": 0.08551673944687045,
+      "grad_norm": 0.11318827420473099,
+      "learning_rate": 4.96004897065351e-05,
+      "loss": 0.20947303771972656,
+      "step": 470
+    },
+    {
+      "epoch": 0.08642649199417758,
+      "grad_norm": 0.13409486413002014,
+      "learning_rate": 4.95872764225307e-05,
+      "loss": 0.19670876264572143,
+      "step": 475
+    },
+    {
+      "epoch": 0.08733624454148471,
+      "grad_norm": 0.14440792798995972,
+      "learning_rate": 4.957384999064672e-05,
+      "loss": 0.19842848777770997,
+      "step": 480
+    },
+    {
+      "epoch": 0.08824599708879186,
+      "grad_norm": 0.12246996909379959,
+      "learning_rate": 4.956021052727731e-05,
+      "loss": 0.20318071842193602,
+      "step": 485
+    },
+    {
+      "epoch": 0.08915574963609899,
+      "grad_norm": 0.13437233865261078,
+      "learning_rate": 4.954635815066342e-05,
+      "loss": 0.21675212383270265,
+      "step": 490
+    },
+    {
+      "epoch": 0.09006550218340612,
+      "grad_norm": 0.11109672486782074,
+      "learning_rate": 4.9532292980891744e-05,
+      "loss": 0.2100757837295532,
+      "step": 495
+    },
+    {
+      "epoch": 0.09097525473071325,
+      "grad_norm": 0.1388893872499466,
+      "learning_rate": 4.9518015139893675e-05,
+      "loss": 0.20303285121917725,
+      "step": 500
+    },
+    {
+      "epoch": 0.09188500727802038,
+      "grad_norm": 0.13239721953868866,
+      "learning_rate": 4.950352475144427e-05,
+      "loss": 0.2152268409729004,
+      "step": 505
+    },
+    {
+      "epoch": 0.0927947598253275,
+      "grad_norm": 0.12834979593753815,
+      "learning_rate": 4.948882194116119e-05,
+      "loss": 0.20799248218536376,
+      "step": 510
+    },
+    {
+      "epoch": 0.09370451237263465,
+      "grad_norm": 0.11886704713106155,
+      "learning_rate": 4.947390683650354e-05,
+      "loss": 0.20394976139068605,
+      "step": 515
+    },
+    {
+      "epoch": 0.09461426491994178,
+      "grad_norm": 0.11398876458406448,
+      "learning_rate": 4.945877956677083e-05,
+      "loss": 0.2091092586517334,
+      "step": 520
+    },
+    {
+      "epoch": 0.09552401746724891,
+      "grad_norm": 0.1422540694475174,
+      "learning_rate": 4.944344026310186e-05,
+      "loss": 0.19564238786697388,
+      "step": 525
+    },
+    {
+      "epoch": 0.09643377001455604,
+      "grad_norm": 0.11359584331512451,
+      "learning_rate": 4.9427889058473535e-05,
+      "loss": 0.20493624210357667,
+      "step": 530
+    },
+    {
+      "epoch": 0.09734352256186317,
+      "grad_norm": 0.11703553050756454,
+      "learning_rate": 4.941212608769974e-05,
+      "loss": 0.2098615884780884,
+      "step": 535
+    },
+    {
+      "epoch": 0.0982532751091703,
+      "grad_norm": 0.14552047848701477,
+      "learning_rate": 4.939615148743017e-05,
+      "loss": 0.20382182598114013,
+      "step": 540
+    },
+    {
+      "epoch": 0.09916302765647744,
+      "grad_norm": 0.13178016245365143,
+      "learning_rate": 4.937996539614914e-05,
+      "loss": 0.19901862144470214,
+      "step": 545
+    },
+    {
+      "epoch": 0.10007278020378457,
+      "grad_norm": 0.635392427444458,
+      "learning_rate": 4.936356795417439e-05,
+      "loss": 0.20694944858551026,
+      "step": 550
+    },
+    {
+      "epoch": 0.1009825327510917,
+      "grad_norm": 0.15019077062606812,
+      "learning_rate": 4.934695930365586e-05,
+      "loss": 0.19313746690750122,
+      "step": 555
+    },
+    {
+      "epoch": 0.10189228529839883,
+      "grad_norm": 0.12941956520080566,
+      "learning_rate": 4.9330139588574474e-05,
+      "loss": 0.19671722650527954,
+      "step": 560
+    },
+    {
+      "epoch": 0.10280203784570596,
+      "grad_norm": 0.13818831741809845,
+      "learning_rate": 4.931310895474088e-05,
+      "loss": 0.20026786327362062,
+      "step": 565
+    },
+    {
+      "epoch": 0.1037117903930131,
+      "grad_norm": 0.12011194974184036,
+      "learning_rate": 4.929586754979417e-05,
+      "loss": 0.1932437539100647,
+      "step": 570
+    },
+    {
+      "epoch": 0.10462154294032024,
+      "grad_norm": 0.1345364898443222,
+      "learning_rate": 4.9278415523200644e-05,
+      "loss": 0.20245940685272218,
+      "step": 575
+    },
+    {
+      "epoch": 0.10553129548762737,
+      "grad_norm": 0.13281017541885376,
+      "learning_rate": 4.926075302625247e-05,
+      "loss": 0.19864981174468993,
+      "step": 580
+    },
+    {
+      "epoch": 0.1064410480349345,
+      "grad_norm": 0.13465586304664612,
+      "learning_rate": 4.924288021206639e-05,
+      "loss": 0.19573183059692384,
+      "step": 585
+    },
+    {
+      "epoch": 0.10735080058224163,
+      "grad_norm": 0.15225961804389954,
+      "learning_rate": 4.9224797235582396e-05,
+      "loss": 0.19946801662445068,
+      "step": 590
+    },
+    {
+      "epoch": 0.10826055312954876,
+      "grad_norm": 0.12816746532917023,
+      "learning_rate": 4.92065042535624e-05,
+      "loss": 0.19851526021957397,
+      "step": 595
+    },
+    {
+      "epoch": 0.1091703056768559,
+      "grad_norm": 0.13802853226661682,
+      "learning_rate": 4.9188001424588824e-05,
+      "loss": 0.19321763515472412,
+      "step": 600
+    },
+    {
+      "epoch": 0.11008005822416303,
+      "grad_norm": 0.17504797875881195,
+      "learning_rate": 4.9169288909063295e-05,
+      "loss": 0.2032616138458252,
+      "step": 605
+    },
+    {
+      "epoch": 0.11098981077147016,
+      "grad_norm": 0.13544194400310516,
+      "learning_rate": 4.91503668692052e-05,
+      "loss": 0.2011256456375122,
+      "step": 610
+    },
+    {
+      "epoch": 0.11189956331877729,
+      "grad_norm": 1.3976134061813354,
+      "learning_rate": 4.91312354690503e-05,
+      "loss": 0.19916868209838867,
+      "step": 615
+    },
+    {
+      "epoch": 0.11280931586608442,
+      "grad_norm": 0.1465059071779251,
+      "learning_rate": 4.91118948744493e-05,
+      "loss": 0.19487457275390624,
+      "step": 620
+    },
+    {
+      "epoch": 0.11371906841339156,
+      "grad_norm": 0.12103168666362762,
+      "learning_rate": 4.909234525306645e-05,
+      "loss": 0.1907251238822937,
+      "step": 625
+    },
+    {
+      "epoch": 0.1146288209606987,
+      "grad_norm": 0.12660574913024902,
+      "learning_rate": 4.907258677437802e-05,
+      "loss": 0.19327253103256226,
+      "step": 630
+    },
+    {
+      "epoch": 0.11553857350800582,
+      "grad_norm": 0.1347813606262207,
+      "learning_rate": 4.90526196096709e-05,
+      "loss": 0.19637736082077026,
+      "step": 635
+    },
+    {
+      "epoch": 0.11644832605531295,
+      "grad_norm": 0.14953652024269104,
+      "learning_rate": 4.903244393204107e-05,
+      "loss": 0.20325069427490233,
+      "step": 640
+    },
+    {
+      "epoch": 0.11735807860262008,
+      "grad_norm": 0.13936272263526917,
+      "learning_rate": 4.901205991639213e-05,
+      "loss": 0.1930275321006775,
+      "step": 645
+    },
+    {
+      "epoch": 0.11826783114992721,
+      "grad_norm": 0.1448420137166977,
+      "learning_rate": 4.899146773943374e-05,
+      "loss": 0.20026936531066894,
+      "step": 650
+    },
+    {
+      "epoch": 0.11917758369723436,
+      "grad_norm": 0.1312534064054489,
+      "learning_rate": 4.897066757968014e-05,
+      "loss": 0.19062033891677857,
+      "step": 655
+    },
+    {
+      "epoch": 0.12008733624454149,
+      "grad_norm": 0.13644742965698242,
+      "learning_rate": 4.894965961744859e-05,
+      "loss": 0.18719595670700073,
+      "step": 660
+    },
+    {
+      "epoch": 0.12099708879184862,
+      "grad_norm": 0.14276087284088135,
+      "learning_rate": 4.892844403485777e-05,
+      "loss": 0.19784307479858398,
+      "step": 665
+    },
+    {
+      "epoch": 0.12190684133915575,
+      "grad_norm": 0.14735399186611176,
+      "learning_rate": 4.890702101582623e-05,
+      "loss": 0.19163782596588136,
+      "step": 670
+    },
+    {
+      "epoch": 0.12281659388646288,
+      "grad_norm": 0.15742065012454987,
+      "learning_rate": 4.888539074607082e-05,
+      "loss": 0.19312986135482788,
+      "step": 675
+    },
+    {
+      "epoch": 0.12372634643377002,
+      "grad_norm": 0.12917031347751617,
+      "learning_rate": 4.8863553413105025e-05,
+      "loss": 0.20066320896148682,
+      "step": 680
+    },
+    {
+      "epoch": 0.12463609898107715,
+      "grad_norm": 0.1484801322221756,
+      "learning_rate": 4.884150920623737e-05,
+      "loss": 0.20096096992492676,
+      "step": 685
+    },
+    {
+      "epoch": 0.12554585152838427,
+      "grad_norm": 0.1455296128988266,
+      "learning_rate": 4.88192583165698e-05,
+      "loss": 0.20518505573272705,
+      "step": 690
+    },
+    {
+      "epoch": 0.12645560407569142,
+      "grad_norm": 0.14517490565776825,
+      "learning_rate": 4.879680093699598e-05,
+      "loss": 0.18859238624572755,
+      "step": 695
+    },
+    {
+      "epoch": 0.12736535662299855,
+      "grad_norm": 0.18778090178966522,
+      "learning_rate": 4.877413726219964e-05,
+      "loss": 0.197074818611145,
+      "step": 700
+    },
+    {
+      "epoch": 0.12827510917030568,
+      "grad_norm": 0.13497677445411682,
+      "learning_rate": 4.87512674886529e-05,
+      "loss": 0.18713107109069824,
+      "step": 705
+    },
+    {
+      "epoch": 0.12918486171761281,
+      "grad_norm": 0.12657155096530914,
+      "learning_rate": 4.872819181461455e-05,
+      "loss": 0.1858484387397766,
+      "step": 710
+    },
+    {
+      "epoch": 0.13009461426491994,
+      "grad_norm": 0.11458148807287216,
+      "learning_rate": 4.870491044012834e-05,
+      "loss": 0.18732179403305055,
+      "step": 715
+    },
+    {
+      "epoch": 0.13100436681222707,
+      "grad_norm": 0.13000249862670898,
+      "learning_rate": 4.8681423567021244e-05,
+      "loss": 0.1872936010360718,
+      "step": 720
+    },
+    {
+      "epoch": 0.1319141193595342,
+      "grad_norm": 0.14580890536308289,
+      "learning_rate": 4.865773139890172e-05,
+      "loss": 0.19280019998550416,
+      "step": 725
+    },
+    {
+      "epoch": 0.13282387190684133,
+      "grad_norm": 0.1507277935743332,
+      "learning_rate": 4.8633834141157913e-05,
+      "loss": 0.1898929238319397,
+      "step": 730
+    },
+    {
+      "epoch": 0.13373362445414846,
+      "grad_norm": 0.1418737769126892,
+      "learning_rate": 4.860973200095592e-05,
+      "loss": 0.17926375865936278,
+      "step": 735
+    },
+    {
+      "epoch": 0.1346433770014556,
+      "grad_norm": 0.17151866853237152,
+      "learning_rate": 4.858542518723794e-05,
+      "loss": 0.18963592052459716,
+      "step": 740
+    },
+    {
+      "epoch": 0.13555312954876272,
+      "grad_norm": 0.11162743717432022,
+      "learning_rate": 4.8560913910720535e-05,
+      "loss": 0.19466646909713745,
+      "step": 745
+    },
+    {
+      "epoch": 0.13646288209606988,
+      "grad_norm": 0.15628376603126526,
+      "learning_rate": 4.8536198383892725e-05,
+      "loss": 0.19494034051895143,
+      "step": 750
+    },
+    {
+      "epoch": 0.137372634643377,
+      "grad_norm": 0.18209289014339447,
+      "learning_rate": 4.851127882101421e-05,
+      "loss": 0.18747550249099731,
+      "step": 755
+    },
+    {
+      "epoch": 0.13828238719068414,
+      "grad_norm": 0.14559614658355713,
+      "learning_rate": 4.8486155438113454e-05,
+      "loss": 0.1897158980369568,
+      "step": 760
+    },
+    {
+      "epoch": 0.13919213973799127,
+      "grad_norm": 0.3198587894439697,
+      "learning_rate": 4.846082845298586e-05,
+      "loss": 0.18571001291275024,
+      "step": 765
+    },
+    {
+      "epoch": 0.1401018922852984,
+      "grad_norm": 0.1486678421497345,
+      "learning_rate": 4.843529808519189e-05,
+      "loss": 0.19561930894851684,
+      "step": 770
+    },
+    {
+      "epoch": 0.14101164483260553,
+      "grad_norm": 0.15318170189857483,
+      "learning_rate": 4.840956455605509e-05,
+      "loss": 0.187040114402771,
+      "step": 775
+    },
+    {
+      "epoch": 0.14192139737991266,
+      "grad_norm": 0.13754244148731232,
+      "learning_rate": 4.838362808866025e-05,
+      "loss": 0.18345539569854735,
+      "step": 780
+    },
+    {
+      "epoch": 0.1428311499272198,
+      "grad_norm": 0.12943248450756073,
+      "learning_rate": 4.835748890785143e-05,
+      "loss": 0.1921079397201538,
+      "step": 785
+    },
+    {
+      "epoch": 0.14374090247452692,
+      "grad_norm": 0.110458143055439,
+      "learning_rate": 4.833114724023001e-05,
+      "loss": 0.17927205562591553,
+      "step": 790
+    },
+    {
+      "epoch": 0.14465065502183405,
+      "grad_norm": 0.2421770840883255,
+      "learning_rate": 4.830460331415275e-05,
+      "loss": 0.18317567110061644,
+      "step": 795
+    },
+    {
+      "epoch": 0.14556040756914118,
+      "grad_norm": 0.14752762019634247,
+      "learning_rate": 4.8277857359729787e-05,
+      "loss": 0.1843916058540344,
+      "step": 800
+    },
+    {
+      "epoch": 0.14647016011644834,
+      "grad_norm": 0.15043556690216064,
+      "learning_rate": 4.8250909608822644e-05,
+      "loss": 0.18354393243789674,
+      "step": 805
+    },
+    {
+      "epoch": 0.14737991266375547,
+      "grad_norm": 0.1381794661283493,
+      "learning_rate": 4.822376029504223e-05,
+      "loss": 0.1789781332015991,
+      "step": 810
+    },
+    {
+      "epoch": 0.1482896652110626,
+      "grad_norm": 0.18386174738407135,
+      "learning_rate": 4.819640965374681e-05,
+      "loss": 0.19494292736053467,
+      "step": 815
+    },
+    {
+      "epoch": 0.14919941775836973,
+      "grad_norm": 0.13829593360424042,
+      "learning_rate": 4.816885792203996e-05,
+      "loss": 0.18486063480377196,
+      "step": 820
+    },
+    {
+      "epoch": 0.15010917030567686,
+      "grad_norm": 0.15033291280269623,
+      "learning_rate": 4.814110533876852e-05,
+      "loss": 0.18061509132385253,
+      "step": 825
+    },
+    {
+      "epoch": 0.151018922852984,
+      "grad_norm": 0.17150473594665527,
+      "learning_rate": 4.811315214452051e-05,
+      "loss": 0.18464866876602173,
+      "step": 830
+    },
+    {
+      "epoch": 0.15192867540029112,
+      "grad_norm": 0.15317125618457794,
+      "learning_rate": 4.808499858162307e-05,
+      "loss": 0.1837708592414856,
+      "step": 835
+    },
+    {
+      "epoch": 0.15283842794759825,
+      "grad_norm": 0.2671392560005188,
+      "learning_rate": 4.805664489414031e-05,
+      "loss": 0.19338636398315429,
+      "step": 840
+    },
+    {
+      "epoch": 0.15374818049490538,
+      "grad_norm": 0.14047028124332428,
+      "learning_rate": 4.802809132787125e-05,
+      "loss": 0.17069108486175538,
+      "step": 845
+    },
+    {
+      "epoch": 0.1546579330422125,
+      "grad_norm": 0.1520431935787201,
+      "learning_rate": 4.799933813034768e-05,
+      "loss": 0.18607735633850098,
+      "step": 850
+    },
+    {
+      "epoch": 0.15556768558951964,
+      "grad_norm": 0.17239463329315186,
+      "learning_rate": 4.797038555083197e-05,
+      "loss": 0.18069062232971192,
+      "step": 855
+    },
+    {
+      "epoch": 0.1564774381368268,
+      "grad_norm": 0.1377955675125122,
+      "learning_rate": 4.794123384031495e-05,
+      "loss": 0.18870222568511963,
+      "step": 860
+    },
+    {
+      "epoch": 0.15738719068413393,
+      "grad_norm": 0.15901461243629456,
+      "learning_rate": 4.791188325151373e-05,
+      "loss": 0.18128334283828734,
+      "step": 865
+    },
+    {
+      "epoch": 0.15829694323144106,
+      "grad_norm": 0.14634132385253906,
+      "learning_rate": 4.7882334038869495e-05,
+      "loss": 0.1866163969039917,
+      "step": 870
+    },
+    {
+      "epoch": 0.1592066957787482,
+      "grad_norm": 0.15361061692237854,
+      "learning_rate": 4.785258645854529e-05,
+      "loss": 0.17850807905197144,
+      "step": 875
+    },
+    {
+      "epoch": 0.16011644832605532,
+      "grad_norm": 0.13751649856567383,
+      "learning_rate": 4.782264076842385e-05,
+      "loss": 0.17731113433837892,
+      "step": 880
+    },
+    {
+      "epoch": 0.16102620087336245,
+      "grad_norm": 0.17909638583660126,
+      "learning_rate": 4.7792497228105314e-05,
+      "loss": 0.18344542980194092,
+      "step": 885
+    },
+    {
+      "epoch": 0.16193595342066958,
+      "grad_norm": 0.16038304567337036,
+      "learning_rate": 4.776215609890498e-05,
+      "loss": 0.18868647813796996,
+      "step": 890
+    },
+    {
+      "epoch": 0.1628457059679767,
+      "grad_norm": 0.1653951108455658,
+      "learning_rate": 4.773161764385107e-05,
+      "loss": 0.18614152669906617,
+      "step": 895
+    },
+    {
+      "epoch": 0.16375545851528384,
+      "grad_norm": 0.16193026304244995,
+      "learning_rate": 4.770088212768241e-05,
+      "loss": 0.18564575910568237,
+      "step": 900
+    },
+    {
+      "epoch": 0.16466521106259097,
+      "grad_norm": 0.16048531234264374,
+      "learning_rate": 4.7669949816846173e-05,
+      "loss": 0.18330031633377075,
+      "step": 905
+    },
+    {
+      "epoch": 0.1655749636098981,
+      "grad_norm": 0.1440177708864212,
+      "learning_rate": 4.7638820979495534e-05,
+      "loss": 0.17712442874908446,
+      "step": 910
+    },
+    {
+      "epoch": 0.16648471615720525,
+      "grad_norm": 0.19635969400405884,
+      "learning_rate": 4.760749588548738e-05,
+      "loss": 0.18679027557373046,
+      "step": 915
+    },
+    {
+      "epoch": 0.16739446870451238,
+      "grad_norm": 0.15576541423797607,
+      "learning_rate": 4.757597480637995e-05,
+      "loss": 0.19283764362335204,
+      "step": 920
+    },
+    {
+      "epoch": 0.1683042212518195,
+      "grad_norm": 0.1550331562757492,
+      "learning_rate": 4.7544258015430463e-05,
+      "loss": 0.18269542455673218,
+      "step": 925
+    },
+    {
+      "epoch": 0.16921397379912664,
+      "grad_norm": 0.18369626998901367,
+      "learning_rate": 4.75123457875928e-05,
+      "loss": 0.1697891116142273,
+      "step": 930
+    },
+    {
+      "epoch": 0.17012372634643377,
+      "grad_norm": 0.15266314148902893,
+      "learning_rate": 4.7480238399515074e-05,
+      "loss": 0.18523451089859008,
+      "step": 935
+    },
+    {
+      "epoch": 0.1710334788937409,
+      "grad_norm": 0.16709664463996887,
+      "learning_rate": 4.744793612953724e-05,
+      "loss": 0.1803238034248352,
+      "step": 940
+    },
+    {
+      "epoch": 0.17194323144104803,
+      "grad_norm": 0.14929179847240448,
+      "learning_rate": 4.741543925768872e-05,
+      "loss": 0.1861217737197876,
+      "step": 945
+    },
+    {
+      "epoch": 0.17285298398835516,
+      "grad_norm": 0.1362280696630478,
+      "learning_rate": 4.7382748065685915e-05,
+      "loss": 0.17896100282669067,
+      "step": 950
+    },
+    {
+      "epoch": 0.1737627365356623,
+      "grad_norm": 0.15290239453315735,
+      "learning_rate": 4.734986283692982e-05,
+      "loss": 0.18432788848876952,
+      "step": 955
+    },
+    {
+      "epoch": 0.17467248908296942,
+      "grad_norm": 0.1287035197019577,
+      "learning_rate": 4.73167838565035e-05,
+      "loss": 0.18485682010650634,
+      "step": 960
+    },
+    {
+      "epoch": 0.17558224163027655,
+      "grad_norm": 0.17969627678394318,
+      "learning_rate": 4.728351141116971e-05,
+      "loss": 0.17361557483673096,
+      "step": 965
+    },
+    {
+      "epoch": 0.1764919941775837,
+      "grad_norm": 0.13751201331615448,
+      "learning_rate": 4.7250045789368326e-05,
+      "loss": 0.1731679320335388,
+      "step": 970
+    },
+    {
+      "epoch": 0.17740174672489084,
+      "grad_norm": 0.1603265255689621,
+      "learning_rate": 4.721638728121388e-05,
+      "loss": 0.17308170795440675,
+      "step": 975
+    },
+    {
+      "epoch": 0.17831149927219797,
+      "grad_norm": 0.1592789888381958,
+      "learning_rate": 4.718253617849306e-05,
+      "loss": 0.17534757852554322,
+      "step": 980
+    },
+    {
+      "epoch": 0.1792212518195051,
+      "grad_norm": 0.12727224826812744,
+      "learning_rate": 4.714849277466214e-05,
+      "loss": 0.17817609310150145,
+      "step": 985
+    },
+    {
+      "epoch": 0.18013100436681223,
+      "grad_norm": 0.15401554107666016,
+      "learning_rate": 4.711425736484447e-05,
+      "loss": 0.1733405351638794,
+      "step": 990
+    },
+    {
+      "epoch": 0.18104075691411936,
+      "grad_norm": 0.13253968954086304,
+      "learning_rate": 4.7079830245827906e-05,
+      "loss": 0.17846795320510864,
+      "step": 995
+    },
+    {
+      "epoch": 0.1819505094614265,
+      "grad_norm": 0.21846213936805725,
+      "learning_rate": 4.7045211716062245e-05,
+      "loss": 0.18021599054336548,
+      "step": 1000
+    },
+    {
+      "epoch": 0.18286026200873362,
+      "grad_norm": 0.16867990791797638,
+      "learning_rate": 4.7010402075656595e-05,
+      "loss": 0.18232386112213134,
+      "step": 1005
+    },
+    {
+      "epoch": 0.18377001455604075,
+      "grad_norm": 0.17180582880973816,
+      "learning_rate": 4.697540162637686e-05,
+      "loss": 0.1816317319869995,
+      "step": 1010
+    },
+    {
+      "epoch": 0.18467976710334788,
+      "grad_norm": 0.16480213403701782,
+      "learning_rate": 4.694021067164303e-05,
+      "loss": 0.17718446254730225,
+      "step": 1015
+    },
+    {
+      "epoch": 0.185589519650655,
+      "grad_norm": 0.15015918016433716,
+      "learning_rate": 4.6904829516526605e-05,
+      "loss": 0.17412011623382567,
+      "step": 1020
+    },
+    {
+      "epoch": 0.18649927219796217,
+      "grad_norm": 0.14445139467716217,
+      "learning_rate": 4.686925846774795e-05,
+      "loss": 0.1778018832206726,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1874090247452693,
+      "grad_norm": 0.1701960265636444,
+      "learning_rate": 4.683349783367362e-05,
+      "loss": 0.16901081800460815,
+      "step": 1030
+    },
+    {
+      "epoch": 0.18831877729257643,
+      "grad_norm": 0.15894867479801178,
+      "learning_rate": 4.679754792431368e-05,
+      "loss": 0.17055928707122803,
+      "step": 1035
+    },
+    {
+      "epoch": 0.18922852983988356,
+      "grad_norm": 0.1511942446231842,
+      "learning_rate": 4.676140905131903e-05,
+      "loss": 0.17339680194854737,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1901382823871907,
+      "grad_norm": 0.14735209941864014,
+      "learning_rate": 4.672508152797872e-05,
+      "loss": 0.17802717685699462,
+      "step": 1045
+    },
+    {
+      "epoch": 0.19104803493449782,
+      "grad_norm": 0.17367291450500488,
+      "learning_rate": 4.66885656692172e-05,
+      "loss": 0.1732744097709656,
+      "step": 1050
+    },
+    {
+      "epoch": 0.19195778748180495,
+      "grad_norm": 0.147227481007576,
+      "learning_rate": 4.665186179159159e-05,
+      "loss": 0.17040517330169677,
+      "step": 1055
+    },
+    {
+      "epoch": 0.19286754002911208,
+      "grad_norm": 0.1709655076265335,
+      "learning_rate": 4.6614970213289e-05,
+      "loss": 0.17794088125228882,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1937772925764192,
+      "grad_norm": 0.1588088721036911,
+      "learning_rate": 4.657789125412366e-05,
+      "loss": 0.17180380821228028,
+      "step": 1065
+    },
+    {
+      "epoch": 0.19468704512372634,
+      "grad_norm": 0.14827021956443787,
+      "learning_rate": 4.654062523553428e-05,
+      "loss": 0.182997989654541,
+      "step": 1070
+    },
+    {
+      "epoch": 0.19559679767103347,
+      "grad_norm": 0.16230466961860657,
+      "learning_rate": 4.6503172480581126e-05,
+      "loss": 0.17346880435943604,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1965065502183406,
+      "grad_norm": 0.1637624353170395,
+      "learning_rate": 4.646553331394333e-05,
+      "loss": 0.17263576984405518,
+      "step": 1080
+    },
+    {
+      "epoch": 0.19741630276564776,
+      "grad_norm": 0.15977843105793,
+      "learning_rate": 4.642770806191603e-05,
+      "loss": 0.17284308671951293,
+      "step": 1085
+    },
+    {
+      "epoch": 0.19832605531295489,
+      "grad_norm": 0.15394869446754456,
+      "learning_rate": 4.6389697052407534e-05,
+      "loss": 0.17797101736068727,
+      "step": 1090
+    },
+    {
+      "epoch": 0.19923580786026202,
+      "grad_norm": 0.15995225310325623,
+      "learning_rate": 4.6351500614936485e-05,
+      "loss": 0.18137198686599731,
+      "step": 1095
+    },
+    {
+      "epoch": 0.20014556040756915,
+      "grad_norm": 0.1779479682445526,
+      "learning_rate": 4.6313119080629006e-05,
+      "loss": 0.17998344898223878,
+      "step": 1100
+    },
+    {
+      "epoch": 0.20105531295487628,
+      "grad_norm": 0.14362832903862,
+      "learning_rate": 4.627455278221584e-05,
+      "loss": 0.18196423053741456,
+      "step": 1105
+    },
+    {
+      "epoch": 0.2019650655021834,
+      "grad_norm": 0.15951639413833618,
+      "learning_rate": 4.623580205402947e-05,
+      "loss": 0.17423888444900512,
+      "step": 1110
+    },
+    {
+      "epoch": 0.20287481804949054,
+      "grad_norm": 0.17273563146591187,
+      "learning_rate": 4.619686723200115e-05,
+      "loss": 0.17392473220825194,
+      "step": 1115
+    },
+    {
+      "epoch": 0.20378457059679767,
+      "grad_norm": 0.1655360758304596,
+      "learning_rate": 4.615774865365813e-05,
+      "loss": 0.17528389692306517,
+      "step": 1120
+    },
+    {
+      "epoch": 0.2046943231441048,
+      "grad_norm": 0.15920691192150116,
+      "learning_rate": 4.611844665812058e-05,
+      "loss": 0.1806849241256714,
+      "step": 1125
+    },
+    {
+      "epoch": 0.20560407569141192,
+      "grad_norm": 0.16114577651023865,
+      "learning_rate": 4.607896158609875e-05,
+      "loss": 0.17217352390289306,
+      "step": 1130
+    },
+    {
+      "epoch": 0.20651382823871905,
+      "grad_norm": 0.1499422937631607,
+      "learning_rate": 4.603929377988999e-05,
+      "loss": 0.17806737422943114,
+      "step": 1135
+    },
+    {
+      "epoch": 0.2074235807860262,
+      "grad_norm": 0.17605191469192505,
+      "learning_rate": 4.5999443583375765e-05,
+      "loss": 0.17842113971710205,
+      "step": 1140
+    },
+    {
+      "epoch": 0.20833333333333334,
+      "grad_norm": 0.16117210686206818,
+      "learning_rate": 4.595941134201871e-05,
+      "loss": 0.18379683494567872,
+      "step": 1145
+    },
+    {
+      "epoch": 0.20924308588064047,
+      "grad_norm": 0.21199050545692444,
+      "learning_rate": 4.591919740285957e-05,
+      "loss": 0.16286123991012574,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2101528384279476,
+      "grad_norm": 0.15100529789924622,
+      "learning_rate": 4.587880211451427e-05,
+      "loss": 0.17995200157165528,
+      "step": 1155
+    },
+    {
+      "epoch": 0.21106259097525473,
+      "grad_norm": 0.16618172824382782,
+      "learning_rate": 4.583822582717085e-05,
+      "loss": 0.16960303783416747,
+      "step": 1160
+    },
+    {
+      "epoch": 0.21197234352256186,
+      "grad_norm": 0.14743569493293762,
+      "learning_rate": 4.579746889258643e-05,
+      "loss": 0.17762668132781984,
+      "step": 1165
+    },
+    {
+      "epoch": 0.212882096069869,
+      "grad_norm": 0.1697179079055786,
+      "learning_rate": 4.575653166408417e-05,
+      "loss": 0.16656005382537842,
+      "step": 1170
+    },
+    {
+      "epoch": 0.21379184861717612,
+      "grad_norm": 0.14886513352394104,
+      "learning_rate": 4.57154144965502e-05,
+      "loss": 0.17091882228851318,
+      "step": 1175
+    },
+    {
+      "epoch": 0.21470160116448325,
+      "grad_norm": 0.18197473883628845,
+      "learning_rate": 4.5674117746430556e-05,
+      "loss": 0.1770920753479004,
+      "step": 1180
+    },
+    {
+      "epoch": 0.21561135371179038,
+      "grad_norm": 0.17323088645935059,
+      "learning_rate": 4.563264177172807e-05,
+      "loss": 0.1734643578529358,
+      "step": 1185
+    },
+    {
+      "epoch": 0.2165211062590975,
+      "grad_norm": 0.1521984338760376,
+      "learning_rate": 4.559098693199929e-05,
+      "loss": 0.17515116930007935,
+      "step": 1190
+    },
+    {
+      "epoch": 0.21743085880640467,
+      "grad_norm": 0.1842304915189743,
+      "learning_rate": 4.554915358835134e-05,
+      "loss": 0.16798022985458375,
+      "step": 1195
+    },
+    {
+      "epoch": 0.2183406113537118,
+      "grad_norm": 0.14753451943397522,
+      "learning_rate": 4.5507142103438794e-05,
+      "loss": 0.1755476713180542,
+      "step": 1200
+    },
+    {
+      "epoch": 0.21925036390101893,
+      "grad_norm": 0.17096194624900818,
+      "learning_rate": 4.546495284146057e-05,
+      "loss": 0.1792473554611206,
+      "step": 1205
+    },
+    {
+      "epoch": 0.22016011644832606,
+      "grad_norm": 0.1579233556985855,
+      "learning_rate": 4.542258616815669e-05,
+      "loss": 0.17230144739151002,
+      "step": 1210
+    },
+    {
+      "epoch": 0.2210698689956332,
+      "grad_norm": 0.177297905087471,
+      "learning_rate": 4.5380042450805216e-05,
+      "loss": 0.1807127833366394,
+      "step": 1215
+    },
+    {
+      "epoch": 0.22197962154294032,
+      "grad_norm": 0.14331696927547455,
+      "learning_rate": 4.533732205821897e-05,
+      "loss": 0.17201389074325563,
+      "step": 1220
+    },
+    {
+      "epoch": 0.22288937409024745,
+      "grad_norm": 0.14473360776901245,
+      "learning_rate": 4.529442536074239e-05,
+      "loss": 0.17036900520324708,
+      "step": 1225
+    },
+    {
+      "epoch": 0.22379912663755458,
+      "grad_norm": 0.1820901483297348,
+      "learning_rate": 4.5251352730248314e-05,
+      "loss": 0.17704882621765136,
+      "step": 1230
+    },
+    {
+      "epoch": 0.2247088791848617,
+      "grad_norm": 0.1948976367712021,
+      "learning_rate": 4.5208104540134746e-05,
+      "loss": 0.1706973433494568,
+      "step": 1235
+    },
+    {
+      "epoch": 0.22561863173216884,
+      "grad_norm": 0.16660070419311523,
+      "learning_rate": 4.51646811653216e-05,
+      "loss": 0.17636821269989014,
+      "step": 1240
+    },
+    {
+      "epoch": 0.22652838427947597,
+      "grad_norm": 0.1699984073638916,
+      "learning_rate": 4.512108298224751e-05,
+      "loss": 0.16986632347106934,
+      "step": 1245
+    },
+    {
+      "epoch": 0.22743813682678313,
+      "grad_norm": 0.17601042985916138,
+      "learning_rate": 4.50773103688665e-05,
+      "loss": 0.17507898807525635,
+      "step": 1250
+    },
+    {
+      "epoch": 0.22834788937409026,
+      "grad_norm": 0.17557238042354584,
+      "learning_rate": 4.503336370464476e-05,
+      "loss": 0.17702863216400147,
+      "step": 1255
+    },
+    {
+      "epoch": 0.2292576419213974,
+      "grad_norm": 0.1800651252269745,
+      "learning_rate": 4.498924337055729e-05,
+      "loss": 0.16419180631637573,
+      "step": 1260
+    },
+    {
+      "epoch": 0.23016739446870452,
+      "grad_norm": 0.2022479772567749,
+      "learning_rate": 4.494494974908468e-05,
+      "loss": 0.17482060194015503,
+      "step": 1265
+    },
+    {
+      "epoch": 0.23107714701601165,
+      "grad_norm": 0.14180205762386322,
+      "learning_rate": 4.490048322420973e-05,
+      "loss": 0.1723136067390442,
+      "step": 1270
+    },
+    {
+      "epoch": 0.23198689956331878,
+      "grad_norm": 0.18607310950756073,
+      "learning_rate": 4.485584418141419e-05,
+      "loss": 0.17096419334411622,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2328966521106259,
+      "grad_norm": 0.15958310663700104,
+      "learning_rate": 4.481103300767529e-05,
+      "loss": 0.1656244158744812,
+      "step": 1280
+    },
+    {
+      "epoch": 0.23380640465793304,
+      "grad_norm": 0.17552383244037628,
+      "learning_rate": 4.476605009146255e-05,
+      "loss": 0.17677626609802247,
+      "step": 1285
+    },
+    {
+      "epoch": 0.23471615720524017,
+      "grad_norm": 0.15299823880195618,
+      "learning_rate": 4.472089582273429e-05,
+      "loss": 0.1778991103172302,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2356259097525473,
+      "grad_norm": 0.14613987505435944,
+      "learning_rate": 4.46755705929343e-05,
+      "loss": 0.17071452140808105,
+      "step": 1295
+    },
+    {
+      "epoch": 0.23653566229985443,
+      "grad_norm": 0.17781122028827667,
+      "learning_rate": 4.463007479498843e-05,
+      "loss": 0.16955430507659913,
+      "step": 1300
+    },
+    {
+      "epoch": 0.23744541484716158,
+      "grad_norm": 0.16326487064361572,
+      "learning_rate": 4.458440882330119e-05,
+      "loss": 0.1777693510055542,
+      "step": 1305
+    },
+    {
+      "epoch": 0.23835516739446871,
+      "grad_norm": 0.17701926827430725,
+      "learning_rate": 4.4538573073752365e-05,
+      "loss": 0.16323351860046387,
+      "step": 1310
+    },
+    {
+      "epoch": 0.23926491994177584,
+      "grad_norm": 0.13104717433452606,
+      "learning_rate": 4.449256794369349e-05,
+      "loss": 0.17653456926345826,
+      "step": 1315
+    },
+    {
+      "epoch": 0.24017467248908297,
+      "grad_norm": 0.1796836256980896,
+      "learning_rate": 4.444639383194452e-05,
+      "loss": 0.17189600467681884,
+      "step": 1320
+    },
+    {
+      "epoch": 0.2410844250363901,
+      "grad_norm": 0.14919696748256683,
+      "learning_rate": 4.440005113879029e-05,
+      "loss": 0.17003334760665895,
+      "step": 1325
+    },
+    {
+      "epoch": 0.24199417758369723,
+      "grad_norm": 0.1728784441947937,
+      "learning_rate": 4.4353540265977064e-05,
+      "loss": 0.17397408485412597,
+      "step": 1330
+    },
+    {
+      "epoch": 0.24290393013100436,
+      "grad_norm": 0.14591015875339508,
+      "learning_rate": 4.43068616167091e-05,
+      "loss": 0.16498478651046752,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2438136826783115,
+      "grad_norm": 0.18417201936244965,
+      "learning_rate": 4.4260015595645055e-05,
+      "loss": 0.16841750144958495,
+      "step": 1340
+    },
+    {
+      "epoch": 0.24472343522561862,
+      "grad_norm": 0.16264279186725616,
+      "learning_rate": 4.4213002608894605e-05,
+      "loss": 0.16907373666763306,
+      "step": 1345
+    },
+    {
+      "epoch": 0.24563318777292575,
+      "grad_norm": 0.15248481929302216,
+      "learning_rate": 4.416582306401481e-05,
+      "loss": 0.15931472778320313,
+      "step": 1350
+    },
+    {
+      "epoch": 0.24654294032023288,
+      "grad_norm": 0.1488373875617981,
+      "learning_rate": 4.4118477370006636e-05,
+      "loss": 0.1701716423034668,
+      "step": 1355
+    },
+    {
+      "epoch": 0.24745269286754004,
+      "grad_norm": 0.14679782092571259,
+      "learning_rate": 4.407096593731142e-05,
+      "loss": 0.157412326335907,
+      "step": 1360
+    },
+    {
+      "epoch": 0.24836244541484717,
+      "grad_norm": 0.17139530181884766,
+      "learning_rate": 4.402328917780728e-05,
+      "loss": 0.17303754091262818,
+      "step": 1365
+    },
+    {
+      "epoch": 0.2492721979621543,
+      "grad_norm": 0.1534871757030487,
+      "learning_rate": 4.397544750480554e-05,
+      "loss": 0.1786255121231079,
+      "step": 1370
+    },
+    {
+      "epoch": 0.2501819505094614,
+      "grad_norm": 0.1876252293586731,
+      "learning_rate": 4.39274413330472e-05,
+      "loss": 0.16442898511886597,
+      "step": 1375
+    },
+    {
+      "epoch": 0.25109170305676853,
+      "grad_norm": 0.16165752708911896,
+      "learning_rate": 4.387927107869928e-05,
+      "loss": 0.1780426025390625,
+      "step": 1380
+    },
+    {
+      "epoch": 0.25200145560407566,
+      "grad_norm": 0.17242255806922913,
+      "learning_rate": 4.383093715935124e-05,
+      "loss": 0.15959256887435913,
+      "step": 1385
+    },
+    {
+      "epoch": 0.25291120815138285,
+      "grad_norm": 0.1627114862203598,
+      "learning_rate": 4.378243999401137e-05,
+      "loss": 0.17606115341186523,
+      "step": 1390
+    },
+    {
+      "epoch": 0.25382096069869,
+      "grad_norm": 0.15911224484443665,
+      "learning_rate": 4.373378000310312e-05,
+      "loss": 0.16798585653305054,
+      "step": 1395
+    },
+    {
+      "epoch": 0.2547307132459971,
+      "grad_norm": 0.15542249381542206,
+      "learning_rate": 4.3684957608461505e-05,
+      "loss": 0.1695417881011963,
+      "step": 1400
+    },
+    {
+      "epoch": 0.25564046579330424,
+      "grad_norm": 0.1475304812192917,
+      "learning_rate": 4.363597323332941e-05,
+      "loss": 0.16340878009796142,
+      "step": 1405
+    },
+    {
+      "epoch": 0.25655021834061137,
+      "grad_norm": 0.16943927109241486,
+      "learning_rate": 4.358682730235395e-05,
+      "loss": 0.17240238189697266,
+      "step": 1410
+    },
+    {
+      "epoch": 0.2574599708879185,
+      "grad_norm": 0.1816391944885254,
+      "learning_rate": 4.3537520241582744e-05,
+      "loss": 0.16558437347412108,
+      "step": 1415
+    },
+    {
+      "epoch": 0.25836972343522563,
+      "grad_norm": 0.23851341009140015,
+      "learning_rate": 4.348805247846027e-05,
+      "loss": 0.16796000003814698,
+      "step": 1420
+    },
+    {
+      "epoch": 0.25927947598253276,
+      "grad_norm": 0.15415243804454803,
+      "learning_rate": 4.343842444182414e-05,
+      "loss": 0.1746017098426819,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2601892285298399,
+      "grad_norm": 0.15651032328605652,
+      "learning_rate": 4.338863656190139e-05,
+      "loss": 0.1649057984352112,
+      "step": 1430
+    },
+    {
+      "epoch": 0.261098981077147,
+      "grad_norm": 0.16601966321468353,
+      "learning_rate": 4.333868927030471e-05,
+      "loss": 0.15888988971710205,
+      "step": 1435
+    },
+    {
+      "epoch": 0.26200873362445415,
+      "grad_norm": 0.1549467295408249,
+      "learning_rate": 4.328858300002876e-05,
+      "loss": 0.16357985734939576,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2629184861717613,
+      "grad_norm": 0.16332370042800903,
+      "learning_rate": 4.32383181854464e-05,
+      "loss": 0.16749982833862304,
+      "step": 1445
+    },
+    {
+      "epoch": 0.2638282387190684,
+      "grad_norm": 0.14827077090740204,
+      "learning_rate": 4.3187895262304894e-05,
+      "loss": 0.16886214017868043,
+      "step": 1450
+    },
+    {
+      "epoch": 0.26473799126637554,
+      "grad_norm": 0.1557198166847229,
+      "learning_rate": 4.313731466772216e-05,
+      "loss": 0.17512214183807373,
+      "step": 1455
+    },
+    {
+      "epoch": 0.26564774381368267,
+      "grad_norm": 0.17263570427894592,
+      "learning_rate": 4.308657684018299e-05,
+      "loss": 0.16248074769973755,
+      "step": 1460
+    },
+    {
+      "epoch": 0.2665574963609898,
+      "grad_norm": 0.17135761678218842,
+      "learning_rate": 4.303568221953521e-05,
+      "loss": 0.16605921983718872,
+      "step": 1465
+    },
+    {
+      "epoch": 0.26746724890829693,
+      "grad_norm": 0.14322632551193237,
+      "learning_rate": 4.2984631246985897e-05,
+      "loss": 0.1610772728919983,
+      "step": 1470
+    },
+    {
+      "epoch": 0.26837700145560406,
+      "grad_norm": 0.18852312862873077,
+      "learning_rate": 4.2933424365097564e-05,
+      "loss": 0.1686462163925171,
+      "step": 1475
+    },
+    {
+      "epoch": 0.2692867540029112,
+      "grad_norm": 0.1780245155096054,
+      "learning_rate": 4.2882062017784294e-05,
+      "loss": 0.16953932046890258,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2701965065502183,
+      "grad_norm": 0.180568665266037,
+      "learning_rate": 4.2830544650307895e-05,
+      "loss": 0.16442664861679077,
+      "step": 1485
+    },
+    {
+      "epoch": 0.27110625909752545,
+      "grad_norm": 0.16876435279846191,
+      "learning_rate": 4.277887270927407e-05,
+      "loss": 0.17128173112869263,
+      "step": 1490
+    },
+    {
+      "epoch": 0.2720160116448326,
+      "grad_norm": 0.164053276181221,
+      "learning_rate": 4.2727046642628513e-05,
+      "loss": 0.16331382989883422,
+      "step": 1495
+    },
+    {
+      "epoch": 0.27292576419213976,
+      "grad_norm": 0.14577528834342957,
+      "learning_rate": 4.267506689965305e-05,
+      "loss": 0.1638316035270691,
+      "step": 1500
+    },
+    {
+      "epoch": 0.2738355167394469,
+      "grad_norm": 0.1648740917444229,
+      "learning_rate": 4.262293393096171e-05,
+      "loss": 0.15332664251327516,
+      "step": 1505
+    },
+    {
+      "epoch": 0.274745269286754,
+      "grad_norm": 0.16445094347000122,
+      "learning_rate": 4.257064818849685e-05,
+      "loss": 0.1706634521484375,
+      "step": 1510
+    },
+    {
+      "epoch": 0.27565502183406115,
+      "grad_norm": 0.1584935486316681,
+      "learning_rate": 4.251821012552524e-05,
+      "loss": 0.1684114694595337,
+      "step": 1515
+    },
+    {
+      "epoch": 0.2765647743813683,
+      "grad_norm": 0.17215611040592194,
+      "learning_rate": 4.24656201966341e-05,
+      "loss": 0.15594131946563722,
+      "step": 1520
+    },
+    {
+      "epoch": 0.2774745269286754,
+      "grad_norm": 0.15945589542388916,
+      "learning_rate": 4.2412878857727214e-05,
+      "loss": 0.1686659574508667,
+      "step": 1525
+    },
+    {
+      "epoch": 0.27838427947598254,
+      "grad_norm": 0.16103951632976532,
+      "learning_rate": 4.2359986566020906e-05,
+      "loss": 0.17779340744018554,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2792940320232897,
+      "grad_norm": 0.1770307570695877,
+      "learning_rate": 4.230694378004014e-05,
+      "loss": 0.16786882877349854,
+      "step": 1535
+    },
+    {
+      "epoch": 0.2802037845705968,
+      "grad_norm": 0.16225053369998932,
+      "learning_rate": 4.2253750959614504e-05,
+      "loss": 0.16558897495269775,
+      "step": 1540
+    },
+    {
+      "epoch": 0.28111353711790393,
+      "grad_norm": 0.27213969826698303,
+      "learning_rate": 4.220040856587425e-05,
+      "loss": 0.1641119599342346,
+      "step": 1545
+    },
+    {
+      "epoch": 0.28202328966521106,
+      "grad_norm": 0.1773071587085724,
+      "learning_rate": 4.2146917061246284e-05,
+      "loss": 0.16919140815734862,
+      "step": 1550
+    },
+    {
+      "epoch": 0.2829330422125182,
+      "grad_norm": 0.15519705414772034,
+      "learning_rate": 4.209327690945014e-05,
+      "loss": 0.15501506328582765,
+      "step": 1555
+    },
+    {
+      "epoch": 0.2838427947598253,
+      "grad_norm": 0.19921597838401794,
+      "learning_rate": 4.203948857549402e-05,
+      "loss": 0.1690821886062622,
+      "step": 1560
+    },
+    {
+      "epoch": 0.28475254730713245,
+      "grad_norm": 0.15417630970478058,
+      "learning_rate": 4.1985552525670696e-05,
+      "loss": 0.1675640344619751,
+      "step": 1565
+    },
+    {
+      "epoch": 0.2856622998544396,
+      "grad_norm": 0.1739572137594223,
+      "learning_rate": 4.193146922755348e-05,
+      "loss": 0.16738017797470092,
+      "step": 1570
+    },
+    {
+      "epoch": 0.2865720524017467,
+      "grad_norm": 0.1384361982345581,
+      "learning_rate": 4.187723914999221e-05,
+      "loss": 0.16802358627319336,
+      "step": 1575
+    },
+    {
+      "epoch": 0.28748180494905384,
+      "grad_norm": 0.1491454839706421,
+      "learning_rate": 4.182286276310915e-05,
+      "loss": 0.1619583249092102,
+      "step": 1580
+    },
+    {
+      "epoch": 0.288391557496361,
+      "grad_norm": 0.15831919014453888,
+      "learning_rate": 4.176834053829492e-05,
+      "loss": 0.1625199794769287,
+      "step": 1585
+    },
+    {
+      "epoch": 0.2893013100436681,
+      "grad_norm": 0.16265396773815155,
+      "learning_rate": 4.1713672948204416e-05,
+      "loss": 0.16718552112579346,
+      "step": 1590
+    },
+    {
+      "epoch": 0.29021106259097523,
+      "grad_norm": 0.15153461694717407,
+      "learning_rate": 4.1658860466752714e-05,
+      "loss": 0.15979087352752686,
+      "step": 1595
+    },
+    {
+      "epoch": 0.29112081513828236,
+      "grad_norm": 0.1620412915945053,
+      "learning_rate": 4.160390356911096e-05,
+      "loss": 0.16103557348251343,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2920305676855895,
+      "grad_norm": 0.16673807799816132,
+      "learning_rate": 4.154880273170223e-05,
+      "loss": 0.16394708156585694,
+      "step": 1605
+    },
+    {
+      "epoch": 0.2929403202328967,
+      "grad_norm": 0.14834867417812347,
+      "learning_rate": 4.149355843219744e-05,
+      "loss": 0.15916435718536376,
+      "step": 1610
+    },
+    {
+      "epoch": 0.2938500727802038,
+      "grad_norm": 0.16977964341640472,
+      "learning_rate": 4.143817114951119e-05,
+      "loss": 0.16538127660751342,
+      "step": 1615
+    },
+    {
+      "epoch": 0.29475982532751094,
+      "grad_norm": 0.17986875772476196,
+      "learning_rate": 4.138264136379756e-05,
+      "loss": 0.15514618158340454,
+      "step": 1620
+    },
+    {
+      "epoch": 0.29566957787481807,
+      "grad_norm": 0.15794920921325684,
+      "learning_rate": 4.132696955644605e-05,
+      "loss": 0.15992183685302735,
+      "step": 1625
+    },
+    {
+      "epoch": 0.2965793304221252,
+      "grad_norm": 0.19955399632453918,
+      "learning_rate": 4.127115621007731e-05,
+      "loss": 0.16362056732177735,
+      "step": 1630
+    },
+    {
+      "epoch": 0.29748908296943233,
+      "grad_norm": 0.1352023035287857,
+      "learning_rate": 4.121520180853903e-05,
+      "loss": 0.15631601810455323,
+      "step": 1635
+    },
+    {
+      "epoch": 0.29839883551673946,
+      "grad_norm": 0.15340781211853027,
+      "learning_rate": 4.1159106836901674e-05,
+      "loss": 0.1571858048439026,
+      "step": 1640
+    },
+    {
+      "epoch": 0.2993085880640466,
+      "grad_norm": 0.15311770141124725,
+      "learning_rate": 4.110287178145433e-05,
+      "loss": 0.16082344055175782,
+      "step": 1645
+    },
+    {
+      "epoch": 0.3002183406113537,
+      "grad_norm": 0.17811627686023712,
+      "learning_rate": 4.10464971297005e-05,
+      "loss": 0.16117215156555176,
+      "step": 1650
+    },
+    {
+      "epoch": 0.30112809315866085,
+      "grad_norm": 0.21060039103031158,
+      "learning_rate": 4.0989983370353805e-05,
+      "loss": 0.15838587284088135,
+      "step": 1655
+    },
+    {
+      "epoch": 0.302037845705968,
+      "grad_norm": 0.155836820602417,
+      "learning_rate": 4.093333099333383e-05,
+      "loss": 0.16648870706558228,
+      "step": 1660
+    },
+    {
+      "epoch": 0.3029475982532751,
+      "grad_norm": 0.13711698353290558,
+      "learning_rate": 4.0876540489761826e-05,
+      "loss": 0.16899349689483642,
+      "step": 1665
+    },
+    {
+      "epoch": 0.30385735080058224,
+      "grad_norm": 0.15162716805934906,
+      "learning_rate": 4.0819612351956485e-05,
+      "loss": 0.16574090719223022,
+      "step": 1670
+    },
+    {
+      "epoch": 0.30476710334788937,
+      "grad_norm": 0.15016348659992218,
+      "learning_rate": 4.0762547073429615e-05,
+      "loss": 0.1689780354499817,
+      "step": 1675
+    },
+    {
+      "epoch": 0.3056768558951965,
+      "grad_norm": 0.15182986855506897,
+      "learning_rate": 4.070534514888194e-05,
+      "loss": 0.1593686819076538,
+      "step": 1680
+    },
+    {
+      "epoch": 0.3065866084425036,
+      "grad_norm": 0.15648750960826874,
+      "learning_rate": 4.0648007074198765e-05,
+      "loss": 0.16436235904693602,
+      "step": 1685
+    },
+    {
+      "epoch": 0.30749636098981076,
+      "grad_norm": 0.18339484930038452,
+      "learning_rate": 4.0590533346445665e-05,
+      "loss": 0.1678077220916748,
+      "step": 1690
+    },
+    {
+      "epoch": 0.3084061135371179,
+      "grad_norm": 0.16426527500152588,
+      "learning_rate": 4.053292446386422e-05,
+      "loss": 0.1689227342605591,
+      "step": 1695
+    },
+    {
+      "epoch": 0.309315866084425,
+      "grad_norm": 0.16129335761070251,
+      "learning_rate": 4.047518092586766e-05,
+      "loss": 0.16592445373535156,
+      "step": 1700
+    },
+    {
+      "epoch": 0.31022561863173215,
+      "grad_norm": 0.15512363612651825,
+      "learning_rate": 4.041730323303654e-05,
+      "loss": 0.16142364740371704,
+      "step": 1705
+    },
+    {
+      "epoch": 0.3111353711790393,
+      "grad_norm": 0.159842386841774,
+      "learning_rate": 4.0359291887114425e-05,
+      "loss": 0.1702875852584839,
+      "step": 1710
+    },
+    {
+      "epoch": 0.3120451237263464,
+      "grad_norm": 0.19558854401111603,
+      "learning_rate": 4.030114739100352e-05,
+      "loss": 0.15966148376464845,
+      "step": 1715
+    },
+    {
+      "epoch": 0.3129548762736536,
+      "grad_norm": 0.1577496975660324,
+      "learning_rate": 4.024287024876029e-05,
+      "loss": 0.1620358943939209,
+      "step": 1720
+    },
+    {
+      "epoch": 0.3138646288209607,
+      "grad_norm": 0.1629355251789093,
+      "learning_rate": 4.0184460965591144e-05,
+      "loss": 0.16511552333831786,
+      "step": 1725
+    },
+    {
+      "epoch": 0.31477438136826785,
+      "grad_norm": 0.17060767114162445,
+      "learning_rate": 4.0125920047848e-05,
+      "loss": 0.15672838687896729,
+      "step": 1730
+    },
+    {
+      "epoch": 0.315684133915575,
+      "grad_norm": 0.22447620332241058,
+      "learning_rate": 4.006724800302394e-05,
+      "loss": 0.15339784622192382,
+      "step": 1735
+    },
+    {
+      "epoch": 0.3165938864628821,
+      "grad_norm": 0.14572037756443024,
+      "learning_rate": 4.000844533974878e-05,
+      "loss": 0.16566959619522095,
+      "step": 1740
+    },
+    {
+      "epoch": 0.31750363901018924,
+      "grad_norm": 0.15915483236312866,
+      "learning_rate": 3.9949512567784684e-05,
+      "loss": 0.16153957843780517,
+      "step": 1745
+    },
+    {
+      "epoch": 0.3184133915574964,
+      "grad_norm": 0.1668540984392166,
+      "learning_rate": 3.9890450198021704e-05,
+      "loss": 0.1659809947013855,
+      "step": 1750
+    },
+    {
+      "epoch": 0.3193231441048035,
+      "grad_norm": 0.16612035036087036,
+      "learning_rate": 3.983125874247341e-05,
+      "loss": 0.16941241025924683,
+      "step": 1755
+    },
+    {
+      "epoch": 0.32023289665211063,
+      "grad_norm": 0.15163679420948029,
+      "learning_rate": 3.9771938714272407e-05,
+      "loss": 0.16053590774536133,
+      "step": 1760
+    },
+    {
+      "epoch": 0.32114264919941776,
+      "grad_norm": 0.1797824203968048,
+      "learning_rate": 3.97124906276659e-05,
+      "loss": 0.1667110800743103,
+      "step": 1765
+    },
+    {
+      "epoch": 0.3220524017467249,
+      "grad_norm": 0.15076608955860138,
+      "learning_rate": 3.9652914998011237e-05,
+      "loss": 0.1607860803604126,
+      "step": 1770
+    },
+    {
+      "epoch": 0.322962154294032,
+      "grad_norm": 0.16523587703704834,
+      "learning_rate": 3.959321234177144e-05,
+      "loss": 0.16515827178955078,
+      "step": 1775
+    },
+    {
+      "epoch": 0.32387190684133915,
+      "grad_norm": 0.22065149247646332,
+      "learning_rate": 3.9533383176510746e-05,
+      "loss": 0.1618957757949829,
+      "step": 1780
+    },
+    {
+      "epoch": 0.3247816593886463,
+      "grad_norm": 0.16426463425159454,
+      "learning_rate": 3.9473428020890066e-05,
+      "loss": 0.15763382911682128,
+      "step": 1785
+    },
+    {
+      "epoch": 0.3256914119359534,
+      "grad_norm": 0.16474904119968414,
+      "learning_rate": 3.941334739466257e-05,
+      "loss": 0.15135571956634522,
+      "step": 1790
+    },
+    {
+      "epoch": 0.32660116448326054,
+      "grad_norm": 0.16746412217617035,
+      "learning_rate": 3.935314181866909e-05,
+      "loss": 0.15925389528274536,
+      "step": 1795
+    },
+    {
+      "epoch": 0.32751091703056767,
+      "grad_norm": 0.17819371819496155,
+      "learning_rate": 3.929281181483369e-05,
+      "loss": 0.1598669171333313,
+      "step": 1800
+    },
+    {
+      "epoch": 0.3284206695778748,
+      "grad_norm": 0.1816040277481079,
+      "learning_rate": 3.923235790615907e-05,
+      "loss": 0.1652522087097168,
+      "step": 1805
+    },
+    {
+      "epoch": 0.32933042212518193,
+      "grad_norm": 0.14846695959568024,
+      "learning_rate": 3.917178061672211e-05,
+      "loss": 0.16665585041046144,
+      "step": 1810
+    },
+    {
+      "epoch": 0.33024017467248906,
+      "grad_norm": 0.1734926551580429,
+      "learning_rate": 3.911108047166924e-05,
+      "loss": 0.16069791316986085,
+      "step": 1815
+    },
+    {
+      "epoch": 0.3311499272197962,
+      "grad_norm": 0.16154922544956207,
+      "learning_rate": 3.905025799721194e-05,
+      "loss": 0.16114097833633423,
+      "step": 1820
+    },
+    {
+      "epoch": 0.3320596797671033,
+      "grad_norm": 0.1538771390914917,
+      "learning_rate": 3.898931372062217e-05,
+      "loss": 0.1602831244468689,
+      "step": 1825
+    },
+    {
+      "epoch": 0.3329694323144105,
+      "grad_norm": 0.14036566019058228,
+      "learning_rate": 3.892824817022781e-05,
+      "loss": 0.1502395749092102,
+      "step": 1830
+    },
+    {
+      "epoch": 0.33387918486171764,
+      "grad_norm": 0.19212059676647186,
+      "learning_rate": 3.886706187540804e-05,
+      "loss": 0.16265250444412233,
+      "step": 1835
+    },
+    {
+      "epoch": 0.33478893740902477,
+      "grad_norm": 0.17410333454608917,
+      "learning_rate": 3.880575536658881e-05,
+      "loss": 0.15689224004745483,
+      "step": 1840
+    },
+    {
+      "epoch": 0.3356986899563319,
+      "grad_norm": 0.15165294706821442,
+      "learning_rate": 3.874432917523817e-05,
+      "loss": 0.15033140182495117,
+      "step": 1845
+    },
+    {
+      "epoch": 0.336608442503639,
+      "grad_norm": 0.16166730225086212,
+      "learning_rate": 3.8682783833861736e-05,
+      "loss": 0.16896235942840576,
+      "step": 1850
+    },
+    {
+      "epoch": 0.33751819505094616,
+      "grad_norm": 0.16497021913528442,
+      "learning_rate": 3.8621119875998026e-05,
+      "loss": 0.1600774645805359,
+      "step": 1855
+    },
+    {
+      "epoch": 0.3384279475982533,
+      "grad_norm": 0.17264948785305023,
+      "learning_rate": 3.855933783621384e-05,
+      "loss": 0.16947593688964843,
+      "step": 1860
+    },
+    {
+      "epoch": 0.3393377001455604,
+      "grad_norm": 0.16870704293251038,
+      "learning_rate": 3.8497438250099636e-05,
+      "loss": 0.16062095165252685,
+      "step": 1865
+    },
+    {
+      "epoch": 0.34024745269286755,
+      "grad_norm": 0.16644036769866943,
+      "learning_rate": 3.843542165426492e-05,
+      "loss": 0.16015599966049193,
+      "step": 1870
+    },
+    {
+      "epoch": 0.3411572052401747,
+      "grad_norm": 0.1626352220773697,
+      "learning_rate": 3.837328858633349e-05,
+      "loss": 0.17444703578948975,
+      "step": 1875
+    },
+    {
+      "epoch": 0.3420669577874818,
+      "grad_norm": 0.1427375227212906,
+      "learning_rate": 3.83110395849389e-05,
+      "loss": 0.1589805006980896,
+      "step": 1880
+    },
+    {
+      "epoch": 0.34297671033478894,
+      "grad_norm": 0.17840255796909332,
+      "learning_rate": 3.824867518971973e-05,
+      "loss": 0.15953952074050903,
+      "step": 1885
+    },
+    {
+      "epoch": 0.34388646288209607,
+      "grad_norm": 0.16998249292373657,
+      "learning_rate": 3.818619594131489e-05,
+      "loss": 0.16027032136917113,
+      "step": 1890
+    },
+    {
+      "epoch": 0.3447962154294032,
+      "grad_norm": 0.14950257539749146,
+      "learning_rate": 3.812360238135897e-05,
+      "loss": 0.15335670709609986,
+      "step": 1895
+    },
+    {
+      "epoch": 0.3457059679767103,
+      "grad_norm": 0.1678011417388916,
+      "learning_rate": 3.806089505247752e-05,
+      "loss": 0.1560648798942566,
+      "step": 1900
+    },
+    {
+      "epoch": 0.34661572052401746,
+      "grad_norm": 0.17944541573524475,
+      "learning_rate": 3.799807449828238e-05,
+      "loss": 0.16072254180908202,
+      "step": 1905
+    },
+    {
+      "epoch": 0.3475254730713246,
+      "grad_norm": 0.166817307472229,
+      "learning_rate": 3.793514126336691e-05,
+      "loss": 0.1542820692062378,
+      "step": 1910
+    },
+    {
+      "epoch": 0.3484352256186317,
+      "grad_norm": 0.16047626733779907,
+      "learning_rate": 3.787209589330134e-05,
+      "loss": 0.16092092990875245,
+      "step": 1915
+    },
+    {
+      "epoch": 0.34934497816593885,
+      "grad_norm": 0.16478900611400604,
+      "learning_rate": 3.7808938934627965e-05,
+      "loss": 0.16765867471694945,
+      "step": 1920
+    },
+    {
+      "epoch": 0.350254730713246,
+      "grad_norm": 0.15349514782428741,
+      "learning_rate": 3.774567093485648e-05,
+      "loss": 0.15890377759933472,
+      "step": 1925
+    },
+    {
+      "epoch": 0.3511644832605531,
+      "grad_norm": 0.1515921950340271,
+      "learning_rate": 3.768229244245917e-05,
+      "loss": 0.16668319702148438,
+      "step": 1930
+    },
+    {
+      "epoch": 0.35207423580786024,
+      "grad_norm": 0.16310466825962067,
+      "learning_rate": 3.7618804006866195e-05,
+      "loss": 0.15182652473449706,
+      "step": 1935
+    },
+    {
+      "epoch": 0.3529839883551674,
+      "grad_norm": 0.17294517159461975,
+      "learning_rate": 3.755520617846084e-05,
+      "loss": 0.16287628412246705,
+      "step": 1940
+    },
+    {
+      "epoch": 0.35389374090247455,
+      "grad_norm": 0.1482895463705063,
+      "learning_rate": 3.749149950857467e-05,
+      "loss": 0.15321952104568481,
+      "step": 1945
+    },
+    {
+      "epoch": 0.3548034934497817,
+      "grad_norm": 0.2236029952764511,
+      "learning_rate": 3.7427684549482847e-05,
+      "loss": 0.15403482913970948,
+      "step": 1950
+    },
+    {
+      "epoch": 0.3557132459970888,
+      "grad_norm": 0.20185327529907227,
+      "learning_rate": 3.736376185439927e-05,
+      "loss": 0.1633884072303772,
+      "step": 1955
+    },
+    {
+      "epoch": 0.35662299854439594,
+      "grad_norm": 0.13906247913837433,
+      "learning_rate": 3.7299731977471816e-05,
+      "loss": 0.15925350189208984,
+      "step": 1960
+    },
+    {
+      "epoch": 0.35753275109170307,
+      "grad_norm": 0.18665002286434174,
+      "learning_rate": 3.723559547377751e-05,
+      "loss": 0.1612026572227478,
+      "step": 1965
+    },
+    {
+      "epoch": 0.3584425036390102,
+      "grad_norm": 0.16913433372974396,
+      "learning_rate": 3.717135289931774e-05,
+      "loss": 0.15479494333267213,
+      "step": 1970
+    },
+    {
+      "epoch": 0.35935225618631733,
+      "grad_norm": 0.1620066910982132,
+      "learning_rate": 3.7107004811013434e-05,
+      "loss": 0.1604058027267456,
+      "step": 1975
+    },
+    {
+      "epoch": 0.36026200873362446,
+      "grad_norm": 0.16838301718235016,
+      "learning_rate": 3.704255176670021e-05,
+      "loss": 0.15335073471069335,
+      "step": 1980
+    },
+    {
+      "epoch": 0.3611717612809316,
+      "grad_norm": 0.3054695427417755,
+      "learning_rate": 3.6977994325123535e-05,
+      "loss": 0.16558053493499755,
+      "step": 1985
+    },
+    {
+      "epoch": 0.3620815138282387,
+      "grad_norm": 0.1526716649532318,
+      "learning_rate": 3.6913333045933934e-05,
+      "loss": 0.16148923635482787,
+      "step": 1990
+    },
+    {
+      "epoch": 0.36299126637554585,
+      "grad_norm": 0.15328513085842133,
+      "learning_rate": 3.684856848968209e-05,
+      "loss": 0.1553613781929016,
+      "step": 1995
+    },
+    {
+      "epoch": 0.363901018922853,
+      "grad_norm": 0.16129714250564575,
+      "learning_rate": 3.6783701217813995e-05,
+      "loss": 0.16724612712860107,
+      "step": 2000
+    },
+    {
+      "epoch": 0.3648107714701601,
+      "grad_norm": 0.15715539455413818,
+      "learning_rate": 3.6718731792666086e-05,
+      "loss": 0.15867922306060792,
+      "step": 2005
+    },
+    {
+      "epoch": 0.36572052401746724,
+      "grad_norm": 0.15569166839122772,
+      "learning_rate": 3.6653660777460366e-05,
+      "loss": 0.1552058696746826,
+      "step": 2010
+    },
+    {
+      "epoch": 0.36663027656477437,
+      "grad_norm": 0.16223010420799255,
+      "learning_rate": 3.6588488736299535e-05,
+      "loss": 0.1583200454711914,
+      "step": 2015
+    },
+    {
+      "epoch": 0.3675400291120815,
+      "grad_norm": 0.18441995978355408,
+      "learning_rate": 3.652321623416209e-05,
+      "loss": 0.15050662755966188,
+      "step": 2020
+    },
+    {
+      "epoch": 0.36844978165938863,
+      "grad_norm": 0.13792674243450165,
+      "learning_rate": 3.645784383689742e-05,
+      "loss": 0.15458759069442748,
+      "step": 2025
+    },
+    {
+      "epoch": 0.36935953420669576,
+      "grad_norm": 0.14993111789226532,
+      "learning_rate": 3.639237211122091e-05,
+      "loss": 0.15926222801208495,
+      "step": 2030
+    },
+    {
+      "epoch": 0.3702692867540029,
+      "grad_norm": 0.16815930604934692,
+      "learning_rate": 3.632680162470904e-05,
+      "loss": 0.15524441003799438,
+      "step": 2035
+    },
+    {
+      "epoch": 0.37117903930131,
+      "grad_norm": 0.13312821090221405,
+      "learning_rate": 3.626113294579441e-05,
+      "loss": 0.15883516073226928,
+      "step": 2040
+    },
+    {
+      "epoch": 0.37208879184861715,
+      "grad_norm": 0.16838273406028748,
+      "learning_rate": 3.619536664376091e-05,
+      "loss": 0.15829603672027587,
+      "step": 2045
+    },
+    {
+      "epoch": 0.37299854439592434,
+      "grad_norm": 0.14706873893737793,
+      "learning_rate": 3.612950328873869e-05,
+      "loss": 0.15644397735595703,
+      "step": 2050
+    },
+    {
+      "epoch": 0.37390829694323147,
+      "grad_norm": 0.1644199639558792,
+      "learning_rate": 3.606354345169926e-05,
+      "loss": 0.15858219861984252,
+      "step": 2055
+    },
+    {
+      "epoch": 0.3748180494905386,
+      "grad_norm": 0.18077051639556885,
+      "learning_rate": 3.599748770445055e-05,
+      "loss": 0.1641286849975586,
+      "step": 2060
+    },
+    {
+      "epoch": 0.3757278020378457,
+      "grad_norm": 0.16329127550125122,
+      "learning_rate": 3.5931336619631914e-05,
+      "loss": 0.15027186870574952,
+      "step": 2065
+    },
+    {
+      "epoch": 0.37663755458515286,
+      "grad_norm": 0.16346783936023712,
+      "learning_rate": 3.586509077070922e-05,
+      "loss": 0.1558641314506531,
+      "step": 2070
+    },
+    {
+      "epoch": 0.37754730713246,
+      "grad_norm": 0.1727602630853653,
+      "learning_rate": 3.5798750731969834e-05,
+      "loss": 0.15390506982803345,
+      "step": 2075
+    },
+    {
+      "epoch": 0.3784570596797671,
+      "grad_norm": 0.7598192691802979,
+      "learning_rate": 3.5732317078517654e-05,
+      "loss": 0.1533232808113098,
+      "step": 2080
+    },
+    {
+      "epoch": 0.37936681222707425,
+      "grad_norm": 0.1433355212211609,
+      "learning_rate": 3.5665790386268124e-05,
+      "loss": 0.15560413599014283,
+      "step": 2085
+    },
+    {
+      "epoch": 0.3802765647743814,
+      "grad_norm": 0.18439625203609467,
+      "learning_rate": 3.559917123194325e-05,
+      "loss": 0.16695556640625,
+      "step": 2090
+    },
+    {
+      "epoch": 0.3811863173216885,
+      "grad_norm": 0.1693502813577652,
+      "learning_rate": 3.55324601930666e-05,
+      "loss": 0.15957870483398437,
+      "step": 2095
+    },
+    {
+      "epoch": 0.38209606986899564,
+      "grad_norm": 0.17776088416576385,
+      "learning_rate": 3.54656578479583e-05,
+      "loss": 0.1527492880821228,
+      "step": 2100
+    },
+    {
+      "epoch": 0.38300582241630277,
+      "grad_norm": 0.15993724763393402,
+      "learning_rate": 3.539876477572998e-05,
+      "loss": 0.1567505717277527,
+      "step": 2105
+    },
+    {
+      "epoch": 0.3839155749636099,
+      "grad_norm": 0.17067375779151917,
+      "learning_rate": 3.533178155627981e-05,
+      "loss": 0.14660797119140626,
+      "step": 2110
+    },
+    {
+      "epoch": 0.384825327510917,
+      "grad_norm": 0.20239882171154022,
+      "learning_rate": 3.526470877028745e-05,
+      "loss": 0.1596767544746399,
+      "step": 2115
+    },
+    {
+      "epoch": 0.38573508005822416,
+      "grad_norm": 0.1863643079996109,
+      "learning_rate": 3.5197546999209005e-05,
+      "loss": 0.15738571882247926,
+      "step": 2120
+    },
+    {
+      "epoch": 0.3866448326055313,
+      "grad_norm": 0.16994133591651917,
+      "learning_rate": 3.5130296825272014e-05,
+      "loss": 0.16255316734313965,
+      "step": 2125
+    },
+    {
+      "epoch": 0.3875545851528384,
+      "grad_norm": 0.18703415989875793,
+      "learning_rate": 3.5062958831470355e-05,
+      "loss": 0.15206334590911866,
+      "step": 2130
+    },
+    {
+      "epoch": 0.38846433770014555,
+      "grad_norm": 0.15433982014656067,
+      "learning_rate": 3.4995533601559226e-05,
+      "loss": 0.1590178370475769,
+      "step": 2135
+    },
+    {
+      "epoch": 0.3893740902474527,
+      "grad_norm": 0.16498146951198578,
+      "learning_rate": 3.4928021720050104e-05,
+      "loss": 0.14759145975112914,
+      "step": 2140
+    },
+    {
+      "epoch": 0.3902838427947598,
+      "grad_norm": 0.17880478501319885,
+      "learning_rate": 3.486042377220562e-05,
+      "loss": 0.1642458915710449,
+      "step": 2145
+    },
+    {
+      "epoch": 0.39119359534206694,
+      "grad_norm": 0.14700061082839966,
+      "learning_rate": 3.479274034403455e-05,
+      "loss": 0.16105138063430785,
+      "step": 2150
+    },
+    {
+      "epoch": 0.39210334788937407,
+      "grad_norm": 0.1620762050151825,
+      "learning_rate": 3.472497202228664e-05,
+      "loss": 0.15104985237121582,
+      "step": 2155
+    },
+    {
+      "epoch": 0.3930131004366812,
+      "grad_norm": 0.1625058799982071,
+      "learning_rate": 3.4657119394447654e-05,
+      "loss": 0.16145485639572144,
+      "step": 2160
+    },
+    {
+      "epoch": 0.3939228529839884,
+      "grad_norm": 0.1631549596786499,
+      "learning_rate": 3.458918304873417e-05,
+      "loss": 0.16712255477905275,
+      "step": 2165
+    },
+    {
+      "epoch": 0.3948326055312955,
+      "grad_norm": 0.16041551530361176,
+      "learning_rate": 3.452116357408853e-05,
+      "loss": 0.15118330717086792,
+      "step": 2170
+    },
+    {
+      "epoch": 0.39574235807860264,
+      "grad_norm": 0.16692611575126648,
+      "learning_rate": 3.44530615601737e-05,
+      "loss": 0.16982550621032716,
+      "step": 2175
+    },
+    {
+      "epoch": 0.39665211062590977,
+      "grad_norm": 0.16082268953323364,
+      "learning_rate": 3.438487759736821e-05,
+      "loss": 0.1513260006904602,
+      "step": 2180
+    },
+    {
+      "epoch": 0.3975618631732169,
+      "grad_norm": 0.1474589854478836,
+      "learning_rate": 3.4316612276761004e-05,
+      "loss": 0.14968743324279785,
+      "step": 2185
+    },
+    {
+      "epoch": 0.39847161572052403,
+      "grad_norm": 0.14531342685222626,
+      "learning_rate": 3.42482661901463e-05,
+      "loss": 0.1563260555267334,
+      "step": 2190
+    },
+    {
+      "epoch": 0.39938136826783116,
+      "grad_norm": 0.16775506734848022,
+      "learning_rate": 3.41798399300185e-05,
+      "loss": 0.14861010313034057,
+      "step": 2195
+    },
+    {
+      "epoch": 0.4002911208151383,
+      "grad_norm": 0.15065217018127441,
+      "learning_rate": 3.411133408956703e-05,
+      "loss": 0.15559519529342652,
+      "step": 2200
+    },
+    {
+      "epoch": 0.4012008733624454,
+      "grad_norm": 0.16655296087265015,
+      "learning_rate": 3.4042749262671184e-05,
+      "loss": 0.16025567054748535,
+      "step": 2205
+    },
+    {
+      "epoch": 0.40211062590975255,
+      "grad_norm": 0.14773905277252197,
+      "learning_rate": 3.397408604389501e-05,
+      "loss": 0.15074082612991332,
+      "step": 2210
+    },
+    {
+      "epoch": 0.4030203784570597,
+      "grad_norm": 0.16233304142951965,
+      "learning_rate": 3.3905345028482125e-05,
+      "loss": 0.15490520000457764,
+      "step": 2215
+    },
+    {
+      "epoch": 0.4039301310043668,
+      "grad_norm": 0.17520153522491455,
+      "learning_rate": 3.383652681235058e-05,
+      "loss": 0.1517520785331726,
+      "step": 2220
+    },
+    {
+      "epoch": 0.40483988355167394,
+      "grad_norm": 0.14749875664710999,
+      "learning_rate": 3.376763199208766e-05,
+      "loss": 0.15410997867584228,
+      "step": 2225
+    },
+    {
+      "epoch": 0.40574963609898107,
+      "grad_norm": 0.16855919361114502,
+      "learning_rate": 3.369866116494477e-05,
+      "loss": 0.1510261058807373,
+      "step": 2230
+    },
+    {
+      "epoch": 0.4066593886462882,
+      "grad_norm": 0.1594122350215912,
+      "learning_rate": 3.362961492883218e-05,
+      "loss": 0.1493813395500183,
+      "step": 2235
+    },
+    {
+      "epoch": 0.40756914119359533,
+      "grad_norm": 0.13645926117897034,
+      "learning_rate": 3.3560493882313915e-05,
+      "loss": 0.14876762628555298,
+      "step": 2240
+    },
+    {
+      "epoch": 0.40847889374090246,
+      "grad_norm": 0.14304400980472565,
+      "learning_rate": 3.349129862460251e-05,
+      "loss": 0.15567013025283813,
+      "step": 2245
+    },
+    {
+      "epoch": 0.4093886462882096,
+      "grad_norm": 0.17040041089057922,
+      "learning_rate": 3.342202975555386e-05,
+      "loss": 0.1563249945640564,
+      "step": 2250
+    },
+    {
+      "epoch": 0.4102983988355167,
+      "grad_norm": 0.15594671666622162,
+      "learning_rate": 3.3352687875661984e-05,
+      "loss": 0.1546410083770752,
+      "step": 2255
+    },
+    {
+      "epoch": 0.41120815138282385,
+      "grad_norm": 0.1677195280790329,
+      "learning_rate": 3.328327358605384e-05,
+      "loss": 0.15710171461105346,
+      "step": 2260
+    },
+    {
+      "epoch": 0.412117903930131,
+      "grad_norm": 0.1731705516576767,
+      "learning_rate": 3.321378748848412e-05,
+      "loss": 0.16444036960601807,
+      "step": 2265
+    },
+    {
+      "epoch": 0.4130276564774381,
+      "grad_norm": 0.18779033422470093,
+      "learning_rate": 3.3144230185329984e-05,
+      "loss": 0.15659687519073487,
+      "step": 2270
+    },
+    {
+      "epoch": 0.4139374090247453,
+      "grad_norm": 0.1543768346309662,
+      "learning_rate": 3.3074602279585913e-05,
+      "loss": 0.15100739002227784,
+      "step": 2275
+    },
+    {
+      "epoch": 0.4148471615720524,
+      "grad_norm": 0.16672168672084808,
+      "learning_rate": 3.300490437485843e-05,
+      "loss": 0.15535364151000977,
+      "step": 2280
+    },
+    {
+      "epoch": 0.41575691411935956,
+      "grad_norm": 0.16741308569908142,
+      "learning_rate": 3.293513707536089e-05,
+      "loss": 0.15523911714553834,
+      "step": 2285
+    },
+    {
+      "epoch": 0.4166666666666667,
+      "grad_norm": 0.1488303542137146,
+      "learning_rate": 3.286530098590822e-05,
+      "loss": 0.1542000651359558,
+      "step": 2290
+    },
+    {
+      "epoch": 0.4175764192139738,
+      "grad_norm": 0.1637732982635498,
+      "learning_rate": 3.2795396711911694e-05,
+      "loss": 0.15354831218719484,
+      "step": 2295
+    },
+    {
+      "epoch": 0.41848617176128095,
+      "grad_norm": 0.1472022533416748,
+      "learning_rate": 3.272542485937369e-05,
+      "loss": 0.16235145330429077,
+      "step": 2300
+    },
+    {
+      "epoch": 0.4193959243085881,
+      "grad_norm": 0.15908290445804596,
+      "learning_rate": 3.265538603488241e-05,
+      "loss": 0.15642645359039306,
+      "step": 2305
+    },
+    {
+      "epoch": 0.4203056768558952,
+      "grad_norm": 0.1584865301847458,
+      "learning_rate": 3.2585280845606645e-05,
+      "loss": 0.15490249395370484,
+      "step": 2310
+    },
+    {
+      "epoch": 0.42121542940320233,
+      "grad_norm": 0.15893949568271637,
+      "learning_rate": 3.251510989929052e-05,
+      "loss": 0.1598116159439087,
+      "step": 2315
+    },
+    {
+      "epoch": 0.42212518195050946,
+      "grad_norm": 0.18930596113204956,
+      "learning_rate": 3.244487380424817e-05,
+      "loss": 0.1482008934020996,
+      "step": 2320
+    },
+    {
+      "epoch": 0.4230349344978166,
+      "grad_norm": 0.132876455783844,
+      "learning_rate": 3.237457316935856e-05,
+      "loss": 0.15304710865020751,
+      "step": 2325
+    },
+    {
+      "epoch": 0.4239446870451237,
+      "grad_norm": 0.16447032988071442,
+      "learning_rate": 3.2304208604060106e-05,
+      "loss": 0.15298750400543212,
+      "step": 2330
+    },
+    {
+      "epoch": 0.42485443959243085,
+      "grad_norm": 0.17748120427131653,
+      "learning_rate": 3.223378071834546e-05,
+      "loss": 0.1556084156036377,
+      "step": 2335
+    },
+    {
+      "epoch": 0.425764192139738,
+      "grad_norm": 0.16366586089134216,
+      "learning_rate": 3.2163290122756206e-05,
+      "loss": 0.14387927055358887,
+      "step": 2340
+    },
+    {
+      "epoch": 0.4266739446870451,
+      "grad_norm": 0.15398970246315002,
+      "learning_rate": 3.209273742837755e-05,
+      "loss": 0.16091293096542358,
+      "step": 2345
+    },
+    {
+      "epoch": 0.42758369723435224,
+      "grad_norm": 0.164212167263031,
+      "learning_rate": 3.202212324683305e-05,
+      "loss": 0.15523531436920165,
+      "step": 2350
+    },
+    {
+      "epoch": 0.4284934497816594,
+      "grad_norm": 0.16749800741672516,
+      "learning_rate": 3.1951448190279255e-05,
+      "loss": 0.15354975461959838,
+      "step": 2355
+    },
+    {
+      "epoch": 0.4294032023289665,
+      "grad_norm": 0.14137034118175507,
+      "learning_rate": 3.18807128714005e-05,
+      "loss": 0.14981694221496583,
+      "step": 2360
+    },
+    {
+      "epoch": 0.43031295487627363,
+      "grad_norm": 0.14848439395427704,
+      "learning_rate": 3.1809917903403507e-05,
+      "loss": 0.15448769330978393,
+      "step": 2365
+    },
+    {
+      "epoch": 0.43122270742358076,
+      "grad_norm": 0.1747605800628662,
+      "learning_rate": 3.1739063900012095e-05,
+      "loss": 0.15882387161254882,
+      "step": 2370
+    },
+    {
+      "epoch": 0.4321324599708879,
+      "grad_norm": 0.16054467856884003,
+      "learning_rate": 3.166815147546186e-05,
+      "loss": 0.15170297622680665,
+      "step": 2375
+    },
+    {
+      "epoch": 0.433042212518195,
+      "grad_norm": 0.15428027510643005,
+      "learning_rate": 3.1597181244494886e-05,
+      "loss": 0.16202548742294312,
+      "step": 2380
+    },
+    {
+      "epoch": 0.4339519650655022,
+      "grad_norm": 0.16747219860553741,
+      "learning_rate": 3.1526153822354325e-05,
+      "loss": 0.15461477041244506,
+      "step": 2385
+    },
+    {
+      "epoch": 0.43486171761280934,
+      "grad_norm": 0.17415772378444672,
+      "learning_rate": 3.145506982477918e-05,
+      "loss": 0.16173542737960817,
+      "step": 2390
+    },
+    {
+      "epoch": 0.43577147016011647,
+      "grad_norm": 0.1293518990278244,
+      "learning_rate": 3.1383929867998865e-05,
+      "loss": 0.15572521686553956,
+      "step": 2395
+    },
+    {
+      "epoch": 0.4366812227074236,
+      "grad_norm": 0.16909323632717133,
+      "learning_rate": 3.1312734568727935e-05,
+      "loss": 0.15898628234863282,
+      "step": 2400
+    },
+    {
+      "epoch": 0.43759097525473073,
+      "grad_norm": 0.16770294308662415,
+      "learning_rate": 3.124148454416069e-05,
+      "loss": 0.1536281704902649,
+      "step": 2405
+    },
+    {
+      "epoch": 0.43850072780203786,
+      "grad_norm": 0.14078612625598907,
+      "learning_rate": 3.117018041196585e-05,
+      "loss": 0.15274266004562378,
+      "step": 2410
+    },
+    {
+      "epoch": 0.439410480349345,
+      "grad_norm": 0.15457536280155182,
+      "learning_rate": 3.1098822790281226e-05,
+      "loss": 0.15391263961791993,
+      "step": 2415
+    },
+    {
+      "epoch": 0.4403202328966521,
+      "grad_norm": 0.1640717089176178,
+      "learning_rate": 3.102741229770827e-05,
+      "loss": 0.15515168905258178,
+      "step": 2420
+    },
+    {
+      "epoch": 0.44122998544395925,
+      "grad_norm": 0.2601533830165863,
+      "learning_rate": 3.095594955330683e-05,
+      "loss": 0.1587247371673584,
+      "step": 2425
+    },
+    {
+      "epoch": 0.4421397379912664,
+      "grad_norm": 0.1352529525756836,
+      "learning_rate": 3.08844351765897e-05,
+      "loss": 0.1483217477798462,
+      "step": 2430
+    },
+    {
+      "epoch": 0.4430494905385735,
+      "grad_norm": 0.18479721248149872,
+      "learning_rate": 3.081286978751728e-05,
+      "loss": 0.15121787786483765,
+      "step": 2435
+    },
+    {
+      "epoch": 0.44395924308588064,
+      "grad_norm": 0.16954511404037476,
+      "learning_rate": 3.074125400649221e-05,
+      "loss": 0.16073100566864013,
+      "step": 2440
+    },
+    {
+      "epoch": 0.44486899563318777,
+      "grad_norm": 0.15154729783535004,
+      "learning_rate": 3.0669588454353944e-05,
+      "loss": 0.15738017559051515,
+      "step": 2445
+    },
+    {
+      "epoch": 0.4457787481804949,
+      "grad_norm": 0.1540488302707672,
+      "learning_rate": 3.059787375237344e-05,
+      "loss": 0.1515384554862976,
+      "step": 2450
+    },
+    {
+      "epoch": 0.44668850072780203,
+      "grad_norm": 0.1814432442188263,
+      "learning_rate": 3.052611052224774e-05,
+      "loss": 0.15731438398361205,
+      "step": 2455
+    },
+    {
+      "epoch": 0.44759825327510916,
+      "grad_norm": 0.16657036542892456,
+      "learning_rate": 3.0454299386094542e-05,
+      "loss": 0.15741543769836425,
+      "step": 2460
+    },
+    {
+      "epoch": 0.4485080058224163,
+      "grad_norm": 0.2177237570285797,
+      "learning_rate": 3.0382440966446875e-05,
+      "loss": 0.14972515106201173,
+      "step": 2465
+    },
+    {
+      "epoch": 0.4494177583697234,
+      "grad_norm": 0.1669909954071045,
+      "learning_rate": 3.031053588624766e-05,
+      "loss": 0.1506432294845581,
+      "step": 2470
+    },
+    {
+      "epoch": 0.45032751091703055,
+      "grad_norm": 0.1752234250307083,
+      "learning_rate": 3.0238584768844313e-05,
+      "loss": 0.14969609975814818,
+      "step": 2475
+    },
+    {
+      "epoch": 0.4512372634643377,
+      "grad_norm": 0.18267901241779327,
+      "learning_rate": 3.0166588237983363e-05,
+      "loss": 0.15112748146057128,
+      "step": 2480
+    },
+    {
+      "epoch": 0.4521470160116448,
+      "grad_norm": 0.16250105202198029,
+      "learning_rate": 3.0094546917805007e-05,
+      "loss": 0.15864100456237792,
+      "step": 2485
+    },
+    {
+      "epoch": 0.45305676855895194,
+      "grad_norm": 0.14825721085071564,
+      "learning_rate": 3.0022461432837752e-05,
+      "loss": 0.1513954520225525,
+      "step": 2490
+    },
+    {
+      "epoch": 0.4539665211062591,
+      "grad_norm": 0.1626640111207962,
+      "learning_rate": 2.9950332407992943e-05,
+      "loss": 0.1505578875541687,
+      "step": 2495
+    },
+    {
+      "epoch": 0.45487627365356625,
+      "grad_norm": 0.1535351574420929,
+      "learning_rate": 2.987816046855939e-05,
+      "loss": 0.15255829095840454,
+      "step": 2500
+    },
+    {
+      "epoch": 0.4557860262008734,
+      "grad_norm": 0.17552775144577026,
+      "learning_rate": 2.9805946240197928e-05,
+      "loss": 0.1516443133354187,
+      "step": 2505
+    },
+    {
+      "epoch": 0.4566957787481805,
+      "grad_norm": 0.16020981967449188,
+      "learning_rate": 2.9733690348935994e-05,
+      "loss": 0.14519743919372557,
+      "step": 2510
+    },
+    {
+      "epoch": 0.45760553129548764,
+      "grad_norm": 0.17800211906433105,
+      "learning_rate": 2.9661393421162204e-05,
+      "loss": 0.15679080486297609,
+      "step": 2515
+    },
+    {
+      "epoch": 0.4585152838427948,
+      "grad_norm": 0.16016991436481476,
+      "learning_rate": 2.9589056083620902e-05,
+      "loss": 0.14768127202987671,
+      "step": 2520
+    },
+    {
+      "epoch": 0.4594250363901019,
+      "grad_norm": 0.16272081434726715,
+      "learning_rate": 2.951667896340679e-05,
+      "loss": 0.1513301968574524,
+      "step": 2525
+    },
+    {
+      "epoch": 0.46033478893740903,
+      "grad_norm": 0.1726413071155548,
+      "learning_rate": 2.9444262687959402e-05,
+      "loss": 0.14819332361221313,
+      "step": 2530
+    },
+    {
+      "epoch": 0.46124454148471616,
+      "grad_norm": 0.1670403778553009,
+      "learning_rate": 2.9371807885057735e-05,
+      "loss": 0.15245940685272216,
+      "step": 2535
+    },
+    {
+      "epoch": 0.4621542940320233,
+      "grad_norm": 0.1650049239397049,
+      "learning_rate": 2.9299315182814772e-05,
+      "loss": 0.15187418460845947,
+      "step": 2540
+    },
+    {
+      "epoch": 0.4630640465793304,
+      "grad_norm": 0.16327734291553497,
+      "learning_rate": 2.9226785209672047e-05,
+      "loss": 0.15579828023910522,
+      "step": 2545
+    },
+    {
+      "epoch": 0.46397379912663755,
+      "grad_norm": 0.3367880582809448,
+      "learning_rate": 2.91542185943942e-05,
+      "loss": 0.15617697238922118,
+      "step": 2550
+    },
+    {
+      "epoch": 0.4648835516739447,
+      "grad_norm": 0.1731594055891037,
+      "learning_rate": 2.908161596606353e-05,
+      "loss": 0.1559603691101074,
+      "step": 2555
+    },
+    {
+      "epoch": 0.4657933042212518,
+      "grad_norm": 0.1477293074131012,
+      "learning_rate": 2.9008977954074517e-05,
+      "loss": 0.15567959547042848,
+      "step": 2560
+    },
+    {
+      "epoch": 0.46670305676855894,
+      "grad_norm": 0.16227173805236816,
+      "learning_rate": 2.8936305188128392e-05,
+      "loss": 0.1522113561630249,
+      "step": 2565
+    },
+    {
+      "epoch": 0.4676128093158661,
+      "grad_norm": 0.2031075656414032,
+      "learning_rate": 2.8863598298227674e-05,
+      "loss": 0.15054640769958497,
+      "step": 2570
+    },
+    {
+      "epoch": 0.4685225618631732,
+      "grad_norm": 0.18351472914218903,
+      "learning_rate": 2.8790857914670698e-05,
+      "loss": 0.15837019681930542,
+      "step": 2575
+    },
+    {
+      "epoch": 0.46943231441048033,
+      "grad_norm": 0.15914765000343323,
+      "learning_rate": 2.871808466804616e-05,
+      "loss": 0.1550259470939636,
+      "step": 2580
+    },
+    {
+      "epoch": 0.47034206695778746,
+      "grad_norm": 0.17366717755794525,
+      "learning_rate": 2.8645279189227636e-05,
+      "loss": 0.15702390670776367,
+      "step": 2585
+    },
+    {
+      "epoch": 0.4712518195050946,
+      "grad_norm": 0.13677838444709778,
+      "learning_rate": 2.8572442109368134e-05,
+      "loss": 0.15485031604766847,
+      "step": 2590
+    },
+    {
+      "epoch": 0.4721615720524017,
+      "grad_norm": 0.1477748304605484,
+      "learning_rate": 2.8499574059894617e-05,
+      "loss": 0.14577245712280273,
+      "step": 2595
+    },
+    {
+      "epoch": 0.47307132459970885,
+      "grad_norm": 0.1582217663526535,
+      "learning_rate": 2.842667567250252e-05,
+      "loss": 0.15586793422698975,
+      "step": 2600
+    },
+    {
+      "epoch": 0.47398107714701604,
+      "grad_norm": 0.19658738374710083,
+      "learning_rate": 2.8353747579150268e-05,
+      "loss": 0.15060495138168334,
+      "step": 2605
+    },
+    {
+      "epoch": 0.47489082969432317,
+      "grad_norm": 0.176767036318779,
+      "learning_rate": 2.828079041205382e-05,
+      "loss": 0.15116705894470214,
+      "step": 2610
+    },
+    {
+      "epoch": 0.4758005822416303,
+      "grad_norm": 0.16972507536411285,
+      "learning_rate": 2.820780480368117e-05,
+      "loss": 0.1541937470436096,
+      "step": 2615
+    },
+    {
+      "epoch": 0.47671033478893743,
+      "grad_norm": 0.1548585742712021,
+      "learning_rate": 2.8134791386746884e-05,
+      "loss": 0.14334756135940552,
+      "step": 2620
+    },
+    {
+      "epoch": 0.47762008733624456,
+      "grad_norm": 0.15411986410617828,
+      "learning_rate": 2.806175079420658e-05,
+      "loss": 0.14642289876937867,
+      "step": 2625
+    },
+    {
+      "epoch": 0.4785298398835517,
+      "grad_norm": 0.16609491407871246,
+      "learning_rate": 2.7988683659251474e-05,
+      "loss": 0.15083469152450563,
+      "step": 2630
+    },
+    {
+      "epoch": 0.4794395924308588,
+      "grad_norm": 0.16592684388160706,
+      "learning_rate": 2.791559061530289e-05,
+      "loss": 0.14218480587005616,
+      "step": 2635
+    },
+    {
+      "epoch": 0.48034934497816595,
+      "grad_norm": 0.1764935404062271,
+      "learning_rate": 2.7842472296006722e-05,
+      "loss": 0.15004343986511232,
+      "step": 2640
+    },
+    {
+      "epoch": 0.4812590975254731,
+      "grad_norm": 0.20094354450702667,
+      "learning_rate": 2.7769329335228022e-05,
+      "loss": 0.14975016117095946,
+      "step": 2645
+    },
+    {
+      "epoch": 0.4821688500727802,
+      "grad_norm": 0.1869269460439682,
+      "learning_rate": 2.769616236704542e-05,
+      "loss": 0.155981707572937,
+      "step": 2650
+    },
+    {
+      "epoch": 0.48307860262008734,
+      "grad_norm": 0.16671574115753174,
+      "learning_rate": 2.762297202574571e-05,
+      "loss": 0.14633859395980836,
+      "step": 2655
+    },
+    {
+      "epoch": 0.48398835516739447,
+      "grad_norm": 0.14999663829803467,
+      "learning_rate": 2.754975894581826e-05,
+      "loss": 0.15692603588104248,
+      "step": 2660
+    },
+    {
+      "epoch": 0.4848981077147016,
+      "grad_norm": 0.16893649101257324,
+      "learning_rate": 2.7476523761949592e-05,
+      "loss": 0.14530394077301026,
+      "step": 2665
+    },
+    {
+      "epoch": 0.48580786026200873,
+      "grad_norm": 0.16039884090423584,
+      "learning_rate": 2.740326710901784e-05,
+      "loss": 0.15013915300369263,
+      "step": 2670
+    },
+    {
+      "epoch": 0.48671761280931586,
+      "grad_norm": 0.16672006249427795,
+      "learning_rate": 2.732998962208725e-05,
+      "loss": 0.15667349100112915,
+      "step": 2675
+    },
+    {
+      "epoch": 0.487627365356623,
+      "grad_norm": 0.2160867303609848,
+      "learning_rate": 2.7256691936402684e-05,
+      "loss": 0.14335414171218872,
+      "step": 2680
+    },
+    {
+      "epoch": 0.4885371179039301,
+      "grad_norm": 0.349030077457428,
+      "learning_rate": 2.71833746873841e-05,
+      "loss": 0.1437530279159546,
+      "step": 2685
+    },
+    {
+      "epoch": 0.48944687045123725,
+      "grad_norm": 0.18380966782569885,
+      "learning_rate": 2.7110038510621073e-05,
+      "loss": 0.1476014256477356,
+      "step": 2690
+    },
+    {
+      "epoch": 0.4903566229985444,
+      "grad_norm": 0.1523742377758026,
+      "learning_rate": 2.703668404186722e-05,
+      "loss": 0.14578526020050048,
+      "step": 2695
+    },
+    {
+      "epoch": 0.4912663755458515,
+      "grad_norm": 0.16092729568481445,
+      "learning_rate": 2.696331191703479e-05,
+      "loss": 0.15335593223571778,
+      "step": 2700
+    },
+    {
+      "epoch": 0.49217612809315864,
+      "grad_norm": 0.17185333371162415,
+      "learning_rate": 2.688992277218904e-05,
+      "loss": 0.1540898084640503,
+      "step": 2705
+    },
+    {
+      "epoch": 0.49308588064046577,
+      "grad_norm": 0.1521969735622406,
+      "learning_rate": 2.6816517243542792e-05,
+      "loss": 0.15171396732330322,
+      "step": 2710
+    },
+    {
+      "epoch": 0.49399563318777295,
+      "grad_norm": 0.16064171493053436,
+      "learning_rate": 2.674309596745092e-05,
+      "loss": 0.1505839228630066,
+      "step": 2715
+    },
+    {
+      "epoch": 0.4949053857350801,
+      "grad_norm": 0.16430898010730743,
+      "learning_rate": 2.6669659580404795e-05,
+      "loss": 0.1551363468170166,
+      "step": 2720
+    },
+    {
+      "epoch": 0.4958151382823872,
+      "grad_norm": 0.16125477850437164,
+      "learning_rate": 2.659620871902677e-05,
+      "loss": 0.15069286823272704,
+      "step": 2725
+    },
+    {
+      "epoch": 0.49672489082969434,
+      "grad_norm": 0.1428450047969818,
+      "learning_rate": 2.652274402006471e-05,
+      "loss": 0.15511081218719483,
+      "step": 2730
+    },
+    {
+      "epoch": 0.4976346433770015,
+      "grad_norm": 0.15452754497528076,
+      "learning_rate": 2.6449266120386406e-05,
+      "loss": 0.14941939115524291,
+      "step": 2735
+    },
+    {
+      "epoch": 0.4985443959243086,
+      "grad_norm": 0.17243537306785583,
+      "learning_rate": 2.6375775656974123e-05,
+      "loss": 0.151741623878479,
+      "step": 2740
+    },
+    {
+      "epoch": 0.49945414847161573,
+      "grad_norm": 0.13736453652381897,
+      "learning_rate": 2.6302273266919008e-05,
+      "loss": 0.147042977809906,
+      "step": 2745
+    },
+    {
+      "epoch": 0.5003639010189228,
+      "grad_norm": 0.16241495311260223,
+      "learning_rate": 2.6228759587415614e-05,
+      "loss": 0.14664684534072875,
+      "step": 2750
+    },
+    {
+      "epoch": 0.50127365356623,
+      "grad_norm": 0.193496435880661,
+      "learning_rate": 2.6155235255756356e-05,
+      "loss": 0.15486966371536254,
+      "step": 2755
+    },
+    {
+      "epoch": 0.5021834061135371,
+      "grad_norm": 0.1542847901582718,
+      "learning_rate": 2.6081700909326e-05,
+      "loss": 0.15148009061813356,
+      "step": 2760
+    },
+    {
+      "epoch": 0.5030931586608443,
+      "grad_norm": 0.1696511209011078,
+      "learning_rate": 2.6008157185596142e-05,
+      "loss": 0.14190055131912233,
+      "step": 2765
+    },
+    {
+      "epoch": 0.5040029112081513,
+      "grad_norm": 0.14690077304840088,
+      "learning_rate": 2.5934604722119655e-05,
+      "loss": 0.1570739269256592,
+      "step": 2770
+    },
+    {
+      "epoch": 0.5049126637554585,
+      "grad_norm": 0.17149671912193298,
+      "learning_rate": 2.5861044156525162e-05,
+      "loss": 0.14940304756164552,
+      "step": 2775
+    },
+    {
+      "epoch": 0.5058224163027657,
+      "grad_norm": 0.16639231145381927,
+      "learning_rate": 2.578747612651155e-05,
+      "loss": 0.15691237449645995,
+      "step": 2780
+    },
+    {
+      "epoch": 0.5067321688500728,
+      "grad_norm": 0.2062763124704361,
+      "learning_rate": 2.5713901269842404e-05,
+      "loss": 0.1564734935760498,
+      "step": 2785
+    },
+    {
+      "epoch": 0.50764192139738,
+      "grad_norm": 0.12636308372020721,
+      "learning_rate": 2.5640320224340502e-05,
+      "loss": 0.14539417028427123,
+      "step": 2790
+    },
+    {
+      "epoch": 0.508551673944687,
+      "grad_norm": 0.16893689334392548,
+      "learning_rate": 2.556673362788225e-05,
+      "loss": 0.15440930128097535,
+      "step": 2795
+    },
+    {
+      "epoch": 0.5094614264919942,
+      "grad_norm": 0.16250015795230865,
+      "learning_rate": 2.54931421183922e-05,
+      "loss": 0.14485647678375244,
+      "step": 2800
+    },
+    {
+      "epoch": 0.5103711790393013,
+      "grad_norm": 0.1700994372367859,
+      "learning_rate": 2.5419546333837462e-05,
+      "loss": 0.15411126613616943,
+      "step": 2805
+    },
+    {
+      "epoch": 0.5112809315866085,
+      "grad_norm": 0.1547706127166748,
+      "learning_rate": 2.5345946912222256e-05,
+      "loss": 0.15516072511672974,
+      "step": 2810
+    },
+    {
+      "epoch": 0.5121906841339156,
+      "grad_norm": 0.17955681681632996,
+      "learning_rate": 2.527234449158228e-05,
+      "loss": 0.15546923875808716,
+      "step": 2815
+    },
+    {
+      "epoch": 0.5131004366812227,
+      "grad_norm": 0.163709819316864,
+      "learning_rate": 2.519873970997927e-05,
+      "loss": 0.15665037631988527,
+      "step": 2820
+    },
+    {
+      "epoch": 0.5140101892285298,
+      "grad_norm": 0.17859576642513275,
+      "learning_rate": 2.5125133205495405e-05,
+      "loss": 0.1539722204208374,
+      "step": 2825
+    },
+    {
+      "epoch": 0.514919941775837,
+      "grad_norm": 0.17443150281906128,
+      "learning_rate": 2.5051525616227806e-05,
+      "loss": 0.148411762714386,
+      "step": 2830
+    },
+    {
+      "epoch": 0.5158296943231441,
+      "grad_norm": 0.17397581040859222,
+      "learning_rate": 2.4977917580283007e-05,
+      "loss": 0.14880497455596925,
+      "step": 2835
+    },
+    {
+      "epoch": 0.5167394468704513,
+      "grad_norm": 0.14565663039684296,
+      "learning_rate": 2.4904309735771405e-05,
+      "loss": 0.14934680461883545,
+      "step": 2840
+    },
+    {
+      "epoch": 0.5176491994177583,
+      "grad_norm": 0.17895659804344177,
+      "learning_rate": 2.4830702720801746e-05,
+      "loss": 0.15287939310073853,
+      "step": 2845
+    },
+    {
+      "epoch": 0.5185589519650655,
+      "grad_norm": 0.15812788903713226,
+      "learning_rate": 2.4757097173475572e-05,
+      "loss": 0.14576947689056396,
+      "step": 2850
+    },
+    {
+      "epoch": 0.5194687045123726,
+      "grad_norm": 0.17123781144618988,
+      "learning_rate": 2.46834937318817e-05,
+      "loss": 0.15224847793579102,
+      "step": 2855
+    },
+    {
+      "epoch": 0.5203784570596798,
+      "grad_norm": 0.14845474064350128,
+      "learning_rate": 2.460989303409072e-05,
+      "loss": 0.14901585578918458,
+      "step": 2860
+    },
+    {
+      "epoch": 0.5212882096069869,
+      "grad_norm": 0.23493704199790955,
+      "learning_rate": 2.4536295718149407e-05,
+      "loss": 0.1517487049102783,
+      "step": 2865
+    },
+    {
+      "epoch": 0.522197962154294,
+      "grad_norm": 0.16209843754768372,
+      "learning_rate": 2.4462702422075217e-05,
+      "loss": 0.14327445030212402,
+      "step": 2870
+    },
+    {
+      "epoch": 0.5231077147016011,
+      "grad_norm": 0.17249803245067596,
+      "learning_rate": 2.4389113783850793e-05,
+      "loss": 0.1517549753189087,
+      "step": 2875
+    },
+    {
+      "epoch": 0.5240174672489083,
+      "grad_norm": 0.14561402797698975,
+      "learning_rate": 2.431553044141836e-05,
+      "loss": 0.14764087200164794,
+      "step": 2880
+    },
+    {
+      "epoch": 0.5249272197962155,
+      "grad_norm": 0.17033302783966064,
+      "learning_rate": 2.4241953032674256e-05,
+      "loss": 0.15181604623794556,
+      "step": 2885
+    },
+    {
+      "epoch": 0.5258369723435226,
+      "grad_norm": 0.1184430941939354,
+      "learning_rate": 2.4168382195463367e-05,
+      "loss": 0.14264242649078368,
+      "step": 2890
+    },
+    {
+      "epoch": 0.5267467248908297,
+      "grad_norm": 0.17521196603775024,
+      "learning_rate": 2.4094818567573618e-05,
+      "loss": 0.1509538173675537,
+      "step": 2895
+    },
+    {
+      "epoch": 0.5276564774381368,
+      "grad_norm": 0.1681576371192932,
+      "learning_rate": 2.4021262786730428e-05,
+      "loss": 0.15344605445861817,
+      "step": 2900
+    },
+    {
+      "epoch": 0.528566229985444,
+      "grad_norm": 0.17134182155132294,
+      "learning_rate": 2.3947715490591206e-05,
+      "loss": 0.15161689519882202,
+      "step": 2905
+    },
+    {
+      "epoch": 0.5294759825327511,
+      "grad_norm": 0.1796472817659378,
+      "learning_rate": 2.3874177316739778e-05,
+      "loss": 0.15086464881896972,
+      "step": 2910
+    },
+    {
+      "epoch": 0.5303857350800583,
+      "grad_norm": 0.23268625140190125,
+      "learning_rate": 2.380064890268093e-05,
+      "loss": 0.15354180335998535,
+      "step": 2915
+    },
+    {
+      "epoch": 0.5312954876273653,
+      "grad_norm": 0.16318941116333008,
+      "learning_rate": 2.372713088583481e-05,
+      "loss": 0.15131797790527343,
+      "step": 2920
+    },
+    {
+      "epoch": 0.5322052401746725,
+      "grad_norm": 0.18171803653240204,
+      "learning_rate": 2.365362390353143e-05,
+      "loss": 0.15784090757369995,
+      "step": 2925
+    },
+    {
+      "epoch": 0.5331149927219796,
+      "grad_norm": 0.17672640085220337,
+      "learning_rate": 2.3580128593005156e-05,
+      "loss": 0.15509436130523682,
+      "step": 2930
+    },
+    {
+      "epoch": 0.5340247452692868,
+      "grad_norm": 0.15985223650932312,
+      "learning_rate": 2.3506645591389174e-05,
+      "loss": 0.14851027727127075,
+      "step": 2935
+    },
+    {
+      "epoch": 0.5349344978165939,
+      "grad_norm": 0.16597607731819153,
+      "learning_rate": 2.343317553570995e-05,
+      "loss": 0.1504931092262268,
+      "step": 2940
+    },
+    {
+      "epoch": 0.535844250363901,
+      "grad_norm": 0.20180748403072357,
+      "learning_rate": 2.3359719062881725e-05,
+      "loss": 0.15023820400238036,
+      "step": 2945
+    },
+    {
+      "epoch": 0.5367540029112081,
+      "grad_norm": 0.1735963076353073,
+      "learning_rate": 2.3286276809701e-05,
+      "loss": 0.15374408960342406,
+      "step": 2950
+    },
+    {
+      "epoch": 0.5376637554585153,
+      "grad_norm": 0.17629501223564148,
+      "learning_rate": 2.3212849412840995e-05,
+      "loss": 0.15007833242416382,
+      "step": 2955
+    },
+    {
+      "epoch": 0.5385735080058224,
+      "grad_norm": 0.1493796557188034,
+      "learning_rate": 2.3139437508846155e-05,
+      "loss": 0.15206656455993653,
+      "step": 2960
+    },
+    {
+      "epoch": 0.5394832605531296,
+      "grad_norm": 0.17426837980747223,
+      "learning_rate": 2.306604173412659e-05,
+      "loss": 0.1441131591796875,
+      "step": 2965
+    },
+    {
+      "epoch": 0.5403930131004366,
+      "grad_norm": 0.16984431445598602,
+      "learning_rate": 2.2992662724952613e-05,
+      "loss": 0.14438753128051757,
+      "step": 2970
+    },
+    {
+      "epoch": 0.5413027656477438,
+      "grad_norm": 0.1814386397600174,
+      "learning_rate": 2.2919301117449167e-05,
+      "loss": 0.14887022972106934,
+      "step": 2975
+    },
+    {
+      "epoch": 0.5422125181950509,
+      "grad_norm": 0.158392995595932,
+      "learning_rate": 2.2845957547590368e-05,
+      "loss": 0.14404361248016356,
+      "step": 2980
+    },
+    {
+      "epoch": 0.5431222707423581,
+      "grad_norm": 0.17496263980865479,
+      "learning_rate": 2.2772632651193953e-05,
+      "loss": 0.1454906702041626,
+      "step": 2985
+    },
+    {
+      "epoch": 0.5440320232896652,
+      "grad_norm": 0.157533198595047,
+      "learning_rate": 2.2699327063915766e-05,
+      "loss": 0.1458217740058899,
+      "step": 2990
+    },
+    {
+      "epoch": 0.5449417758369723,
+      "grad_norm": 0.1767890453338623,
+      "learning_rate": 2.262604142124427e-05,
+      "loss": 0.14384825229644777,
+      "step": 2995
+    },
+    {
+      "epoch": 0.5458515283842795,
+      "grad_norm": 0.1851050704717636,
+      "learning_rate": 2.2552776358495033e-05,
+      "loss": 0.14832457304000854,
+      "step": 3000
+    },
+    {
+      "epoch": 0.5467612809315866,
+      "grad_norm": 0.164175882935524,
+      "learning_rate": 2.247953251080521e-05,
+      "loss": 0.14999878406524658,
+      "step": 3005
+    },
+    {
+      "epoch": 0.5476710334788938,
+      "grad_norm": 0.3403675854206085,
+      "learning_rate": 2.240631051312804e-05,
+      "loss": 0.1443937063217163,
+      "step": 3010
+    },
+    {
+      "epoch": 0.5485807860262009,
+      "grad_norm": 0.16751109063625336,
+      "learning_rate": 2.2333111000227342e-05,
+      "loss": 0.1462402105331421,
+      "step": 3015
+    },
+    {
+      "epoch": 0.549490538573508,
+      "grad_norm": 0.14741151034832,
+      "learning_rate": 2.225993460667201e-05,
+      "loss": 0.149855899810791,
+      "step": 3020
+    },
+    {
+      "epoch": 0.5504002911208151,
+      "grad_norm": 0.20605266094207764,
+      "learning_rate": 2.218678196683054e-05,
+      "loss": 0.15413178205490113,
+      "step": 3025
+    },
+    {
+      "epoch": 0.5513100436681223,
+      "grad_norm": 0.14884796738624573,
+      "learning_rate": 2.2113653714865473e-05,
+      "loss": 0.14592334032058715,
+      "step": 3030
+    },
+    {
+      "epoch": 0.5522197962154294,
+      "grad_norm": 0.17114350199699402,
+      "learning_rate": 2.2040550484727943e-05,
+      "loss": 0.1498338460922241,
+      "step": 3035
+    },
+    {
+      "epoch": 0.5531295487627366,
+      "grad_norm": 0.16496853530406952,
+      "learning_rate": 2.196747291015219e-05,
+      "loss": 0.14650315046310425,
+      "step": 3040
+    },
+    {
+      "epoch": 0.5540393013100436,
+      "grad_norm": 0.15172401070594788,
+      "learning_rate": 2.189442162465001e-05,
+      "loss": 0.14984124898910522,
+      "step": 3045
+    },
+    {
+      "epoch": 0.5549490538573508,
+      "grad_norm": 0.19258467853069305,
+      "learning_rate": 2.182139726150532e-05,
+      "loss": 0.1486764669418335,
+      "step": 3050
+    },
+    {
+      "epoch": 0.5558588064046579,
+      "grad_norm": 0.1749001443386078,
+      "learning_rate": 2.1748400453768652e-05,
+      "loss": 0.14983701705932617,
+      "step": 3055
+    },
+    {
+      "epoch": 0.5567685589519651,
+      "grad_norm": 0.37510567903518677,
+      "learning_rate": 2.1675431834251637e-05,
+      "loss": 0.14483561515808105,
+      "step": 3060
+    },
+    {
+      "epoch": 0.5576783114992722,
+      "grad_norm": 0.16932405531406403,
+      "learning_rate": 2.1602492035521553e-05,
+      "loss": 0.14487643241882325,
+      "step": 3065
+    },
+    {
+      "epoch": 0.5585880640465793,
+      "grad_norm": 0.174176424741745,
+      "learning_rate": 2.152958168989584e-05,
+      "loss": 0.14737497568130492,
+      "step": 3070
+    },
+    {
+      "epoch": 0.5594978165938864,
+      "grad_norm": 0.1601252257823944,
+      "learning_rate": 2.1456701429436577e-05,
+      "loss": 0.15183379650115966,
+      "step": 3075
+    },
+    {
+      "epoch": 0.5604075691411936,
+      "grad_norm": 0.14960910379886627,
+      "learning_rate": 2.1383851885945085e-05,
+      "loss": 0.143074893951416,
+      "step": 3080
+    },
+    {
+      "epoch": 0.5613173216885007,
+      "grad_norm": 0.1678633838891983,
+      "learning_rate": 2.1311033690956346e-05,
+      "loss": 0.14961432218551635,
+      "step": 3085
+    },
+    {
+      "epoch": 0.5622270742358079,
+      "grad_norm": 0.15814319252967834,
+      "learning_rate": 2.1238247475733613e-05,
+      "loss": 0.14308581352233887,
+      "step": 3090
+    },
+    {
+      "epoch": 0.5631368267831149,
+      "grad_norm": 0.21240772306919098,
+      "learning_rate": 2.1165493871262887e-05,
+      "loss": 0.14737485647201537,
+      "step": 3095
+    },
+    {
+      "epoch": 0.5640465793304221,
+      "grad_norm": 0.15161271393299103,
+      "learning_rate": 2.109277350824749e-05,
+      "loss": 0.14534420967102052,
+      "step": 3100
+    },
+    {
+      "epoch": 0.5649563318777293,
+      "grad_norm": 0.16572362184524536,
+      "learning_rate": 2.1020087017102537e-05,
+      "loss": 0.14299670457839966,
+      "step": 3105
+    },
+    {
+      "epoch": 0.5658660844250364,
+      "grad_norm": 0.1548164039850235,
+      "learning_rate": 2.094743502794954e-05,
+      "loss": 0.14371142387390137,
+      "step": 3110
+    },
+    {
+      "epoch": 0.5667758369723436,
+      "grad_norm": 0.2574169933795929,
+      "learning_rate": 2.0874818170610885e-05,
+      "loss": 0.14350423812866211,
+      "step": 3115
+    },
+    {
+      "epoch": 0.5676855895196506,
+      "grad_norm": 0.16359548270702362,
+      "learning_rate": 2.080223707460443e-05,
+      "loss": 0.1520243763923645,
+      "step": 3120
+    },
+    {
+      "epoch": 0.5685953420669578,
+      "grad_norm": 0.1798320859670639,
+      "learning_rate": 2.072969236913799e-05,
+      "loss": 0.14832595586776734,
+      "step": 3125
+    },
+    {
+      "epoch": 0.5695050946142649,
+      "grad_norm": 0.17045916616916656,
+      "learning_rate": 2.0657184683103926e-05,
+      "loss": 0.15308042764663696,
+      "step": 3130
+    },
+    {
+      "epoch": 0.5704148471615721,
+      "grad_norm": 0.16345897316932678,
+      "learning_rate": 2.058471464507366e-05,
+      "loss": 0.14564799070358275,
+      "step": 3135
+    },
+    {
+      "epoch": 0.5713245997088792,
+      "grad_norm": 0.15170110762119293,
+      "learning_rate": 2.0512282883292257e-05,
+      "loss": 0.14161767959594726,
+      "step": 3140
+    },
+    {
+      "epoch": 0.5722343522561864,
+      "grad_norm": 0.8107472658157349,
+      "learning_rate": 2.0439890025672955e-05,
+      "loss": 0.14481087923049926,
+      "step": 3145
+    },
+    {
+      "epoch": 0.5731441048034934,
+      "grad_norm": 0.15346679091453552,
+      "learning_rate": 2.036753669979174e-05,
+      "loss": 0.14860262870788574,
+      "step": 3150
+    },
+    {
+      "epoch": 0.5740538573508006,
+      "grad_norm": 0.1632593423128128,
+      "learning_rate": 2.0295223532881886e-05,
+      "loss": 0.1481687307357788,
+      "step": 3155
+    },
+    {
+      "epoch": 0.5749636098981077,
+      "grad_norm": 0.23399172723293304,
+      "learning_rate": 2.022295115182852e-05,
+      "loss": 0.149153733253479,
+      "step": 3160
+    },
+    {
+      "epoch": 0.5758733624454149,
+      "grad_norm": 0.14977394044399261,
+      "learning_rate": 2.015072018316323e-05,
+      "loss": 0.14921388626098633,
+      "step": 3165
+    },
+    {
+      "epoch": 0.576783114992722,
+      "grad_norm": 0.1550658792257309,
+      "learning_rate": 2.007853125305856e-05,
+      "loss": 0.1482759475708008,
+      "step": 3170
+    },
+    {
+      "epoch": 0.5776928675400291,
+      "grad_norm": 0.16661737859249115,
+      "learning_rate": 2.0006384987322645e-05,
+      "loss": 0.14903552532196046,
+      "step": 3175
+    },
+    {
+      "epoch": 0.5786026200873362,
+      "grad_norm": 0.1746823936700821,
+      "learning_rate": 1.9934282011393753e-05,
+      "loss": 0.1412947654724121,
+      "step": 3180
+    },
+    {
+      "epoch": 0.5795123726346434,
+      "grad_norm": 0.17025792598724365,
+      "learning_rate": 1.9862222950334857e-05,
+      "loss": 0.15289769172668458,
+      "step": 3185
+    },
+    {
+      "epoch": 0.5804221251819505,
+      "grad_norm": 0.16857658326625824,
+      "learning_rate": 1.9790208428828252e-05,
+      "loss": 0.14419941902160643,
+      "step": 3190
+    },
+    {
+      "epoch": 0.5813318777292577,
+      "grad_norm": 0.16099876165390015,
+      "learning_rate": 1.9718239071170118e-05,
+      "loss": 0.14476487636566163,
+      "step": 3195
+    },
+    {
+      "epoch": 0.5822416302765647,
+      "grad_norm": 0.16140873730182648,
+      "learning_rate": 1.964631550126508e-05,
+      "loss": 0.14588416814804078,
+      "step": 3200
+    },
+    {
+      "epoch": 0.5831513828238719,
+      "grad_norm": 0.15719448029994965,
+      "learning_rate": 1.957443834262087e-05,
+      "loss": 0.15144693851470947,
+      "step": 3205
+    },
+    {
+      "epoch": 0.584061135371179,
+      "grad_norm": 0.16512645781040192,
+      "learning_rate": 1.950260821834285e-05,
+      "loss": 0.14787566661834717,
+      "step": 3210
+    },
+    {
+      "epoch": 0.5849708879184862,
+      "grad_norm": 0.18584516644477844,
+      "learning_rate": 1.9430825751128643e-05,
+      "loss": 0.14514710903167724,
+      "step": 3215
+    },
+    {
+      "epoch": 0.5858806404657934,
+      "grad_norm": 0.17640981078147888,
+      "learning_rate": 1.9359091563262742e-05,
+      "loss": 0.1511004686355591,
+      "step": 3220
+    },
+    {
+      "epoch": 0.5867903930131004,
+      "grad_norm": 0.1697624921798706,
+      "learning_rate": 1.9287406276611095e-05,
+      "loss": 0.15392563343048096,
+      "step": 3225
+    },
+    {
+      "epoch": 0.5877001455604076,
+      "grad_norm": 0.1677260845899582,
+      "learning_rate": 1.9215770512615725e-05,
+      "loss": 0.15311745405197144,
+      "step": 3230
+    },
+    {
+      "epoch": 0.5886098981077147,
+      "grad_norm": 0.15357480943202972,
+      "learning_rate": 1.9144184892289337e-05,
+      "loss": 0.14370160102844237,
+      "step": 3235
+    },
+    {
+      "epoch": 0.5895196506550219,
+      "grad_norm": 0.18601207435131073,
+      "learning_rate": 1.9072650036209955e-05,
+      "loss": 0.14095077514648438,
+      "step": 3240
+    },
+    {
+      "epoch": 0.590429403202329,
+      "grad_norm": 0.17313526570796967,
+      "learning_rate": 1.9001166564515513e-05,
+      "loss": 0.148259174823761,
+      "step": 3245
+    },
+    {
+      "epoch": 0.5913391557496361,
+      "grad_norm": 0.1634378433227539,
+      "learning_rate": 1.8929735096898504e-05,
+      "loss": 0.15082294940948487,
+      "step": 3250
+    },
+    {
+      "epoch": 0.5922489082969432,
+      "grad_norm": 0.18542174994945526,
+      "learning_rate": 1.885835625260058e-05,
+      "loss": 0.14461435079574586,
+      "step": 3255
+    },
+    {
+      "epoch": 0.5931586608442504,
+      "grad_norm": 0.1740756630897522,
+      "learning_rate": 1.87870306504072e-05,
+      "loss": 0.14083608388900756,
+      "step": 3260
+    },
+    {
+      "epoch": 0.5940684133915575,
+      "grad_norm": 0.25606217980384827,
+      "learning_rate": 1.8715758908642288e-05,
+      "loss": 0.15125386714935302,
+      "step": 3265
+    },
+    {
+      "epoch": 0.5949781659388647,
+      "grad_norm": 0.20194627344608307,
+      "learning_rate": 1.8644541645162834e-05,
+      "loss": 0.14433003664016725,
+      "step": 3270
+    },
+    {
+      "epoch": 0.5958879184861717,
+      "grad_norm": 0.1902168095111847,
+      "learning_rate": 1.8573379477353542e-05,
+      "loss": 0.14718132019042968,
+      "step": 3275
+    },
+    {
+      "epoch": 0.5967976710334789,
+      "grad_norm": 0.15122972428798676,
+      "learning_rate": 1.850227302212151e-05,
+      "loss": 0.153376567363739,
+      "step": 3280
+    },
+    {
+      "epoch": 0.597707423580786,
+      "grad_norm": 0.14331959187984467,
+      "learning_rate": 1.843122289589085e-05,
+      "loss": 0.146630597114563,
+      "step": 3285
+    },
+    {
+      "epoch": 0.5986171761280932,
+      "grad_norm": 0.15083099901676178,
+      "learning_rate": 1.836022971459737e-05,
+      "loss": 0.1445971965789795,
+      "step": 3290
+    },
+    {
+      "epoch": 0.5995269286754003,
+      "grad_norm": 0.16585418581962585,
+      "learning_rate": 1.828929409368321e-05,
+      "loss": 0.15120241641998292,
+      "step": 3295
+    },
+    {
+      "epoch": 0.6004366812227074,
+      "grad_norm": 0.1653224229812622,
+      "learning_rate": 1.8218416648091524e-05,
+      "loss": 0.14349838495254516,
+      "step": 3300
+    },
+    {
+      "epoch": 0.6013464337700145,
+      "grad_norm": 0.1891375184059143,
+      "learning_rate": 1.8147597992261124e-05,
+      "loss": 0.15171384811401367,
+      "step": 3305
+    },
+    {
+      "epoch": 0.6022561863173217,
+      "grad_norm": 0.13392704725265503,
+      "learning_rate": 1.8076838740121187e-05,
+      "loss": 0.14607118368148803,
+      "step": 3310
+    },
+    {
+      "epoch": 0.6031659388646288,
+      "grad_norm": 0.15421944856643677,
+      "learning_rate": 1.8006139505085926e-05,
+      "loss": 0.1380957007408142,
+      "step": 3315
+    },
+    {
+      "epoch": 0.604075691411936,
+      "grad_norm": 0.16637761890888214,
+      "learning_rate": 1.7935500900049246e-05,
+      "loss": 0.14604611396789552,
+      "step": 3320
+    },
+    {
+      "epoch": 0.6049854439592431,
+      "grad_norm": 0.16638441383838654,
+      "learning_rate": 1.7864923537379445e-05,
+      "loss": 0.1513611912727356,
+      "step": 3325
+    },
+    {
+      "epoch": 0.6058951965065502,
+      "grad_norm": 0.1745707094669342,
+      "learning_rate": 1.779440802891394e-05,
+      "loss": 0.15391240119934083,
+      "step": 3330
+    },
+    {
+      "epoch": 0.6068049490538574,
+      "grad_norm": 0.1620505005121231,
+      "learning_rate": 1.77239549859539e-05,
+      "loss": 0.14986472129821776,
+      "step": 3335
+    },
+    {
+      "epoch": 0.6077147016011645,
+      "grad_norm": 0.1579132080078125,
+      "learning_rate": 1.7653565019259e-05,
+      "loss": 0.1466603994369507,
+      "step": 3340
+    },
+    {
+      "epoch": 0.6086244541484717,
+      "grad_norm": 0.19154994189739227,
+      "learning_rate": 1.7583238739042086e-05,
+      "loss": 0.15228934288024903,
+      "step": 3345
+    },
+    {
+      "epoch": 0.6095342066957787,
+      "grad_norm": 0.15771779417991638,
+      "learning_rate": 1.7512976754963913e-05,
+      "loss": 0.14965078830718995,
+      "step": 3350
+    },
+    {
+      "epoch": 0.6104439592430859,
+      "grad_norm": 0.18406136333942413,
+      "learning_rate": 1.744277967612785e-05,
+      "loss": 0.1473196864128113,
+      "step": 3355
+    },
+    {
+      "epoch": 0.611353711790393,
+      "grad_norm": 0.17603816092014313,
+      "learning_rate": 1.7372648111074607e-05,
+      "loss": 0.1430676221847534,
+      "step": 3360
+    },
+    {
+      "epoch": 0.6122634643377002,
+      "grad_norm": 0.156408429145813,
+      "learning_rate": 1.7302582667776933e-05,
+      "loss": 0.14018454551696777,
+      "step": 3365
+    },
+    {
+      "epoch": 0.6131732168850073,
+      "grad_norm": 0.14504843950271606,
+      "learning_rate": 1.7232583953634407e-05,
+      "loss": 0.14505640268325806,
+      "step": 3370
+    },
+    {
+      "epoch": 0.6140829694323144,
+      "grad_norm": 0.1864968240261078,
+      "learning_rate": 1.716265257546808e-05,
+      "loss": 0.14810394048690795,
+      "step": 3375
+    },
+    {
+      "epoch": 0.6149927219796215,
+      "grad_norm": 0.1621711403131485,
+      "learning_rate": 1.7092789139515295e-05,
+      "loss": 0.14203091859817504,
+      "step": 3380
+    },
+    {
+      "epoch": 0.6159024745269287,
+      "grad_norm": 0.17994914948940277,
+      "learning_rate": 1.70229942514244e-05,
+      "loss": 0.14565644264221192,
+      "step": 3385
+    },
+    {
+      "epoch": 0.6168122270742358,
+      "grad_norm": 0.1707388162612915,
+      "learning_rate": 1.6953268516249486e-05,
+      "loss": 0.14449434280395507,
+      "step": 3390
+    },
+    {
+      "epoch": 0.617721979621543,
+      "grad_norm": 0.16425329446792603,
+      "learning_rate": 1.6883612538445175e-05,
+      "loss": 0.15185940265655518,
+      "step": 3395
+    },
+    {
+      "epoch": 0.61863173216885,
+      "grad_norm": 0.15987788140773773,
+      "learning_rate": 1.6814026921861335e-05,
+      "loss": 0.14994431734085084,
+      "step": 3400
+    },
+    {
+      "epoch": 0.6195414847161572,
+      "grad_norm": 0.2987690269947052,
+      "learning_rate": 1.6744512269737894e-05,
+      "loss": 0.14652738571166993,
+      "step": 3405
+    },
+    {
+      "epoch": 0.6204512372634643,
+      "grad_norm": 0.1681315004825592,
+      "learning_rate": 1.6675069184699574e-05,
+      "loss": 0.14566165208816528,
+      "step": 3410
+    },
+    {
+      "epoch": 0.6213609898107715,
+      "grad_norm": 0.15847846865653992,
+      "learning_rate": 1.660569826875069e-05,
+      "loss": 0.1374401330947876,
+      "step": 3415
+    },
+    {
+      "epoch": 0.6222707423580786,
+      "grad_norm": 0.16370312869548798,
+      "learning_rate": 1.6536400123269907e-05,
+      "loss": 0.14905524253845215,
+      "step": 3420
+    },
+    {
+      "epoch": 0.6231804949053857,
+      "grad_norm": 0.16054444015026093,
+      "learning_rate": 1.6467175349005054e-05,
+      "loss": 0.1496324896812439,
+      "step": 3425
+    },
+    {
+      "epoch": 0.6240902474526928,
+      "grad_norm": 0.1663951277732849,
+      "learning_rate": 1.639802454606788e-05,
+      "loss": 0.1504170298576355,
+      "step": 3430
+    },
+    {
+      "epoch": 0.625,
+      "grad_norm": 0.1591310054063797,
+      "learning_rate": 1.6328948313928906e-05,
+      "loss": 0.1410186171531677,
+      "step": 3435
+    },
+    {
+      "epoch": 0.6259097525473072,
+      "grad_norm": 0.1637524962425232,
+      "learning_rate": 1.6259947251412178e-05,
+      "loss": 0.13963305950164795,
+      "step": 3440
+    },
+    {
+      "epoch": 0.6268195050946143,
+      "grad_norm": 0.1688017100095749,
+      "learning_rate": 1.6191021956690096e-05,
+      "loss": 0.14727941751480103,
+      "step": 3445
+    },
+    {
+      "epoch": 0.6277292576419214,
+      "grad_norm": 0.1691795438528061,
+      "learning_rate": 1.612217302727821e-05,
+      "loss": 0.14856183528900146,
+      "step": 3450
+    },
+    {
+      "epoch": 0.6286390101892285,
+      "grad_norm": 0.18501746654510498,
+      "learning_rate": 1.60534010600301e-05,
+      "loss": 0.1481746554374695,
+      "step": 3455
+    },
+    {
+      "epoch": 0.6295487627365357,
+      "grad_norm": 0.16234716773033142,
+      "learning_rate": 1.5984706651132125e-05,
+      "loss": 0.1427530527114868,
+      "step": 3460
+    },
+    {
+      "epoch": 0.6304585152838428,
+      "grad_norm": 0.16013780236244202,
+      "learning_rate": 1.5916090396098293e-05,
+      "loss": 0.14264426231384278,
+      "step": 3465
+    },
+    {
+      "epoch": 0.63136826783115,
+      "grad_norm": 0.17116396129131317,
+      "learning_rate": 1.5847552889765095e-05,
+      "loss": 0.14109257459640503,
+      "step": 3470
+    },
+    {
+      "epoch": 0.632278020378457,
+      "grad_norm": 0.16949769854545593,
+      "learning_rate": 1.5779094726286344e-05,
+      "loss": 0.1387040376663208,
+      "step": 3475
+    },
+    {
+      "epoch": 0.6331877729257642,
+      "grad_norm": 0.14983431994915009,
+      "learning_rate": 1.5710716499128044e-05,
+      "loss": 0.13645120859146118,
+      "step": 3480
+    },
+    {
+      "epoch": 0.6340975254730713,
+      "grad_norm": 0.1632554531097412,
+      "learning_rate": 1.564241880106321e-05,
+      "loss": 0.14883992671966553,
+      "step": 3485
+    },
+    {
+      "epoch": 0.6350072780203785,
+      "grad_norm": 0.15686506032943726,
+      "learning_rate": 1.5574202224166744e-05,
+      "loss": 0.14244272708892822,
+      "step": 3490
+    },
+    {
+      "epoch": 0.6359170305676856,
+      "grad_norm": 0.18843458592891693,
+      "learning_rate": 1.5506067359810333e-05,
+      "loss": 0.15149861574172974,
+      "step": 3495
+    },
+    {
+      "epoch": 0.6368267831149927,
+      "grad_norm": 0.15874551236629486,
+      "learning_rate": 1.5438014798657275e-05,
+      "loss": 0.15188233852386473,
+      "step": 3500
+    },
+    {
+      "epoch": 0.6377365356622998,
+      "grad_norm": 0.17014239728450775,
+      "learning_rate": 1.5370045130657366e-05,
+      "loss": 0.14694437980651856,
+      "step": 3505
+    },
+    {
+      "epoch": 0.638646288209607,
+      "grad_norm": 0.14744038879871368,
+      "learning_rate": 1.5302158945041838e-05,
+      "loss": 0.14434736967086792,
+      "step": 3510
+    },
+    {
+      "epoch": 0.6395560407569141,
+      "grad_norm": 0.2069770246744156,
+      "learning_rate": 1.523435683031818e-05,
+      "loss": 0.13982917070388795,
+      "step": 3515
+    },
+    {
+      "epoch": 0.6404657933042213,
+      "grad_norm": 0.17811502516269684,
+      "learning_rate": 1.5166639374265063e-05,
+      "loss": 0.1408839702606201,
+      "step": 3520
+    },
+    {
+      "epoch": 0.6413755458515283,
+      "grad_norm": 0.165786474943161,
+      "learning_rate": 1.509900716392728e-05,
+      "loss": 0.15312877893447877,
+      "step": 3525
+    },
+    {
+      "epoch": 0.6422852983988355,
+      "grad_norm": 0.1633884161710739,
+      "learning_rate": 1.5031460785610596e-05,
+      "loss": 0.1488795518875122,
+      "step": 3530
+    },
+    {
+      "epoch": 0.6431950509461426,
+      "grad_norm": 0.16498984396457672,
+      "learning_rate": 1.4964000824876723e-05,
+      "loss": 0.15031465291976928,
+      "step": 3535
+    },
+    {
+      "epoch": 0.6441048034934498,
+      "grad_norm": 0.18043678998947144,
+      "learning_rate": 1.4896627866538191e-05,
+      "loss": 0.147829806804657,
+      "step": 3540
+    },
+    {
+      "epoch": 0.6450145560407569,
+      "grad_norm": 0.16813597083091736,
+      "learning_rate": 1.4829342494653315e-05,
+      "loss": 0.1418998956680298,
+      "step": 3545
+    },
+    {
+      "epoch": 0.645924308588064,
+      "grad_norm": 0.1817242056131363,
+      "learning_rate": 1.4762145292521118e-05,
+      "loss": 0.14508869647979736,
+      "step": 3550
+    },
+    {
+      "epoch": 0.6468340611353712,
+      "grad_norm": 0.14666494727134705,
+      "learning_rate": 1.469503684267628e-05,
+      "loss": 0.14159854650497436,
+      "step": 3555
+    },
+    {
+      "epoch": 0.6477438136826783,
+      "grad_norm": 0.16485381126403809,
+      "learning_rate": 1.4628017726884086e-05,
+      "loss": 0.14419105052947997,
+      "step": 3560
+    },
+    {
+      "epoch": 0.6486535662299855,
+      "grad_norm": 0.16100342571735382,
+      "learning_rate": 1.4561088526135375e-05,
+      "loss": 0.14501721858978273,
+      "step": 3565
+    },
+    {
+      "epoch": 0.6495633187772926,
+      "grad_norm": 0.16996590793132782,
+      "learning_rate": 1.4494249820641493e-05,
+      "loss": 0.1377166509628296,
+      "step": 3570
+    },
+    {
+      "epoch": 0.6504730713245997,
+      "grad_norm": 0.16168837249279022,
+      "learning_rate": 1.4427502189829339e-05,
+      "loss": 0.1414325475692749,
+      "step": 3575
+    },
+    {
+      "epoch": 0.6513828238719068,
+      "grad_norm": 0.16318906843662262,
+      "learning_rate": 1.436084621233621e-05,
+      "loss": 0.14685193300247193,
+      "step": 3580
+    },
+    {
+      "epoch": 0.652292576419214,
+      "grad_norm": 0.1636219322681427,
+      "learning_rate": 1.4294282466004899e-05,
+      "loss": 0.1405899167060852,
+      "step": 3585
+    },
+    {
+      "epoch": 0.6532023289665211,
+      "grad_norm": 0.1838461309671402,
+      "learning_rate": 1.422781152787865e-05,
+      "loss": 0.14386332035064697,
+      "step": 3590
+    },
+    {
+      "epoch": 0.6541120815138283,
+      "grad_norm": 0.1796344667673111,
+      "learning_rate": 1.4161433974196115e-05,
+      "loss": 0.1513024687767029,
+      "step": 3595
+    },
+    {
+      "epoch": 0.6550218340611353,
+      "grad_norm": 0.16424529254436493,
+      "learning_rate": 1.4095150380386427e-05,
+      "loss": 0.14238927364349366,
+      "step": 3600
+    },
+    {
+      "epoch": 0.6559315866084425,
+      "grad_norm": 0.19264160096645355,
+      "learning_rate": 1.402896132106415e-05,
+      "loss": 0.14297477006912232,
+      "step": 3605
+    },
+    {
+      "epoch": 0.6568413391557496,
+      "grad_norm": 0.18319948017597198,
+      "learning_rate": 1.3962867370024347e-05,
+      "loss": 0.1448880434036255,
+      "step": 3610
+    },
+    {
+      "epoch": 0.6577510917030568,
+      "grad_norm": 0.16507290303707123,
+      "learning_rate": 1.389686910023758e-05,
+      "loss": 0.14724698066711425,
+      "step": 3615
+    },
+    {
+      "epoch": 0.6586608442503639,
+      "grad_norm": 0.17871244251728058,
+      "learning_rate": 1.3830967083844942e-05,
+      "loss": 0.14479386806488037,
+      "step": 3620
+    },
+    {
+      "epoch": 0.659570596797671,
+      "grad_norm": 0.1846228390932083,
+      "learning_rate": 1.3765161892153112e-05,
+      "loss": 0.1453616738319397,
+      "step": 3625
+    },
+    {
+      "epoch": 0.6604803493449781,
+      "grad_norm": 0.17185978591442108,
+      "learning_rate": 1.3699454095629372e-05,
+      "loss": 0.14906206130981445,
+      "step": 3630
+    },
+    {
+      "epoch": 0.6613901018922853,
+      "grad_norm": 0.14751191437244415,
+      "learning_rate": 1.3633844263896698e-05,
+      "loss": 0.13991892337799072,
+      "step": 3635
+    },
+    {
+      "epoch": 0.6622998544395924,
+      "grad_norm": 0.22059763967990875,
+      "learning_rate": 1.3568332965728817e-05,
+      "loss": 0.14680869579315187,
+      "step": 3640
+    },
+    {
+      "epoch": 0.6632096069868996,
+      "grad_norm": 0.15295909345149994,
+      "learning_rate": 1.3502920769045232e-05,
+      "loss": 0.1404443383216858,
+      "step": 3645
+    },
+    {
+      "epoch": 0.6641193595342066,
+      "grad_norm": 0.14600558578968048,
+      "learning_rate": 1.3437608240906364e-05,
+      "loss": 0.14663270711898804,
+      "step": 3650
+    },
+    {
+      "epoch": 0.6650291120815138,
+      "grad_norm": 0.15548352897167206,
+      "learning_rate": 1.3372395947508587e-05,
+      "loss": 0.1431443452835083,
+      "step": 3655
+    },
+    {
+      "epoch": 0.665938864628821,
+      "grad_norm": 0.1813388466835022,
+      "learning_rate": 1.3307284454179342e-05,
+      "loss": 0.1458706736564636,
+      "step": 3660
+    },
+    {
+      "epoch": 0.6668486171761281,
+      "grad_norm": 0.16326870024204254,
+      "learning_rate": 1.3242274325372247e-05,
+      "loss": 0.14700595140457154,
+      "step": 3665
+    },
+    {
+      "epoch": 0.6677583697234353,
+      "grad_norm": 0.18779197335243225,
+      "learning_rate": 1.3177366124662149e-05,
+      "loss": 0.1497237801551819,
+      "step": 3670
+    },
+    {
+      "epoch": 0.6686681222707423,
+      "grad_norm": 0.16291002929210663,
+      "learning_rate": 1.3112560414740315e-05,
+      "loss": 0.1387086868286133,
+      "step": 3675
+    },
+    {
+      "epoch": 0.6695778748180495,
+      "grad_norm": 0.1532297134399414,
+      "learning_rate": 1.3047857757409487e-05,
+      "loss": 0.14497545957565308,
+      "step": 3680
+    },
+    {
+      "epoch": 0.6704876273653566,
+      "grad_norm": 0.14697515964508057,
+      "learning_rate": 1.2983258713579066e-05,
+      "loss": 0.1494283437728882,
+      "step": 3685
+    },
+    {
+      "epoch": 0.6713973799126638,
+      "grad_norm": 0.15213452279567719,
+      "learning_rate": 1.2918763843260218e-05,
+      "loss": 0.1468907594680786,
+      "step": 3690
+    },
+    {
+      "epoch": 0.6723071324599709,
+      "grad_norm": 0.1745215803384781,
+      "learning_rate": 1.285437370556099e-05,
+      "loss": 0.14997754096984864,
+      "step": 3695
+    },
+    {
+      "epoch": 0.673216885007278,
+      "grad_norm": 0.19207637012004852,
+      "learning_rate": 1.2790088858681577e-05,
+      "loss": 0.14202862977981567,
+      "step": 3700
+    },
+    {
+      "epoch": 0.6741266375545851,
+      "grad_norm": 0.1521359086036682,
+      "learning_rate": 1.2725909859909313e-05,
+      "loss": 0.14547673463821412,
+      "step": 3705
+    },
+    {
+      "epoch": 0.6750363901018923,
+      "grad_norm": 0.16975535452365875,
+      "learning_rate": 1.2661837265613999e-05,
+      "loss": 0.14006874561309815,
+      "step": 3710
+    },
+    {
+      "epoch": 0.6759461426491994,
+      "grad_norm": 0.22234582901000977,
+      "learning_rate": 1.2597871631242992e-05,
+      "loss": 0.13691173791885375,
+      "step": 3715
+    },
+    {
+      "epoch": 0.6768558951965066,
+      "grad_norm": 0.16082969307899475,
+      "learning_rate": 1.2534013511316383e-05,
+      "loss": 0.14932308197021485,
+      "step": 3720
+    },
+    {
+      "epoch": 0.6777656477438136,
+      "grad_norm": 0.1751091182231903,
+      "learning_rate": 1.247026345942226e-05,
+      "loss": 0.14531974792480468,
+      "step": 3725
+    },
+    {
+      "epoch": 0.6786754002911208,
+      "grad_norm": 0.15838147699832916,
+      "learning_rate": 1.2406622028211844e-05,
+      "loss": 0.14759832620620728,
+      "step": 3730
+    },
+    {
+      "epoch": 0.6795851528384279,
+      "grad_norm": 0.1771744042634964,
+      "learning_rate": 1.2343089769394714e-05,
+      "loss": 0.1382831573486328,
+      "step": 3735
+    },
+    {
+      "epoch": 0.6804949053857351,
+      "grad_norm": 0.16301538050174713,
+      "learning_rate": 1.2279667233734037e-05,
+      "loss": 0.14444775581359864,
+      "step": 3740
+    },
+    {
+      "epoch": 0.6814046579330422,
+      "grad_norm": 0.1584121286869049,
+      "learning_rate": 1.2216354971041796e-05,
+      "loss": 0.14200170040130616,
+      "step": 3745
+    },
+    {
+      "epoch": 0.6823144104803494,
+      "grad_norm": 0.139187291264534,
+      "learning_rate": 1.2153153530174007e-05,
+      "loss": 0.14318310022354125,
+      "step": 3750
+    },
+    {
+      "epoch": 0.6832241630276564,
+      "grad_norm": 0.13665248453617096,
+      "learning_rate": 1.2090063459025955e-05,
+      "loss": 0.1411946654319763,
+      "step": 3755
+    },
+    {
+      "epoch": 0.6841339155749636,
+      "grad_norm": 0.16273781657218933,
+      "learning_rate": 1.2027085304527475e-05,
+      "loss": 0.14873508214950562,
+      "step": 3760
+    },
+    {
+      "epoch": 0.6850436681222707,
+      "grad_norm": 0.16317526996135712,
+      "learning_rate": 1.1964219612638194e-05,
+      "loss": 0.14644203186035157,
+      "step": 3765
+    },
+    {
+      "epoch": 0.6859534206695779,
+      "grad_norm": 0.17253617942333221,
+      "learning_rate": 1.1901466928342777e-05,
+      "loss": 0.14027841091156007,
+      "step": 3770
+    },
+    {
+      "epoch": 0.6868631732168851,
+      "grad_norm": 0.19692830741405487,
+      "learning_rate": 1.183882779564624e-05,
+      "loss": 0.14411110877990724,
+      "step": 3775
+    },
+    {
+      "epoch": 0.6877729257641921,
+      "grad_norm": 0.15444578230381012,
+      "learning_rate": 1.1776302757569214e-05,
+      "loss": 0.14355008602142333,
+      "step": 3780
+    },
+    {
+      "epoch": 0.6886826783114993,
+      "grad_norm": 0.1622200757265091,
+      "learning_rate": 1.1713892356143239e-05,
+      "loss": 0.14794334173202514,
+      "step": 3785
+    },
+    {
+      "epoch": 0.6895924308588064,
+      "grad_norm": 0.1898501068353653,
+      "learning_rate": 1.1651597132406073e-05,
+      "loss": 0.1418622612953186,
+      "step": 3790
+    },
+    {
+      "epoch": 0.6905021834061136,
+      "grad_norm": 0.17803208529949188,
+      "learning_rate": 1.1589417626396973e-05,
+      "loss": 0.14576040506362914,
+      "step": 3795
+    },
+    {
+      "epoch": 0.6914119359534207,
+      "grad_norm": 0.17138013243675232,
+      "learning_rate": 1.1527354377152053e-05,
+      "loss": 0.14494270086288452,
+      "step": 3800
+    },
+    {
+      "epoch": 0.6923216885007278,
+      "grad_norm": 0.15170913934707642,
+      "learning_rate": 1.1465407922699603e-05,
+      "loss": 0.144084370136261,
+      "step": 3805
+    },
+    {
+      "epoch": 0.6932314410480349,
+      "grad_norm": 0.158562570810318,
+      "learning_rate": 1.1403578800055387e-05,
+      "loss": 0.13636608123779298,
+      "step": 3810
+    },
+    {
+      "epoch": 0.6941411935953421,
+      "grad_norm": 0.17687302827835083,
+      "learning_rate": 1.1341867545218044e-05,
+      "loss": 0.14214688539505005,
+      "step": 3815
+    },
+    {
+      "epoch": 0.6950509461426492,
+      "grad_norm": 0.15394899249076843,
+      "learning_rate": 1.1280274693164378e-05,
+      "loss": 0.14914129972457885,
+      "step": 3820
+    },
+    {
+      "epoch": 0.6959606986899564,
+      "grad_norm": 0.15709355473518372,
+      "learning_rate": 1.12188007778448e-05,
+      "loss": 0.14798580408096312,
+      "step": 3825
+    },
+    {
+      "epoch": 0.6968704512372634,
+      "grad_norm": 0.16631539165973663,
+      "learning_rate": 1.115744633217864e-05,
+      "loss": 0.14756966829299928,
+      "step": 3830
+    },
+    {
+      "epoch": 0.6977802037845706,
+      "grad_norm": 0.15893076360225677,
+      "learning_rate": 1.109621188804951e-05,
+      "loss": 0.14061959981918334,
+      "step": 3835
+    },
+    {
+      "epoch": 0.6986899563318777,
+      "grad_norm": 0.183414489030838,
+      "learning_rate": 1.103509797630077e-05,
+      "loss": 0.1448473334312439,
+      "step": 3840
+    },
+    {
+      "epoch": 0.6995997088791849,
+      "grad_norm": 0.14087305963039398,
+      "learning_rate": 1.0974105126730841e-05,
+      "loss": 0.14369285106658936,
+      "step": 3845
+    },
+    {
+      "epoch": 0.700509461426492,
+      "grad_norm": 0.16919967532157898,
+      "learning_rate": 1.0913233868088685e-05,
+      "loss": 0.1478085398674011,
+      "step": 3850
+    },
+    {
+      "epoch": 0.7014192139737991,
+      "grad_norm": 0.1439533829689026,
+      "learning_rate": 1.0852484728069178e-05,
+      "loss": 0.14376721382141114,
+      "step": 3855
+    },
+    {
+      "epoch": 0.7023289665211062,
+      "grad_norm": 0.17719274759292603,
+      "learning_rate": 1.0791858233308521e-05,
+      "loss": 0.14089040756225585,
+      "step": 3860
+    },
+    {
+      "epoch": 0.7032387190684134,
+      "grad_norm": 0.19753769040107727,
+      "learning_rate": 1.0731354909379754e-05,
+      "loss": 0.15021742582321168,
+      "step": 3865
+    },
+    {
+      "epoch": 0.7041484716157205,
+      "grad_norm": 0.19186992943286896,
+      "learning_rate": 1.0670975280788086e-05,
+      "loss": 0.14113202095031738,
+      "step": 3870
+    },
+    {
+      "epoch": 0.7050582241630277,
+      "grad_norm": 0.1709229201078415,
+      "learning_rate": 1.0610719870966443e-05,
+      "loss": 0.1500566840171814,
+      "step": 3875
+    },
+    {
+      "epoch": 0.7059679767103348,
+      "grad_norm": 0.17846204340457916,
+      "learning_rate": 1.0550589202270892e-05,
+      "loss": 0.15014195442199707,
+      "step": 3880
+    },
+    {
+      "epoch": 0.7068777292576419,
+      "grad_norm": 0.1827082335948944,
+      "learning_rate": 1.0490583795976091e-05,
+      "loss": 0.1423472762107849,
+      "step": 3885
+    },
+    {
+      "epoch": 0.7077874818049491,
+      "grad_norm": 0.17418377101421356,
+      "learning_rate": 1.043070417227083e-05,
+      "loss": 0.14668900966644288,
+      "step": 3890
+    },
+    {
+      "epoch": 0.7086972343522562,
+      "grad_norm": 0.17385616898536682,
+      "learning_rate": 1.0370950850253449e-05,
+      "loss": 0.14627279043197633,
+      "step": 3895
+    },
+    {
+      "epoch": 0.7096069868995634,
+      "grad_norm": 0.16486723721027374,
+      "learning_rate": 1.0311324347927404e-05,
+      "loss": 0.14603652954101562,
+      "step": 3900
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.146429330077812e+18,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-3900/training_args.bin b/checkpoint-3900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-3900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/checkpoint-400/README.md b/checkpoint-400/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-400/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-400/adapter_config.json b/checkpoint-400/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-400/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-400/adapter_model.safetensors b/checkpoint-400/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..83637b00f165a90605bcc0237b5733b736b01632
--- /dev/null
+++ b/checkpoint-400/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5376434516ca8d29b90a6d57eb8dabcc28d57a2b4ee2686d6b396663c726ac03
+size 169741912
diff --git a/checkpoint-400/chat_template.jinja b/checkpoint-400/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-400/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-400/optimizer.pt b/checkpoint-400/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..baa99a9bad68f126e96ca47c559cb0f82638851b
--- /dev/null
+++ b/checkpoint-400/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:05ae1fdde3f57b9ae0ee54b2f1db326f37e068e2a2c912cfe0b484bbd379453a
+size 72807355
diff --git a/checkpoint-400/processor_config.json b/checkpoint-400/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-400/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-400/rng_state.pth b/checkpoint-400/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-400/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-400/scheduler.pt b/checkpoint-400/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b1ad556e5b8b695a8493682eb5176a8a51ef3995
--- /dev/null
+++ b/checkpoint-400/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b91c3f06e07c33026be1365e870f031bb614ff69eebc663c24c46d29531e21d6
+size 1465
diff --git a/checkpoint-400/tokenizer.json b/checkpoint-400/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-400/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-400/tokenizer_config.json b/checkpoint-400/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-400/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-400/trainer_state.json b/checkpoint-400/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..a3fb93958cc243843e2afbc35bb31eff8a945ee7
--- /dev/null
+++ b/checkpoint-400/trainer_state.json
@@ -0,0 +1,602 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.07278020378457059,
+  "eval_steps": 100,
+  "global_step": 400,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    },
+    {
+      "epoch": 0.03729985443959243,
+      "grad_norm": 0.061678580939769745,
+      "learning_rate": 4.999340748636462e-05,
+      "loss": 0.24956226348876953,
+      "step": 205
+    },
+    {
+      "epoch": 0.03820960698689956,
+      "grad_norm": 0.07328873127698898,
+      "learning_rate": 4.999160884067051e-05,
+      "loss": 0.26169676780700685,
+      "step": 210
+    },
+    {
+      "epoch": 0.0391193595342067,
+      "grad_norm": 0.08287990838289261,
+      "learning_rate": 4.9989593541926246e-05,
+      "loss": 0.2574604034423828,
+      "step": 215
+    },
+    {
+      "epoch": 0.04002911208151383,
+      "grad_norm": 0.06787359714508057,
+      "learning_rate": 4.9987361607602525e-05,
+      "loss": 0.25351409912109374,
+      "step": 220
+    },
+    {
+      "epoch": 0.04093886462882096,
+      "grad_norm": 0.06695502996444702,
+      "learning_rate": 4.998491305704805e-05,
+      "loss": 0.24522039890289307,
+      "step": 225
+    },
+    {
+      "epoch": 0.041848617176128096,
+      "grad_norm": 0.08872214704751968,
+      "learning_rate": 4.9982247911489375e-05,
+      "loss": 0.2581867933273315,
+      "step": 230
+    },
+    {
+      "epoch": 0.042758369723435226,
+      "grad_norm": 0.07637131959199905,
+      "learning_rate": 4.9979366194030743e-05,
+      "loss": 0.25569658279418944,
+      "step": 235
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 0.08158119022846222,
+      "learning_rate": 4.997626792965385e-05,
+      "loss": 0.2529409646987915,
+      "step": 240
+    },
+    {
+      "epoch": 0.04457787481804949,
+      "grad_norm": 0.07529161125421524,
+      "learning_rate": 4.997295314521766e-05,
+      "loss": 0.24049024581909179,
+      "step": 245
+    },
+    {
+      "epoch": 0.04548762736535662,
+      "grad_norm": 0.08860139548778534,
+      "learning_rate": 4.996942186945813e-05,
+      "loss": 0.2490522861480713,
+      "step": 250
+    },
+    {
+      "epoch": 0.04639737991266375,
+      "grad_norm": 0.0850321501493454,
+      "learning_rate": 4.9965674132988005e-05,
+      "loss": 0.24180831909179687,
+      "step": 255
+    },
+    {
+      "epoch": 0.04730713245997089,
+      "grad_norm": 0.07556115090847015,
+      "learning_rate": 4.996170996829653e-05,
+      "loss": 0.2509631872177124,
+      "step": 260
+    },
+    {
+      "epoch": 0.04821688500727802,
+      "grad_norm": 0.07971206307411194,
+      "learning_rate": 4.995752940974918e-05,
+      "loss": 0.24398891925811766,
+      "step": 265
+    },
+    {
+      "epoch": 0.04912663755458515,
+      "grad_norm": 0.09149336814880371,
+      "learning_rate": 4.9953132493587344e-05,
+      "loss": 0.2300492286682129,
+      "step": 270
+    },
+    {
+      "epoch": 0.050036390101892286,
+      "grad_norm": 0.08265820890665054,
+      "learning_rate": 4.9948519257928034e-05,
+      "loss": 0.24246792793273925,
+      "step": 275
+    },
+    {
+      "epoch": 0.050946142649199416,
+      "grad_norm": 0.10328587144613266,
+      "learning_rate": 4.9943689742763534e-05,
+      "loss": 0.2367171049118042,
+      "step": 280
+    },
+    {
+      "epoch": 0.05185589519650655,
+      "grad_norm": 0.0836917981505394,
+      "learning_rate": 4.993864398996105e-05,
+      "loss": 0.23215813636779786,
+      "step": 285
+    },
+    {
+      "epoch": 0.05276564774381368,
+      "grad_norm": 0.09475161135196686,
+      "learning_rate": 4.99333820432624e-05,
+      "loss": 0.2350748062133789,
+      "step": 290
+    },
+    {
+      "epoch": 0.05367540029112081,
+      "grad_norm": 0.08040128648281097,
+      "learning_rate": 4.992790394828355e-05,
+      "loss": 0.23253886699676513,
+      "step": 295
+    },
+    {
+      "epoch": 0.05458515283842795,
+      "grad_norm": 0.08852150291204453,
+      "learning_rate": 4.992220975251428e-05,
+      "loss": 0.23856515884399415,
+      "step": 300
+    },
+    {
+      "epoch": 0.05549490538573508,
+      "grad_norm": 0.09565229713916779,
+      "learning_rate": 4.991629950531775e-05,
+      "loss": 0.23311660289764405,
+      "step": 305
+    },
+    {
+      "epoch": 0.05640465793304221,
+      "grad_norm": 0.08158160001039505,
+      "learning_rate": 4.991017325793009e-05,
+      "loss": 0.22467944622039795,
+      "step": 310
+    },
+    {
+      "epoch": 0.05731441048034935,
+      "grad_norm": 0.07746429741382599,
+      "learning_rate": 4.990383106345994e-05,
+      "loss": 0.229844069480896,
+      "step": 315
+    },
+    {
+      "epoch": 0.05822416302765648,
+      "grad_norm": 0.08564355969429016,
+      "learning_rate": 4.989727297688797e-05,
+      "loss": 0.22414517402648926,
+      "step": 320
+    },
+    {
+      "epoch": 0.05913391557496361,
+      "grad_norm": 0.07517435401678085,
+      "learning_rate": 4.9890499055066435e-05,
+      "loss": 0.2236532211303711,
+      "step": 325
+    },
+    {
+      "epoch": 0.060043668122270744,
+      "grad_norm": 0.111734539270401,
+      "learning_rate": 4.988350935671869e-05,
+      "loss": 0.21474847793579102,
+      "step": 330
+    },
+    {
+      "epoch": 0.060953420669577874,
+      "grad_norm": 0.09906989336013794,
+      "learning_rate": 4.987630394243866e-05,
+      "loss": 0.23321933746337892,
+      "step": 335
+    },
+    {
+      "epoch": 0.06186317321688501,
+      "grad_norm": 0.10131457448005676,
+      "learning_rate": 4.98688828746903e-05,
+      "loss": 0.2310662031173706,
+      "step": 340
+    },
+    {
+      "epoch": 0.06277292576419213,
+      "grad_norm": 0.09203507006168365,
+      "learning_rate": 4.986124621780708e-05,
+      "loss": 0.22021169662475587,
+      "step": 345
+    },
+    {
+      "epoch": 0.06368267831149928,
+      "grad_norm": 0.09505912661552429,
+      "learning_rate": 4.9853394037991416e-05,
+      "loss": 0.2197155237197876,
+      "step": 350
+    },
+    {
+      "epoch": 0.06459243085880641,
+      "grad_norm": 0.09038657695055008,
+      "learning_rate": 4.984532640331412e-05,
+      "loss": 0.22066287994384765,
+      "step": 355
+    },
+    {
+      "epoch": 0.06550218340611354,
+      "grad_norm": 0.09707064181566238,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 0.22455451488494874,
+      "step": 360
+    },
+    {
+      "epoch": 0.06641193595342067,
+      "grad_norm": 0.10367228090763092,
+      "learning_rate": 4.98285450509961e-05,
+      "loss": 0.21993820667266845,
+      "step": 365
+    },
+    {
+      "epoch": 0.0673216885007278,
+      "grad_norm": 0.12229471653699875,
+      "learning_rate": 4.9819831478833456e-05,
+      "loss": 0.2168867588043213,
+      "step": 370
+    },
+    {
+      "epoch": 0.06823144104803494,
+      "grad_norm": 0.0964592918753624,
+      "learning_rate": 4.981090274276406e-05,
+      "loss": 0.21579203605651856,
+      "step": 375
+    },
+    {
+      "epoch": 0.06914119359534207,
+      "grad_norm": 0.09400496631860733,
+      "learning_rate": 4.980175892019141e-05,
+      "loss": 0.20972180366516113,
+      "step": 380
+    },
+    {
+      "epoch": 0.0700509461426492,
+      "grad_norm": 0.08158645778894424,
+      "learning_rate": 4.9792400090383594e-05,
+      "loss": 0.22148358821868896,
+      "step": 385
+    },
+    {
+      "epoch": 0.07096069868995633,
+      "grad_norm": 0.10916394740343094,
+      "learning_rate": 4.978282633447261e-05,
+      "loss": 0.2214418649673462,
+      "step": 390
+    },
+    {
+      "epoch": 0.07187045123726346,
+      "grad_norm": 0.11138810962438583,
+      "learning_rate": 4.9773037735453636e-05,
+      "loss": 0.21814754009246826,
+      "step": 395
+    },
+    {
+      "epoch": 0.07278020378457059,
+      "grad_norm": 0.10914396494626999,
+      "learning_rate": 4.9763034378184365e-05,
+      "loss": 0.21310818195343018,
+      "step": 400
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.2959893223753523e+17,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-400/training_args.bin b/checkpoint-400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/checkpoint-4000/README.md b/checkpoint-4000/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-4000/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-4000/adapter_config.json b/checkpoint-4000/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-4000/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-4000/adapter_model.safetensors b/checkpoint-4000/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..122225dd523bb4ebeaf0082ea3bcb92a6e1dac7a
--- /dev/null
+++ b/checkpoint-4000/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8f6d4a4c0f7f29a79c24ba03c70be0b9f9ad2322eaec2507a4bdd253e1877f3e
+size 169741912
diff --git a/checkpoint-4000/chat_template.jinja b/checkpoint-4000/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-4000/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-4000/optimizer.pt b/checkpoint-4000/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..02df676d5405acbb1640036be3872d2e20f4f83a
--- /dev/null
+++ b/checkpoint-4000/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:261af0e2bee0b8fe5b21d7b03a5d9a95b5648f50dc926d3ad176128963a89839
+size 72807355
diff --git a/checkpoint-4000/processor_config.json b/checkpoint-4000/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-4000/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-4000/rng_state.pth b/checkpoint-4000/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-4000/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-4000/scheduler.pt b/checkpoint-4000/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7e45086845a384e4c8a98a875dc676b7e4dc576d
--- /dev/null
+++ b/checkpoint-4000/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f8e1aa2c2ec8acfeb7d7d0e346bde622c16805e36befbe0290006307b2751b20
+size 1465
diff --git a/checkpoint-4000/tokenizer.json b/checkpoint-4000/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-4000/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-4000/tokenizer_config.json b/checkpoint-4000/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-4000/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-4000/trainer_state.json b/checkpoint-4000/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..7e82d6b94c578cbceba70578b6bfe9649017fcfa
--- /dev/null
+++ b/checkpoint-4000/trainer_state.json
@@ -0,0 +1,5642 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.727802037845706,
+  "eval_steps": 100,
+  "global_step": 4000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    },
+    {
+      "epoch": 0.03729985443959243,
+      "grad_norm": 0.061678580939769745,
+      "learning_rate": 4.999340748636462e-05,
+      "loss": 0.24956226348876953,
+      "step": 205
+    },
+    {
+      "epoch": 0.03820960698689956,
+      "grad_norm": 0.07328873127698898,
+      "learning_rate": 4.999160884067051e-05,
+      "loss": 0.26169676780700685,
+      "step": 210
+    },
+    {
+      "epoch": 0.0391193595342067,
+      "grad_norm": 0.08287990838289261,
+      "learning_rate": 4.9989593541926246e-05,
+      "loss": 0.2574604034423828,
+      "step": 215
+    },
+    {
+      "epoch": 0.04002911208151383,
+      "grad_norm": 0.06787359714508057,
+      "learning_rate": 4.9987361607602525e-05,
+      "loss": 0.25351409912109374,
+      "step": 220
+    },
+    {
+      "epoch": 0.04093886462882096,
+      "grad_norm": 0.06695502996444702,
+      "learning_rate": 4.998491305704805e-05,
+      "loss": 0.24522039890289307,
+      "step": 225
+    },
+    {
+      "epoch": 0.041848617176128096,
+      "grad_norm": 0.08872214704751968,
+      "learning_rate": 4.9982247911489375e-05,
+      "loss": 0.2581867933273315,
+      "step": 230
+    },
+    {
+      "epoch": 0.042758369723435226,
+      "grad_norm": 0.07637131959199905,
+      "learning_rate": 4.9979366194030743e-05,
+      "loss": 0.25569658279418944,
+      "step": 235
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 0.08158119022846222,
+      "learning_rate": 4.997626792965385e-05,
+      "loss": 0.2529409646987915,
+      "step": 240
+    },
+    {
+      "epoch": 0.04457787481804949,
+      "grad_norm": 0.07529161125421524,
+      "learning_rate": 4.997295314521766e-05,
+      "loss": 0.24049024581909179,
+      "step": 245
+    },
+    {
+      "epoch": 0.04548762736535662,
+      "grad_norm": 0.08860139548778534,
+      "learning_rate": 4.996942186945813e-05,
+      "loss": 0.2490522861480713,
+      "step": 250
+    },
+    {
+      "epoch": 0.04639737991266375,
+      "grad_norm": 0.0850321501493454,
+      "learning_rate": 4.9965674132988005e-05,
+      "loss": 0.24180831909179687,
+      "step": 255
+    },
+    {
+      "epoch": 0.04730713245997089,
+      "grad_norm": 0.07556115090847015,
+      "learning_rate": 4.996170996829653e-05,
+      "loss": 0.2509631872177124,
+      "step": 260
+    },
+    {
+      "epoch": 0.04821688500727802,
+      "grad_norm": 0.07971206307411194,
+      "learning_rate": 4.995752940974918e-05,
+      "loss": 0.24398891925811766,
+      "step": 265
+    },
+    {
+      "epoch": 0.04912663755458515,
+      "grad_norm": 0.09149336814880371,
+      "learning_rate": 4.9953132493587344e-05,
+      "loss": 0.2300492286682129,
+      "step": 270
+    },
+    {
+      "epoch": 0.050036390101892286,
+      "grad_norm": 0.08265820890665054,
+      "learning_rate": 4.9948519257928034e-05,
+      "loss": 0.24246792793273925,
+      "step": 275
+    },
+    {
+      "epoch": 0.050946142649199416,
+      "grad_norm": 0.10328587144613266,
+      "learning_rate": 4.9943689742763534e-05,
+      "loss": 0.2367171049118042,
+      "step": 280
+    },
+    {
+      "epoch": 0.05185589519650655,
+      "grad_norm": 0.0836917981505394,
+      "learning_rate": 4.993864398996105e-05,
+      "loss": 0.23215813636779786,
+      "step": 285
+    },
+    {
+      "epoch": 0.05276564774381368,
+      "grad_norm": 0.09475161135196686,
+      "learning_rate": 4.99333820432624e-05,
+      "loss": 0.2350748062133789,
+      "step": 290
+    },
+    {
+      "epoch": 0.05367540029112081,
+      "grad_norm": 0.08040128648281097,
+      "learning_rate": 4.992790394828355e-05,
+      "loss": 0.23253886699676513,
+      "step": 295
+    },
+    {
+      "epoch": 0.05458515283842795,
+      "grad_norm": 0.08852150291204453,
+      "learning_rate": 4.992220975251428e-05,
+      "loss": 0.23856515884399415,
+      "step": 300
+    },
+    {
+      "epoch": 0.05549490538573508,
+      "grad_norm": 0.09565229713916779,
+      "learning_rate": 4.991629950531775e-05,
+      "loss": 0.23311660289764405,
+      "step": 305
+    },
+    {
+      "epoch": 0.05640465793304221,
+      "grad_norm": 0.08158160001039505,
+      "learning_rate": 4.991017325793009e-05,
+      "loss": 0.22467944622039795,
+      "step": 310
+    },
+    {
+      "epoch": 0.05731441048034935,
+      "grad_norm": 0.07746429741382599,
+      "learning_rate": 4.990383106345994e-05,
+      "loss": 0.229844069480896,
+      "step": 315
+    },
+    {
+      "epoch": 0.05822416302765648,
+      "grad_norm": 0.08564355969429016,
+      "learning_rate": 4.989727297688797e-05,
+      "loss": 0.22414517402648926,
+      "step": 320
+    },
+    {
+      "epoch": 0.05913391557496361,
+      "grad_norm": 0.07517435401678085,
+      "learning_rate": 4.9890499055066435e-05,
+      "loss": 0.2236532211303711,
+      "step": 325
+    },
+    {
+      "epoch": 0.060043668122270744,
+      "grad_norm": 0.111734539270401,
+      "learning_rate": 4.988350935671869e-05,
+      "loss": 0.21474847793579102,
+      "step": 330
+    },
+    {
+      "epoch": 0.060953420669577874,
+      "grad_norm": 0.09906989336013794,
+      "learning_rate": 4.987630394243866e-05,
+      "loss": 0.23321933746337892,
+      "step": 335
+    },
+    {
+      "epoch": 0.06186317321688501,
+      "grad_norm": 0.10131457448005676,
+      "learning_rate": 4.98688828746903e-05,
+      "loss": 0.2310662031173706,
+      "step": 340
+    },
+    {
+      "epoch": 0.06277292576419213,
+      "grad_norm": 0.09203507006168365,
+      "learning_rate": 4.986124621780708e-05,
+      "loss": 0.22021169662475587,
+      "step": 345
+    },
+    {
+      "epoch": 0.06368267831149928,
+      "grad_norm": 0.09505912661552429,
+      "learning_rate": 4.9853394037991416e-05,
+      "loss": 0.2197155237197876,
+      "step": 350
+    },
+    {
+      "epoch": 0.06459243085880641,
+      "grad_norm": 0.09038657695055008,
+      "learning_rate": 4.984532640331412e-05,
+      "loss": 0.22066287994384765,
+      "step": 355
+    },
+    {
+      "epoch": 0.06550218340611354,
+      "grad_norm": 0.09707064181566238,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 0.22455451488494874,
+      "step": 360
+    },
+    {
+      "epoch": 0.06641193595342067,
+      "grad_norm": 0.10367228090763092,
+      "learning_rate": 4.98285450509961e-05,
+      "loss": 0.21993820667266845,
+      "step": 365
+    },
+    {
+      "epoch": 0.0673216885007278,
+      "grad_norm": 0.12229471653699875,
+      "learning_rate": 4.9819831478833456e-05,
+      "loss": 0.2168867588043213,
+      "step": 370
+    },
+    {
+      "epoch": 0.06823144104803494,
+      "grad_norm": 0.0964592918753624,
+      "learning_rate": 4.981090274276406e-05,
+      "loss": 0.21579203605651856,
+      "step": 375
+    },
+    {
+      "epoch": 0.06914119359534207,
+      "grad_norm": 0.09400496631860733,
+      "learning_rate": 4.980175892019141e-05,
+      "loss": 0.20972180366516113,
+      "step": 380
+    },
+    {
+      "epoch": 0.0700509461426492,
+      "grad_norm": 0.08158645778894424,
+      "learning_rate": 4.9792400090383594e-05,
+      "loss": 0.22148358821868896,
+      "step": 385
+    },
+    {
+      "epoch": 0.07096069868995633,
+      "grad_norm": 0.10916394740343094,
+      "learning_rate": 4.978282633447261e-05,
+      "loss": 0.2214418649673462,
+      "step": 390
+    },
+    {
+      "epoch": 0.07187045123726346,
+      "grad_norm": 0.11138810962438583,
+      "learning_rate": 4.9773037735453636e-05,
+      "loss": 0.21814754009246826,
+      "step": 395
+    },
+    {
+      "epoch": 0.07278020378457059,
+      "grad_norm": 0.10914396494626999,
+      "learning_rate": 4.9763034378184365e-05,
+      "loss": 0.21310818195343018,
+      "step": 400
+    },
+    {
+      "epoch": 0.07368995633187773,
+      "grad_norm": 0.1043366864323616,
+      "learning_rate": 4.975281634938421e-05,
+      "loss": 0.21266789436340333,
+      "step": 405
+    },
+    {
+      "epoch": 0.07459970887918486,
+      "grad_norm": 0.1036868542432785,
+      "learning_rate": 4.9742383737633594e-05,
+      "loss": 0.21606721878051757,
+      "step": 410
+    },
+    {
+      "epoch": 0.075509461426492,
+      "grad_norm": 0.11640442907810211,
+      "learning_rate": 4.9731736633373144e-05,
+      "loss": 0.21532948017120362,
+      "step": 415
+    },
+    {
+      "epoch": 0.07641921397379912,
+      "grad_norm": 0.11219926178455353,
+      "learning_rate": 4.9720875128902956e-05,
+      "loss": 0.2191627025604248,
+      "step": 420
+    },
+    {
+      "epoch": 0.07732896652110625,
+      "grad_norm": 0.12103637307882309,
+      "learning_rate": 4.970979931838176e-05,
+      "loss": 0.20938868522644044,
+      "step": 425
+    },
+    {
+      "epoch": 0.0782387190684134,
+      "grad_norm": 0.13274189829826355,
+      "learning_rate": 4.96985092978261e-05,
+      "loss": 0.21792960166931152,
+      "step": 430
+    },
+    {
+      "epoch": 0.07914847161572053,
+      "grad_norm": 0.11164513230323792,
+      "learning_rate": 4.968700516510954e-05,
+      "loss": 0.2022618055343628,
+      "step": 435
+    },
+    {
+      "epoch": 0.08005822416302766,
+      "grad_norm": 0.09532847255468369,
+      "learning_rate": 4.967528701996174e-05,
+      "loss": 0.21255812644958497,
+      "step": 440
+    },
+    {
+      "epoch": 0.08096797671033479,
+      "grad_norm": 0.10279258340597153,
+      "learning_rate": 4.96633549639677e-05,
+      "loss": 0.20683050155639648,
+      "step": 445
+    },
+    {
+      "epoch": 0.08187772925764192,
+      "grad_norm": 0.1257462352514267,
+      "learning_rate": 4.965120910056677e-05,
+      "loss": 0.21419920921325683,
+      "step": 450
+    },
+    {
+      "epoch": 0.08278748180494905,
+      "grad_norm": 0.11663137376308441,
+      "learning_rate": 4.963884953505186e-05,
+      "loss": 0.2072287082672119,
+      "step": 455
+    },
+    {
+      "epoch": 0.08369723435225619,
+      "grad_norm": 0.10488224029541016,
+      "learning_rate": 4.96262763745684e-05,
+      "loss": 0.1982678532600403,
+      "step": 460
+    },
+    {
+      "epoch": 0.08460698689956332,
+      "grad_norm": 0.11801692098379135,
+      "learning_rate": 4.961348972811354e-05,
+      "loss": 0.20662031173706055,
+      "step": 465
+    },
+    {
+      "epoch": 0.08551673944687045,
+      "grad_norm": 0.11318827420473099,
+      "learning_rate": 4.96004897065351e-05,
+      "loss": 0.20947303771972656,
+      "step": 470
+    },
+    {
+      "epoch": 0.08642649199417758,
+      "grad_norm": 0.13409486413002014,
+      "learning_rate": 4.95872764225307e-05,
+      "loss": 0.19670876264572143,
+      "step": 475
+    },
+    {
+      "epoch": 0.08733624454148471,
+      "grad_norm": 0.14440792798995972,
+      "learning_rate": 4.957384999064672e-05,
+      "loss": 0.19842848777770997,
+      "step": 480
+    },
+    {
+      "epoch": 0.08824599708879186,
+      "grad_norm": 0.12246996909379959,
+      "learning_rate": 4.956021052727731e-05,
+      "loss": 0.20318071842193602,
+      "step": 485
+    },
+    {
+      "epoch": 0.08915574963609899,
+      "grad_norm": 0.13437233865261078,
+      "learning_rate": 4.954635815066342e-05,
+      "loss": 0.21675212383270265,
+      "step": 490
+    },
+    {
+      "epoch": 0.09006550218340612,
+      "grad_norm": 0.11109672486782074,
+      "learning_rate": 4.9532292980891744e-05,
+      "loss": 0.2100757837295532,
+      "step": 495
+    },
+    {
+      "epoch": 0.09097525473071325,
+      "grad_norm": 0.1388893872499466,
+      "learning_rate": 4.9518015139893675e-05,
+      "loss": 0.20303285121917725,
+      "step": 500
+    },
+    {
+      "epoch": 0.09188500727802038,
+      "grad_norm": 0.13239721953868866,
+      "learning_rate": 4.950352475144427e-05,
+      "loss": 0.2152268409729004,
+      "step": 505
+    },
+    {
+      "epoch": 0.0927947598253275,
+      "grad_norm": 0.12834979593753815,
+      "learning_rate": 4.948882194116119e-05,
+      "loss": 0.20799248218536376,
+      "step": 510
+    },
+    {
+      "epoch": 0.09370451237263465,
+      "grad_norm": 0.11886704713106155,
+      "learning_rate": 4.947390683650354e-05,
+      "loss": 0.20394976139068605,
+      "step": 515
+    },
+    {
+      "epoch": 0.09461426491994178,
+      "grad_norm": 0.11398876458406448,
+      "learning_rate": 4.945877956677083e-05,
+      "loss": 0.2091092586517334,
+      "step": 520
+    },
+    {
+      "epoch": 0.09552401746724891,
+      "grad_norm": 0.1422540694475174,
+      "learning_rate": 4.944344026310186e-05,
+      "loss": 0.19564238786697388,
+      "step": 525
+    },
+    {
+      "epoch": 0.09643377001455604,
+      "grad_norm": 0.11359584331512451,
+      "learning_rate": 4.9427889058473535e-05,
+      "loss": 0.20493624210357667,
+      "step": 530
+    },
+    {
+      "epoch": 0.09734352256186317,
+      "grad_norm": 0.11703553050756454,
+      "learning_rate": 4.941212608769974e-05,
+      "loss": 0.2098615884780884,
+      "step": 535
+    },
+    {
+      "epoch": 0.0982532751091703,
+      "grad_norm": 0.14552047848701477,
+      "learning_rate": 4.939615148743017e-05,
+      "loss": 0.20382182598114013,
+      "step": 540
+    },
+    {
+      "epoch": 0.09916302765647744,
+      "grad_norm": 0.13178016245365143,
+      "learning_rate": 4.937996539614914e-05,
+      "loss": 0.19901862144470214,
+      "step": 545
+    },
+    {
+      "epoch": 0.10007278020378457,
+      "grad_norm": 0.635392427444458,
+      "learning_rate": 4.936356795417439e-05,
+      "loss": 0.20694944858551026,
+      "step": 550
+    },
+    {
+      "epoch": 0.1009825327510917,
+      "grad_norm": 0.15019077062606812,
+      "learning_rate": 4.934695930365586e-05,
+      "loss": 0.19313746690750122,
+      "step": 555
+    },
+    {
+      "epoch": 0.10189228529839883,
+      "grad_norm": 0.12941956520080566,
+      "learning_rate": 4.9330139588574474e-05,
+      "loss": 0.19671722650527954,
+      "step": 560
+    },
+    {
+      "epoch": 0.10280203784570596,
+      "grad_norm": 0.13818831741809845,
+      "learning_rate": 4.931310895474088e-05,
+      "loss": 0.20026786327362062,
+      "step": 565
+    },
+    {
+      "epoch": 0.1037117903930131,
+      "grad_norm": 0.12011194974184036,
+      "learning_rate": 4.929586754979417e-05,
+      "loss": 0.1932437539100647,
+      "step": 570
+    },
+    {
+      "epoch": 0.10462154294032024,
+      "grad_norm": 0.1345364898443222,
+      "learning_rate": 4.9278415523200644e-05,
+      "loss": 0.20245940685272218,
+      "step": 575
+    },
+    {
+      "epoch": 0.10553129548762737,
+      "grad_norm": 0.13281017541885376,
+      "learning_rate": 4.926075302625247e-05,
+      "loss": 0.19864981174468993,
+      "step": 580
+    },
+    {
+      "epoch": 0.1064410480349345,
+      "grad_norm": 0.13465586304664612,
+      "learning_rate": 4.924288021206639e-05,
+      "loss": 0.19573183059692384,
+      "step": 585
+    },
+    {
+      "epoch": 0.10735080058224163,
+      "grad_norm": 0.15225961804389954,
+      "learning_rate": 4.9224797235582396e-05,
+      "loss": 0.19946801662445068,
+      "step": 590
+    },
+    {
+      "epoch": 0.10826055312954876,
+      "grad_norm": 0.12816746532917023,
+      "learning_rate": 4.92065042535624e-05,
+      "loss": 0.19851526021957397,
+      "step": 595
+    },
+    {
+      "epoch": 0.1091703056768559,
+      "grad_norm": 0.13802853226661682,
+      "learning_rate": 4.9188001424588824e-05,
+      "loss": 0.19321763515472412,
+      "step": 600
+    },
+    {
+      "epoch": 0.11008005822416303,
+      "grad_norm": 0.17504797875881195,
+      "learning_rate": 4.9169288909063295e-05,
+      "loss": 0.2032616138458252,
+      "step": 605
+    },
+    {
+      "epoch": 0.11098981077147016,
+      "grad_norm": 0.13544194400310516,
+      "learning_rate": 4.91503668692052e-05,
+      "loss": 0.2011256456375122,
+      "step": 610
+    },
+    {
+      "epoch": 0.11189956331877729,
+      "grad_norm": 1.3976134061813354,
+      "learning_rate": 4.91312354690503e-05,
+      "loss": 0.19916868209838867,
+      "step": 615
+    },
+    {
+      "epoch": 0.11280931586608442,
+      "grad_norm": 0.1465059071779251,
+      "learning_rate": 4.91118948744493e-05,
+      "loss": 0.19487457275390624,
+      "step": 620
+    },
+    {
+      "epoch": 0.11371906841339156,
+      "grad_norm": 0.12103168666362762,
+      "learning_rate": 4.909234525306645e-05,
+      "loss": 0.1907251238822937,
+      "step": 625
+    },
+    {
+      "epoch": 0.1146288209606987,
+      "grad_norm": 0.12660574913024902,
+      "learning_rate": 4.907258677437802e-05,
+      "loss": 0.19327253103256226,
+      "step": 630
+    },
+    {
+      "epoch": 0.11553857350800582,
+      "grad_norm": 0.1347813606262207,
+      "learning_rate": 4.90526196096709e-05,
+      "loss": 0.19637736082077026,
+      "step": 635
+    },
+    {
+      "epoch": 0.11644832605531295,
+      "grad_norm": 0.14953652024269104,
+      "learning_rate": 4.903244393204107e-05,
+      "loss": 0.20325069427490233,
+      "step": 640
+    },
+    {
+      "epoch": 0.11735807860262008,
+      "grad_norm": 0.13936272263526917,
+      "learning_rate": 4.901205991639213e-05,
+      "loss": 0.1930275321006775,
+      "step": 645
+    },
+    {
+      "epoch": 0.11826783114992721,
+      "grad_norm": 0.1448420137166977,
+      "learning_rate": 4.899146773943374e-05,
+      "loss": 0.20026936531066894,
+      "step": 650
+    },
+    {
+      "epoch": 0.11917758369723436,
+      "grad_norm": 0.1312534064054489,
+      "learning_rate": 4.897066757968014e-05,
+      "loss": 0.19062033891677857,
+      "step": 655
+    },
+    {
+      "epoch": 0.12008733624454149,
+      "grad_norm": 0.13644742965698242,
+      "learning_rate": 4.894965961744859e-05,
+      "loss": 0.18719595670700073,
+      "step": 660
+    },
+    {
+      "epoch": 0.12099708879184862,
+      "grad_norm": 0.14276087284088135,
+      "learning_rate": 4.892844403485777e-05,
+      "loss": 0.19784307479858398,
+      "step": 665
+    },
+    {
+      "epoch": 0.12190684133915575,
+      "grad_norm": 0.14735399186611176,
+      "learning_rate": 4.890702101582623e-05,
+      "loss": 0.19163782596588136,
+      "step": 670
+    },
+    {
+      "epoch": 0.12281659388646288,
+      "grad_norm": 0.15742065012454987,
+      "learning_rate": 4.888539074607082e-05,
+      "loss": 0.19312986135482788,
+      "step": 675
+    },
+    {
+      "epoch": 0.12372634643377002,
+      "grad_norm": 0.12917031347751617,
+      "learning_rate": 4.8863553413105025e-05,
+      "loss": 0.20066320896148682,
+      "step": 680
+    },
+    {
+      "epoch": 0.12463609898107715,
+      "grad_norm": 0.1484801322221756,
+      "learning_rate": 4.884150920623737e-05,
+      "loss": 0.20096096992492676,
+      "step": 685
+    },
+    {
+      "epoch": 0.12554585152838427,
+      "grad_norm": 0.1455296128988266,
+      "learning_rate": 4.88192583165698e-05,
+      "loss": 0.20518505573272705,
+      "step": 690
+    },
+    {
+      "epoch": 0.12645560407569142,
+      "grad_norm": 0.14517490565776825,
+      "learning_rate": 4.879680093699598e-05,
+      "loss": 0.18859238624572755,
+      "step": 695
+    },
+    {
+      "epoch": 0.12736535662299855,
+      "grad_norm": 0.18778090178966522,
+      "learning_rate": 4.877413726219964e-05,
+      "loss": 0.197074818611145,
+      "step": 700
+    },
+    {
+      "epoch": 0.12827510917030568,
+      "grad_norm": 0.13497677445411682,
+      "learning_rate": 4.87512674886529e-05,
+      "loss": 0.18713107109069824,
+      "step": 705
+    },
+    {
+      "epoch": 0.12918486171761281,
+      "grad_norm": 0.12657155096530914,
+      "learning_rate": 4.872819181461455e-05,
+      "loss": 0.1858484387397766,
+      "step": 710
+    },
+    {
+      "epoch": 0.13009461426491994,
+      "grad_norm": 0.11458148807287216,
+      "learning_rate": 4.870491044012834e-05,
+      "loss": 0.18732179403305055,
+      "step": 715
+    },
+    {
+      "epoch": 0.13100436681222707,
+      "grad_norm": 0.13000249862670898,
+      "learning_rate": 4.8681423567021244e-05,
+      "loss": 0.1872936010360718,
+      "step": 720
+    },
+    {
+      "epoch": 0.1319141193595342,
+      "grad_norm": 0.14580890536308289,
+      "learning_rate": 4.865773139890172e-05,
+      "loss": 0.19280019998550416,
+      "step": 725
+    },
+    {
+      "epoch": 0.13282387190684133,
+      "grad_norm": 0.1507277935743332,
+      "learning_rate": 4.8633834141157913e-05,
+      "loss": 0.1898929238319397,
+      "step": 730
+    },
+    {
+      "epoch": 0.13373362445414846,
+      "grad_norm": 0.1418737769126892,
+      "learning_rate": 4.860973200095592e-05,
+      "loss": 0.17926375865936278,
+      "step": 735
+    },
+    {
+      "epoch": 0.1346433770014556,
+      "grad_norm": 0.17151866853237152,
+      "learning_rate": 4.858542518723794e-05,
+      "loss": 0.18963592052459716,
+      "step": 740
+    },
+    {
+      "epoch": 0.13555312954876272,
+      "grad_norm": 0.11162743717432022,
+      "learning_rate": 4.8560913910720535e-05,
+      "loss": 0.19466646909713745,
+      "step": 745
+    },
+    {
+      "epoch": 0.13646288209606988,
+      "grad_norm": 0.15628376603126526,
+      "learning_rate": 4.8536198383892725e-05,
+      "loss": 0.19494034051895143,
+      "step": 750
+    },
+    {
+      "epoch": 0.137372634643377,
+      "grad_norm": 0.18209289014339447,
+      "learning_rate": 4.851127882101421e-05,
+      "loss": 0.18747550249099731,
+      "step": 755
+    },
+    {
+      "epoch": 0.13828238719068414,
+      "grad_norm": 0.14559614658355713,
+      "learning_rate": 4.8486155438113454e-05,
+      "loss": 0.1897158980369568,
+      "step": 760
+    },
+    {
+      "epoch": 0.13919213973799127,
+      "grad_norm": 0.3198587894439697,
+      "learning_rate": 4.846082845298586e-05,
+      "loss": 0.18571001291275024,
+      "step": 765
+    },
+    {
+      "epoch": 0.1401018922852984,
+      "grad_norm": 0.1486678421497345,
+      "learning_rate": 4.843529808519189e-05,
+      "loss": 0.19561930894851684,
+      "step": 770
+    },
+    {
+      "epoch": 0.14101164483260553,
+      "grad_norm": 0.15318170189857483,
+      "learning_rate": 4.840956455605509e-05,
+      "loss": 0.187040114402771,
+      "step": 775
+    },
+    {
+      "epoch": 0.14192139737991266,
+      "grad_norm": 0.13754244148731232,
+      "learning_rate": 4.838362808866025e-05,
+      "loss": 0.18345539569854735,
+      "step": 780
+    },
+    {
+      "epoch": 0.1428311499272198,
+      "grad_norm": 0.12943248450756073,
+      "learning_rate": 4.835748890785143e-05,
+      "loss": 0.1921079397201538,
+      "step": 785
+    },
+    {
+      "epoch": 0.14374090247452692,
+      "grad_norm": 0.110458143055439,
+      "learning_rate": 4.833114724023001e-05,
+      "loss": 0.17927205562591553,
+      "step": 790
+    },
+    {
+      "epoch": 0.14465065502183405,
+      "grad_norm": 0.2421770840883255,
+      "learning_rate": 4.830460331415275e-05,
+      "loss": 0.18317567110061644,
+      "step": 795
+    },
+    {
+      "epoch": 0.14556040756914118,
+      "grad_norm": 0.14752762019634247,
+      "learning_rate": 4.8277857359729787e-05,
+      "loss": 0.1843916058540344,
+      "step": 800
+    },
+    {
+      "epoch": 0.14647016011644834,
+      "grad_norm": 0.15043556690216064,
+      "learning_rate": 4.8250909608822644e-05,
+      "loss": 0.18354393243789674,
+      "step": 805
+    },
+    {
+      "epoch": 0.14737991266375547,
+      "grad_norm": 0.1381794661283493,
+      "learning_rate": 4.822376029504223e-05,
+      "loss": 0.1789781332015991,
+      "step": 810
+    },
+    {
+      "epoch": 0.1482896652110626,
+      "grad_norm": 0.18386174738407135,
+      "learning_rate": 4.819640965374681e-05,
+      "loss": 0.19494292736053467,
+      "step": 815
+    },
+    {
+      "epoch": 0.14919941775836973,
+      "grad_norm": 0.13829593360424042,
+      "learning_rate": 4.816885792203996e-05,
+      "loss": 0.18486063480377196,
+      "step": 820
+    },
+    {
+      "epoch": 0.15010917030567686,
+      "grad_norm": 0.15033291280269623,
+      "learning_rate": 4.814110533876852e-05,
+      "loss": 0.18061509132385253,
+      "step": 825
+    },
+    {
+      "epoch": 0.151018922852984,
+      "grad_norm": 0.17150473594665527,
+      "learning_rate": 4.811315214452051e-05,
+      "loss": 0.18464866876602173,
+      "step": 830
+    },
+    {
+      "epoch": 0.15192867540029112,
+      "grad_norm": 0.15317125618457794,
+      "learning_rate": 4.808499858162307e-05,
+      "loss": 0.1837708592414856,
+      "step": 835
+    },
+    {
+      "epoch": 0.15283842794759825,
+      "grad_norm": 0.2671392560005188,
+      "learning_rate": 4.805664489414031e-05,
+      "loss": 0.19338636398315429,
+      "step": 840
+    },
+    {
+      "epoch": 0.15374818049490538,
+      "grad_norm": 0.14047028124332428,
+      "learning_rate": 4.802809132787125e-05,
+      "loss": 0.17069108486175538,
+      "step": 845
+    },
+    {
+      "epoch": 0.1546579330422125,
+      "grad_norm": 0.1520431935787201,
+      "learning_rate": 4.799933813034768e-05,
+      "loss": 0.18607735633850098,
+      "step": 850
+    },
+    {
+      "epoch": 0.15556768558951964,
+      "grad_norm": 0.17239463329315186,
+      "learning_rate": 4.797038555083197e-05,
+      "loss": 0.18069062232971192,
+      "step": 855
+    },
+    {
+      "epoch": 0.1564774381368268,
+      "grad_norm": 0.1377955675125122,
+      "learning_rate": 4.794123384031495e-05,
+      "loss": 0.18870222568511963,
+      "step": 860
+    },
+    {
+      "epoch": 0.15738719068413393,
+      "grad_norm": 0.15901461243629456,
+      "learning_rate": 4.791188325151373e-05,
+      "loss": 0.18128334283828734,
+      "step": 865
+    },
+    {
+      "epoch": 0.15829694323144106,
+      "grad_norm": 0.14634132385253906,
+      "learning_rate": 4.7882334038869495e-05,
+      "loss": 0.1866163969039917,
+      "step": 870
+    },
+    {
+      "epoch": 0.1592066957787482,
+      "grad_norm": 0.15361061692237854,
+      "learning_rate": 4.785258645854529e-05,
+      "loss": 0.17850807905197144,
+      "step": 875
+    },
+    {
+      "epoch": 0.16011644832605532,
+      "grad_norm": 0.13751649856567383,
+      "learning_rate": 4.782264076842385e-05,
+      "loss": 0.17731113433837892,
+      "step": 880
+    },
+    {
+      "epoch": 0.16102620087336245,
+      "grad_norm": 0.17909638583660126,
+      "learning_rate": 4.7792497228105314e-05,
+      "loss": 0.18344542980194092,
+      "step": 885
+    },
+    {
+      "epoch": 0.16193595342066958,
+      "grad_norm": 0.16038304567337036,
+      "learning_rate": 4.776215609890498e-05,
+      "loss": 0.18868647813796996,
+      "step": 890
+    },
+    {
+      "epoch": 0.1628457059679767,
+      "grad_norm": 0.1653951108455658,
+      "learning_rate": 4.773161764385107e-05,
+      "loss": 0.18614152669906617,
+      "step": 895
+    },
+    {
+      "epoch": 0.16375545851528384,
+      "grad_norm": 0.16193026304244995,
+      "learning_rate": 4.770088212768241e-05,
+      "loss": 0.18564575910568237,
+      "step": 900
+    },
+    {
+      "epoch": 0.16466521106259097,
+      "grad_norm": 0.16048531234264374,
+      "learning_rate": 4.7669949816846173e-05,
+      "loss": 0.18330031633377075,
+      "step": 905
+    },
+    {
+      "epoch": 0.1655749636098981,
+      "grad_norm": 0.1440177708864212,
+      "learning_rate": 4.7638820979495534e-05,
+      "loss": 0.17712442874908446,
+      "step": 910
+    },
+    {
+      "epoch": 0.16648471615720525,
+      "grad_norm": 0.19635969400405884,
+      "learning_rate": 4.760749588548738e-05,
+      "loss": 0.18679027557373046,
+      "step": 915
+    },
+    {
+      "epoch": 0.16739446870451238,
+      "grad_norm": 0.15576541423797607,
+      "learning_rate": 4.757597480637995e-05,
+      "loss": 0.19283764362335204,
+      "step": 920
+    },
+    {
+      "epoch": 0.1683042212518195,
+      "grad_norm": 0.1550331562757492,
+      "learning_rate": 4.7544258015430463e-05,
+      "loss": 0.18269542455673218,
+      "step": 925
+    },
+    {
+      "epoch": 0.16921397379912664,
+      "grad_norm": 0.18369626998901367,
+      "learning_rate": 4.75123457875928e-05,
+      "loss": 0.1697891116142273,
+      "step": 930
+    },
+    {
+      "epoch": 0.17012372634643377,
+      "grad_norm": 0.15266314148902893,
+      "learning_rate": 4.7480238399515074e-05,
+      "loss": 0.18523451089859008,
+      "step": 935
+    },
+    {
+      "epoch": 0.1710334788937409,
+      "grad_norm": 0.16709664463996887,
+      "learning_rate": 4.744793612953724e-05,
+      "loss": 0.1803238034248352,
+      "step": 940
+    },
+    {
+      "epoch": 0.17194323144104803,
+      "grad_norm": 0.14929179847240448,
+      "learning_rate": 4.741543925768872e-05,
+      "loss": 0.1861217737197876,
+      "step": 945
+    },
+    {
+      "epoch": 0.17285298398835516,
+      "grad_norm": 0.1362280696630478,
+      "learning_rate": 4.7382748065685915e-05,
+      "loss": 0.17896100282669067,
+      "step": 950
+    },
+    {
+      "epoch": 0.1737627365356623,
+      "grad_norm": 0.15290239453315735,
+      "learning_rate": 4.734986283692982e-05,
+      "loss": 0.18432788848876952,
+      "step": 955
+    },
+    {
+      "epoch": 0.17467248908296942,
+      "grad_norm": 0.1287035197019577,
+      "learning_rate": 4.73167838565035e-05,
+      "loss": 0.18485682010650634,
+      "step": 960
+    },
+    {
+      "epoch": 0.17558224163027655,
+      "grad_norm": 0.17969627678394318,
+      "learning_rate": 4.728351141116971e-05,
+      "loss": 0.17361557483673096,
+      "step": 965
+    },
+    {
+      "epoch": 0.1764919941775837,
+      "grad_norm": 0.13751201331615448,
+      "learning_rate": 4.7250045789368326e-05,
+      "loss": 0.1731679320335388,
+      "step": 970
+    },
+    {
+      "epoch": 0.17740174672489084,
+      "grad_norm": 0.1603265255689621,
+      "learning_rate": 4.721638728121388e-05,
+      "loss": 0.17308170795440675,
+      "step": 975
+    },
+    {
+      "epoch": 0.17831149927219797,
+      "grad_norm": 0.1592789888381958,
+      "learning_rate": 4.718253617849306e-05,
+      "loss": 0.17534757852554322,
+      "step": 980
+    },
+    {
+      "epoch": 0.1792212518195051,
+      "grad_norm": 0.12727224826812744,
+      "learning_rate": 4.714849277466214e-05,
+      "loss": 0.17817609310150145,
+      "step": 985
+    },
+    {
+      "epoch": 0.18013100436681223,
+      "grad_norm": 0.15401554107666016,
+      "learning_rate": 4.711425736484447e-05,
+      "loss": 0.1733405351638794,
+      "step": 990
+    },
+    {
+      "epoch": 0.18104075691411936,
+      "grad_norm": 0.13253968954086304,
+      "learning_rate": 4.7079830245827906e-05,
+      "loss": 0.17846795320510864,
+      "step": 995
+    },
+    {
+      "epoch": 0.1819505094614265,
+      "grad_norm": 0.21846213936805725,
+      "learning_rate": 4.7045211716062245e-05,
+      "loss": 0.18021599054336548,
+      "step": 1000
+    },
+    {
+      "epoch": 0.18286026200873362,
+      "grad_norm": 0.16867990791797638,
+      "learning_rate": 4.7010402075656595e-05,
+      "loss": 0.18232386112213134,
+      "step": 1005
+    },
+    {
+      "epoch": 0.18377001455604075,
+      "grad_norm": 0.17180582880973816,
+      "learning_rate": 4.697540162637686e-05,
+      "loss": 0.1816317319869995,
+      "step": 1010
+    },
+    {
+      "epoch": 0.18467976710334788,
+      "grad_norm": 0.16480213403701782,
+      "learning_rate": 4.694021067164303e-05,
+      "loss": 0.17718446254730225,
+      "step": 1015
+    },
+    {
+      "epoch": 0.185589519650655,
+      "grad_norm": 0.15015918016433716,
+      "learning_rate": 4.6904829516526605e-05,
+      "loss": 0.17412011623382567,
+      "step": 1020
+    },
+    {
+      "epoch": 0.18649927219796217,
+      "grad_norm": 0.14445139467716217,
+      "learning_rate": 4.686925846774795e-05,
+      "loss": 0.1778018832206726,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1874090247452693,
+      "grad_norm": 0.1701960265636444,
+      "learning_rate": 4.683349783367362e-05,
+      "loss": 0.16901081800460815,
+      "step": 1030
+    },
+    {
+      "epoch": 0.18831877729257643,
+      "grad_norm": 0.15894867479801178,
+      "learning_rate": 4.679754792431368e-05,
+      "loss": 0.17055928707122803,
+      "step": 1035
+    },
+    {
+      "epoch": 0.18922852983988356,
+      "grad_norm": 0.1511942446231842,
+      "learning_rate": 4.676140905131903e-05,
+      "loss": 0.17339680194854737,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1901382823871907,
+      "grad_norm": 0.14735209941864014,
+      "learning_rate": 4.672508152797872e-05,
+      "loss": 0.17802717685699462,
+      "step": 1045
+    },
+    {
+      "epoch": 0.19104803493449782,
+      "grad_norm": 0.17367291450500488,
+      "learning_rate": 4.66885656692172e-05,
+      "loss": 0.1732744097709656,
+      "step": 1050
+    },
+    {
+      "epoch": 0.19195778748180495,
+      "grad_norm": 0.147227481007576,
+      "learning_rate": 4.665186179159159e-05,
+      "loss": 0.17040517330169677,
+      "step": 1055
+    },
+    {
+      "epoch": 0.19286754002911208,
+      "grad_norm": 0.1709655076265335,
+      "learning_rate": 4.6614970213289e-05,
+      "loss": 0.17794088125228882,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1937772925764192,
+      "grad_norm": 0.1588088721036911,
+      "learning_rate": 4.657789125412366e-05,
+      "loss": 0.17180380821228028,
+      "step": 1065
+    },
+    {
+      "epoch": 0.19468704512372634,
+      "grad_norm": 0.14827021956443787,
+      "learning_rate": 4.654062523553428e-05,
+      "loss": 0.182997989654541,
+      "step": 1070
+    },
+    {
+      "epoch": 0.19559679767103347,
+      "grad_norm": 0.16230466961860657,
+      "learning_rate": 4.6503172480581126e-05,
+      "loss": 0.17346880435943604,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1965065502183406,
+      "grad_norm": 0.1637624353170395,
+      "learning_rate": 4.646553331394333e-05,
+      "loss": 0.17263576984405518,
+      "step": 1080
+    },
+    {
+      "epoch": 0.19741630276564776,
+      "grad_norm": 0.15977843105793,
+      "learning_rate": 4.642770806191603e-05,
+      "loss": 0.17284308671951293,
+      "step": 1085
+    },
+    {
+      "epoch": 0.19832605531295489,
+      "grad_norm": 0.15394869446754456,
+      "learning_rate": 4.6389697052407534e-05,
+      "loss": 0.17797101736068727,
+      "step": 1090
+    },
+    {
+      "epoch": 0.19923580786026202,
+      "grad_norm": 0.15995225310325623,
+      "learning_rate": 4.6351500614936485e-05,
+      "loss": 0.18137198686599731,
+      "step": 1095
+    },
+    {
+      "epoch": 0.20014556040756915,
+      "grad_norm": 0.1779479682445526,
+      "learning_rate": 4.6313119080629006e-05,
+      "loss": 0.17998344898223878,
+      "step": 1100
+    },
+    {
+      "epoch": 0.20105531295487628,
+      "grad_norm": 0.14362832903862,
+      "learning_rate": 4.627455278221584e-05,
+      "loss": 0.18196423053741456,
+      "step": 1105
+    },
+    {
+      "epoch": 0.2019650655021834,
+      "grad_norm": 0.15951639413833618,
+      "learning_rate": 4.623580205402947e-05,
+      "loss": 0.17423888444900512,
+      "step": 1110
+    },
+    {
+      "epoch": 0.20287481804949054,
+      "grad_norm": 0.17273563146591187,
+      "learning_rate": 4.619686723200115e-05,
+      "loss": 0.17392473220825194,
+      "step": 1115
+    },
+    {
+      "epoch": 0.20378457059679767,
+      "grad_norm": 0.1655360758304596,
+      "learning_rate": 4.615774865365813e-05,
+      "loss": 0.17528389692306517,
+      "step": 1120
+    },
+    {
+      "epoch": 0.2046943231441048,
+      "grad_norm": 0.15920691192150116,
+      "learning_rate": 4.611844665812058e-05,
+      "loss": 0.1806849241256714,
+      "step": 1125
+    },
+    {
+      "epoch": 0.20560407569141192,
+      "grad_norm": 0.16114577651023865,
+      "learning_rate": 4.607896158609875e-05,
+      "loss": 0.17217352390289306,
+      "step": 1130
+    },
+    {
+      "epoch": 0.20651382823871905,
+      "grad_norm": 0.1499422937631607,
+      "learning_rate": 4.603929377988999e-05,
+      "loss": 0.17806737422943114,
+      "step": 1135
+    },
+    {
+      "epoch": 0.2074235807860262,
+      "grad_norm": 0.17605191469192505,
+      "learning_rate": 4.5999443583375765e-05,
+      "loss": 0.17842113971710205,
+      "step": 1140
+    },
+    {
+      "epoch": 0.20833333333333334,
+      "grad_norm": 0.16117210686206818,
+      "learning_rate": 4.595941134201871e-05,
+      "loss": 0.18379683494567872,
+      "step": 1145
+    },
+    {
+      "epoch": 0.20924308588064047,
+      "grad_norm": 0.21199050545692444,
+      "learning_rate": 4.591919740285957e-05,
+      "loss": 0.16286123991012574,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2101528384279476,
+      "grad_norm": 0.15100529789924622,
+      "learning_rate": 4.587880211451427e-05,
+      "loss": 0.17995200157165528,
+      "step": 1155
+    },
+    {
+      "epoch": 0.21106259097525473,
+      "grad_norm": 0.16618172824382782,
+      "learning_rate": 4.583822582717085e-05,
+      "loss": 0.16960303783416747,
+      "step": 1160
+    },
+    {
+      "epoch": 0.21197234352256186,
+      "grad_norm": 0.14743569493293762,
+      "learning_rate": 4.579746889258643e-05,
+      "loss": 0.17762668132781984,
+      "step": 1165
+    },
+    {
+      "epoch": 0.212882096069869,
+      "grad_norm": 0.1697179079055786,
+      "learning_rate": 4.575653166408417e-05,
+      "loss": 0.16656005382537842,
+      "step": 1170
+    },
+    {
+      "epoch": 0.21379184861717612,
+      "grad_norm": 0.14886513352394104,
+      "learning_rate": 4.57154144965502e-05,
+      "loss": 0.17091882228851318,
+      "step": 1175
+    },
+    {
+      "epoch": 0.21470160116448325,
+      "grad_norm": 0.18197473883628845,
+      "learning_rate": 4.5674117746430556e-05,
+      "loss": 0.1770920753479004,
+      "step": 1180
+    },
+    {
+      "epoch": 0.21561135371179038,
+      "grad_norm": 0.17323088645935059,
+      "learning_rate": 4.563264177172807e-05,
+      "loss": 0.1734643578529358,
+      "step": 1185
+    },
+    {
+      "epoch": 0.2165211062590975,
+      "grad_norm": 0.1521984338760376,
+      "learning_rate": 4.559098693199929e-05,
+      "loss": 0.17515116930007935,
+      "step": 1190
+    },
+    {
+      "epoch": 0.21743085880640467,
+      "grad_norm": 0.1842304915189743,
+      "learning_rate": 4.554915358835134e-05,
+      "loss": 0.16798022985458375,
+      "step": 1195
+    },
+    {
+      "epoch": 0.2183406113537118,
+      "grad_norm": 0.14753451943397522,
+      "learning_rate": 4.5507142103438794e-05,
+      "loss": 0.1755476713180542,
+      "step": 1200
+    },
+    {
+      "epoch": 0.21925036390101893,
+      "grad_norm": 0.17096194624900818,
+      "learning_rate": 4.546495284146057e-05,
+      "loss": 0.1792473554611206,
+      "step": 1205
+    },
+    {
+      "epoch": 0.22016011644832606,
+      "grad_norm": 0.1579233556985855,
+      "learning_rate": 4.542258616815669e-05,
+      "loss": 0.17230144739151002,
+      "step": 1210
+    },
+    {
+      "epoch": 0.2210698689956332,
+      "grad_norm": 0.177297905087471,
+      "learning_rate": 4.5380042450805216e-05,
+      "loss": 0.1807127833366394,
+      "step": 1215
+    },
+    {
+      "epoch": 0.22197962154294032,
+      "grad_norm": 0.14331696927547455,
+      "learning_rate": 4.533732205821897e-05,
+      "loss": 0.17201389074325563,
+      "step": 1220
+    },
+    {
+      "epoch": 0.22288937409024745,
+      "grad_norm": 0.14473360776901245,
+      "learning_rate": 4.529442536074239e-05,
+      "loss": 0.17036900520324708,
+      "step": 1225
+    },
+    {
+      "epoch": 0.22379912663755458,
+      "grad_norm": 0.1820901483297348,
+      "learning_rate": 4.5251352730248314e-05,
+      "loss": 0.17704882621765136,
+      "step": 1230
+    },
+    {
+      "epoch": 0.2247088791848617,
+      "grad_norm": 0.1948976367712021,
+      "learning_rate": 4.5208104540134746e-05,
+      "loss": 0.1706973433494568,
+      "step": 1235
+    },
+    {
+      "epoch": 0.22561863173216884,
+      "grad_norm": 0.16660070419311523,
+      "learning_rate": 4.51646811653216e-05,
+      "loss": 0.17636821269989014,
+      "step": 1240
+    },
+    {
+      "epoch": 0.22652838427947597,
+      "grad_norm": 0.1699984073638916,
+      "learning_rate": 4.512108298224751e-05,
+      "loss": 0.16986632347106934,
+      "step": 1245
+    },
+    {
+      "epoch": 0.22743813682678313,
+      "grad_norm": 0.17601042985916138,
+      "learning_rate": 4.50773103688665e-05,
+      "loss": 0.17507898807525635,
+      "step": 1250
+    },
+    {
+      "epoch": 0.22834788937409026,
+      "grad_norm": 0.17557238042354584,
+      "learning_rate": 4.503336370464476e-05,
+      "loss": 0.17702863216400147,
+      "step": 1255
+    },
+    {
+      "epoch": 0.2292576419213974,
+      "grad_norm": 0.1800651252269745,
+      "learning_rate": 4.498924337055729e-05,
+      "loss": 0.16419180631637573,
+      "step": 1260
+    },
+    {
+      "epoch": 0.23016739446870452,
+      "grad_norm": 0.2022479772567749,
+      "learning_rate": 4.494494974908468e-05,
+      "loss": 0.17482060194015503,
+      "step": 1265
+    },
+    {
+      "epoch": 0.23107714701601165,
+      "grad_norm": 0.14180205762386322,
+      "learning_rate": 4.490048322420973e-05,
+      "loss": 0.1723136067390442,
+      "step": 1270
+    },
+    {
+      "epoch": 0.23198689956331878,
+      "grad_norm": 0.18607310950756073,
+      "learning_rate": 4.485584418141419e-05,
+      "loss": 0.17096419334411622,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2328966521106259,
+      "grad_norm": 0.15958310663700104,
+      "learning_rate": 4.481103300767529e-05,
+      "loss": 0.1656244158744812,
+      "step": 1280
+    },
+    {
+      "epoch": 0.23380640465793304,
+      "grad_norm": 0.17552383244037628,
+      "learning_rate": 4.476605009146255e-05,
+      "loss": 0.17677626609802247,
+      "step": 1285
+    },
+    {
+      "epoch": 0.23471615720524017,
+      "grad_norm": 0.15299823880195618,
+      "learning_rate": 4.472089582273429e-05,
+      "loss": 0.1778991103172302,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2356259097525473,
+      "grad_norm": 0.14613987505435944,
+      "learning_rate": 4.46755705929343e-05,
+      "loss": 0.17071452140808105,
+      "step": 1295
+    },
+    {
+      "epoch": 0.23653566229985443,
+      "grad_norm": 0.17781122028827667,
+      "learning_rate": 4.463007479498843e-05,
+      "loss": 0.16955430507659913,
+      "step": 1300
+    },
+    {
+      "epoch": 0.23744541484716158,
+      "grad_norm": 0.16326487064361572,
+      "learning_rate": 4.458440882330119e-05,
+      "loss": 0.1777693510055542,
+      "step": 1305
+    },
+    {
+      "epoch": 0.23835516739446871,
+      "grad_norm": 0.17701926827430725,
+      "learning_rate": 4.4538573073752365e-05,
+      "loss": 0.16323351860046387,
+      "step": 1310
+    },
+    {
+      "epoch": 0.23926491994177584,
+      "grad_norm": 0.13104717433452606,
+      "learning_rate": 4.449256794369349e-05,
+      "loss": 0.17653456926345826,
+      "step": 1315
+    },
+    {
+      "epoch": 0.24017467248908297,
+      "grad_norm": 0.1796836256980896,
+      "learning_rate": 4.444639383194452e-05,
+      "loss": 0.17189600467681884,
+      "step": 1320
+    },
+    {
+      "epoch": 0.2410844250363901,
+      "grad_norm": 0.14919696748256683,
+      "learning_rate": 4.440005113879029e-05,
+      "loss": 0.17003334760665895,
+      "step": 1325
+    },
+    {
+      "epoch": 0.24199417758369723,
+      "grad_norm": 0.1728784441947937,
+      "learning_rate": 4.4353540265977064e-05,
+      "loss": 0.17397408485412597,
+      "step": 1330
+    },
+    {
+      "epoch": 0.24290393013100436,
+      "grad_norm": 0.14591015875339508,
+      "learning_rate": 4.43068616167091e-05,
+      "loss": 0.16498478651046752,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2438136826783115,
+      "grad_norm": 0.18417201936244965,
+      "learning_rate": 4.4260015595645055e-05,
+      "loss": 0.16841750144958495,
+      "step": 1340
+    },
+    {
+      "epoch": 0.24472343522561862,
+      "grad_norm": 0.16264279186725616,
+      "learning_rate": 4.4213002608894605e-05,
+      "loss": 0.16907373666763306,
+      "step": 1345
+    },
+    {
+      "epoch": 0.24563318777292575,
+      "grad_norm": 0.15248481929302216,
+      "learning_rate": 4.416582306401481e-05,
+      "loss": 0.15931472778320313,
+      "step": 1350
+    },
+    {
+      "epoch": 0.24654294032023288,
+      "grad_norm": 0.1488373875617981,
+      "learning_rate": 4.4118477370006636e-05,
+      "loss": 0.1701716423034668,
+      "step": 1355
+    },
+    {
+      "epoch": 0.24745269286754004,
+      "grad_norm": 0.14679782092571259,
+      "learning_rate": 4.407096593731142e-05,
+      "loss": 0.157412326335907,
+      "step": 1360
+    },
+    {
+      "epoch": 0.24836244541484717,
+      "grad_norm": 0.17139530181884766,
+      "learning_rate": 4.402328917780728e-05,
+      "loss": 0.17303754091262818,
+      "step": 1365
+    },
+    {
+      "epoch": 0.2492721979621543,
+      "grad_norm": 0.1534871757030487,
+      "learning_rate": 4.397544750480554e-05,
+      "loss": 0.1786255121231079,
+      "step": 1370
+    },
+    {
+      "epoch": 0.2501819505094614,
+      "grad_norm": 0.1876252293586731,
+      "learning_rate": 4.39274413330472e-05,
+      "loss": 0.16442898511886597,
+      "step": 1375
+    },
+    {
+      "epoch": 0.25109170305676853,
+      "grad_norm": 0.16165752708911896,
+      "learning_rate": 4.387927107869928e-05,
+      "loss": 0.1780426025390625,
+      "step": 1380
+    },
+    {
+      "epoch": 0.25200145560407566,
+      "grad_norm": 0.17242255806922913,
+      "learning_rate": 4.383093715935124e-05,
+      "loss": 0.15959256887435913,
+      "step": 1385
+    },
+    {
+      "epoch": 0.25291120815138285,
+      "grad_norm": 0.1627114862203598,
+      "learning_rate": 4.378243999401137e-05,
+      "loss": 0.17606115341186523,
+      "step": 1390
+    },
+    {
+      "epoch": 0.25382096069869,
+      "grad_norm": 0.15911224484443665,
+      "learning_rate": 4.373378000310312e-05,
+      "loss": 0.16798585653305054,
+      "step": 1395
+    },
+    {
+      "epoch": 0.2547307132459971,
+      "grad_norm": 0.15542249381542206,
+      "learning_rate": 4.3684957608461505e-05,
+      "loss": 0.1695417881011963,
+      "step": 1400
+    },
+    {
+      "epoch": 0.25564046579330424,
+      "grad_norm": 0.1475304812192917,
+      "learning_rate": 4.363597323332941e-05,
+      "loss": 0.16340878009796142,
+      "step": 1405
+    },
+    {
+      "epoch": 0.25655021834061137,
+      "grad_norm": 0.16943927109241486,
+      "learning_rate": 4.358682730235395e-05,
+      "loss": 0.17240238189697266,
+      "step": 1410
+    },
+    {
+      "epoch": 0.2574599708879185,
+      "grad_norm": 0.1816391944885254,
+      "learning_rate": 4.3537520241582744e-05,
+      "loss": 0.16558437347412108,
+      "step": 1415
+    },
+    {
+      "epoch": 0.25836972343522563,
+      "grad_norm": 0.23851341009140015,
+      "learning_rate": 4.348805247846027e-05,
+      "loss": 0.16796000003814698,
+      "step": 1420
+    },
+    {
+      "epoch": 0.25927947598253276,
+      "grad_norm": 0.15415243804454803,
+      "learning_rate": 4.343842444182414e-05,
+      "loss": 0.1746017098426819,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2601892285298399,
+      "grad_norm": 0.15651032328605652,
+      "learning_rate": 4.338863656190139e-05,
+      "loss": 0.1649057984352112,
+      "step": 1430
+    },
+    {
+      "epoch": 0.261098981077147,
+      "grad_norm": 0.16601966321468353,
+      "learning_rate": 4.333868927030471e-05,
+      "loss": 0.15888988971710205,
+      "step": 1435
+    },
+    {
+      "epoch": 0.26200873362445415,
+      "grad_norm": 0.1549467295408249,
+      "learning_rate": 4.328858300002876e-05,
+      "loss": 0.16357985734939576,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2629184861717613,
+      "grad_norm": 0.16332370042800903,
+      "learning_rate": 4.32383181854464e-05,
+      "loss": 0.16749982833862304,
+      "step": 1445
+    },
+    {
+      "epoch": 0.2638282387190684,
+      "grad_norm": 0.14827077090740204,
+      "learning_rate": 4.3187895262304894e-05,
+      "loss": 0.16886214017868043,
+      "step": 1450
+    },
+    {
+      "epoch": 0.26473799126637554,
+      "grad_norm": 0.1557198166847229,
+      "learning_rate": 4.313731466772216e-05,
+      "loss": 0.17512214183807373,
+      "step": 1455
+    },
+    {
+      "epoch": 0.26564774381368267,
+      "grad_norm": 0.17263570427894592,
+      "learning_rate": 4.308657684018299e-05,
+      "loss": 0.16248074769973755,
+      "step": 1460
+    },
+    {
+      "epoch": 0.2665574963609898,
+      "grad_norm": 0.17135761678218842,
+      "learning_rate": 4.303568221953521e-05,
+      "loss": 0.16605921983718872,
+      "step": 1465
+    },
+    {
+      "epoch": 0.26746724890829693,
+      "grad_norm": 0.14322632551193237,
+      "learning_rate": 4.2984631246985897e-05,
+      "loss": 0.1610772728919983,
+      "step": 1470
+    },
+    {
+      "epoch": 0.26837700145560406,
+      "grad_norm": 0.18852312862873077,
+      "learning_rate": 4.2933424365097564e-05,
+      "loss": 0.1686462163925171,
+      "step": 1475
+    },
+    {
+      "epoch": 0.2692867540029112,
+      "grad_norm": 0.1780245155096054,
+      "learning_rate": 4.2882062017784294e-05,
+      "loss": 0.16953932046890258,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2701965065502183,
+      "grad_norm": 0.180568665266037,
+      "learning_rate": 4.2830544650307895e-05,
+      "loss": 0.16442664861679077,
+      "step": 1485
+    },
+    {
+      "epoch": 0.27110625909752545,
+      "grad_norm": 0.16876435279846191,
+      "learning_rate": 4.277887270927407e-05,
+      "loss": 0.17128173112869263,
+      "step": 1490
+    },
+    {
+      "epoch": 0.2720160116448326,
+      "grad_norm": 0.164053276181221,
+      "learning_rate": 4.2727046642628513e-05,
+      "loss": 0.16331382989883422,
+      "step": 1495
+    },
+    {
+      "epoch": 0.27292576419213976,
+      "grad_norm": 0.14577528834342957,
+      "learning_rate": 4.267506689965305e-05,
+      "loss": 0.1638316035270691,
+      "step": 1500
+    },
+    {
+      "epoch": 0.2738355167394469,
+      "grad_norm": 0.1648740917444229,
+      "learning_rate": 4.262293393096171e-05,
+      "loss": 0.15332664251327516,
+      "step": 1505
+    },
+    {
+      "epoch": 0.274745269286754,
+      "grad_norm": 0.16445094347000122,
+      "learning_rate": 4.257064818849685e-05,
+      "loss": 0.1706634521484375,
+      "step": 1510
+    },
+    {
+      "epoch": 0.27565502183406115,
+      "grad_norm": 0.1584935486316681,
+      "learning_rate": 4.251821012552524e-05,
+      "loss": 0.1684114694595337,
+      "step": 1515
+    },
+    {
+      "epoch": 0.2765647743813683,
+      "grad_norm": 0.17215611040592194,
+      "learning_rate": 4.24656201966341e-05,
+      "loss": 0.15594131946563722,
+      "step": 1520
+    },
+    {
+      "epoch": 0.2774745269286754,
+      "grad_norm": 0.15945589542388916,
+      "learning_rate": 4.2412878857727214e-05,
+      "loss": 0.1686659574508667,
+      "step": 1525
+    },
+    {
+      "epoch": 0.27838427947598254,
+      "grad_norm": 0.16103951632976532,
+      "learning_rate": 4.2359986566020906e-05,
+      "loss": 0.17779340744018554,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2792940320232897,
+      "grad_norm": 0.1770307570695877,
+      "learning_rate": 4.230694378004014e-05,
+      "loss": 0.16786882877349854,
+      "step": 1535
+    },
+    {
+      "epoch": 0.2802037845705968,
+      "grad_norm": 0.16225053369998932,
+      "learning_rate": 4.2253750959614504e-05,
+      "loss": 0.16558897495269775,
+      "step": 1540
+    },
+    {
+      "epoch": 0.28111353711790393,
+      "grad_norm": 0.27213969826698303,
+      "learning_rate": 4.220040856587425e-05,
+      "loss": 0.1641119599342346,
+      "step": 1545
+    },
+    {
+      "epoch": 0.28202328966521106,
+      "grad_norm": 0.1773071587085724,
+      "learning_rate": 4.2146917061246284e-05,
+      "loss": 0.16919140815734862,
+      "step": 1550
+    },
+    {
+      "epoch": 0.2829330422125182,
+      "grad_norm": 0.15519705414772034,
+      "learning_rate": 4.209327690945014e-05,
+      "loss": 0.15501506328582765,
+      "step": 1555
+    },
+    {
+      "epoch": 0.2838427947598253,
+      "grad_norm": 0.19921597838401794,
+      "learning_rate": 4.203948857549402e-05,
+      "loss": 0.1690821886062622,
+      "step": 1560
+    },
+    {
+      "epoch": 0.28475254730713245,
+      "grad_norm": 0.15417630970478058,
+      "learning_rate": 4.1985552525670696e-05,
+      "loss": 0.1675640344619751,
+      "step": 1565
+    },
+    {
+      "epoch": 0.2856622998544396,
+      "grad_norm": 0.1739572137594223,
+      "learning_rate": 4.193146922755348e-05,
+      "loss": 0.16738017797470092,
+      "step": 1570
+    },
+    {
+      "epoch": 0.2865720524017467,
+      "grad_norm": 0.1384361982345581,
+      "learning_rate": 4.187723914999221e-05,
+      "loss": 0.16802358627319336,
+      "step": 1575
+    },
+    {
+      "epoch": 0.28748180494905384,
+      "grad_norm": 0.1491454839706421,
+      "learning_rate": 4.182286276310915e-05,
+      "loss": 0.1619583249092102,
+      "step": 1580
+    },
+    {
+      "epoch": 0.288391557496361,
+      "grad_norm": 0.15831919014453888,
+      "learning_rate": 4.176834053829492e-05,
+      "loss": 0.1625199794769287,
+      "step": 1585
+    },
+    {
+      "epoch": 0.2893013100436681,
+      "grad_norm": 0.16265396773815155,
+      "learning_rate": 4.1713672948204416e-05,
+      "loss": 0.16718552112579346,
+      "step": 1590
+    },
+    {
+      "epoch": 0.29021106259097523,
+      "grad_norm": 0.15153461694717407,
+      "learning_rate": 4.1658860466752714e-05,
+      "loss": 0.15979087352752686,
+      "step": 1595
+    },
+    {
+      "epoch": 0.29112081513828236,
+      "grad_norm": 0.1620412915945053,
+      "learning_rate": 4.160390356911096e-05,
+      "loss": 0.16103557348251343,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2920305676855895,
+      "grad_norm": 0.16673807799816132,
+      "learning_rate": 4.154880273170223e-05,
+      "loss": 0.16394708156585694,
+      "step": 1605
+    },
+    {
+      "epoch": 0.2929403202328967,
+      "grad_norm": 0.14834867417812347,
+      "learning_rate": 4.149355843219744e-05,
+      "loss": 0.15916435718536376,
+      "step": 1610
+    },
+    {
+      "epoch": 0.2938500727802038,
+      "grad_norm": 0.16977964341640472,
+      "learning_rate": 4.143817114951119e-05,
+      "loss": 0.16538127660751342,
+      "step": 1615
+    },
+    {
+      "epoch": 0.29475982532751094,
+      "grad_norm": 0.17986875772476196,
+      "learning_rate": 4.138264136379756e-05,
+      "loss": 0.15514618158340454,
+      "step": 1620
+    },
+    {
+      "epoch": 0.29566957787481807,
+      "grad_norm": 0.15794920921325684,
+      "learning_rate": 4.132696955644605e-05,
+      "loss": 0.15992183685302735,
+      "step": 1625
+    },
+    {
+      "epoch": 0.2965793304221252,
+      "grad_norm": 0.19955399632453918,
+      "learning_rate": 4.127115621007731e-05,
+      "loss": 0.16362056732177735,
+      "step": 1630
+    },
+    {
+      "epoch": 0.29748908296943233,
+      "grad_norm": 0.1352023035287857,
+      "learning_rate": 4.121520180853903e-05,
+      "loss": 0.15631601810455323,
+      "step": 1635
+    },
+    {
+      "epoch": 0.29839883551673946,
+      "grad_norm": 0.15340781211853027,
+      "learning_rate": 4.1159106836901674e-05,
+      "loss": 0.1571858048439026,
+      "step": 1640
+    },
+    {
+      "epoch": 0.2993085880640466,
+      "grad_norm": 0.15311770141124725,
+      "learning_rate": 4.110287178145433e-05,
+      "loss": 0.16082344055175782,
+      "step": 1645
+    },
+    {
+      "epoch": 0.3002183406113537,
+      "grad_norm": 0.17811627686023712,
+      "learning_rate": 4.10464971297005e-05,
+      "loss": 0.16117215156555176,
+      "step": 1650
+    },
+    {
+      "epoch": 0.30112809315866085,
+      "grad_norm": 0.21060039103031158,
+      "learning_rate": 4.0989983370353805e-05,
+      "loss": 0.15838587284088135,
+      "step": 1655
+    },
+    {
+      "epoch": 0.302037845705968,
+      "grad_norm": 0.155836820602417,
+      "learning_rate": 4.093333099333383e-05,
+      "loss": 0.16648870706558228,
+      "step": 1660
+    },
+    {
+      "epoch": 0.3029475982532751,
+      "grad_norm": 0.13711698353290558,
+      "learning_rate": 4.0876540489761826e-05,
+      "loss": 0.16899349689483642,
+      "step": 1665
+    },
+    {
+      "epoch": 0.30385735080058224,
+      "grad_norm": 0.15162716805934906,
+      "learning_rate": 4.0819612351956485e-05,
+      "loss": 0.16574090719223022,
+      "step": 1670
+    },
+    {
+      "epoch": 0.30476710334788937,
+      "grad_norm": 0.15016348659992218,
+      "learning_rate": 4.0762547073429615e-05,
+      "loss": 0.1689780354499817,
+      "step": 1675
+    },
+    {
+      "epoch": 0.3056768558951965,
+      "grad_norm": 0.15182986855506897,
+      "learning_rate": 4.070534514888194e-05,
+      "loss": 0.1593686819076538,
+      "step": 1680
+    },
+    {
+      "epoch": 0.3065866084425036,
+      "grad_norm": 0.15648750960826874,
+      "learning_rate": 4.0648007074198765e-05,
+      "loss": 0.16436235904693602,
+      "step": 1685
+    },
+    {
+      "epoch": 0.30749636098981076,
+      "grad_norm": 0.18339484930038452,
+      "learning_rate": 4.0590533346445665e-05,
+      "loss": 0.1678077220916748,
+      "step": 1690
+    },
+    {
+      "epoch": 0.3084061135371179,
+      "grad_norm": 0.16426527500152588,
+      "learning_rate": 4.053292446386422e-05,
+      "loss": 0.1689227342605591,
+      "step": 1695
+    },
+    {
+      "epoch": 0.309315866084425,
+      "grad_norm": 0.16129335761070251,
+      "learning_rate": 4.047518092586766e-05,
+      "loss": 0.16592445373535156,
+      "step": 1700
+    },
+    {
+      "epoch": 0.31022561863173215,
+      "grad_norm": 0.15512363612651825,
+      "learning_rate": 4.041730323303654e-05,
+      "loss": 0.16142364740371704,
+      "step": 1705
+    },
+    {
+      "epoch": 0.3111353711790393,
+      "grad_norm": 0.159842386841774,
+      "learning_rate": 4.0359291887114425e-05,
+      "loss": 0.1702875852584839,
+      "step": 1710
+    },
+    {
+      "epoch": 0.3120451237263464,
+      "grad_norm": 0.19558854401111603,
+      "learning_rate": 4.030114739100352e-05,
+      "loss": 0.15966148376464845,
+      "step": 1715
+    },
+    {
+      "epoch": 0.3129548762736536,
+      "grad_norm": 0.1577496975660324,
+      "learning_rate": 4.024287024876029e-05,
+      "loss": 0.1620358943939209,
+      "step": 1720
+    },
+    {
+      "epoch": 0.3138646288209607,
+      "grad_norm": 0.1629355251789093,
+      "learning_rate": 4.0184460965591144e-05,
+      "loss": 0.16511552333831786,
+      "step": 1725
+    },
+    {
+      "epoch": 0.31477438136826785,
+      "grad_norm": 0.17060767114162445,
+      "learning_rate": 4.0125920047848e-05,
+      "loss": 0.15672838687896729,
+      "step": 1730
+    },
+    {
+      "epoch": 0.315684133915575,
+      "grad_norm": 0.22447620332241058,
+      "learning_rate": 4.006724800302394e-05,
+      "loss": 0.15339784622192382,
+      "step": 1735
+    },
+    {
+      "epoch": 0.3165938864628821,
+      "grad_norm": 0.14572037756443024,
+      "learning_rate": 4.000844533974878e-05,
+      "loss": 0.16566959619522095,
+      "step": 1740
+    },
+    {
+      "epoch": 0.31750363901018924,
+      "grad_norm": 0.15915483236312866,
+      "learning_rate": 3.9949512567784684e-05,
+      "loss": 0.16153957843780517,
+      "step": 1745
+    },
+    {
+      "epoch": 0.3184133915574964,
+      "grad_norm": 0.1668540984392166,
+      "learning_rate": 3.9890450198021704e-05,
+      "loss": 0.1659809947013855,
+      "step": 1750
+    },
+    {
+      "epoch": 0.3193231441048035,
+      "grad_norm": 0.16612035036087036,
+      "learning_rate": 3.983125874247341e-05,
+      "loss": 0.16941241025924683,
+      "step": 1755
+    },
+    {
+      "epoch": 0.32023289665211063,
+      "grad_norm": 0.15163679420948029,
+      "learning_rate": 3.9771938714272407e-05,
+      "loss": 0.16053590774536133,
+      "step": 1760
+    },
+    {
+      "epoch": 0.32114264919941776,
+      "grad_norm": 0.1797824203968048,
+      "learning_rate": 3.97124906276659e-05,
+      "loss": 0.1667110800743103,
+      "step": 1765
+    },
+    {
+      "epoch": 0.3220524017467249,
+      "grad_norm": 0.15076608955860138,
+      "learning_rate": 3.9652914998011237e-05,
+      "loss": 0.1607860803604126,
+      "step": 1770
+    },
+    {
+      "epoch": 0.322962154294032,
+      "grad_norm": 0.16523587703704834,
+      "learning_rate": 3.959321234177144e-05,
+      "loss": 0.16515827178955078,
+      "step": 1775
+    },
+    {
+      "epoch": 0.32387190684133915,
+      "grad_norm": 0.22065149247646332,
+      "learning_rate": 3.9533383176510746e-05,
+      "loss": 0.1618957757949829,
+      "step": 1780
+    },
+    {
+      "epoch": 0.3247816593886463,
+      "grad_norm": 0.16426463425159454,
+      "learning_rate": 3.9473428020890066e-05,
+      "loss": 0.15763382911682128,
+      "step": 1785
+    },
+    {
+      "epoch": 0.3256914119359534,
+      "grad_norm": 0.16474904119968414,
+      "learning_rate": 3.941334739466257e-05,
+      "loss": 0.15135571956634522,
+      "step": 1790
+    },
+    {
+      "epoch": 0.32660116448326054,
+      "grad_norm": 0.16746412217617035,
+      "learning_rate": 3.935314181866909e-05,
+      "loss": 0.15925389528274536,
+      "step": 1795
+    },
+    {
+      "epoch": 0.32751091703056767,
+      "grad_norm": 0.17819371819496155,
+      "learning_rate": 3.929281181483369e-05,
+      "loss": 0.1598669171333313,
+      "step": 1800
+    },
+    {
+      "epoch": 0.3284206695778748,
+      "grad_norm": 0.1816040277481079,
+      "learning_rate": 3.923235790615907e-05,
+      "loss": 0.1652522087097168,
+      "step": 1805
+    },
+    {
+      "epoch": 0.32933042212518193,
+      "grad_norm": 0.14846695959568024,
+      "learning_rate": 3.917178061672211e-05,
+      "loss": 0.16665585041046144,
+      "step": 1810
+    },
+    {
+      "epoch": 0.33024017467248906,
+      "grad_norm": 0.1734926551580429,
+      "learning_rate": 3.911108047166924e-05,
+      "loss": 0.16069791316986085,
+      "step": 1815
+    },
+    {
+      "epoch": 0.3311499272197962,
+      "grad_norm": 0.16154922544956207,
+      "learning_rate": 3.905025799721194e-05,
+      "loss": 0.16114097833633423,
+      "step": 1820
+    },
+    {
+      "epoch": 0.3320596797671033,
+      "grad_norm": 0.1538771390914917,
+      "learning_rate": 3.898931372062217e-05,
+      "loss": 0.1602831244468689,
+      "step": 1825
+    },
+    {
+      "epoch": 0.3329694323144105,
+      "grad_norm": 0.14036566019058228,
+      "learning_rate": 3.892824817022781e-05,
+      "loss": 0.1502395749092102,
+      "step": 1830
+    },
+    {
+      "epoch": 0.33387918486171764,
+      "grad_norm": 0.19212059676647186,
+      "learning_rate": 3.886706187540804e-05,
+      "loss": 0.16265250444412233,
+      "step": 1835
+    },
+    {
+      "epoch": 0.33478893740902477,
+      "grad_norm": 0.17410333454608917,
+      "learning_rate": 3.880575536658881e-05,
+      "loss": 0.15689224004745483,
+      "step": 1840
+    },
+    {
+      "epoch": 0.3356986899563319,
+      "grad_norm": 0.15165294706821442,
+      "learning_rate": 3.874432917523817e-05,
+      "loss": 0.15033140182495117,
+      "step": 1845
+    },
+    {
+      "epoch": 0.336608442503639,
+      "grad_norm": 0.16166730225086212,
+      "learning_rate": 3.8682783833861736e-05,
+      "loss": 0.16896235942840576,
+      "step": 1850
+    },
+    {
+      "epoch": 0.33751819505094616,
+      "grad_norm": 0.16497021913528442,
+      "learning_rate": 3.8621119875998026e-05,
+      "loss": 0.1600774645805359,
+      "step": 1855
+    },
+    {
+      "epoch": 0.3384279475982533,
+      "grad_norm": 0.17264948785305023,
+      "learning_rate": 3.855933783621384e-05,
+      "loss": 0.16947593688964843,
+      "step": 1860
+    },
+    {
+      "epoch": 0.3393377001455604,
+      "grad_norm": 0.16870704293251038,
+      "learning_rate": 3.8497438250099636e-05,
+      "loss": 0.16062095165252685,
+      "step": 1865
+    },
+    {
+      "epoch": 0.34024745269286755,
+      "grad_norm": 0.16644036769866943,
+      "learning_rate": 3.843542165426492e-05,
+      "loss": 0.16015599966049193,
+      "step": 1870
+    },
+    {
+      "epoch": 0.3411572052401747,
+      "grad_norm": 0.1626352220773697,
+      "learning_rate": 3.837328858633349e-05,
+      "loss": 0.17444703578948975,
+      "step": 1875
+    },
+    {
+      "epoch": 0.3420669577874818,
+      "grad_norm": 0.1427375227212906,
+      "learning_rate": 3.83110395849389e-05,
+      "loss": 0.1589805006980896,
+      "step": 1880
+    },
+    {
+      "epoch": 0.34297671033478894,
+      "grad_norm": 0.17840255796909332,
+      "learning_rate": 3.824867518971973e-05,
+      "loss": 0.15953952074050903,
+      "step": 1885
+    },
+    {
+      "epoch": 0.34388646288209607,
+      "grad_norm": 0.16998249292373657,
+      "learning_rate": 3.818619594131489e-05,
+      "loss": 0.16027032136917113,
+      "step": 1890
+    },
+    {
+      "epoch": 0.3447962154294032,
+      "grad_norm": 0.14950257539749146,
+      "learning_rate": 3.812360238135897e-05,
+      "loss": 0.15335670709609986,
+      "step": 1895
+    },
+    {
+      "epoch": 0.3457059679767103,
+      "grad_norm": 0.1678011417388916,
+      "learning_rate": 3.806089505247752e-05,
+      "loss": 0.1560648798942566,
+      "step": 1900
+    },
+    {
+      "epoch": 0.34661572052401746,
+      "grad_norm": 0.17944541573524475,
+      "learning_rate": 3.799807449828238e-05,
+      "loss": 0.16072254180908202,
+      "step": 1905
+    },
+    {
+      "epoch": 0.3475254730713246,
+      "grad_norm": 0.166817307472229,
+      "learning_rate": 3.793514126336691e-05,
+      "loss": 0.1542820692062378,
+      "step": 1910
+    },
+    {
+      "epoch": 0.3484352256186317,
+      "grad_norm": 0.16047626733779907,
+      "learning_rate": 3.787209589330134e-05,
+      "loss": 0.16092092990875245,
+      "step": 1915
+    },
+    {
+      "epoch": 0.34934497816593885,
+      "grad_norm": 0.16478900611400604,
+      "learning_rate": 3.7808938934627965e-05,
+      "loss": 0.16765867471694945,
+      "step": 1920
+    },
+    {
+      "epoch": 0.350254730713246,
+      "grad_norm": 0.15349514782428741,
+      "learning_rate": 3.774567093485648e-05,
+      "loss": 0.15890377759933472,
+      "step": 1925
+    },
+    {
+      "epoch": 0.3511644832605531,
+      "grad_norm": 0.1515921950340271,
+      "learning_rate": 3.768229244245917e-05,
+      "loss": 0.16668319702148438,
+      "step": 1930
+    },
+    {
+      "epoch": 0.35207423580786024,
+      "grad_norm": 0.16310466825962067,
+      "learning_rate": 3.7618804006866195e-05,
+      "loss": 0.15182652473449706,
+      "step": 1935
+    },
+    {
+      "epoch": 0.3529839883551674,
+      "grad_norm": 0.17294517159461975,
+      "learning_rate": 3.755520617846084e-05,
+      "loss": 0.16287628412246705,
+      "step": 1940
+    },
+    {
+      "epoch": 0.35389374090247455,
+      "grad_norm": 0.1482895463705063,
+      "learning_rate": 3.749149950857467e-05,
+      "loss": 0.15321952104568481,
+      "step": 1945
+    },
+    {
+      "epoch": 0.3548034934497817,
+      "grad_norm": 0.2236029952764511,
+      "learning_rate": 3.7427684549482847e-05,
+      "loss": 0.15403482913970948,
+      "step": 1950
+    },
+    {
+      "epoch": 0.3557132459970888,
+      "grad_norm": 0.20185327529907227,
+      "learning_rate": 3.736376185439927e-05,
+      "loss": 0.1633884072303772,
+      "step": 1955
+    },
+    {
+      "epoch": 0.35662299854439594,
+      "grad_norm": 0.13906247913837433,
+      "learning_rate": 3.7299731977471816e-05,
+      "loss": 0.15925350189208984,
+      "step": 1960
+    },
+    {
+      "epoch": 0.35753275109170307,
+      "grad_norm": 0.18665002286434174,
+      "learning_rate": 3.723559547377751e-05,
+      "loss": 0.1612026572227478,
+      "step": 1965
+    },
+    {
+      "epoch": 0.3584425036390102,
+      "grad_norm": 0.16913433372974396,
+      "learning_rate": 3.717135289931774e-05,
+      "loss": 0.15479494333267213,
+      "step": 1970
+    },
+    {
+      "epoch": 0.35935225618631733,
+      "grad_norm": 0.1620066910982132,
+      "learning_rate": 3.7107004811013434e-05,
+      "loss": 0.1604058027267456,
+      "step": 1975
+    },
+    {
+      "epoch": 0.36026200873362446,
+      "grad_norm": 0.16838301718235016,
+      "learning_rate": 3.704255176670021e-05,
+      "loss": 0.15335073471069335,
+      "step": 1980
+    },
+    {
+      "epoch": 0.3611717612809316,
+      "grad_norm": 0.3054695427417755,
+      "learning_rate": 3.6977994325123535e-05,
+      "loss": 0.16558053493499755,
+      "step": 1985
+    },
+    {
+      "epoch": 0.3620815138282387,
+      "grad_norm": 0.1526716649532318,
+      "learning_rate": 3.6913333045933934e-05,
+      "loss": 0.16148923635482787,
+      "step": 1990
+    },
+    {
+      "epoch": 0.36299126637554585,
+      "grad_norm": 0.15328513085842133,
+      "learning_rate": 3.684856848968209e-05,
+      "loss": 0.1553613781929016,
+      "step": 1995
+    },
+    {
+      "epoch": 0.363901018922853,
+      "grad_norm": 0.16129714250564575,
+      "learning_rate": 3.6783701217813995e-05,
+      "loss": 0.16724612712860107,
+      "step": 2000
+    },
+    {
+      "epoch": 0.3648107714701601,
+      "grad_norm": 0.15715539455413818,
+      "learning_rate": 3.6718731792666086e-05,
+      "loss": 0.15867922306060792,
+      "step": 2005
+    },
+    {
+      "epoch": 0.36572052401746724,
+      "grad_norm": 0.15569166839122772,
+      "learning_rate": 3.6653660777460366e-05,
+      "loss": 0.1552058696746826,
+      "step": 2010
+    },
+    {
+      "epoch": 0.36663027656477437,
+      "grad_norm": 0.16223010420799255,
+      "learning_rate": 3.6588488736299535e-05,
+      "loss": 0.1583200454711914,
+      "step": 2015
+    },
+    {
+      "epoch": 0.3675400291120815,
+      "grad_norm": 0.18441995978355408,
+      "learning_rate": 3.652321623416209e-05,
+      "loss": 0.15050662755966188,
+      "step": 2020
+    },
+    {
+      "epoch": 0.36844978165938863,
+      "grad_norm": 0.13792674243450165,
+      "learning_rate": 3.645784383689742e-05,
+      "loss": 0.15458759069442748,
+      "step": 2025
+    },
+    {
+      "epoch": 0.36935953420669576,
+      "grad_norm": 0.14993111789226532,
+      "learning_rate": 3.639237211122091e-05,
+      "loss": 0.15926222801208495,
+      "step": 2030
+    },
+    {
+      "epoch": 0.3702692867540029,
+      "grad_norm": 0.16815930604934692,
+      "learning_rate": 3.632680162470904e-05,
+      "loss": 0.15524441003799438,
+      "step": 2035
+    },
+    {
+      "epoch": 0.37117903930131,
+      "grad_norm": 0.13312821090221405,
+      "learning_rate": 3.626113294579441e-05,
+      "loss": 0.15883516073226928,
+      "step": 2040
+    },
+    {
+      "epoch": 0.37208879184861715,
+      "grad_norm": 0.16838273406028748,
+      "learning_rate": 3.619536664376091e-05,
+      "loss": 0.15829603672027587,
+      "step": 2045
+    },
+    {
+      "epoch": 0.37299854439592434,
+      "grad_norm": 0.14706873893737793,
+      "learning_rate": 3.612950328873869e-05,
+      "loss": 0.15644397735595703,
+      "step": 2050
+    },
+    {
+      "epoch": 0.37390829694323147,
+      "grad_norm": 0.1644199639558792,
+      "learning_rate": 3.606354345169926e-05,
+      "loss": 0.15858219861984252,
+      "step": 2055
+    },
+    {
+      "epoch": 0.3748180494905386,
+      "grad_norm": 0.18077051639556885,
+      "learning_rate": 3.599748770445055e-05,
+      "loss": 0.1641286849975586,
+      "step": 2060
+    },
+    {
+      "epoch": 0.3757278020378457,
+      "grad_norm": 0.16329127550125122,
+      "learning_rate": 3.5931336619631914e-05,
+      "loss": 0.15027186870574952,
+      "step": 2065
+    },
+    {
+      "epoch": 0.37663755458515286,
+      "grad_norm": 0.16346783936023712,
+      "learning_rate": 3.586509077070922e-05,
+      "loss": 0.1558641314506531,
+      "step": 2070
+    },
+    {
+      "epoch": 0.37754730713246,
+      "grad_norm": 0.1727602630853653,
+      "learning_rate": 3.5798750731969834e-05,
+      "loss": 0.15390506982803345,
+      "step": 2075
+    },
+    {
+      "epoch": 0.3784570596797671,
+      "grad_norm": 0.7598192691802979,
+      "learning_rate": 3.5732317078517654e-05,
+      "loss": 0.1533232808113098,
+      "step": 2080
+    },
+    {
+      "epoch": 0.37936681222707425,
+      "grad_norm": 0.1433355212211609,
+      "learning_rate": 3.5665790386268124e-05,
+      "loss": 0.15560413599014283,
+      "step": 2085
+    },
+    {
+      "epoch": 0.3802765647743814,
+      "grad_norm": 0.18439625203609467,
+      "learning_rate": 3.559917123194325e-05,
+      "loss": 0.16695556640625,
+      "step": 2090
+    },
+    {
+      "epoch": 0.3811863173216885,
+      "grad_norm": 0.1693502813577652,
+      "learning_rate": 3.55324601930666e-05,
+      "loss": 0.15957870483398437,
+      "step": 2095
+    },
+    {
+      "epoch": 0.38209606986899564,
+      "grad_norm": 0.17776088416576385,
+      "learning_rate": 3.54656578479583e-05,
+      "loss": 0.1527492880821228,
+      "step": 2100
+    },
+    {
+      "epoch": 0.38300582241630277,
+      "grad_norm": 0.15993724763393402,
+      "learning_rate": 3.539876477572998e-05,
+      "loss": 0.1567505717277527,
+      "step": 2105
+    },
+    {
+      "epoch": 0.3839155749636099,
+      "grad_norm": 0.17067375779151917,
+      "learning_rate": 3.533178155627981e-05,
+      "loss": 0.14660797119140626,
+      "step": 2110
+    },
+    {
+      "epoch": 0.384825327510917,
+      "grad_norm": 0.20239882171154022,
+      "learning_rate": 3.526470877028745e-05,
+      "loss": 0.1596767544746399,
+      "step": 2115
+    },
+    {
+      "epoch": 0.38573508005822416,
+      "grad_norm": 0.1863643079996109,
+      "learning_rate": 3.5197546999209005e-05,
+      "loss": 0.15738571882247926,
+      "step": 2120
+    },
+    {
+      "epoch": 0.3866448326055313,
+      "grad_norm": 0.16994133591651917,
+      "learning_rate": 3.5130296825272014e-05,
+      "loss": 0.16255316734313965,
+      "step": 2125
+    },
+    {
+      "epoch": 0.3875545851528384,
+      "grad_norm": 0.18703415989875793,
+      "learning_rate": 3.5062958831470355e-05,
+      "loss": 0.15206334590911866,
+      "step": 2130
+    },
+    {
+      "epoch": 0.38846433770014555,
+      "grad_norm": 0.15433982014656067,
+      "learning_rate": 3.4995533601559226e-05,
+      "loss": 0.1590178370475769,
+      "step": 2135
+    },
+    {
+      "epoch": 0.3893740902474527,
+      "grad_norm": 0.16498146951198578,
+      "learning_rate": 3.4928021720050104e-05,
+      "loss": 0.14759145975112914,
+      "step": 2140
+    },
+    {
+      "epoch": 0.3902838427947598,
+      "grad_norm": 0.17880478501319885,
+      "learning_rate": 3.486042377220562e-05,
+      "loss": 0.1642458915710449,
+      "step": 2145
+    },
+    {
+      "epoch": 0.39119359534206694,
+      "grad_norm": 0.14700061082839966,
+      "learning_rate": 3.479274034403455e-05,
+      "loss": 0.16105138063430785,
+      "step": 2150
+    },
+    {
+      "epoch": 0.39210334788937407,
+      "grad_norm": 0.1620762050151825,
+      "learning_rate": 3.472497202228664e-05,
+      "loss": 0.15104985237121582,
+      "step": 2155
+    },
+    {
+      "epoch": 0.3930131004366812,
+      "grad_norm": 0.1625058799982071,
+      "learning_rate": 3.4657119394447654e-05,
+      "loss": 0.16145485639572144,
+      "step": 2160
+    },
+    {
+      "epoch": 0.3939228529839884,
+      "grad_norm": 0.1631549596786499,
+      "learning_rate": 3.458918304873417e-05,
+      "loss": 0.16712255477905275,
+      "step": 2165
+    },
+    {
+      "epoch": 0.3948326055312955,
+      "grad_norm": 0.16041551530361176,
+      "learning_rate": 3.452116357408853e-05,
+      "loss": 0.15118330717086792,
+      "step": 2170
+    },
+    {
+      "epoch": 0.39574235807860264,
+      "grad_norm": 0.16692611575126648,
+      "learning_rate": 3.44530615601737e-05,
+      "loss": 0.16982550621032716,
+      "step": 2175
+    },
+    {
+      "epoch": 0.39665211062590977,
+      "grad_norm": 0.16082268953323364,
+      "learning_rate": 3.438487759736821e-05,
+      "loss": 0.1513260006904602,
+      "step": 2180
+    },
+    {
+      "epoch": 0.3975618631732169,
+      "grad_norm": 0.1474589854478836,
+      "learning_rate": 3.4316612276761004e-05,
+      "loss": 0.14968743324279785,
+      "step": 2185
+    },
+    {
+      "epoch": 0.39847161572052403,
+      "grad_norm": 0.14531342685222626,
+      "learning_rate": 3.42482661901463e-05,
+      "loss": 0.1563260555267334,
+      "step": 2190
+    },
+    {
+      "epoch": 0.39938136826783116,
+      "grad_norm": 0.16775506734848022,
+      "learning_rate": 3.41798399300185e-05,
+      "loss": 0.14861010313034057,
+      "step": 2195
+    },
+    {
+      "epoch": 0.4002911208151383,
+      "grad_norm": 0.15065217018127441,
+      "learning_rate": 3.411133408956703e-05,
+      "loss": 0.15559519529342652,
+      "step": 2200
+    },
+    {
+      "epoch": 0.4012008733624454,
+      "grad_norm": 0.16655296087265015,
+      "learning_rate": 3.4042749262671184e-05,
+      "loss": 0.16025567054748535,
+      "step": 2205
+    },
+    {
+      "epoch": 0.40211062590975255,
+      "grad_norm": 0.14773905277252197,
+      "learning_rate": 3.397408604389501e-05,
+      "loss": 0.15074082612991332,
+      "step": 2210
+    },
+    {
+      "epoch": 0.4030203784570597,
+      "grad_norm": 0.16233304142951965,
+      "learning_rate": 3.3905345028482125e-05,
+      "loss": 0.15490520000457764,
+      "step": 2215
+    },
+    {
+      "epoch": 0.4039301310043668,
+      "grad_norm": 0.17520153522491455,
+      "learning_rate": 3.383652681235058e-05,
+      "loss": 0.1517520785331726,
+      "step": 2220
+    },
+    {
+      "epoch": 0.40483988355167394,
+      "grad_norm": 0.14749875664710999,
+      "learning_rate": 3.376763199208766e-05,
+      "loss": 0.15410997867584228,
+      "step": 2225
+    },
+    {
+      "epoch": 0.40574963609898107,
+      "grad_norm": 0.16855919361114502,
+      "learning_rate": 3.369866116494477e-05,
+      "loss": 0.1510261058807373,
+      "step": 2230
+    },
+    {
+      "epoch": 0.4066593886462882,
+      "grad_norm": 0.1594122350215912,
+      "learning_rate": 3.362961492883218e-05,
+      "loss": 0.1493813395500183,
+      "step": 2235
+    },
+    {
+      "epoch": 0.40756914119359533,
+      "grad_norm": 0.13645926117897034,
+      "learning_rate": 3.3560493882313915e-05,
+      "loss": 0.14876762628555298,
+      "step": 2240
+    },
+    {
+      "epoch": 0.40847889374090246,
+      "grad_norm": 0.14304400980472565,
+      "learning_rate": 3.349129862460251e-05,
+      "loss": 0.15567013025283813,
+      "step": 2245
+    },
+    {
+      "epoch": 0.4093886462882096,
+      "grad_norm": 0.17040041089057922,
+      "learning_rate": 3.342202975555386e-05,
+      "loss": 0.1563249945640564,
+      "step": 2250
+    },
+    {
+      "epoch": 0.4102983988355167,
+      "grad_norm": 0.15594671666622162,
+      "learning_rate": 3.3352687875661984e-05,
+      "loss": 0.1546410083770752,
+      "step": 2255
+    },
+    {
+      "epoch": 0.41120815138282385,
+      "grad_norm": 0.1677195280790329,
+      "learning_rate": 3.328327358605384e-05,
+      "loss": 0.15710171461105346,
+      "step": 2260
+    },
+    {
+      "epoch": 0.412117903930131,
+      "grad_norm": 0.1731705516576767,
+      "learning_rate": 3.321378748848412e-05,
+      "loss": 0.16444036960601807,
+      "step": 2265
+    },
+    {
+      "epoch": 0.4130276564774381,
+      "grad_norm": 0.18779033422470093,
+      "learning_rate": 3.3144230185329984e-05,
+      "loss": 0.15659687519073487,
+      "step": 2270
+    },
+    {
+      "epoch": 0.4139374090247453,
+      "grad_norm": 0.1543768346309662,
+      "learning_rate": 3.3074602279585913e-05,
+      "loss": 0.15100739002227784,
+      "step": 2275
+    },
+    {
+      "epoch": 0.4148471615720524,
+      "grad_norm": 0.16672168672084808,
+      "learning_rate": 3.300490437485843e-05,
+      "loss": 0.15535364151000977,
+      "step": 2280
+    },
+    {
+      "epoch": 0.41575691411935956,
+      "grad_norm": 0.16741308569908142,
+      "learning_rate": 3.293513707536089e-05,
+      "loss": 0.15523911714553834,
+      "step": 2285
+    },
+    {
+      "epoch": 0.4166666666666667,
+      "grad_norm": 0.1488303542137146,
+      "learning_rate": 3.286530098590822e-05,
+      "loss": 0.1542000651359558,
+      "step": 2290
+    },
+    {
+      "epoch": 0.4175764192139738,
+      "grad_norm": 0.1637732982635498,
+      "learning_rate": 3.2795396711911694e-05,
+      "loss": 0.15354831218719484,
+      "step": 2295
+    },
+    {
+      "epoch": 0.41848617176128095,
+      "grad_norm": 0.1472022533416748,
+      "learning_rate": 3.272542485937369e-05,
+      "loss": 0.16235145330429077,
+      "step": 2300
+    },
+    {
+      "epoch": 0.4193959243085881,
+      "grad_norm": 0.15908290445804596,
+      "learning_rate": 3.265538603488241e-05,
+      "loss": 0.15642645359039306,
+      "step": 2305
+    },
+    {
+      "epoch": 0.4203056768558952,
+      "grad_norm": 0.1584865301847458,
+      "learning_rate": 3.2585280845606645e-05,
+      "loss": 0.15490249395370484,
+      "step": 2310
+    },
+    {
+      "epoch": 0.42121542940320233,
+      "grad_norm": 0.15893949568271637,
+      "learning_rate": 3.251510989929052e-05,
+      "loss": 0.1598116159439087,
+      "step": 2315
+    },
+    {
+      "epoch": 0.42212518195050946,
+      "grad_norm": 0.18930596113204956,
+      "learning_rate": 3.244487380424817e-05,
+      "loss": 0.1482008934020996,
+      "step": 2320
+    },
+    {
+      "epoch": 0.4230349344978166,
+      "grad_norm": 0.132876455783844,
+      "learning_rate": 3.237457316935856e-05,
+      "loss": 0.15304710865020751,
+      "step": 2325
+    },
+    {
+      "epoch": 0.4239446870451237,
+      "grad_norm": 0.16447032988071442,
+      "learning_rate": 3.2304208604060106e-05,
+      "loss": 0.15298750400543212,
+      "step": 2330
+    },
+    {
+      "epoch": 0.42485443959243085,
+      "grad_norm": 0.17748120427131653,
+      "learning_rate": 3.223378071834546e-05,
+      "loss": 0.1556084156036377,
+      "step": 2335
+    },
+    {
+      "epoch": 0.425764192139738,
+      "grad_norm": 0.16366586089134216,
+      "learning_rate": 3.2163290122756206e-05,
+      "loss": 0.14387927055358887,
+      "step": 2340
+    },
+    {
+      "epoch": 0.4266739446870451,
+      "grad_norm": 0.15398970246315002,
+      "learning_rate": 3.209273742837755e-05,
+      "loss": 0.16091293096542358,
+      "step": 2345
+    },
+    {
+      "epoch": 0.42758369723435224,
+      "grad_norm": 0.164212167263031,
+      "learning_rate": 3.202212324683305e-05,
+      "loss": 0.15523531436920165,
+      "step": 2350
+    },
+    {
+      "epoch": 0.4284934497816594,
+      "grad_norm": 0.16749800741672516,
+      "learning_rate": 3.1951448190279255e-05,
+      "loss": 0.15354975461959838,
+      "step": 2355
+    },
+    {
+      "epoch": 0.4294032023289665,
+      "grad_norm": 0.14137034118175507,
+      "learning_rate": 3.18807128714005e-05,
+      "loss": 0.14981694221496583,
+      "step": 2360
+    },
+    {
+      "epoch": 0.43031295487627363,
+      "grad_norm": 0.14848439395427704,
+      "learning_rate": 3.1809917903403507e-05,
+      "loss": 0.15448769330978393,
+      "step": 2365
+    },
+    {
+      "epoch": 0.43122270742358076,
+      "grad_norm": 0.1747605800628662,
+      "learning_rate": 3.1739063900012095e-05,
+      "loss": 0.15882387161254882,
+      "step": 2370
+    },
+    {
+      "epoch": 0.4321324599708879,
+      "grad_norm": 0.16054467856884003,
+      "learning_rate": 3.166815147546186e-05,
+      "loss": 0.15170297622680665,
+      "step": 2375
+    },
+    {
+      "epoch": 0.433042212518195,
+      "grad_norm": 0.15428027510643005,
+      "learning_rate": 3.1597181244494886e-05,
+      "loss": 0.16202548742294312,
+      "step": 2380
+    },
+    {
+      "epoch": 0.4339519650655022,
+      "grad_norm": 0.16747219860553741,
+      "learning_rate": 3.1526153822354325e-05,
+      "loss": 0.15461477041244506,
+      "step": 2385
+    },
+    {
+      "epoch": 0.43486171761280934,
+      "grad_norm": 0.17415772378444672,
+      "learning_rate": 3.145506982477918e-05,
+      "loss": 0.16173542737960817,
+      "step": 2390
+    },
+    {
+      "epoch": 0.43577147016011647,
+      "grad_norm": 0.1293518990278244,
+      "learning_rate": 3.1383929867998865e-05,
+      "loss": 0.15572521686553956,
+      "step": 2395
+    },
+    {
+      "epoch": 0.4366812227074236,
+      "grad_norm": 0.16909323632717133,
+      "learning_rate": 3.1312734568727935e-05,
+      "loss": 0.15898628234863282,
+      "step": 2400
+    },
+    {
+      "epoch": 0.43759097525473073,
+      "grad_norm": 0.16770294308662415,
+      "learning_rate": 3.124148454416069e-05,
+      "loss": 0.1536281704902649,
+      "step": 2405
+    },
+    {
+      "epoch": 0.43850072780203786,
+      "grad_norm": 0.14078612625598907,
+      "learning_rate": 3.117018041196585e-05,
+      "loss": 0.15274266004562378,
+      "step": 2410
+    },
+    {
+      "epoch": 0.439410480349345,
+      "grad_norm": 0.15457536280155182,
+      "learning_rate": 3.1098822790281226e-05,
+      "loss": 0.15391263961791993,
+      "step": 2415
+    },
+    {
+      "epoch": 0.4403202328966521,
+      "grad_norm": 0.1640717089176178,
+      "learning_rate": 3.102741229770827e-05,
+      "loss": 0.15515168905258178,
+      "step": 2420
+    },
+    {
+      "epoch": 0.44122998544395925,
+      "grad_norm": 0.2601533830165863,
+      "learning_rate": 3.095594955330683e-05,
+      "loss": 0.1587247371673584,
+      "step": 2425
+    },
+    {
+      "epoch": 0.4421397379912664,
+      "grad_norm": 0.1352529525756836,
+      "learning_rate": 3.08844351765897e-05,
+      "loss": 0.1483217477798462,
+      "step": 2430
+    },
+    {
+      "epoch": 0.4430494905385735,
+      "grad_norm": 0.18479721248149872,
+      "learning_rate": 3.081286978751728e-05,
+      "loss": 0.15121787786483765,
+      "step": 2435
+    },
+    {
+      "epoch": 0.44395924308588064,
+      "grad_norm": 0.16954511404037476,
+      "learning_rate": 3.074125400649221e-05,
+      "loss": 0.16073100566864013,
+      "step": 2440
+    },
+    {
+      "epoch": 0.44486899563318777,
+      "grad_norm": 0.15154729783535004,
+      "learning_rate": 3.0669588454353944e-05,
+      "loss": 0.15738017559051515,
+      "step": 2445
+    },
+    {
+      "epoch": 0.4457787481804949,
+      "grad_norm": 0.1540488302707672,
+      "learning_rate": 3.059787375237344e-05,
+      "loss": 0.1515384554862976,
+      "step": 2450
+    },
+    {
+      "epoch": 0.44668850072780203,
+      "grad_norm": 0.1814432442188263,
+      "learning_rate": 3.052611052224774e-05,
+      "loss": 0.15731438398361205,
+      "step": 2455
+    },
+    {
+      "epoch": 0.44759825327510916,
+      "grad_norm": 0.16657036542892456,
+      "learning_rate": 3.0454299386094542e-05,
+      "loss": 0.15741543769836425,
+      "step": 2460
+    },
+    {
+      "epoch": 0.4485080058224163,
+      "grad_norm": 0.2177237570285797,
+      "learning_rate": 3.0382440966446875e-05,
+      "loss": 0.14972515106201173,
+      "step": 2465
+    },
+    {
+      "epoch": 0.4494177583697234,
+      "grad_norm": 0.1669909954071045,
+      "learning_rate": 3.031053588624766e-05,
+      "loss": 0.1506432294845581,
+      "step": 2470
+    },
+    {
+      "epoch": 0.45032751091703055,
+      "grad_norm": 0.1752234250307083,
+      "learning_rate": 3.0238584768844313e-05,
+      "loss": 0.14969609975814818,
+      "step": 2475
+    },
+    {
+      "epoch": 0.4512372634643377,
+      "grad_norm": 0.18267901241779327,
+      "learning_rate": 3.0166588237983363e-05,
+      "loss": 0.15112748146057128,
+      "step": 2480
+    },
+    {
+      "epoch": 0.4521470160116448,
+      "grad_norm": 0.16250105202198029,
+      "learning_rate": 3.0094546917805007e-05,
+      "loss": 0.15864100456237792,
+      "step": 2485
+    },
+    {
+      "epoch": 0.45305676855895194,
+      "grad_norm": 0.14825721085071564,
+      "learning_rate": 3.0022461432837752e-05,
+      "loss": 0.1513954520225525,
+      "step": 2490
+    },
+    {
+      "epoch": 0.4539665211062591,
+      "grad_norm": 0.1626640111207962,
+      "learning_rate": 2.9950332407992943e-05,
+      "loss": 0.1505578875541687,
+      "step": 2495
+    },
+    {
+      "epoch": 0.45487627365356625,
+      "grad_norm": 0.1535351574420929,
+      "learning_rate": 2.987816046855939e-05,
+      "loss": 0.15255829095840454,
+      "step": 2500
+    },
+    {
+      "epoch": 0.4557860262008734,
+      "grad_norm": 0.17552775144577026,
+      "learning_rate": 2.9805946240197928e-05,
+      "loss": 0.1516443133354187,
+      "step": 2505
+    },
+    {
+      "epoch": 0.4566957787481805,
+      "grad_norm": 0.16020981967449188,
+      "learning_rate": 2.9733690348935994e-05,
+      "loss": 0.14519743919372557,
+      "step": 2510
+    },
+    {
+      "epoch": 0.45760553129548764,
+      "grad_norm": 0.17800211906433105,
+      "learning_rate": 2.9661393421162204e-05,
+      "loss": 0.15679080486297609,
+      "step": 2515
+    },
+    {
+      "epoch": 0.4585152838427948,
+      "grad_norm": 0.16016991436481476,
+      "learning_rate": 2.9589056083620902e-05,
+      "loss": 0.14768127202987671,
+      "step": 2520
+    },
+    {
+      "epoch": 0.4594250363901019,
+      "grad_norm": 0.16272081434726715,
+      "learning_rate": 2.951667896340679e-05,
+      "loss": 0.1513301968574524,
+      "step": 2525
+    },
+    {
+      "epoch": 0.46033478893740903,
+      "grad_norm": 0.1726413071155548,
+      "learning_rate": 2.9444262687959402e-05,
+      "loss": 0.14819332361221313,
+      "step": 2530
+    },
+    {
+      "epoch": 0.46124454148471616,
+      "grad_norm": 0.1670403778553009,
+      "learning_rate": 2.9371807885057735e-05,
+      "loss": 0.15245940685272216,
+      "step": 2535
+    },
+    {
+      "epoch": 0.4621542940320233,
+      "grad_norm": 0.1650049239397049,
+      "learning_rate": 2.9299315182814772e-05,
+      "loss": 0.15187418460845947,
+      "step": 2540
+    },
+    {
+      "epoch": 0.4630640465793304,
+      "grad_norm": 0.16327734291553497,
+      "learning_rate": 2.9226785209672047e-05,
+      "loss": 0.15579828023910522,
+      "step": 2545
+    },
+    {
+      "epoch": 0.46397379912663755,
+      "grad_norm": 0.3367880582809448,
+      "learning_rate": 2.91542185943942e-05,
+      "loss": 0.15617697238922118,
+      "step": 2550
+    },
+    {
+      "epoch": 0.4648835516739447,
+      "grad_norm": 0.1731594055891037,
+      "learning_rate": 2.908161596606353e-05,
+      "loss": 0.1559603691101074,
+      "step": 2555
+    },
+    {
+      "epoch": 0.4657933042212518,
+      "grad_norm": 0.1477293074131012,
+      "learning_rate": 2.9008977954074517e-05,
+      "loss": 0.15567959547042848,
+      "step": 2560
+    },
+    {
+      "epoch": 0.46670305676855894,
+      "grad_norm": 0.16227173805236816,
+      "learning_rate": 2.8936305188128392e-05,
+      "loss": 0.1522113561630249,
+      "step": 2565
+    },
+    {
+      "epoch": 0.4676128093158661,
+      "grad_norm": 0.2031075656414032,
+      "learning_rate": 2.8863598298227674e-05,
+      "loss": 0.15054640769958497,
+      "step": 2570
+    },
+    {
+      "epoch": 0.4685225618631732,
+      "grad_norm": 0.18351472914218903,
+      "learning_rate": 2.8790857914670698e-05,
+      "loss": 0.15837019681930542,
+      "step": 2575
+    },
+    {
+      "epoch": 0.46943231441048033,
+      "grad_norm": 0.15914765000343323,
+      "learning_rate": 2.871808466804616e-05,
+      "loss": 0.1550259470939636,
+      "step": 2580
+    },
+    {
+      "epoch": 0.47034206695778746,
+      "grad_norm": 0.17366717755794525,
+      "learning_rate": 2.8645279189227636e-05,
+      "loss": 0.15702390670776367,
+      "step": 2585
+    },
+    {
+      "epoch": 0.4712518195050946,
+      "grad_norm": 0.13677838444709778,
+      "learning_rate": 2.8572442109368134e-05,
+      "loss": 0.15485031604766847,
+      "step": 2590
+    },
+    {
+      "epoch": 0.4721615720524017,
+      "grad_norm": 0.1477748304605484,
+      "learning_rate": 2.8499574059894617e-05,
+      "loss": 0.14577245712280273,
+      "step": 2595
+    },
+    {
+      "epoch": 0.47307132459970885,
+      "grad_norm": 0.1582217663526535,
+      "learning_rate": 2.842667567250252e-05,
+      "loss": 0.15586793422698975,
+      "step": 2600
+    },
+    {
+      "epoch": 0.47398107714701604,
+      "grad_norm": 0.19658738374710083,
+      "learning_rate": 2.8353747579150268e-05,
+      "loss": 0.15060495138168334,
+      "step": 2605
+    },
+    {
+      "epoch": 0.47489082969432317,
+      "grad_norm": 0.176767036318779,
+      "learning_rate": 2.828079041205382e-05,
+      "loss": 0.15116705894470214,
+      "step": 2610
+    },
+    {
+      "epoch": 0.4758005822416303,
+      "grad_norm": 0.16972507536411285,
+      "learning_rate": 2.820780480368117e-05,
+      "loss": 0.1541937470436096,
+      "step": 2615
+    },
+    {
+      "epoch": 0.47671033478893743,
+      "grad_norm": 0.1548585742712021,
+      "learning_rate": 2.8134791386746884e-05,
+      "loss": 0.14334756135940552,
+      "step": 2620
+    },
+    {
+      "epoch": 0.47762008733624456,
+      "grad_norm": 0.15411986410617828,
+      "learning_rate": 2.806175079420658e-05,
+      "loss": 0.14642289876937867,
+      "step": 2625
+    },
+    {
+      "epoch": 0.4785298398835517,
+      "grad_norm": 0.16609491407871246,
+      "learning_rate": 2.7988683659251474e-05,
+      "loss": 0.15083469152450563,
+      "step": 2630
+    },
+    {
+      "epoch": 0.4794395924308588,
+      "grad_norm": 0.16592684388160706,
+      "learning_rate": 2.791559061530289e-05,
+      "loss": 0.14218480587005616,
+      "step": 2635
+    },
+    {
+      "epoch": 0.48034934497816595,
+      "grad_norm": 0.1764935404062271,
+      "learning_rate": 2.7842472296006722e-05,
+      "loss": 0.15004343986511232,
+      "step": 2640
+    },
+    {
+      "epoch": 0.4812590975254731,
+      "grad_norm": 0.20094354450702667,
+      "learning_rate": 2.7769329335228022e-05,
+      "loss": 0.14975016117095946,
+      "step": 2645
+    },
+    {
+      "epoch": 0.4821688500727802,
+      "grad_norm": 0.1869269460439682,
+      "learning_rate": 2.769616236704542e-05,
+      "loss": 0.155981707572937,
+      "step": 2650
+    },
+    {
+      "epoch": 0.48307860262008734,
+      "grad_norm": 0.16671574115753174,
+      "learning_rate": 2.762297202574571e-05,
+      "loss": 0.14633859395980836,
+      "step": 2655
+    },
+    {
+      "epoch": 0.48398835516739447,
+      "grad_norm": 0.14999663829803467,
+      "learning_rate": 2.754975894581826e-05,
+      "loss": 0.15692603588104248,
+      "step": 2660
+    },
+    {
+      "epoch": 0.4848981077147016,
+      "grad_norm": 0.16893649101257324,
+      "learning_rate": 2.7476523761949592e-05,
+      "loss": 0.14530394077301026,
+      "step": 2665
+    },
+    {
+      "epoch": 0.48580786026200873,
+      "grad_norm": 0.16039884090423584,
+      "learning_rate": 2.740326710901784e-05,
+      "loss": 0.15013915300369263,
+      "step": 2670
+    },
+    {
+      "epoch": 0.48671761280931586,
+      "grad_norm": 0.16672006249427795,
+      "learning_rate": 2.732998962208725e-05,
+      "loss": 0.15667349100112915,
+      "step": 2675
+    },
+    {
+      "epoch": 0.487627365356623,
+      "grad_norm": 0.2160867303609848,
+      "learning_rate": 2.7256691936402684e-05,
+      "loss": 0.14335414171218872,
+      "step": 2680
+    },
+    {
+      "epoch": 0.4885371179039301,
+      "grad_norm": 0.349030077457428,
+      "learning_rate": 2.71833746873841e-05,
+      "loss": 0.1437530279159546,
+      "step": 2685
+    },
+    {
+      "epoch": 0.48944687045123725,
+      "grad_norm": 0.18380966782569885,
+      "learning_rate": 2.7110038510621073e-05,
+      "loss": 0.1476014256477356,
+      "step": 2690
+    },
+    {
+      "epoch": 0.4903566229985444,
+      "grad_norm": 0.1523742377758026,
+      "learning_rate": 2.703668404186722e-05,
+      "loss": 0.14578526020050048,
+      "step": 2695
+    },
+    {
+      "epoch": 0.4912663755458515,
+      "grad_norm": 0.16092729568481445,
+      "learning_rate": 2.696331191703479e-05,
+      "loss": 0.15335593223571778,
+      "step": 2700
+    },
+    {
+      "epoch": 0.49217612809315864,
+      "grad_norm": 0.17185333371162415,
+      "learning_rate": 2.688992277218904e-05,
+      "loss": 0.1540898084640503,
+      "step": 2705
+    },
+    {
+      "epoch": 0.49308588064046577,
+      "grad_norm": 0.1521969735622406,
+      "learning_rate": 2.6816517243542792e-05,
+      "loss": 0.15171396732330322,
+      "step": 2710
+    },
+    {
+      "epoch": 0.49399563318777295,
+      "grad_norm": 0.16064171493053436,
+      "learning_rate": 2.674309596745092e-05,
+      "loss": 0.1505839228630066,
+      "step": 2715
+    },
+    {
+      "epoch": 0.4949053857350801,
+      "grad_norm": 0.16430898010730743,
+      "learning_rate": 2.6669659580404795e-05,
+      "loss": 0.1551363468170166,
+      "step": 2720
+    },
+    {
+      "epoch": 0.4958151382823872,
+      "grad_norm": 0.16125477850437164,
+      "learning_rate": 2.659620871902677e-05,
+      "loss": 0.15069286823272704,
+      "step": 2725
+    },
+    {
+      "epoch": 0.49672489082969434,
+      "grad_norm": 0.1428450047969818,
+      "learning_rate": 2.652274402006471e-05,
+      "loss": 0.15511081218719483,
+      "step": 2730
+    },
+    {
+      "epoch": 0.4976346433770015,
+      "grad_norm": 0.15452754497528076,
+      "learning_rate": 2.6449266120386406e-05,
+      "loss": 0.14941939115524291,
+      "step": 2735
+    },
+    {
+      "epoch": 0.4985443959243086,
+      "grad_norm": 0.17243537306785583,
+      "learning_rate": 2.6375775656974123e-05,
+      "loss": 0.151741623878479,
+      "step": 2740
+    },
+    {
+      "epoch": 0.49945414847161573,
+      "grad_norm": 0.13736453652381897,
+      "learning_rate": 2.6302273266919008e-05,
+      "loss": 0.147042977809906,
+      "step": 2745
+    },
+    {
+      "epoch": 0.5003639010189228,
+      "grad_norm": 0.16241495311260223,
+      "learning_rate": 2.6228759587415614e-05,
+      "loss": 0.14664684534072875,
+      "step": 2750
+    },
+    {
+      "epoch": 0.50127365356623,
+      "grad_norm": 0.193496435880661,
+      "learning_rate": 2.6155235255756356e-05,
+      "loss": 0.15486966371536254,
+      "step": 2755
+    },
+    {
+      "epoch": 0.5021834061135371,
+      "grad_norm": 0.1542847901582718,
+      "learning_rate": 2.6081700909326e-05,
+      "loss": 0.15148009061813356,
+      "step": 2760
+    },
+    {
+      "epoch": 0.5030931586608443,
+      "grad_norm": 0.1696511209011078,
+      "learning_rate": 2.6008157185596142e-05,
+      "loss": 0.14190055131912233,
+      "step": 2765
+    },
+    {
+      "epoch": 0.5040029112081513,
+      "grad_norm": 0.14690077304840088,
+      "learning_rate": 2.5934604722119655e-05,
+      "loss": 0.1570739269256592,
+      "step": 2770
+    },
+    {
+      "epoch": 0.5049126637554585,
+      "grad_norm": 0.17149671912193298,
+      "learning_rate": 2.5861044156525162e-05,
+      "loss": 0.14940304756164552,
+      "step": 2775
+    },
+    {
+      "epoch": 0.5058224163027657,
+      "grad_norm": 0.16639231145381927,
+      "learning_rate": 2.578747612651155e-05,
+      "loss": 0.15691237449645995,
+      "step": 2780
+    },
+    {
+      "epoch": 0.5067321688500728,
+      "grad_norm": 0.2062763124704361,
+      "learning_rate": 2.5713901269842404e-05,
+      "loss": 0.1564734935760498,
+      "step": 2785
+    },
+    {
+      "epoch": 0.50764192139738,
+      "grad_norm": 0.12636308372020721,
+      "learning_rate": 2.5640320224340502e-05,
+      "loss": 0.14539417028427123,
+      "step": 2790
+    },
+    {
+      "epoch": 0.508551673944687,
+      "grad_norm": 0.16893689334392548,
+      "learning_rate": 2.556673362788225e-05,
+      "loss": 0.15440930128097535,
+      "step": 2795
+    },
+    {
+      "epoch": 0.5094614264919942,
+      "grad_norm": 0.16250015795230865,
+      "learning_rate": 2.54931421183922e-05,
+      "loss": 0.14485647678375244,
+      "step": 2800
+    },
+    {
+      "epoch": 0.5103711790393013,
+      "grad_norm": 0.1700994372367859,
+      "learning_rate": 2.5419546333837462e-05,
+      "loss": 0.15411126613616943,
+      "step": 2805
+    },
+    {
+      "epoch": 0.5112809315866085,
+      "grad_norm": 0.1547706127166748,
+      "learning_rate": 2.5345946912222256e-05,
+      "loss": 0.15516072511672974,
+      "step": 2810
+    },
+    {
+      "epoch": 0.5121906841339156,
+      "grad_norm": 0.17955681681632996,
+      "learning_rate": 2.527234449158228e-05,
+      "loss": 0.15546923875808716,
+      "step": 2815
+    },
+    {
+      "epoch": 0.5131004366812227,
+      "grad_norm": 0.163709819316864,
+      "learning_rate": 2.519873970997927e-05,
+      "loss": 0.15665037631988527,
+      "step": 2820
+    },
+    {
+      "epoch": 0.5140101892285298,
+      "grad_norm": 0.17859576642513275,
+      "learning_rate": 2.5125133205495405e-05,
+      "loss": 0.1539722204208374,
+      "step": 2825
+    },
+    {
+      "epoch": 0.514919941775837,
+      "grad_norm": 0.17443150281906128,
+      "learning_rate": 2.5051525616227806e-05,
+      "loss": 0.148411762714386,
+      "step": 2830
+    },
+    {
+      "epoch": 0.5158296943231441,
+      "grad_norm": 0.17397581040859222,
+      "learning_rate": 2.4977917580283007e-05,
+      "loss": 0.14880497455596925,
+      "step": 2835
+    },
+    {
+      "epoch": 0.5167394468704513,
+      "grad_norm": 0.14565663039684296,
+      "learning_rate": 2.4904309735771405e-05,
+      "loss": 0.14934680461883545,
+      "step": 2840
+    },
+    {
+      "epoch": 0.5176491994177583,
+      "grad_norm": 0.17895659804344177,
+      "learning_rate": 2.4830702720801746e-05,
+      "loss": 0.15287939310073853,
+      "step": 2845
+    },
+    {
+      "epoch": 0.5185589519650655,
+      "grad_norm": 0.15812788903713226,
+      "learning_rate": 2.4757097173475572e-05,
+      "loss": 0.14576947689056396,
+      "step": 2850
+    },
+    {
+      "epoch": 0.5194687045123726,
+      "grad_norm": 0.17123781144618988,
+      "learning_rate": 2.46834937318817e-05,
+      "loss": 0.15224847793579102,
+      "step": 2855
+    },
+    {
+      "epoch": 0.5203784570596798,
+      "grad_norm": 0.14845474064350128,
+      "learning_rate": 2.460989303409072e-05,
+      "loss": 0.14901585578918458,
+      "step": 2860
+    },
+    {
+      "epoch": 0.5212882096069869,
+      "grad_norm": 0.23493704199790955,
+      "learning_rate": 2.4536295718149407e-05,
+      "loss": 0.1517487049102783,
+      "step": 2865
+    },
+    {
+      "epoch": 0.522197962154294,
+      "grad_norm": 0.16209843754768372,
+      "learning_rate": 2.4462702422075217e-05,
+      "loss": 0.14327445030212402,
+      "step": 2870
+    },
+    {
+      "epoch": 0.5231077147016011,
+      "grad_norm": 0.17249803245067596,
+      "learning_rate": 2.4389113783850793e-05,
+      "loss": 0.1517549753189087,
+      "step": 2875
+    },
+    {
+      "epoch": 0.5240174672489083,
+      "grad_norm": 0.14561402797698975,
+      "learning_rate": 2.431553044141836e-05,
+      "loss": 0.14764087200164794,
+      "step": 2880
+    },
+    {
+      "epoch": 0.5249272197962155,
+      "grad_norm": 0.17033302783966064,
+      "learning_rate": 2.4241953032674256e-05,
+      "loss": 0.15181604623794556,
+      "step": 2885
+    },
+    {
+      "epoch": 0.5258369723435226,
+      "grad_norm": 0.1184430941939354,
+      "learning_rate": 2.4168382195463367e-05,
+      "loss": 0.14264242649078368,
+      "step": 2890
+    },
+    {
+      "epoch": 0.5267467248908297,
+      "grad_norm": 0.17521196603775024,
+      "learning_rate": 2.4094818567573618e-05,
+      "loss": 0.1509538173675537,
+      "step": 2895
+    },
+    {
+      "epoch": 0.5276564774381368,
+      "grad_norm": 0.1681576371192932,
+      "learning_rate": 2.4021262786730428e-05,
+      "loss": 0.15344605445861817,
+      "step": 2900
+    },
+    {
+      "epoch": 0.528566229985444,
+      "grad_norm": 0.17134182155132294,
+      "learning_rate": 2.3947715490591206e-05,
+      "loss": 0.15161689519882202,
+      "step": 2905
+    },
+    {
+      "epoch": 0.5294759825327511,
+      "grad_norm": 0.1796472817659378,
+      "learning_rate": 2.3874177316739778e-05,
+      "loss": 0.15086464881896972,
+      "step": 2910
+    },
+    {
+      "epoch": 0.5303857350800583,
+      "grad_norm": 0.23268625140190125,
+      "learning_rate": 2.380064890268093e-05,
+      "loss": 0.15354180335998535,
+      "step": 2915
+    },
+    {
+      "epoch": 0.5312954876273653,
+      "grad_norm": 0.16318941116333008,
+      "learning_rate": 2.372713088583481e-05,
+      "loss": 0.15131797790527343,
+      "step": 2920
+    },
+    {
+      "epoch": 0.5322052401746725,
+      "grad_norm": 0.18171803653240204,
+      "learning_rate": 2.365362390353143e-05,
+      "loss": 0.15784090757369995,
+      "step": 2925
+    },
+    {
+      "epoch": 0.5331149927219796,
+      "grad_norm": 0.17672640085220337,
+      "learning_rate": 2.3580128593005156e-05,
+      "loss": 0.15509436130523682,
+      "step": 2930
+    },
+    {
+      "epoch": 0.5340247452692868,
+      "grad_norm": 0.15985223650932312,
+      "learning_rate": 2.3506645591389174e-05,
+      "loss": 0.14851027727127075,
+      "step": 2935
+    },
+    {
+      "epoch": 0.5349344978165939,
+      "grad_norm": 0.16597607731819153,
+      "learning_rate": 2.343317553570995e-05,
+      "loss": 0.1504931092262268,
+      "step": 2940
+    },
+    {
+      "epoch": 0.535844250363901,
+      "grad_norm": 0.20180748403072357,
+      "learning_rate": 2.3359719062881725e-05,
+      "loss": 0.15023820400238036,
+      "step": 2945
+    },
+    {
+      "epoch": 0.5367540029112081,
+      "grad_norm": 0.1735963076353073,
+      "learning_rate": 2.3286276809701e-05,
+      "loss": 0.15374408960342406,
+      "step": 2950
+    },
+    {
+      "epoch": 0.5376637554585153,
+      "grad_norm": 0.17629501223564148,
+      "learning_rate": 2.3212849412840995e-05,
+      "loss": 0.15007833242416382,
+      "step": 2955
+    },
+    {
+      "epoch": 0.5385735080058224,
+      "grad_norm": 0.1493796557188034,
+      "learning_rate": 2.3139437508846155e-05,
+      "loss": 0.15206656455993653,
+      "step": 2960
+    },
+    {
+      "epoch": 0.5394832605531296,
+      "grad_norm": 0.17426837980747223,
+      "learning_rate": 2.306604173412659e-05,
+      "loss": 0.1441131591796875,
+      "step": 2965
+    },
+    {
+      "epoch": 0.5403930131004366,
+      "grad_norm": 0.16984431445598602,
+      "learning_rate": 2.2992662724952613e-05,
+      "loss": 0.14438753128051757,
+      "step": 2970
+    },
+    {
+      "epoch": 0.5413027656477438,
+      "grad_norm": 0.1814386397600174,
+      "learning_rate": 2.2919301117449167e-05,
+      "loss": 0.14887022972106934,
+      "step": 2975
+    },
+    {
+      "epoch": 0.5422125181950509,
+      "grad_norm": 0.158392995595932,
+      "learning_rate": 2.2845957547590368e-05,
+      "loss": 0.14404361248016356,
+      "step": 2980
+    },
+    {
+      "epoch": 0.5431222707423581,
+      "grad_norm": 0.17496263980865479,
+      "learning_rate": 2.2772632651193953e-05,
+      "loss": 0.1454906702041626,
+      "step": 2985
+    },
+    {
+      "epoch": 0.5440320232896652,
+      "grad_norm": 0.157533198595047,
+      "learning_rate": 2.2699327063915766e-05,
+      "loss": 0.1458217740058899,
+      "step": 2990
+    },
+    {
+      "epoch": 0.5449417758369723,
+      "grad_norm": 0.1767890453338623,
+      "learning_rate": 2.262604142124427e-05,
+      "loss": 0.14384825229644777,
+      "step": 2995
+    },
+    {
+      "epoch": 0.5458515283842795,
+      "grad_norm": 0.1851050704717636,
+      "learning_rate": 2.2552776358495033e-05,
+      "loss": 0.14832457304000854,
+      "step": 3000
+    },
+    {
+      "epoch": 0.5467612809315866,
+      "grad_norm": 0.164175882935524,
+      "learning_rate": 2.247953251080521e-05,
+      "loss": 0.14999878406524658,
+      "step": 3005
+    },
+    {
+      "epoch": 0.5476710334788938,
+      "grad_norm": 0.3403675854206085,
+      "learning_rate": 2.240631051312804e-05,
+      "loss": 0.1443937063217163,
+      "step": 3010
+    },
+    {
+      "epoch": 0.5485807860262009,
+      "grad_norm": 0.16751109063625336,
+      "learning_rate": 2.2333111000227342e-05,
+      "loss": 0.1462402105331421,
+      "step": 3015
+    },
+    {
+      "epoch": 0.549490538573508,
+      "grad_norm": 0.14741151034832,
+      "learning_rate": 2.225993460667201e-05,
+      "loss": 0.149855899810791,
+      "step": 3020
+    },
+    {
+      "epoch": 0.5504002911208151,
+      "grad_norm": 0.20605266094207764,
+      "learning_rate": 2.218678196683054e-05,
+      "loss": 0.15413178205490113,
+      "step": 3025
+    },
+    {
+      "epoch": 0.5513100436681223,
+      "grad_norm": 0.14884796738624573,
+      "learning_rate": 2.2113653714865473e-05,
+      "loss": 0.14592334032058715,
+      "step": 3030
+    },
+    {
+      "epoch": 0.5522197962154294,
+      "grad_norm": 0.17114350199699402,
+      "learning_rate": 2.2040550484727943e-05,
+      "loss": 0.1498338460922241,
+      "step": 3035
+    },
+    {
+      "epoch": 0.5531295487627366,
+      "grad_norm": 0.16496853530406952,
+      "learning_rate": 2.196747291015219e-05,
+      "loss": 0.14650315046310425,
+      "step": 3040
+    },
+    {
+      "epoch": 0.5540393013100436,
+      "grad_norm": 0.15172401070594788,
+      "learning_rate": 2.189442162465001e-05,
+      "loss": 0.14984124898910522,
+      "step": 3045
+    },
+    {
+      "epoch": 0.5549490538573508,
+      "grad_norm": 0.19258467853069305,
+      "learning_rate": 2.182139726150532e-05,
+      "loss": 0.1486764669418335,
+      "step": 3050
+    },
+    {
+      "epoch": 0.5558588064046579,
+      "grad_norm": 0.1749001443386078,
+      "learning_rate": 2.1748400453768652e-05,
+      "loss": 0.14983701705932617,
+      "step": 3055
+    },
+    {
+      "epoch": 0.5567685589519651,
+      "grad_norm": 0.37510567903518677,
+      "learning_rate": 2.1675431834251637e-05,
+      "loss": 0.14483561515808105,
+      "step": 3060
+    },
+    {
+      "epoch": 0.5576783114992722,
+      "grad_norm": 0.16932405531406403,
+      "learning_rate": 2.1602492035521553e-05,
+      "loss": 0.14487643241882325,
+      "step": 3065
+    },
+    {
+      "epoch": 0.5585880640465793,
+      "grad_norm": 0.174176424741745,
+      "learning_rate": 2.152958168989584e-05,
+      "loss": 0.14737497568130492,
+      "step": 3070
+    },
+    {
+      "epoch": 0.5594978165938864,
+      "grad_norm": 0.1601252257823944,
+      "learning_rate": 2.1456701429436577e-05,
+      "loss": 0.15183379650115966,
+      "step": 3075
+    },
+    {
+      "epoch": 0.5604075691411936,
+      "grad_norm": 0.14960910379886627,
+      "learning_rate": 2.1383851885945085e-05,
+      "loss": 0.143074893951416,
+      "step": 3080
+    },
+    {
+      "epoch": 0.5613173216885007,
+      "grad_norm": 0.1678633838891983,
+      "learning_rate": 2.1311033690956346e-05,
+      "loss": 0.14961432218551635,
+      "step": 3085
+    },
+    {
+      "epoch": 0.5622270742358079,
+      "grad_norm": 0.15814319252967834,
+      "learning_rate": 2.1238247475733613e-05,
+      "loss": 0.14308581352233887,
+      "step": 3090
+    },
+    {
+      "epoch": 0.5631368267831149,
+      "grad_norm": 0.21240772306919098,
+      "learning_rate": 2.1165493871262887e-05,
+      "loss": 0.14737485647201537,
+      "step": 3095
+    },
+    {
+      "epoch": 0.5640465793304221,
+      "grad_norm": 0.15161271393299103,
+      "learning_rate": 2.109277350824749e-05,
+      "loss": 0.14534420967102052,
+      "step": 3100
+    },
+    {
+      "epoch": 0.5649563318777293,
+      "grad_norm": 0.16572362184524536,
+      "learning_rate": 2.1020087017102537e-05,
+      "loss": 0.14299670457839966,
+      "step": 3105
+    },
+    {
+      "epoch": 0.5658660844250364,
+      "grad_norm": 0.1548164039850235,
+      "learning_rate": 2.094743502794954e-05,
+      "loss": 0.14371142387390137,
+      "step": 3110
+    },
+    {
+      "epoch": 0.5667758369723436,
+      "grad_norm": 0.2574169933795929,
+      "learning_rate": 2.0874818170610885e-05,
+      "loss": 0.14350423812866211,
+      "step": 3115
+    },
+    {
+      "epoch": 0.5676855895196506,
+      "grad_norm": 0.16359548270702362,
+      "learning_rate": 2.080223707460443e-05,
+      "loss": 0.1520243763923645,
+      "step": 3120
+    },
+    {
+      "epoch": 0.5685953420669578,
+      "grad_norm": 0.1798320859670639,
+      "learning_rate": 2.072969236913799e-05,
+      "loss": 0.14832595586776734,
+      "step": 3125
+    },
+    {
+      "epoch": 0.5695050946142649,
+      "grad_norm": 0.17045916616916656,
+      "learning_rate": 2.0657184683103926e-05,
+      "loss": 0.15308042764663696,
+      "step": 3130
+    },
+    {
+      "epoch": 0.5704148471615721,
+      "grad_norm": 0.16345897316932678,
+      "learning_rate": 2.058471464507366e-05,
+      "loss": 0.14564799070358275,
+      "step": 3135
+    },
+    {
+      "epoch": 0.5713245997088792,
+      "grad_norm": 0.15170110762119293,
+      "learning_rate": 2.0512282883292257e-05,
+      "loss": 0.14161767959594726,
+      "step": 3140
+    },
+    {
+      "epoch": 0.5722343522561864,
+      "grad_norm": 0.8107472658157349,
+      "learning_rate": 2.0439890025672955e-05,
+      "loss": 0.14481087923049926,
+      "step": 3145
+    },
+    {
+      "epoch": 0.5731441048034934,
+      "grad_norm": 0.15346679091453552,
+      "learning_rate": 2.036753669979174e-05,
+      "loss": 0.14860262870788574,
+      "step": 3150
+    },
+    {
+      "epoch": 0.5740538573508006,
+      "grad_norm": 0.1632593423128128,
+      "learning_rate": 2.0295223532881886e-05,
+      "loss": 0.1481687307357788,
+      "step": 3155
+    },
+    {
+      "epoch": 0.5749636098981077,
+      "grad_norm": 0.23399172723293304,
+      "learning_rate": 2.022295115182852e-05,
+      "loss": 0.149153733253479,
+      "step": 3160
+    },
+    {
+      "epoch": 0.5758733624454149,
+      "grad_norm": 0.14977394044399261,
+      "learning_rate": 2.015072018316323e-05,
+      "loss": 0.14921388626098633,
+      "step": 3165
+    },
+    {
+      "epoch": 0.576783114992722,
+      "grad_norm": 0.1550658792257309,
+      "learning_rate": 2.007853125305856e-05,
+      "loss": 0.1482759475708008,
+      "step": 3170
+    },
+    {
+      "epoch": 0.5776928675400291,
+      "grad_norm": 0.16661737859249115,
+      "learning_rate": 2.0006384987322645e-05,
+      "loss": 0.14903552532196046,
+      "step": 3175
+    },
+    {
+      "epoch": 0.5786026200873362,
+      "grad_norm": 0.1746823936700821,
+      "learning_rate": 1.9934282011393753e-05,
+      "loss": 0.1412947654724121,
+      "step": 3180
+    },
+    {
+      "epoch": 0.5795123726346434,
+      "grad_norm": 0.17025792598724365,
+      "learning_rate": 1.9862222950334857e-05,
+      "loss": 0.15289769172668458,
+      "step": 3185
+    },
+    {
+      "epoch": 0.5804221251819505,
+      "grad_norm": 0.16857658326625824,
+      "learning_rate": 1.9790208428828252e-05,
+      "loss": 0.14419941902160643,
+      "step": 3190
+    },
+    {
+      "epoch": 0.5813318777292577,
+      "grad_norm": 0.16099876165390015,
+      "learning_rate": 1.9718239071170118e-05,
+      "loss": 0.14476487636566163,
+      "step": 3195
+    },
+    {
+      "epoch": 0.5822416302765647,
+      "grad_norm": 0.16140873730182648,
+      "learning_rate": 1.964631550126508e-05,
+      "loss": 0.14588416814804078,
+      "step": 3200
+    },
+    {
+      "epoch": 0.5831513828238719,
+      "grad_norm": 0.15719448029994965,
+      "learning_rate": 1.957443834262087e-05,
+      "loss": 0.15144693851470947,
+      "step": 3205
+    },
+    {
+      "epoch": 0.584061135371179,
+      "grad_norm": 0.16512645781040192,
+      "learning_rate": 1.950260821834285e-05,
+      "loss": 0.14787566661834717,
+      "step": 3210
+    },
+    {
+      "epoch": 0.5849708879184862,
+      "grad_norm": 0.18584516644477844,
+      "learning_rate": 1.9430825751128643e-05,
+      "loss": 0.14514710903167724,
+      "step": 3215
+    },
+    {
+      "epoch": 0.5858806404657934,
+      "grad_norm": 0.17640981078147888,
+      "learning_rate": 1.9359091563262742e-05,
+      "loss": 0.1511004686355591,
+      "step": 3220
+    },
+    {
+      "epoch": 0.5867903930131004,
+      "grad_norm": 0.1697624921798706,
+      "learning_rate": 1.9287406276611095e-05,
+      "loss": 0.15392563343048096,
+      "step": 3225
+    },
+    {
+      "epoch": 0.5877001455604076,
+      "grad_norm": 0.1677260845899582,
+      "learning_rate": 1.9215770512615725e-05,
+      "loss": 0.15311745405197144,
+      "step": 3230
+    },
+    {
+      "epoch": 0.5886098981077147,
+      "grad_norm": 0.15357480943202972,
+      "learning_rate": 1.9144184892289337e-05,
+      "loss": 0.14370160102844237,
+      "step": 3235
+    },
+    {
+      "epoch": 0.5895196506550219,
+      "grad_norm": 0.18601207435131073,
+      "learning_rate": 1.9072650036209955e-05,
+      "loss": 0.14095077514648438,
+      "step": 3240
+    },
+    {
+      "epoch": 0.590429403202329,
+      "grad_norm": 0.17313526570796967,
+      "learning_rate": 1.9001166564515513e-05,
+      "loss": 0.148259174823761,
+      "step": 3245
+    },
+    {
+      "epoch": 0.5913391557496361,
+      "grad_norm": 0.1634378433227539,
+      "learning_rate": 1.8929735096898504e-05,
+      "loss": 0.15082294940948487,
+      "step": 3250
+    },
+    {
+      "epoch": 0.5922489082969432,
+      "grad_norm": 0.18542174994945526,
+      "learning_rate": 1.885835625260058e-05,
+      "loss": 0.14461435079574586,
+      "step": 3255
+    },
+    {
+      "epoch": 0.5931586608442504,
+      "grad_norm": 0.1740756630897522,
+      "learning_rate": 1.87870306504072e-05,
+      "loss": 0.14083608388900756,
+      "step": 3260
+    },
+    {
+      "epoch": 0.5940684133915575,
+      "grad_norm": 0.25606217980384827,
+      "learning_rate": 1.8715758908642288e-05,
+      "loss": 0.15125386714935302,
+      "step": 3265
+    },
+    {
+      "epoch": 0.5949781659388647,
+      "grad_norm": 0.20194627344608307,
+      "learning_rate": 1.8644541645162834e-05,
+      "loss": 0.14433003664016725,
+      "step": 3270
+    },
+    {
+      "epoch": 0.5958879184861717,
+      "grad_norm": 0.1902168095111847,
+      "learning_rate": 1.8573379477353542e-05,
+      "loss": 0.14718132019042968,
+      "step": 3275
+    },
+    {
+      "epoch": 0.5967976710334789,
+      "grad_norm": 0.15122972428798676,
+      "learning_rate": 1.850227302212151e-05,
+      "loss": 0.153376567363739,
+      "step": 3280
+    },
+    {
+      "epoch": 0.597707423580786,
+      "grad_norm": 0.14331959187984467,
+      "learning_rate": 1.843122289589085e-05,
+      "loss": 0.146630597114563,
+      "step": 3285
+    },
+    {
+      "epoch": 0.5986171761280932,
+      "grad_norm": 0.15083099901676178,
+      "learning_rate": 1.836022971459737e-05,
+      "loss": 0.1445971965789795,
+      "step": 3290
+    },
+    {
+      "epoch": 0.5995269286754003,
+      "grad_norm": 0.16585418581962585,
+      "learning_rate": 1.828929409368321e-05,
+      "loss": 0.15120241641998292,
+      "step": 3295
+    },
+    {
+      "epoch": 0.6004366812227074,
+      "grad_norm": 0.1653224229812622,
+      "learning_rate": 1.8218416648091524e-05,
+      "loss": 0.14349838495254516,
+      "step": 3300
+    },
+    {
+      "epoch": 0.6013464337700145,
+      "grad_norm": 0.1891375184059143,
+      "learning_rate": 1.8147597992261124e-05,
+      "loss": 0.15171384811401367,
+      "step": 3305
+    },
+    {
+      "epoch": 0.6022561863173217,
+      "grad_norm": 0.13392704725265503,
+      "learning_rate": 1.8076838740121187e-05,
+      "loss": 0.14607118368148803,
+      "step": 3310
+    },
+    {
+      "epoch": 0.6031659388646288,
+      "grad_norm": 0.15421944856643677,
+      "learning_rate": 1.8006139505085926e-05,
+      "loss": 0.1380957007408142,
+      "step": 3315
+    },
+    {
+      "epoch": 0.604075691411936,
+      "grad_norm": 0.16637761890888214,
+      "learning_rate": 1.7935500900049246e-05,
+      "loss": 0.14604611396789552,
+      "step": 3320
+    },
+    {
+      "epoch": 0.6049854439592431,
+      "grad_norm": 0.16638441383838654,
+      "learning_rate": 1.7864923537379445e-05,
+      "loss": 0.1513611912727356,
+      "step": 3325
+    },
+    {
+      "epoch": 0.6058951965065502,
+      "grad_norm": 0.1745707094669342,
+      "learning_rate": 1.779440802891394e-05,
+      "loss": 0.15391240119934083,
+      "step": 3330
+    },
+    {
+      "epoch": 0.6068049490538574,
+      "grad_norm": 0.1620505005121231,
+      "learning_rate": 1.77239549859539e-05,
+      "loss": 0.14986472129821776,
+      "step": 3335
+    },
+    {
+      "epoch": 0.6077147016011645,
+      "grad_norm": 0.1579132080078125,
+      "learning_rate": 1.7653565019259e-05,
+      "loss": 0.1466603994369507,
+      "step": 3340
+    },
+    {
+      "epoch": 0.6086244541484717,
+      "grad_norm": 0.19154994189739227,
+      "learning_rate": 1.7583238739042086e-05,
+      "loss": 0.15228934288024903,
+      "step": 3345
+    },
+    {
+      "epoch": 0.6095342066957787,
+      "grad_norm": 0.15771779417991638,
+      "learning_rate": 1.7512976754963913e-05,
+      "loss": 0.14965078830718995,
+      "step": 3350
+    },
+    {
+      "epoch": 0.6104439592430859,
+      "grad_norm": 0.18406136333942413,
+      "learning_rate": 1.744277967612785e-05,
+      "loss": 0.1473196864128113,
+      "step": 3355
+    },
+    {
+      "epoch": 0.611353711790393,
+      "grad_norm": 0.17603816092014313,
+      "learning_rate": 1.7372648111074607e-05,
+      "loss": 0.1430676221847534,
+      "step": 3360
+    },
+    {
+      "epoch": 0.6122634643377002,
+      "grad_norm": 0.156408429145813,
+      "learning_rate": 1.7302582667776933e-05,
+      "loss": 0.14018454551696777,
+      "step": 3365
+    },
+    {
+      "epoch": 0.6131732168850073,
+      "grad_norm": 0.14504843950271606,
+      "learning_rate": 1.7232583953634407e-05,
+      "loss": 0.14505640268325806,
+      "step": 3370
+    },
+    {
+      "epoch": 0.6140829694323144,
+      "grad_norm": 0.1864968240261078,
+      "learning_rate": 1.716265257546808e-05,
+      "loss": 0.14810394048690795,
+      "step": 3375
+    },
+    {
+      "epoch": 0.6149927219796215,
+      "grad_norm": 0.1621711403131485,
+      "learning_rate": 1.7092789139515295e-05,
+      "loss": 0.14203091859817504,
+      "step": 3380
+    },
+    {
+      "epoch": 0.6159024745269287,
+      "grad_norm": 0.17994914948940277,
+      "learning_rate": 1.70229942514244e-05,
+      "loss": 0.14565644264221192,
+      "step": 3385
+    },
+    {
+      "epoch": 0.6168122270742358,
+      "grad_norm": 0.1707388162612915,
+      "learning_rate": 1.6953268516249486e-05,
+      "loss": 0.14449434280395507,
+      "step": 3390
+    },
+    {
+      "epoch": 0.617721979621543,
+      "grad_norm": 0.16425329446792603,
+      "learning_rate": 1.6883612538445175e-05,
+      "loss": 0.15185940265655518,
+      "step": 3395
+    },
+    {
+      "epoch": 0.61863173216885,
+      "grad_norm": 0.15987788140773773,
+      "learning_rate": 1.6814026921861335e-05,
+      "loss": 0.14994431734085084,
+      "step": 3400
+    },
+    {
+      "epoch": 0.6195414847161572,
+      "grad_norm": 0.2987690269947052,
+      "learning_rate": 1.6744512269737894e-05,
+      "loss": 0.14652738571166993,
+      "step": 3405
+    },
+    {
+      "epoch": 0.6204512372634643,
+      "grad_norm": 0.1681315004825592,
+      "learning_rate": 1.6675069184699574e-05,
+      "loss": 0.14566165208816528,
+      "step": 3410
+    },
+    {
+      "epoch": 0.6213609898107715,
+      "grad_norm": 0.15847846865653992,
+      "learning_rate": 1.660569826875069e-05,
+      "loss": 0.1374401330947876,
+      "step": 3415
+    },
+    {
+      "epoch": 0.6222707423580786,
+      "grad_norm": 0.16370312869548798,
+      "learning_rate": 1.6536400123269907e-05,
+      "loss": 0.14905524253845215,
+      "step": 3420
+    },
+    {
+      "epoch": 0.6231804949053857,
+      "grad_norm": 0.16054444015026093,
+      "learning_rate": 1.6467175349005054e-05,
+      "loss": 0.1496324896812439,
+      "step": 3425
+    },
+    {
+      "epoch": 0.6240902474526928,
+      "grad_norm": 0.1663951277732849,
+      "learning_rate": 1.639802454606788e-05,
+      "loss": 0.1504170298576355,
+      "step": 3430
+    },
+    {
+      "epoch": 0.625,
+      "grad_norm": 0.1591310054063797,
+      "learning_rate": 1.6328948313928906e-05,
+      "loss": 0.1410186171531677,
+      "step": 3435
+    },
+    {
+      "epoch": 0.6259097525473072,
+      "grad_norm": 0.1637524962425232,
+      "learning_rate": 1.6259947251412178e-05,
+      "loss": 0.13963305950164795,
+      "step": 3440
+    },
+    {
+      "epoch": 0.6268195050946143,
+      "grad_norm": 0.1688017100095749,
+      "learning_rate": 1.6191021956690096e-05,
+      "loss": 0.14727941751480103,
+      "step": 3445
+    },
+    {
+      "epoch": 0.6277292576419214,
+      "grad_norm": 0.1691795438528061,
+      "learning_rate": 1.612217302727821e-05,
+      "loss": 0.14856183528900146,
+      "step": 3450
+    },
+    {
+      "epoch": 0.6286390101892285,
+      "grad_norm": 0.18501746654510498,
+      "learning_rate": 1.60534010600301e-05,
+      "loss": 0.1481746554374695,
+      "step": 3455
+    },
+    {
+      "epoch": 0.6295487627365357,
+      "grad_norm": 0.16234716773033142,
+      "learning_rate": 1.5984706651132125e-05,
+      "loss": 0.1427530527114868,
+      "step": 3460
+    },
+    {
+      "epoch": 0.6304585152838428,
+      "grad_norm": 0.16013780236244202,
+      "learning_rate": 1.5916090396098293e-05,
+      "loss": 0.14264426231384278,
+      "step": 3465
+    },
+    {
+      "epoch": 0.63136826783115,
+      "grad_norm": 0.17116396129131317,
+      "learning_rate": 1.5847552889765095e-05,
+      "loss": 0.14109257459640503,
+      "step": 3470
+    },
+    {
+      "epoch": 0.632278020378457,
+      "grad_norm": 0.16949769854545593,
+      "learning_rate": 1.5779094726286344e-05,
+      "loss": 0.1387040376663208,
+      "step": 3475
+    },
+    {
+      "epoch": 0.6331877729257642,
+      "grad_norm": 0.14983431994915009,
+      "learning_rate": 1.5710716499128044e-05,
+      "loss": 0.13645120859146118,
+      "step": 3480
+    },
+    {
+      "epoch": 0.6340975254730713,
+      "grad_norm": 0.1632554531097412,
+      "learning_rate": 1.564241880106321e-05,
+      "loss": 0.14883992671966553,
+      "step": 3485
+    },
+    {
+      "epoch": 0.6350072780203785,
+      "grad_norm": 0.15686506032943726,
+      "learning_rate": 1.5574202224166744e-05,
+      "loss": 0.14244272708892822,
+      "step": 3490
+    },
+    {
+      "epoch": 0.6359170305676856,
+      "grad_norm": 0.18843458592891693,
+      "learning_rate": 1.5506067359810333e-05,
+      "loss": 0.15149861574172974,
+      "step": 3495
+    },
+    {
+      "epoch": 0.6368267831149927,
+      "grad_norm": 0.15874551236629486,
+      "learning_rate": 1.5438014798657275e-05,
+      "loss": 0.15188233852386473,
+      "step": 3500
+    },
+    {
+      "epoch": 0.6377365356622998,
+      "grad_norm": 0.17014239728450775,
+      "learning_rate": 1.5370045130657366e-05,
+      "loss": 0.14694437980651856,
+      "step": 3505
+    },
+    {
+      "epoch": 0.638646288209607,
+      "grad_norm": 0.14744038879871368,
+      "learning_rate": 1.5302158945041838e-05,
+      "loss": 0.14434736967086792,
+      "step": 3510
+    },
+    {
+      "epoch": 0.6395560407569141,
+      "grad_norm": 0.2069770246744156,
+      "learning_rate": 1.523435683031818e-05,
+      "loss": 0.13982917070388795,
+      "step": 3515
+    },
+    {
+      "epoch": 0.6404657933042213,
+      "grad_norm": 0.17811502516269684,
+      "learning_rate": 1.5166639374265063e-05,
+      "loss": 0.1408839702606201,
+      "step": 3520
+    },
+    {
+      "epoch": 0.6413755458515283,
+      "grad_norm": 0.165786474943161,
+      "learning_rate": 1.509900716392728e-05,
+      "loss": 0.15312877893447877,
+      "step": 3525
+    },
+    {
+      "epoch": 0.6422852983988355,
+      "grad_norm": 0.1633884161710739,
+      "learning_rate": 1.5031460785610596e-05,
+      "loss": 0.1488795518875122,
+      "step": 3530
+    },
+    {
+      "epoch": 0.6431950509461426,
+      "grad_norm": 0.16498984396457672,
+      "learning_rate": 1.4964000824876723e-05,
+      "loss": 0.15031465291976928,
+      "step": 3535
+    },
+    {
+      "epoch": 0.6441048034934498,
+      "grad_norm": 0.18043678998947144,
+      "learning_rate": 1.4896627866538191e-05,
+      "loss": 0.147829806804657,
+      "step": 3540
+    },
+    {
+      "epoch": 0.6450145560407569,
+      "grad_norm": 0.16813597083091736,
+      "learning_rate": 1.4829342494653315e-05,
+      "loss": 0.1418998956680298,
+      "step": 3545
+    },
+    {
+      "epoch": 0.645924308588064,
+      "grad_norm": 0.1817242056131363,
+      "learning_rate": 1.4762145292521118e-05,
+      "loss": 0.14508869647979736,
+      "step": 3550
+    },
+    {
+      "epoch": 0.6468340611353712,
+      "grad_norm": 0.14666494727134705,
+      "learning_rate": 1.469503684267628e-05,
+      "loss": 0.14159854650497436,
+      "step": 3555
+    },
+    {
+      "epoch": 0.6477438136826783,
+      "grad_norm": 0.16485381126403809,
+      "learning_rate": 1.4628017726884086e-05,
+      "loss": 0.14419105052947997,
+      "step": 3560
+    },
+    {
+      "epoch": 0.6486535662299855,
+      "grad_norm": 0.16100342571735382,
+      "learning_rate": 1.4561088526135375e-05,
+      "loss": 0.14501721858978273,
+      "step": 3565
+    },
+    {
+      "epoch": 0.6495633187772926,
+      "grad_norm": 0.16996590793132782,
+      "learning_rate": 1.4494249820641493e-05,
+      "loss": 0.1377166509628296,
+      "step": 3570
+    },
+    {
+      "epoch": 0.6504730713245997,
+      "grad_norm": 0.16168837249279022,
+      "learning_rate": 1.4427502189829339e-05,
+      "loss": 0.1414325475692749,
+      "step": 3575
+    },
+    {
+      "epoch": 0.6513828238719068,
+      "grad_norm": 0.16318906843662262,
+      "learning_rate": 1.436084621233621e-05,
+      "loss": 0.14685193300247193,
+      "step": 3580
+    },
+    {
+      "epoch": 0.652292576419214,
+      "grad_norm": 0.1636219322681427,
+      "learning_rate": 1.4294282466004899e-05,
+      "loss": 0.1405899167060852,
+      "step": 3585
+    },
+    {
+      "epoch": 0.6532023289665211,
+      "grad_norm": 0.1838461309671402,
+      "learning_rate": 1.422781152787865e-05,
+      "loss": 0.14386332035064697,
+      "step": 3590
+    },
+    {
+      "epoch": 0.6541120815138283,
+      "grad_norm": 0.1796344667673111,
+      "learning_rate": 1.4161433974196115e-05,
+      "loss": 0.1513024687767029,
+      "step": 3595
+    },
+    {
+      "epoch": 0.6550218340611353,
+      "grad_norm": 0.16424529254436493,
+      "learning_rate": 1.4095150380386427e-05,
+      "loss": 0.14238927364349366,
+      "step": 3600
+    },
+    {
+      "epoch": 0.6559315866084425,
+      "grad_norm": 0.19264160096645355,
+      "learning_rate": 1.402896132106415e-05,
+      "loss": 0.14297477006912232,
+      "step": 3605
+    },
+    {
+      "epoch": 0.6568413391557496,
+      "grad_norm": 0.18319948017597198,
+      "learning_rate": 1.3962867370024347e-05,
+      "loss": 0.1448880434036255,
+      "step": 3610
+    },
+    {
+      "epoch": 0.6577510917030568,
+      "grad_norm": 0.16507290303707123,
+      "learning_rate": 1.389686910023758e-05,
+      "loss": 0.14724698066711425,
+      "step": 3615
+    },
+    {
+      "epoch": 0.6586608442503639,
+      "grad_norm": 0.17871244251728058,
+      "learning_rate": 1.3830967083844942e-05,
+      "loss": 0.14479386806488037,
+      "step": 3620
+    },
+    {
+      "epoch": 0.659570596797671,
+      "grad_norm": 0.1846228390932083,
+      "learning_rate": 1.3765161892153112e-05,
+      "loss": 0.1453616738319397,
+      "step": 3625
+    },
+    {
+      "epoch": 0.6604803493449781,
+      "grad_norm": 0.17185978591442108,
+      "learning_rate": 1.3699454095629372e-05,
+      "loss": 0.14906206130981445,
+      "step": 3630
+    },
+    {
+      "epoch": 0.6613901018922853,
+      "grad_norm": 0.14751191437244415,
+      "learning_rate": 1.3633844263896698e-05,
+      "loss": 0.13991892337799072,
+      "step": 3635
+    },
+    {
+      "epoch": 0.6622998544395924,
+      "grad_norm": 0.22059763967990875,
+      "learning_rate": 1.3568332965728817e-05,
+      "loss": 0.14680869579315187,
+      "step": 3640
+    },
+    {
+      "epoch": 0.6632096069868996,
+      "grad_norm": 0.15295909345149994,
+      "learning_rate": 1.3502920769045232e-05,
+      "loss": 0.1404443383216858,
+      "step": 3645
+    },
+    {
+      "epoch": 0.6641193595342066,
+      "grad_norm": 0.14600558578968048,
+      "learning_rate": 1.3437608240906364e-05,
+      "loss": 0.14663270711898804,
+      "step": 3650
+    },
+    {
+      "epoch": 0.6650291120815138,
+      "grad_norm": 0.15548352897167206,
+      "learning_rate": 1.3372395947508587e-05,
+      "loss": 0.1431443452835083,
+      "step": 3655
+    },
+    {
+      "epoch": 0.665938864628821,
+      "grad_norm": 0.1813388466835022,
+      "learning_rate": 1.3307284454179342e-05,
+      "loss": 0.1458706736564636,
+      "step": 3660
+    },
+    {
+      "epoch": 0.6668486171761281,
+      "grad_norm": 0.16326870024204254,
+      "learning_rate": 1.3242274325372247e-05,
+      "loss": 0.14700595140457154,
+      "step": 3665
+    },
+    {
+      "epoch": 0.6677583697234353,
+      "grad_norm": 0.18779197335243225,
+      "learning_rate": 1.3177366124662149e-05,
+      "loss": 0.1497237801551819,
+      "step": 3670
+    },
+    {
+      "epoch": 0.6686681222707423,
+      "grad_norm": 0.16291002929210663,
+      "learning_rate": 1.3112560414740315e-05,
+      "loss": 0.1387086868286133,
+      "step": 3675
+    },
+    {
+      "epoch": 0.6695778748180495,
+      "grad_norm": 0.1532297134399414,
+      "learning_rate": 1.3047857757409487e-05,
+      "loss": 0.14497545957565308,
+      "step": 3680
+    },
+    {
+      "epoch": 0.6704876273653566,
+      "grad_norm": 0.14697515964508057,
+      "learning_rate": 1.2983258713579066e-05,
+      "loss": 0.1494283437728882,
+      "step": 3685
+    },
+    {
+      "epoch": 0.6713973799126638,
+      "grad_norm": 0.15213452279567719,
+      "learning_rate": 1.2918763843260218e-05,
+      "loss": 0.1468907594680786,
+      "step": 3690
+    },
+    {
+      "epoch": 0.6723071324599709,
+      "grad_norm": 0.1745215803384781,
+      "learning_rate": 1.285437370556099e-05,
+      "loss": 0.14997754096984864,
+      "step": 3695
+    },
+    {
+      "epoch": 0.673216885007278,
+      "grad_norm": 0.19207637012004852,
+      "learning_rate": 1.2790088858681577e-05,
+      "loss": 0.14202862977981567,
+      "step": 3700
+    },
+    {
+      "epoch": 0.6741266375545851,
+      "grad_norm": 0.1521359086036682,
+      "learning_rate": 1.2725909859909313e-05,
+      "loss": 0.14547673463821412,
+      "step": 3705
+    },
+    {
+      "epoch": 0.6750363901018923,
+      "grad_norm": 0.16975535452365875,
+      "learning_rate": 1.2661837265613999e-05,
+      "loss": 0.14006874561309815,
+      "step": 3710
+    },
+    {
+      "epoch": 0.6759461426491994,
+      "grad_norm": 0.22234582901000977,
+      "learning_rate": 1.2597871631242992e-05,
+      "loss": 0.13691173791885375,
+      "step": 3715
+    },
+    {
+      "epoch": 0.6768558951965066,
+      "grad_norm": 0.16082969307899475,
+      "learning_rate": 1.2534013511316383e-05,
+      "loss": 0.14932308197021485,
+      "step": 3720
+    },
+    {
+      "epoch": 0.6777656477438136,
+      "grad_norm": 0.1751091182231903,
+      "learning_rate": 1.247026345942226e-05,
+      "loss": 0.14531974792480468,
+      "step": 3725
+    },
+    {
+      "epoch": 0.6786754002911208,
+      "grad_norm": 0.15838147699832916,
+      "learning_rate": 1.2406622028211844e-05,
+      "loss": 0.14759832620620728,
+      "step": 3730
+    },
+    {
+      "epoch": 0.6795851528384279,
+      "grad_norm": 0.1771744042634964,
+      "learning_rate": 1.2343089769394714e-05,
+      "loss": 0.1382831573486328,
+      "step": 3735
+    },
+    {
+      "epoch": 0.6804949053857351,
+      "grad_norm": 0.16301538050174713,
+      "learning_rate": 1.2279667233734037e-05,
+      "loss": 0.14444775581359864,
+      "step": 3740
+    },
+    {
+      "epoch": 0.6814046579330422,
+      "grad_norm": 0.1584121286869049,
+      "learning_rate": 1.2216354971041796e-05,
+      "loss": 0.14200170040130616,
+      "step": 3745
+    },
+    {
+      "epoch": 0.6823144104803494,
+      "grad_norm": 0.139187291264534,
+      "learning_rate": 1.2153153530174007e-05,
+      "loss": 0.14318310022354125,
+      "step": 3750
+    },
+    {
+      "epoch": 0.6832241630276564,
+      "grad_norm": 0.13665248453617096,
+      "learning_rate": 1.2090063459025955e-05,
+      "loss": 0.1411946654319763,
+      "step": 3755
+    },
+    {
+      "epoch": 0.6841339155749636,
+      "grad_norm": 0.16273781657218933,
+      "learning_rate": 1.2027085304527475e-05,
+      "loss": 0.14873508214950562,
+      "step": 3760
+    },
+    {
+      "epoch": 0.6850436681222707,
+      "grad_norm": 0.16317526996135712,
+      "learning_rate": 1.1964219612638194e-05,
+      "loss": 0.14644203186035157,
+      "step": 3765
+    },
+    {
+      "epoch": 0.6859534206695779,
+      "grad_norm": 0.17253617942333221,
+      "learning_rate": 1.1901466928342777e-05,
+      "loss": 0.14027841091156007,
+      "step": 3770
+    },
+    {
+      "epoch": 0.6868631732168851,
+      "grad_norm": 0.19692830741405487,
+      "learning_rate": 1.183882779564624e-05,
+      "loss": 0.14411110877990724,
+      "step": 3775
+    },
+    {
+      "epoch": 0.6877729257641921,
+      "grad_norm": 0.15444578230381012,
+      "learning_rate": 1.1776302757569214e-05,
+      "loss": 0.14355008602142333,
+      "step": 3780
+    },
+    {
+      "epoch": 0.6886826783114993,
+      "grad_norm": 0.1622200757265091,
+      "learning_rate": 1.1713892356143239e-05,
+      "loss": 0.14794334173202514,
+      "step": 3785
+    },
+    {
+      "epoch": 0.6895924308588064,
+      "grad_norm": 0.1898501068353653,
+      "learning_rate": 1.1651597132406073e-05,
+      "loss": 0.1418622612953186,
+      "step": 3790
+    },
+    {
+      "epoch": 0.6905021834061136,
+      "grad_norm": 0.17803208529949188,
+      "learning_rate": 1.1589417626396973e-05,
+      "loss": 0.14576040506362914,
+      "step": 3795
+    },
+    {
+      "epoch": 0.6914119359534207,
+      "grad_norm": 0.17138013243675232,
+      "learning_rate": 1.1527354377152053e-05,
+      "loss": 0.14494270086288452,
+      "step": 3800
+    },
+    {
+      "epoch": 0.6923216885007278,
+      "grad_norm": 0.15170913934707642,
+      "learning_rate": 1.1465407922699603e-05,
+      "loss": 0.144084370136261,
+      "step": 3805
+    },
+    {
+      "epoch": 0.6932314410480349,
+      "grad_norm": 0.158562570810318,
+      "learning_rate": 1.1403578800055387e-05,
+      "loss": 0.13636608123779298,
+      "step": 3810
+    },
+    {
+      "epoch": 0.6941411935953421,
+      "grad_norm": 0.17687302827835083,
+      "learning_rate": 1.1341867545218044e-05,
+      "loss": 0.14214688539505005,
+      "step": 3815
+    },
+    {
+      "epoch": 0.6950509461426492,
+      "grad_norm": 0.15394899249076843,
+      "learning_rate": 1.1280274693164378e-05,
+      "loss": 0.14914129972457885,
+      "step": 3820
+    },
+    {
+      "epoch": 0.6959606986899564,
+      "grad_norm": 0.15709355473518372,
+      "learning_rate": 1.12188007778448e-05,
+      "loss": 0.14798580408096312,
+      "step": 3825
+    },
+    {
+      "epoch": 0.6968704512372634,
+      "grad_norm": 0.16631539165973663,
+      "learning_rate": 1.115744633217864e-05,
+      "loss": 0.14756966829299928,
+      "step": 3830
+    },
+    {
+      "epoch": 0.6977802037845706,
+      "grad_norm": 0.15893076360225677,
+      "learning_rate": 1.109621188804951e-05,
+      "loss": 0.14061959981918334,
+      "step": 3835
+    },
+    {
+      "epoch": 0.6986899563318777,
+      "grad_norm": 0.183414489030838,
+      "learning_rate": 1.103509797630077e-05,
+      "loss": 0.1448473334312439,
+      "step": 3840
+    },
+    {
+      "epoch": 0.6995997088791849,
+      "grad_norm": 0.14087305963039398,
+      "learning_rate": 1.0974105126730841e-05,
+      "loss": 0.14369285106658936,
+      "step": 3845
+    },
+    {
+      "epoch": 0.700509461426492,
+      "grad_norm": 0.16919967532157898,
+      "learning_rate": 1.0913233868088685e-05,
+      "loss": 0.1478085398674011,
+      "step": 3850
+    },
+    {
+      "epoch": 0.7014192139737991,
+      "grad_norm": 0.1439533829689026,
+      "learning_rate": 1.0852484728069178e-05,
+      "loss": 0.14376721382141114,
+      "step": 3855
+    },
+    {
+      "epoch": 0.7023289665211062,
+      "grad_norm": 0.17719274759292603,
+      "learning_rate": 1.0791858233308521e-05,
+      "loss": 0.14089040756225585,
+      "step": 3860
+    },
+    {
+      "epoch": 0.7032387190684134,
+      "grad_norm": 0.19753769040107727,
+      "learning_rate": 1.0731354909379754e-05,
+      "loss": 0.15021742582321168,
+      "step": 3865
+    },
+    {
+      "epoch": 0.7041484716157205,
+      "grad_norm": 0.19186992943286896,
+      "learning_rate": 1.0670975280788086e-05,
+      "loss": 0.14113202095031738,
+      "step": 3870
+    },
+    {
+      "epoch": 0.7050582241630277,
+      "grad_norm": 0.1709229201078415,
+      "learning_rate": 1.0610719870966443e-05,
+      "loss": 0.1500566840171814,
+      "step": 3875
+    },
+    {
+      "epoch": 0.7059679767103348,
+      "grad_norm": 0.17846204340457916,
+      "learning_rate": 1.0550589202270892e-05,
+      "loss": 0.15014195442199707,
+      "step": 3880
+    },
+    {
+      "epoch": 0.7068777292576419,
+      "grad_norm": 0.1827082335948944,
+      "learning_rate": 1.0490583795976091e-05,
+      "loss": 0.1423472762107849,
+      "step": 3885
+    },
+    {
+      "epoch": 0.7077874818049491,
+      "grad_norm": 0.17418377101421356,
+      "learning_rate": 1.043070417227083e-05,
+      "loss": 0.14668900966644288,
+      "step": 3890
+    },
+    {
+      "epoch": 0.7086972343522562,
+      "grad_norm": 0.17385616898536682,
+      "learning_rate": 1.0370950850253449e-05,
+      "loss": 0.14627279043197633,
+      "step": 3895
+    },
+    {
+      "epoch": 0.7096069868995634,
+      "grad_norm": 0.16486723721027374,
+      "learning_rate": 1.0311324347927404e-05,
+      "loss": 0.14603652954101562,
+      "step": 3900
+    },
+    {
+      "epoch": 0.7105167394468704,
+      "grad_norm": 0.21806862950325012,
+      "learning_rate": 1.0251825182196732e-05,
+      "loss": 0.1488169550895691,
+      "step": 3905
+    },
+    {
+      "epoch": 0.7114264919941776,
+      "grad_norm": 0.19884569942951202,
+      "learning_rate": 1.019245386886159e-05,
+      "loss": 0.14387656450271608,
+      "step": 3910
+    },
+    {
+      "epoch": 0.7123362445414847,
+      "grad_norm": 0.16139011085033417,
+      "learning_rate": 1.0133210922613789e-05,
+      "loss": 0.1483074426651001,
+      "step": 3915
+    },
+    {
+      "epoch": 0.7132459970887919,
+      "grad_norm": 0.17000740766525269,
+      "learning_rate": 1.007409685703229e-05,
+      "loss": 0.14050065279006957,
+      "step": 3920
+    },
+    {
+      "epoch": 0.714155749636099,
+      "grad_norm": 0.17235304415225983,
+      "learning_rate": 1.0015112184578813e-05,
+      "loss": 0.1440442681312561,
+      "step": 3925
+    },
+    {
+      "epoch": 0.7150655021834061,
+      "grad_norm": 0.15737567842006683,
+      "learning_rate": 9.956257416593362e-06,
+      "loss": 0.14960765838623047,
+      "step": 3930
+    },
+    {
+      "epoch": 0.7159752547307132,
+      "grad_norm": 0.15499180555343628,
+      "learning_rate": 9.897533063289773e-06,
+      "loss": 0.14488829374313356,
+      "step": 3935
+    },
+    {
+      "epoch": 0.7168850072780204,
+      "grad_norm": 0.17744216322898865,
+      "learning_rate": 9.838939633751337e-06,
+      "loss": 0.1416949987411499,
+      "step": 3940
+    },
+    {
+      "epoch": 0.7177947598253275,
+      "grad_norm": 0.1597192883491516,
+      "learning_rate": 9.780477635926358e-06,
+      "loss": 0.14275280237197877,
+      "step": 3945
+    },
+    {
+      "epoch": 0.7187045123726347,
+      "grad_norm": 0.17800374329090118,
+      "learning_rate": 9.722147576623743e-06,
+      "loss": 0.14532098770141602,
+      "step": 3950
+    },
+    {
+      "epoch": 0.7196142649199417,
+      "grad_norm": 0.1828162521123886,
+      "learning_rate": 9.66394996150864e-06,
+      "loss": 0.14525585174560546,
+      "step": 3955
+    },
+    {
+      "epoch": 0.7205240174672489,
+      "grad_norm": 0.1800539344549179,
+      "learning_rate": 9.605885295098005e-06,
+      "loss": 0.14235819578170777,
+      "step": 3960
+    },
+    {
+      "epoch": 0.721433770014556,
+      "grad_norm": 0.16556483507156372,
+      "learning_rate": 9.54795408075628e-06,
+      "loss": 0.13965482711791993,
+      "step": 3965
+    },
+    {
+      "epoch": 0.7223435225618632,
+      "grad_norm": 0.1592024862766266,
+      "learning_rate": 9.49015682069101e-06,
+      "loss": 0.14051042795181273,
+      "step": 3970
+    },
+    {
+      "epoch": 0.7232532751091703,
+      "grad_norm": 0.18988847732543945,
+      "learning_rate": 9.43249401594846e-06,
+      "loss": 0.1436900496482849,
+      "step": 3975
+    },
+    {
+      "epoch": 0.7241630276564774,
+      "grad_norm": 0.24433808028697968,
+      "learning_rate": 9.374966166409329e-06,
+      "loss": 0.14883997440338134,
+      "step": 3980
+    },
+    {
+      "epoch": 0.7250727802037845,
+      "grad_norm": 0.15091639757156372,
+      "learning_rate": 9.317573770784352e-06,
+      "loss": 0.14726560115814208,
+      "step": 3985
+    },
+    {
+      "epoch": 0.7259825327510917,
+      "grad_norm": 0.17045573890209198,
+      "learning_rate": 9.260317326610051e-06,
+      "loss": 0.14120506048202514,
+      "step": 3990
+    },
+    {
+      "epoch": 0.7268922852983989,
+      "grad_norm": 0.18847957253456116,
+      "learning_rate": 9.203197330244343e-06,
+      "loss": 0.1377041220664978,
+      "step": 3995
+    },
+    {
+      "epoch": 0.727802037845706,
+      "grad_norm": 0.1516445279121399,
+      "learning_rate": 9.14621427686229e-06,
+      "loss": 0.14043946266174318,
+      "step": 4000
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.2009046316420787e+18,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-4000/training_args.bin b/checkpoint-4000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-4000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/checkpoint-4100/README.md b/checkpoint-4100/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-4100/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-4100/adapter_config.json b/checkpoint-4100/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-4100/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-4100/adapter_model.safetensors b/checkpoint-4100/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..4a18403524e41012c589c271ad4dc6f05d9c738d
--- /dev/null
+++ b/checkpoint-4100/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:58539d3520353ef7548f3cb4afce746f3ecf3ab64a94b9beee75d776193ff575
+size 169741912
diff --git a/checkpoint-4100/chat_template.jinja b/checkpoint-4100/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-4100/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-4100/optimizer.pt b/checkpoint-4100/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3be23f7f70eba5099e26bbc4ac1ad6cb0317a3cb
--- /dev/null
+++ b/checkpoint-4100/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:daf64ecc2aea1438e939cefa2112beb45d33571013c1057b470fefbf0ceb1a15
+size 72807355
diff --git a/checkpoint-4100/processor_config.json b/checkpoint-4100/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-4100/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-4100/rng_state.pth b/checkpoint-4100/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-4100/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-4100/scheduler.pt b/checkpoint-4100/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e0c29f060d9dd4277eeb4ecf92eb795e8c08d003
--- /dev/null
+++ b/checkpoint-4100/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:70d73dcbe82614473137cbc421461dfb966bac186f60581edd01b1ecddd8cb5b
+size 1465
diff --git a/checkpoint-4100/tokenizer.json b/checkpoint-4100/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-4100/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-4100/tokenizer_config.json b/checkpoint-4100/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-4100/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-4100/trainer_state.json b/checkpoint-4100/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..90a7cc590216780c435cdf03afc86a636b100c7f
--- /dev/null
+++ b/checkpoint-4100/trainer_state.json
@@ -0,0 +1,5782 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.7459970887918487,
+  "eval_steps": 100,
+  "global_step": 4100,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    },
+    {
+      "epoch": 0.03729985443959243,
+      "grad_norm": 0.061678580939769745,
+      "learning_rate": 4.999340748636462e-05,
+      "loss": 0.24956226348876953,
+      "step": 205
+    },
+    {
+      "epoch": 0.03820960698689956,
+      "grad_norm": 0.07328873127698898,
+      "learning_rate": 4.999160884067051e-05,
+      "loss": 0.26169676780700685,
+      "step": 210
+    },
+    {
+      "epoch": 0.0391193595342067,
+      "grad_norm": 0.08287990838289261,
+      "learning_rate": 4.9989593541926246e-05,
+      "loss": 0.2574604034423828,
+      "step": 215
+    },
+    {
+      "epoch": 0.04002911208151383,
+      "grad_norm": 0.06787359714508057,
+      "learning_rate": 4.9987361607602525e-05,
+      "loss": 0.25351409912109374,
+      "step": 220
+    },
+    {
+      "epoch": 0.04093886462882096,
+      "grad_norm": 0.06695502996444702,
+      "learning_rate": 4.998491305704805e-05,
+      "loss": 0.24522039890289307,
+      "step": 225
+    },
+    {
+      "epoch": 0.041848617176128096,
+      "grad_norm": 0.08872214704751968,
+      "learning_rate": 4.9982247911489375e-05,
+      "loss": 0.2581867933273315,
+      "step": 230
+    },
+    {
+      "epoch": 0.042758369723435226,
+      "grad_norm": 0.07637131959199905,
+      "learning_rate": 4.9979366194030743e-05,
+      "loss": 0.25569658279418944,
+      "step": 235
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 0.08158119022846222,
+      "learning_rate": 4.997626792965385e-05,
+      "loss": 0.2529409646987915,
+      "step": 240
+    },
+    {
+      "epoch": 0.04457787481804949,
+      "grad_norm": 0.07529161125421524,
+      "learning_rate": 4.997295314521766e-05,
+      "loss": 0.24049024581909179,
+      "step": 245
+    },
+    {
+      "epoch": 0.04548762736535662,
+      "grad_norm": 0.08860139548778534,
+      "learning_rate": 4.996942186945813e-05,
+      "loss": 0.2490522861480713,
+      "step": 250
+    },
+    {
+      "epoch": 0.04639737991266375,
+      "grad_norm": 0.0850321501493454,
+      "learning_rate": 4.9965674132988005e-05,
+      "loss": 0.24180831909179687,
+      "step": 255
+    },
+    {
+      "epoch": 0.04730713245997089,
+      "grad_norm": 0.07556115090847015,
+      "learning_rate": 4.996170996829653e-05,
+      "loss": 0.2509631872177124,
+      "step": 260
+    },
+    {
+      "epoch": 0.04821688500727802,
+      "grad_norm": 0.07971206307411194,
+      "learning_rate": 4.995752940974918e-05,
+      "loss": 0.24398891925811766,
+      "step": 265
+    },
+    {
+      "epoch": 0.04912663755458515,
+      "grad_norm": 0.09149336814880371,
+      "learning_rate": 4.9953132493587344e-05,
+      "loss": 0.2300492286682129,
+      "step": 270
+    },
+    {
+      "epoch": 0.050036390101892286,
+      "grad_norm": 0.08265820890665054,
+      "learning_rate": 4.9948519257928034e-05,
+      "loss": 0.24246792793273925,
+      "step": 275
+    },
+    {
+      "epoch": 0.050946142649199416,
+      "grad_norm": 0.10328587144613266,
+      "learning_rate": 4.9943689742763534e-05,
+      "loss": 0.2367171049118042,
+      "step": 280
+    },
+    {
+      "epoch": 0.05185589519650655,
+      "grad_norm": 0.0836917981505394,
+      "learning_rate": 4.993864398996105e-05,
+      "loss": 0.23215813636779786,
+      "step": 285
+    },
+    {
+      "epoch": 0.05276564774381368,
+      "grad_norm": 0.09475161135196686,
+      "learning_rate": 4.99333820432624e-05,
+      "loss": 0.2350748062133789,
+      "step": 290
+    },
+    {
+      "epoch": 0.05367540029112081,
+      "grad_norm": 0.08040128648281097,
+      "learning_rate": 4.992790394828355e-05,
+      "loss": 0.23253886699676513,
+      "step": 295
+    },
+    {
+      "epoch": 0.05458515283842795,
+      "grad_norm": 0.08852150291204453,
+      "learning_rate": 4.992220975251428e-05,
+      "loss": 0.23856515884399415,
+      "step": 300
+    },
+    {
+      "epoch": 0.05549490538573508,
+      "grad_norm": 0.09565229713916779,
+      "learning_rate": 4.991629950531775e-05,
+      "loss": 0.23311660289764405,
+      "step": 305
+    },
+    {
+      "epoch": 0.05640465793304221,
+      "grad_norm": 0.08158160001039505,
+      "learning_rate": 4.991017325793009e-05,
+      "loss": 0.22467944622039795,
+      "step": 310
+    },
+    {
+      "epoch": 0.05731441048034935,
+      "grad_norm": 0.07746429741382599,
+      "learning_rate": 4.990383106345994e-05,
+      "loss": 0.229844069480896,
+      "step": 315
+    },
+    {
+      "epoch": 0.05822416302765648,
+      "grad_norm": 0.08564355969429016,
+      "learning_rate": 4.989727297688797e-05,
+      "loss": 0.22414517402648926,
+      "step": 320
+    },
+    {
+      "epoch": 0.05913391557496361,
+      "grad_norm": 0.07517435401678085,
+      "learning_rate": 4.9890499055066435e-05,
+      "loss": 0.2236532211303711,
+      "step": 325
+    },
+    {
+      "epoch": 0.060043668122270744,
+      "grad_norm": 0.111734539270401,
+      "learning_rate": 4.988350935671869e-05,
+      "loss": 0.21474847793579102,
+      "step": 330
+    },
+    {
+      "epoch": 0.060953420669577874,
+      "grad_norm": 0.09906989336013794,
+      "learning_rate": 4.987630394243866e-05,
+      "loss": 0.23321933746337892,
+      "step": 335
+    },
+    {
+      "epoch": 0.06186317321688501,
+      "grad_norm": 0.10131457448005676,
+      "learning_rate": 4.98688828746903e-05,
+      "loss": 0.2310662031173706,
+      "step": 340
+    },
+    {
+      "epoch": 0.06277292576419213,
+      "grad_norm": 0.09203507006168365,
+      "learning_rate": 4.986124621780708e-05,
+      "loss": 0.22021169662475587,
+      "step": 345
+    },
+    {
+      "epoch": 0.06368267831149928,
+      "grad_norm": 0.09505912661552429,
+      "learning_rate": 4.9853394037991416e-05,
+      "loss": 0.2197155237197876,
+      "step": 350
+    },
+    {
+      "epoch": 0.06459243085880641,
+      "grad_norm": 0.09038657695055008,
+      "learning_rate": 4.984532640331412e-05,
+      "loss": 0.22066287994384765,
+      "step": 355
+    },
+    {
+      "epoch": 0.06550218340611354,
+      "grad_norm": 0.09707064181566238,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 0.22455451488494874,
+      "step": 360
+    },
+    {
+      "epoch": 0.06641193595342067,
+      "grad_norm": 0.10367228090763092,
+      "learning_rate": 4.98285450509961e-05,
+      "loss": 0.21993820667266845,
+      "step": 365
+    },
+    {
+      "epoch": 0.0673216885007278,
+      "grad_norm": 0.12229471653699875,
+      "learning_rate": 4.9819831478833456e-05,
+      "loss": 0.2168867588043213,
+      "step": 370
+    },
+    {
+      "epoch": 0.06823144104803494,
+      "grad_norm": 0.0964592918753624,
+      "learning_rate": 4.981090274276406e-05,
+      "loss": 0.21579203605651856,
+      "step": 375
+    },
+    {
+      "epoch": 0.06914119359534207,
+      "grad_norm": 0.09400496631860733,
+      "learning_rate": 4.980175892019141e-05,
+      "loss": 0.20972180366516113,
+      "step": 380
+    },
+    {
+      "epoch": 0.0700509461426492,
+      "grad_norm": 0.08158645778894424,
+      "learning_rate": 4.9792400090383594e-05,
+      "loss": 0.22148358821868896,
+      "step": 385
+    },
+    {
+      "epoch": 0.07096069868995633,
+      "grad_norm": 0.10916394740343094,
+      "learning_rate": 4.978282633447261e-05,
+      "loss": 0.2214418649673462,
+      "step": 390
+    },
+    {
+      "epoch": 0.07187045123726346,
+      "grad_norm": 0.11138810962438583,
+      "learning_rate": 4.9773037735453636e-05,
+      "loss": 0.21814754009246826,
+      "step": 395
+    },
+    {
+      "epoch": 0.07278020378457059,
+      "grad_norm": 0.10914396494626999,
+      "learning_rate": 4.9763034378184365e-05,
+      "loss": 0.21310818195343018,
+      "step": 400
+    },
+    {
+      "epoch": 0.07368995633187773,
+      "grad_norm": 0.1043366864323616,
+      "learning_rate": 4.975281634938421e-05,
+      "loss": 0.21266789436340333,
+      "step": 405
+    },
+    {
+      "epoch": 0.07459970887918486,
+      "grad_norm": 0.1036868542432785,
+      "learning_rate": 4.9742383737633594e-05,
+      "loss": 0.21606721878051757,
+      "step": 410
+    },
+    {
+      "epoch": 0.075509461426492,
+      "grad_norm": 0.11640442907810211,
+      "learning_rate": 4.9731736633373144e-05,
+      "loss": 0.21532948017120362,
+      "step": 415
+    },
+    {
+      "epoch": 0.07641921397379912,
+      "grad_norm": 0.11219926178455353,
+      "learning_rate": 4.9720875128902956e-05,
+      "loss": 0.2191627025604248,
+      "step": 420
+    },
+    {
+      "epoch": 0.07732896652110625,
+      "grad_norm": 0.12103637307882309,
+      "learning_rate": 4.970979931838176e-05,
+      "loss": 0.20938868522644044,
+      "step": 425
+    },
+    {
+      "epoch": 0.0782387190684134,
+      "grad_norm": 0.13274189829826355,
+      "learning_rate": 4.96985092978261e-05,
+      "loss": 0.21792960166931152,
+      "step": 430
+    },
+    {
+      "epoch": 0.07914847161572053,
+      "grad_norm": 0.11164513230323792,
+      "learning_rate": 4.968700516510954e-05,
+      "loss": 0.2022618055343628,
+      "step": 435
+    },
+    {
+      "epoch": 0.08005822416302766,
+      "grad_norm": 0.09532847255468369,
+      "learning_rate": 4.967528701996174e-05,
+      "loss": 0.21255812644958497,
+      "step": 440
+    },
+    {
+      "epoch": 0.08096797671033479,
+      "grad_norm": 0.10279258340597153,
+      "learning_rate": 4.96633549639677e-05,
+      "loss": 0.20683050155639648,
+      "step": 445
+    },
+    {
+      "epoch": 0.08187772925764192,
+      "grad_norm": 0.1257462352514267,
+      "learning_rate": 4.965120910056677e-05,
+      "loss": 0.21419920921325683,
+      "step": 450
+    },
+    {
+      "epoch": 0.08278748180494905,
+      "grad_norm": 0.11663137376308441,
+      "learning_rate": 4.963884953505186e-05,
+      "loss": 0.2072287082672119,
+      "step": 455
+    },
+    {
+      "epoch": 0.08369723435225619,
+      "grad_norm": 0.10488224029541016,
+      "learning_rate": 4.96262763745684e-05,
+      "loss": 0.1982678532600403,
+      "step": 460
+    },
+    {
+      "epoch": 0.08460698689956332,
+      "grad_norm": 0.11801692098379135,
+      "learning_rate": 4.961348972811354e-05,
+      "loss": 0.20662031173706055,
+      "step": 465
+    },
+    {
+      "epoch": 0.08551673944687045,
+      "grad_norm": 0.11318827420473099,
+      "learning_rate": 4.96004897065351e-05,
+      "loss": 0.20947303771972656,
+      "step": 470
+    },
+    {
+      "epoch": 0.08642649199417758,
+      "grad_norm": 0.13409486413002014,
+      "learning_rate": 4.95872764225307e-05,
+      "loss": 0.19670876264572143,
+      "step": 475
+    },
+    {
+      "epoch": 0.08733624454148471,
+      "grad_norm": 0.14440792798995972,
+      "learning_rate": 4.957384999064672e-05,
+      "loss": 0.19842848777770997,
+      "step": 480
+    },
+    {
+      "epoch": 0.08824599708879186,
+      "grad_norm": 0.12246996909379959,
+      "learning_rate": 4.956021052727731e-05,
+      "loss": 0.20318071842193602,
+      "step": 485
+    },
+    {
+      "epoch": 0.08915574963609899,
+      "grad_norm": 0.13437233865261078,
+      "learning_rate": 4.954635815066342e-05,
+      "loss": 0.21675212383270265,
+      "step": 490
+    },
+    {
+      "epoch": 0.09006550218340612,
+      "grad_norm": 0.11109672486782074,
+      "learning_rate": 4.9532292980891744e-05,
+      "loss": 0.2100757837295532,
+      "step": 495
+    },
+    {
+      "epoch": 0.09097525473071325,
+      "grad_norm": 0.1388893872499466,
+      "learning_rate": 4.9518015139893675e-05,
+      "loss": 0.20303285121917725,
+      "step": 500
+    },
+    {
+      "epoch": 0.09188500727802038,
+      "grad_norm": 0.13239721953868866,
+      "learning_rate": 4.950352475144427e-05,
+      "loss": 0.2152268409729004,
+      "step": 505
+    },
+    {
+      "epoch": 0.0927947598253275,
+      "grad_norm": 0.12834979593753815,
+      "learning_rate": 4.948882194116119e-05,
+      "loss": 0.20799248218536376,
+      "step": 510
+    },
+    {
+      "epoch": 0.09370451237263465,
+      "grad_norm": 0.11886704713106155,
+      "learning_rate": 4.947390683650354e-05,
+      "loss": 0.20394976139068605,
+      "step": 515
+    },
+    {
+      "epoch": 0.09461426491994178,
+      "grad_norm": 0.11398876458406448,
+      "learning_rate": 4.945877956677083e-05,
+      "loss": 0.2091092586517334,
+      "step": 520
+    },
+    {
+      "epoch": 0.09552401746724891,
+      "grad_norm": 0.1422540694475174,
+      "learning_rate": 4.944344026310186e-05,
+      "loss": 0.19564238786697388,
+      "step": 525
+    },
+    {
+      "epoch": 0.09643377001455604,
+      "grad_norm": 0.11359584331512451,
+      "learning_rate": 4.9427889058473535e-05,
+      "loss": 0.20493624210357667,
+      "step": 530
+    },
+    {
+      "epoch": 0.09734352256186317,
+      "grad_norm": 0.11703553050756454,
+      "learning_rate": 4.941212608769974e-05,
+      "loss": 0.2098615884780884,
+      "step": 535
+    },
+    {
+      "epoch": 0.0982532751091703,
+      "grad_norm": 0.14552047848701477,
+      "learning_rate": 4.939615148743017e-05,
+      "loss": 0.20382182598114013,
+      "step": 540
+    },
+    {
+      "epoch": 0.09916302765647744,
+      "grad_norm": 0.13178016245365143,
+      "learning_rate": 4.937996539614914e-05,
+      "loss": 0.19901862144470214,
+      "step": 545
+    },
+    {
+      "epoch": 0.10007278020378457,
+      "grad_norm": 0.635392427444458,
+      "learning_rate": 4.936356795417439e-05,
+      "loss": 0.20694944858551026,
+      "step": 550
+    },
+    {
+      "epoch": 0.1009825327510917,
+      "grad_norm": 0.15019077062606812,
+      "learning_rate": 4.934695930365586e-05,
+      "loss": 0.19313746690750122,
+      "step": 555
+    },
+    {
+      "epoch": 0.10189228529839883,
+      "grad_norm": 0.12941956520080566,
+      "learning_rate": 4.9330139588574474e-05,
+      "loss": 0.19671722650527954,
+      "step": 560
+    },
+    {
+      "epoch": 0.10280203784570596,
+      "grad_norm": 0.13818831741809845,
+      "learning_rate": 4.931310895474088e-05,
+      "loss": 0.20026786327362062,
+      "step": 565
+    },
+    {
+      "epoch": 0.1037117903930131,
+      "grad_norm": 0.12011194974184036,
+      "learning_rate": 4.929586754979417e-05,
+      "loss": 0.1932437539100647,
+      "step": 570
+    },
+    {
+      "epoch": 0.10462154294032024,
+      "grad_norm": 0.1345364898443222,
+      "learning_rate": 4.9278415523200644e-05,
+      "loss": 0.20245940685272218,
+      "step": 575
+    },
+    {
+      "epoch": 0.10553129548762737,
+      "grad_norm": 0.13281017541885376,
+      "learning_rate": 4.926075302625247e-05,
+      "loss": 0.19864981174468993,
+      "step": 580
+    },
+    {
+      "epoch": 0.1064410480349345,
+      "grad_norm": 0.13465586304664612,
+      "learning_rate": 4.924288021206639e-05,
+      "loss": 0.19573183059692384,
+      "step": 585
+    },
+    {
+      "epoch": 0.10735080058224163,
+      "grad_norm": 0.15225961804389954,
+      "learning_rate": 4.9224797235582396e-05,
+      "loss": 0.19946801662445068,
+      "step": 590
+    },
+    {
+      "epoch": 0.10826055312954876,
+      "grad_norm": 0.12816746532917023,
+      "learning_rate": 4.92065042535624e-05,
+      "loss": 0.19851526021957397,
+      "step": 595
+    },
+    {
+      "epoch": 0.1091703056768559,
+      "grad_norm": 0.13802853226661682,
+      "learning_rate": 4.9188001424588824e-05,
+      "loss": 0.19321763515472412,
+      "step": 600
+    },
+    {
+      "epoch": 0.11008005822416303,
+      "grad_norm": 0.17504797875881195,
+      "learning_rate": 4.9169288909063295e-05,
+      "loss": 0.2032616138458252,
+      "step": 605
+    },
+    {
+      "epoch": 0.11098981077147016,
+      "grad_norm": 0.13544194400310516,
+      "learning_rate": 4.91503668692052e-05,
+      "loss": 0.2011256456375122,
+      "step": 610
+    },
+    {
+      "epoch": 0.11189956331877729,
+      "grad_norm": 1.3976134061813354,
+      "learning_rate": 4.91312354690503e-05,
+      "loss": 0.19916868209838867,
+      "step": 615
+    },
+    {
+      "epoch": 0.11280931586608442,
+      "grad_norm": 0.1465059071779251,
+      "learning_rate": 4.91118948744493e-05,
+      "loss": 0.19487457275390624,
+      "step": 620
+    },
+    {
+      "epoch": 0.11371906841339156,
+      "grad_norm": 0.12103168666362762,
+      "learning_rate": 4.909234525306645e-05,
+      "loss": 0.1907251238822937,
+      "step": 625
+    },
+    {
+      "epoch": 0.1146288209606987,
+      "grad_norm": 0.12660574913024902,
+      "learning_rate": 4.907258677437802e-05,
+      "loss": 0.19327253103256226,
+      "step": 630
+    },
+    {
+      "epoch": 0.11553857350800582,
+      "grad_norm": 0.1347813606262207,
+      "learning_rate": 4.90526196096709e-05,
+      "loss": 0.19637736082077026,
+      "step": 635
+    },
+    {
+      "epoch": 0.11644832605531295,
+      "grad_norm": 0.14953652024269104,
+      "learning_rate": 4.903244393204107e-05,
+      "loss": 0.20325069427490233,
+      "step": 640
+    },
+    {
+      "epoch": 0.11735807860262008,
+      "grad_norm": 0.13936272263526917,
+      "learning_rate": 4.901205991639213e-05,
+      "loss": 0.1930275321006775,
+      "step": 645
+    },
+    {
+      "epoch": 0.11826783114992721,
+      "grad_norm": 0.1448420137166977,
+      "learning_rate": 4.899146773943374e-05,
+      "loss": 0.20026936531066894,
+      "step": 650
+    },
+    {
+      "epoch": 0.11917758369723436,
+      "grad_norm": 0.1312534064054489,
+      "learning_rate": 4.897066757968014e-05,
+      "loss": 0.19062033891677857,
+      "step": 655
+    },
+    {
+      "epoch": 0.12008733624454149,
+      "grad_norm": 0.13644742965698242,
+      "learning_rate": 4.894965961744859e-05,
+      "loss": 0.18719595670700073,
+      "step": 660
+    },
+    {
+      "epoch": 0.12099708879184862,
+      "grad_norm": 0.14276087284088135,
+      "learning_rate": 4.892844403485777e-05,
+      "loss": 0.19784307479858398,
+      "step": 665
+    },
+    {
+      "epoch": 0.12190684133915575,
+      "grad_norm": 0.14735399186611176,
+      "learning_rate": 4.890702101582623e-05,
+      "loss": 0.19163782596588136,
+      "step": 670
+    },
+    {
+      "epoch": 0.12281659388646288,
+      "grad_norm": 0.15742065012454987,
+      "learning_rate": 4.888539074607082e-05,
+      "loss": 0.19312986135482788,
+      "step": 675
+    },
+    {
+      "epoch": 0.12372634643377002,
+      "grad_norm": 0.12917031347751617,
+      "learning_rate": 4.8863553413105025e-05,
+      "loss": 0.20066320896148682,
+      "step": 680
+    },
+    {
+      "epoch": 0.12463609898107715,
+      "grad_norm": 0.1484801322221756,
+      "learning_rate": 4.884150920623737e-05,
+      "loss": 0.20096096992492676,
+      "step": 685
+    },
+    {
+      "epoch": 0.12554585152838427,
+      "grad_norm": 0.1455296128988266,
+      "learning_rate": 4.88192583165698e-05,
+      "loss": 0.20518505573272705,
+      "step": 690
+    },
+    {
+      "epoch": 0.12645560407569142,
+      "grad_norm": 0.14517490565776825,
+      "learning_rate": 4.879680093699598e-05,
+      "loss": 0.18859238624572755,
+      "step": 695
+    },
+    {
+      "epoch": 0.12736535662299855,
+      "grad_norm": 0.18778090178966522,
+      "learning_rate": 4.877413726219964e-05,
+      "loss": 0.197074818611145,
+      "step": 700
+    },
+    {
+      "epoch": 0.12827510917030568,
+      "grad_norm": 0.13497677445411682,
+      "learning_rate": 4.87512674886529e-05,
+      "loss": 0.18713107109069824,
+      "step": 705
+    },
+    {
+      "epoch": 0.12918486171761281,
+      "grad_norm": 0.12657155096530914,
+      "learning_rate": 4.872819181461455e-05,
+      "loss": 0.1858484387397766,
+      "step": 710
+    },
+    {
+      "epoch": 0.13009461426491994,
+      "grad_norm": 0.11458148807287216,
+      "learning_rate": 4.870491044012834e-05,
+      "loss": 0.18732179403305055,
+      "step": 715
+    },
+    {
+      "epoch": 0.13100436681222707,
+      "grad_norm": 0.13000249862670898,
+      "learning_rate": 4.8681423567021244e-05,
+      "loss": 0.1872936010360718,
+      "step": 720
+    },
+    {
+      "epoch": 0.1319141193595342,
+      "grad_norm": 0.14580890536308289,
+      "learning_rate": 4.865773139890172e-05,
+      "loss": 0.19280019998550416,
+      "step": 725
+    },
+    {
+      "epoch": 0.13282387190684133,
+      "grad_norm": 0.1507277935743332,
+      "learning_rate": 4.8633834141157913e-05,
+      "loss": 0.1898929238319397,
+      "step": 730
+    },
+    {
+      "epoch": 0.13373362445414846,
+      "grad_norm": 0.1418737769126892,
+      "learning_rate": 4.860973200095592e-05,
+      "loss": 0.17926375865936278,
+      "step": 735
+    },
+    {
+      "epoch": 0.1346433770014556,
+      "grad_norm": 0.17151866853237152,
+      "learning_rate": 4.858542518723794e-05,
+      "loss": 0.18963592052459716,
+      "step": 740
+    },
+    {
+      "epoch": 0.13555312954876272,
+      "grad_norm": 0.11162743717432022,
+      "learning_rate": 4.8560913910720535e-05,
+      "loss": 0.19466646909713745,
+      "step": 745
+    },
+    {
+      "epoch": 0.13646288209606988,
+      "grad_norm": 0.15628376603126526,
+      "learning_rate": 4.8536198383892725e-05,
+      "loss": 0.19494034051895143,
+      "step": 750
+    },
+    {
+      "epoch": 0.137372634643377,
+      "grad_norm": 0.18209289014339447,
+      "learning_rate": 4.851127882101421e-05,
+      "loss": 0.18747550249099731,
+      "step": 755
+    },
+    {
+      "epoch": 0.13828238719068414,
+      "grad_norm": 0.14559614658355713,
+      "learning_rate": 4.8486155438113454e-05,
+      "loss": 0.1897158980369568,
+      "step": 760
+    },
+    {
+      "epoch": 0.13919213973799127,
+      "grad_norm": 0.3198587894439697,
+      "learning_rate": 4.846082845298586e-05,
+      "loss": 0.18571001291275024,
+      "step": 765
+    },
+    {
+      "epoch": 0.1401018922852984,
+      "grad_norm": 0.1486678421497345,
+      "learning_rate": 4.843529808519189e-05,
+      "loss": 0.19561930894851684,
+      "step": 770
+    },
+    {
+      "epoch": 0.14101164483260553,
+      "grad_norm": 0.15318170189857483,
+      "learning_rate": 4.840956455605509e-05,
+      "loss": 0.187040114402771,
+      "step": 775
+    },
+    {
+      "epoch": 0.14192139737991266,
+      "grad_norm": 0.13754244148731232,
+      "learning_rate": 4.838362808866025e-05,
+      "loss": 0.18345539569854735,
+      "step": 780
+    },
+    {
+      "epoch": 0.1428311499272198,
+      "grad_norm": 0.12943248450756073,
+      "learning_rate": 4.835748890785143e-05,
+      "loss": 0.1921079397201538,
+      "step": 785
+    },
+    {
+      "epoch": 0.14374090247452692,
+      "grad_norm": 0.110458143055439,
+      "learning_rate": 4.833114724023001e-05,
+      "loss": 0.17927205562591553,
+      "step": 790
+    },
+    {
+      "epoch": 0.14465065502183405,
+      "grad_norm": 0.2421770840883255,
+      "learning_rate": 4.830460331415275e-05,
+      "loss": 0.18317567110061644,
+      "step": 795
+    },
+    {
+      "epoch": 0.14556040756914118,
+      "grad_norm": 0.14752762019634247,
+      "learning_rate": 4.8277857359729787e-05,
+      "loss": 0.1843916058540344,
+      "step": 800
+    },
+    {
+      "epoch": 0.14647016011644834,
+      "grad_norm": 0.15043556690216064,
+      "learning_rate": 4.8250909608822644e-05,
+      "loss": 0.18354393243789674,
+      "step": 805
+    },
+    {
+      "epoch": 0.14737991266375547,
+      "grad_norm": 0.1381794661283493,
+      "learning_rate": 4.822376029504223e-05,
+      "loss": 0.1789781332015991,
+      "step": 810
+    },
+    {
+      "epoch": 0.1482896652110626,
+      "grad_norm": 0.18386174738407135,
+      "learning_rate": 4.819640965374681e-05,
+      "loss": 0.19494292736053467,
+      "step": 815
+    },
+    {
+      "epoch": 0.14919941775836973,
+      "grad_norm": 0.13829593360424042,
+      "learning_rate": 4.816885792203996e-05,
+      "loss": 0.18486063480377196,
+      "step": 820
+    },
+    {
+      "epoch": 0.15010917030567686,
+      "grad_norm": 0.15033291280269623,
+      "learning_rate": 4.814110533876852e-05,
+      "loss": 0.18061509132385253,
+      "step": 825
+    },
+    {
+      "epoch": 0.151018922852984,
+      "grad_norm": 0.17150473594665527,
+      "learning_rate": 4.811315214452051e-05,
+      "loss": 0.18464866876602173,
+      "step": 830
+    },
+    {
+      "epoch": 0.15192867540029112,
+      "grad_norm": 0.15317125618457794,
+      "learning_rate": 4.808499858162307e-05,
+      "loss": 0.1837708592414856,
+      "step": 835
+    },
+    {
+      "epoch": 0.15283842794759825,
+      "grad_norm": 0.2671392560005188,
+      "learning_rate": 4.805664489414031e-05,
+      "loss": 0.19338636398315429,
+      "step": 840
+    },
+    {
+      "epoch": 0.15374818049490538,
+      "grad_norm": 0.14047028124332428,
+      "learning_rate": 4.802809132787125e-05,
+      "loss": 0.17069108486175538,
+      "step": 845
+    },
+    {
+      "epoch": 0.1546579330422125,
+      "grad_norm": 0.1520431935787201,
+      "learning_rate": 4.799933813034768e-05,
+      "loss": 0.18607735633850098,
+      "step": 850
+    },
+    {
+      "epoch": 0.15556768558951964,
+      "grad_norm": 0.17239463329315186,
+      "learning_rate": 4.797038555083197e-05,
+      "loss": 0.18069062232971192,
+      "step": 855
+    },
+    {
+      "epoch": 0.1564774381368268,
+      "grad_norm": 0.1377955675125122,
+      "learning_rate": 4.794123384031495e-05,
+      "loss": 0.18870222568511963,
+      "step": 860
+    },
+    {
+      "epoch": 0.15738719068413393,
+      "grad_norm": 0.15901461243629456,
+      "learning_rate": 4.791188325151373e-05,
+      "loss": 0.18128334283828734,
+      "step": 865
+    },
+    {
+      "epoch": 0.15829694323144106,
+      "grad_norm": 0.14634132385253906,
+      "learning_rate": 4.7882334038869495e-05,
+      "loss": 0.1866163969039917,
+      "step": 870
+    },
+    {
+      "epoch": 0.1592066957787482,
+      "grad_norm": 0.15361061692237854,
+      "learning_rate": 4.785258645854529e-05,
+      "loss": 0.17850807905197144,
+      "step": 875
+    },
+    {
+      "epoch": 0.16011644832605532,
+      "grad_norm": 0.13751649856567383,
+      "learning_rate": 4.782264076842385e-05,
+      "loss": 0.17731113433837892,
+      "step": 880
+    },
+    {
+      "epoch": 0.16102620087336245,
+      "grad_norm": 0.17909638583660126,
+      "learning_rate": 4.7792497228105314e-05,
+      "loss": 0.18344542980194092,
+      "step": 885
+    },
+    {
+      "epoch": 0.16193595342066958,
+      "grad_norm": 0.16038304567337036,
+      "learning_rate": 4.776215609890498e-05,
+      "loss": 0.18868647813796996,
+      "step": 890
+    },
+    {
+      "epoch": 0.1628457059679767,
+      "grad_norm": 0.1653951108455658,
+      "learning_rate": 4.773161764385107e-05,
+      "loss": 0.18614152669906617,
+      "step": 895
+    },
+    {
+      "epoch": 0.16375545851528384,
+      "grad_norm": 0.16193026304244995,
+      "learning_rate": 4.770088212768241e-05,
+      "loss": 0.18564575910568237,
+      "step": 900
+    },
+    {
+      "epoch": 0.16466521106259097,
+      "grad_norm": 0.16048531234264374,
+      "learning_rate": 4.7669949816846173e-05,
+      "loss": 0.18330031633377075,
+      "step": 905
+    },
+    {
+      "epoch": 0.1655749636098981,
+      "grad_norm": 0.1440177708864212,
+      "learning_rate": 4.7638820979495534e-05,
+      "loss": 0.17712442874908446,
+      "step": 910
+    },
+    {
+      "epoch": 0.16648471615720525,
+      "grad_norm": 0.19635969400405884,
+      "learning_rate": 4.760749588548738e-05,
+      "loss": 0.18679027557373046,
+      "step": 915
+    },
+    {
+      "epoch": 0.16739446870451238,
+      "grad_norm": 0.15576541423797607,
+      "learning_rate": 4.757597480637995e-05,
+      "loss": 0.19283764362335204,
+      "step": 920
+    },
+    {
+      "epoch": 0.1683042212518195,
+      "grad_norm": 0.1550331562757492,
+      "learning_rate": 4.7544258015430463e-05,
+      "loss": 0.18269542455673218,
+      "step": 925
+    },
+    {
+      "epoch": 0.16921397379912664,
+      "grad_norm": 0.18369626998901367,
+      "learning_rate": 4.75123457875928e-05,
+      "loss": 0.1697891116142273,
+      "step": 930
+    },
+    {
+      "epoch": 0.17012372634643377,
+      "grad_norm": 0.15266314148902893,
+      "learning_rate": 4.7480238399515074e-05,
+      "loss": 0.18523451089859008,
+      "step": 935
+    },
+    {
+      "epoch": 0.1710334788937409,
+      "grad_norm": 0.16709664463996887,
+      "learning_rate": 4.744793612953724e-05,
+      "loss": 0.1803238034248352,
+      "step": 940
+    },
+    {
+      "epoch": 0.17194323144104803,
+      "grad_norm": 0.14929179847240448,
+      "learning_rate": 4.741543925768872e-05,
+      "loss": 0.1861217737197876,
+      "step": 945
+    },
+    {
+      "epoch": 0.17285298398835516,
+      "grad_norm": 0.1362280696630478,
+      "learning_rate": 4.7382748065685915e-05,
+      "loss": 0.17896100282669067,
+      "step": 950
+    },
+    {
+      "epoch": 0.1737627365356623,
+      "grad_norm": 0.15290239453315735,
+      "learning_rate": 4.734986283692982e-05,
+      "loss": 0.18432788848876952,
+      "step": 955
+    },
+    {
+      "epoch": 0.17467248908296942,
+      "grad_norm": 0.1287035197019577,
+      "learning_rate": 4.73167838565035e-05,
+      "loss": 0.18485682010650634,
+      "step": 960
+    },
+    {
+      "epoch": 0.17558224163027655,
+      "grad_norm": 0.17969627678394318,
+      "learning_rate": 4.728351141116971e-05,
+      "loss": 0.17361557483673096,
+      "step": 965
+    },
+    {
+      "epoch": 0.1764919941775837,
+      "grad_norm": 0.13751201331615448,
+      "learning_rate": 4.7250045789368326e-05,
+      "loss": 0.1731679320335388,
+      "step": 970
+    },
+    {
+      "epoch": 0.17740174672489084,
+      "grad_norm": 0.1603265255689621,
+      "learning_rate": 4.721638728121388e-05,
+      "loss": 0.17308170795440675,
+      "step": 975
+    },
+    {
+      "epoch": 0.17831149927219797,
+      "grad_norm": 0.1592789888381958,
+      "learning_rate": 4.718253617849306e-05,
+      "loss": 0.17534757852554322,
+      "step": 980
+    },
+    {
+      "epoch": 0.1792212518195051,
+      "grad_norm": 0.12727224826812744,
+      "learning_rate": 4.714849277466214e-05,
+      "loss": 0.17817609310150145,
+      "step": 985
+    },
+    {
+      "epoch": 0.18013100436681223,
+      "grad_norm": 0.15401554107666016,
+      "learning_rate": 4.711425736484447e-05,
+      "loss": 0.1733405351638794,
+      "step": 990
+    },
+    {
+      "epoch": 0.18104075691411936,
+      "grad_norm": 0.13253968954086304,
+      "learning_rate": 4.7079830245827906e-05,
+      "loss": 0.17846795320510864,
+      "step": 995
+    },
+    {
+      "epoch": 0.1819505094614265,
+      "grad_norm": 0.21846213936805725,
+      "learning_rate": 4.7045211716062245e-05,
+      "loss": 0.18021599054336548,
+      "step": 1000
+    },
+    {
+      "epoch": 0.18286026200873362,
+      "grad_norm": 0.16867990791797638,
+      "learning_rate": 4.7010402075656595e-05,
+      "loss": 0.18232386112213134,
+      "step": 1005
+    },
+    {
+      "epoch": 0.18377001455604075,
+      "grad_norm": 0.17180582880973816,
+      "learning_rate": 4.697540162637686e-05,
+      "loss": 0.1816317319869995,
+      "step": 1010
+    },
+    {
+      "epoch": 0.18467976710334788,
+      "grad_norm": 0.16480213403701782,
+      "learning_rate": 4.694021067164303e-05,
+      "loss": 0.17718446254730225,
+      "step": 1015
+    },
+    {
+      "epoch": 0.185589519650655,
+      "grad_norm": 0.15015918016433716,
+      "learning_rate": 4.6904829516526605e-05,
+      "loss": 0.17412011623382567,
+      "step": 1020
+    },
+    {
+      "epoch": 0.18649927219796217,
+      "grad_norm": 0.14445139467716217,
+      "learning_rate": 4.686925846774795e-05,
+      "loss": 0.1778018832206726,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1874090247452693,
+      "grad_norm": 0.1701960265636444,
+      "learning_rate": 4.683349783367362e-05,
+      "loss": 0.16901081800460815,
+      "step": 1030
+    },
+    {
+      "epoch": 0.18831877729257643,
+      "grad_norm": 0.15894867479801178,
+      "learning_rate": 4.679754792431368e-05,
+      "loss": 0.17055928707122803,
+      "step": 1035
+    },
+    {
+      "epoch": 0.18922852983988356,
+      "grad_norm": 0.1511942446231842,
+      "learning_rate": 4.676140905131903e-05,
+      "loss": 0.17339680194854737,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1901382823871907,
+      "grad_norm": 0.14735209941864014,
+      "learning_rate": 4.672508152797872e-05,
+      "loss": 0.17802717685699462,
+      "step": 1045
+    },
+    {
+      "epoch": 0.19104803493449782,
+      "grad_norm": 0.17367291450500488,
+      "learning_rate": 4.66885656692172e-05,
+      "loss": 0.1732744097709656,
+      "step": 1050
+    },
+    {
+      "epoch": 0.19195778748180495,
+      "grad_norm": 0.147227481007576,
+      "learning_rate": 4.665186179159159e-05,
+      "loss": 0.17040517330169677,
+      "step": 1055
+    },
+    {
+      "epoch": 0.19286754002911208,
+      "grad_norm": 0.1709655076265335,
+      "learning_rate": 4.6614970213289e-05,
+      "loss": 0.17794088125228882,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1937772925764192,
+      "grad_norm": 0.1588088721036911,
+      "learning_rate": 4.657789125412366e-05,
+      "loss": 0.17180380821228028,
+      "step": 1065
+    },
+    {
+      "epoch": 0.19468704512372634,
+      "grad_norm": 0.14827021956443787,
+      "learning_rate": 4.654062523553428e-05,
+      "loss": 0.182997989654541,
+      "step": 1070
+    },
+    {
+      "epoch": 0.19559679767103347,
+      "grad_norm": 0.16230466961860657,
+      "learning_rate": 4.6503172480581126e-05,
+      "loss": 0.17346880435943604,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1965065502183406,
+      "grad_norm": 0.1637624353170395,
+      "learning_rate": 4.646553331394333e-05,
+      "loss": 0.17263576984405518,
+      "step": 1080
+    },
+    {
+      "epoch": 0.19741630276564776,
+      "grad_norm": 0.15977843105793,
+      "learning_rate": 4.642770806191603e-05,
+      "loss": 0.17284308671951293,
+      "step": 1085
+    },
+    {
+      "epoch": 0.19832605531295489,
+      "grad_norm": 0.15394869446754456,
+      "learning_rate": 4.6389697052407534e-05,
+      "loss": 0.17797101736068727,
+      "step": 1090
+    },
+    {
+      "epoch": 0.19923580786026202,
+      "grad_norm": 0.15995225310325623,
+      "learning_rate": 4.6351500614936485e-05,
+      "loss": 0.18137198686599731,
+      "step": 1095
+    },
+    {
+      "epoch": 0.20014556040756915,
+      "grad_norm": 0.1779479682445526,
+      "learning_rate": 4.6313119080629006e-05,
+      "loss": 0.17998344898223878,
+      "step": 1100
+    },
+    {
+      "epoch": 0.20105531295487628,
+      "grad_norm": 0.14362832903862,
+      "learning_rate": 4.627455278221584e-05,
+      "loss": 0.18196423053741456,
+      "step": 1105
+    },
+    {
+      "epoch": 0.2019650655021834,
+      "grad_norm": 0.15951639413833618,
+      "learning_rate": 4.623580205402947e-05,
+      "loss": 0.17423888444900512,
+      "step": 1110
+    },
+    {
+      "epoch": 0.20287481804949054,
+      "grad_norm": 0.17273563146591187,
+      "learning_rate": 4.619686723200115e-05,
+      "loss": 0.17392473220825194,
+      "step": 1115
+    },
+    {
+      "epoch": 0.20378457059679767,
+      "grad_norm": 0.1655360758304596,
+      "learning_rate": 4.615774865365813e-05,
+      "loss": 0.17528389692306517,
+      "step": 1120
+    },
+    {
+      "epoch": 0.2046943231441048,
+      "grad_norm": 0.15920691192150116,
+      "learning_rate": 4.611844665812058e-05,
+      "loss": 0.1806849241256714,
+      "step": 1125
+    },
+    {
+      "epoch": 0.20560407569141192,
+      "grad_norm": 0.16114577651023865,
+      "learning_rate": 4.607896158609875e-05,
+      "loss": 0.17217352390289306,
+      "step": 1130
+    },
+    {
+      "epoch": 0.20651382823871905,
+      "grad_norm": 0.1499422937631607,
+      "learning_rate": 4.603929377988999e-05,
+      "loss": 0.17806737422943114,
+      "step": 1135
+    },
+    {
+      "epoch": 0.2074235807860262,
+      "grad_norm": 0.17605191469192505,
+      "learning_rate": 4.5999443583375765e-05,
+      "loss": 0.17842113971710205,
+      "step": 1140
+    },
+    {
+      "epoch": 0.20833333333333334,
+      "grad_norm": 0.16117210686206818,
+      "learning_rate": 4.595941134201871e-05,
+      "loss": 0.18379683494567872,
+      "step": 1145
+    },
+    {
+      "epoch": 0.20924308588064047,
+      "grad_norm": 0.21199050545692444,
+      "learning_rate": 4.591919740285957e-05,
+      "loss": 0.16286123991012574,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2101528384279476,
+      "grad_norm": 0.15100529789924622,
+      "learning_rate": 4.587880211451427e-05,
+      "loss": 0.17995200157165528,
+      "step": 1155
+    },
+    {
+      "epoch": 0.21106259097525473,
+      "grad_norm": 0.16618172824382782,
+      "learning_rate": 4.583822582717085e-05,
+      "loss": 0.16960303783416747,
+      "step": 1160
+    },
+    {
+      "epoch": 0.21197234352256186,
+      "grad_norm": 0.14743569493293762,
+      "learning_rate": 4.579746889258643e-05,
+      "loss": 0.17762668132781984,
+      "step": 1165
+    },
+    {
+      "epoch": 0.212882096069869,
+      "grad_norm": 0.1697179079055786,
+      "learning_rate": 4.575653166408417e-05,
+      "loss": 0.16656005382537842,
+      "step": 1170
+    },
+    {
+      "epoch": 0.21379184861717612,
+      "grad_norm": 0.14886513352394104,
+      "learning_rate": 4.57154144965502e-05,
+      "loss": 0.17091882228851318,
+      "step": 1175
+    },
+    {
+      "epoch": 0.21470160116448325,
+      "grad_norm": 0.18197473883628845,
+      "learning_rate": 4.5674117746430556e-05,
+      "loss": 0.1770920753479004,
+      "step": 1180
+    },
+    {
+      "epoch": 0.21561135371179038,
+      "grad_norm": 0.17323088645935059,
+      "learning_rate": 4.563264177172807e-05,
+      "loss": 0.1734643578529358,
+      "step": 1185
+    },
+    {
+      "epoch": 0.2165211062590975,
+      "grad_norm": 0.1521984338760376,
+      "learning_rate": 4.559098693199929e-05,
+      "loss": 0.17515116930007935,
+      "step": 1190
+    },
+    {
+      "epoch": 0.21743085880640467,
+      "grad_norm": 0.1842304915189743,
+      "learning_rate": 4.554915358835134e-05,
+      "loss": 0.16798022985458375,
+      "step": 1195
+    },
+    {
+      "epoch": 0.2183406113537118,
+      "grad_norm": 0.14753451943397522,
+      "learning_rate": 4.5507142103438794e-05,
+      "loss": 0.1755476713180542,
+      "step": 1200
+    },
+    {
+      "epoch": 0.21925036390101893,
+      "grad_norm": 0.17096194624900818,
+      "learning_rate": 4.546495284146057e-05,
+      "loss": 0.1792473554611206,
+      "step": 1205
+    },
+    {
+      "epoch": 0.22016011644832606,
+      "grad_norm": 0.1579233556985855,
+      "learning_rate": 4.542258616815669e-05,
+      "loss": 0.17230144739151002,
+      "step": 1210
+    },
+    {
+      "epoch": 0.2210698689956332,
+      "grad_norm": 0.177297905087471,
+      "learning_rate": 4.5380042450805216e-05,
+      "loss": 0.1807127833366394,
+      "step": 1215
+    },
+    {
+      "epoch": 0.22197962154294032,
+      "grad_norm": 0.14331696927547455,
+      "learning_rate": 4.533732205821897e-05,
+      "loss": 0.17201389074325563,
+      "step": 1220
+    },
+    {
+      "epoch": 0.22288937409024745,
+      "grad_norm": 0.14473360776901245,
+      "learning_rate": 4.529442536074239e-05,
+      "loss": 0.17036900520324708,
+      "step": 1225
+    },
+    {
+      "epoch": 0.22379912663755458,
+      "grad_norm": 0.1820901483297348,
+      "learning_rate": 4.5251352730248314e-05,
+      "loss": 0.17704882621765136,
+      "step": 1230
+    },
+    {
+      "epoch": 0.2247088791848617,
+      "grad_norm": 0.1948976367712021,
+      "learning_rate": 4.5208104540134746e-05,
+      "loss": 0.1706973433494568,
+      "step": 1235
+    },
+    {
+      "epoch": 0.22561863173216884,
+      "grad_norm": 0.16660070419311523,
+      "learning_rate": 4.51646811653216e-05,
+      "loss": 0.17636821269989014,
+      "step": 1240
+    },
+    {
+      "epoch": 0.22652838427947597,
+      "grad_norm": 0.1699984073638916,
+      "learning_rate": 4.512108298224751e-05,
+      "loss": 0.16986632347106934,
+      "step": 1245
+    },
+    {
+      "epoch": 0.22743813682678313,
+      "grad_norm": 0.17601042985916138,
+      "learning_rate": 4.50773103688665e-05,
+      "loss": 0.17507898807525635,
+      "step": 1250
+    },
+    {
+      "epoch": 0.22834788937409026,
+      "grad_norm": 0.17557238042354584,
+      "learning_rate": 4.503336370464476e-05,
+      "loss": 0.17702863216400147,
+      "step": 1255
+    },
+    {
+      "epoch": 0.2292576419213974,
+      "grad_norm": 0.1800651252269745,
+      "learning_rate": 4.498924337055729e-05,
+      "loss": 0.16419180631637573,
+      "step": 1260
+    },
+    {
+      "epoch": 0.23016739446870452,
+      "grad_norm": 0.2022479772567749,
+      "learning_rate": 4.494494974908468e-05,
+      "loss": 0.17482060194015503,
+      "step": 1265
+    },
+    {
+      "epoch": 0.23107714701601165,
+      "grad_norm": 0.14180205762386322,
+      "learning_rate": 4.490048322420973e-05,
+      "loss": 0.1723136067390442,
+      "step": 1270
+    },
+    {
+      "epoch": 0.23198689956331878,
+      "grad_norm": 0.18607310950756073,
+      "learning_rate": 4.485584418141419e-05,
+      "loss": 0.17096419334411622,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2328966521106259,
+      "grad_norm": 0.15958310663700104,
+      "learning_rate": 4.481103300767529e-05,
+      "loss": 0.1656244158744812,
+      "step": 1280
+    },
+    {
+      "epoch": 0.23380640465793304,
+      "grad_norm": 0.17552383244037628,
+      "learning_rate": 4.476605009146255e-05,
+      "loss": 0.17677626609802247,
+      "step": 1285
+    },
+    {
+      "epoch": 0.23471615720524017,
+      "grad_norm": 0.15299823880195618,
+      "learning_rate": 4.472089582273429e-05,
+      "loss": 0.1778991103172302,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2356259097525473,
+      "grad_norm": 0.14613987505435944,
+      "learning_rate": 4.46755705929343e-05,
+      "loss": 0.17071452140808105,
+      "step": 1295
+    },
+    {
+      "epoch": 0.23653566229985443,
+      "grad_norm": 0.17781122028827667,
+      "learning_rate": 4.463007479498843e-05,
+      "loss": 0.16955430507659913,
+      "step": 1300
+    },
+    {
+      "epoch": 0.23744541484716158,
+      "grad_norm": 0.16326487064361572,
+      "learning_rate": 4.458440882330119e-05,
+      "loss": 0.1777693510055542,
+      "step": 1305
+    },
+    {
+      "epoch": 0.23835516739446871,
+      "grad_norm": 0.17701926827430725,
+      "learning_rate": 4.4538573073752365e-05,
+      "loss": 0.16323351860046387,
+      "step": 1310
+    },
+    {
+      "epoch": 0.23926491994177584,
+      "grad_norm": 0.13104717433452606,
+      "learning_rate": 4.449256794369349e-05,
+      "loss": 0.17653456926345826,
+      "step": 1315
+    },
+    {
+      "epoch": 0.24017467248908297,
+      "grad_norm": 0.1796836256980896,
+      "learning_rate": 4.444639383194452e-05,
+      "loss": 0.17189600467681884,
+      "step": 1320
+    },
+    {
+      "epoch": 0.2410844250363901,
+      "grad_norm": 0.14919696748256683,
+      "learning_rate": 4.440005113879029e-05,
+      "loss": 0.17003334760665895,
+      "step": 1325
+    },
+    {
+      "epoch": 0.24199417758369723,
+      "grad_norm": 0.1728784441947937,
+      "learning_rate": 4.4353540265977064e-05,
+      "loss": 0.17397408485412597,
+      "step": 1330
+    },
+    {
+      "epoch": 0.24290393013100436,
+      "grad_norm": 0.14591015875339508,
+      "learning_rate": 4.43068616167091e-05,
+      "loss": 0.16498478651046752,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2438136826783115,
+      "grad_norm": 0.18417201936244965,
+      "learning_rate": 4.4260015595645055e-05,
+      "loss": 0.16841750144958495,
+      "step": 1340
+    },
+    {
+      "epoch": 0.24472343522561862,
+      "grad_norm": 0.16264279186725616,
+      "learning_rate": 4.4213002608894605e-05,
+      "loss": 0.16907373666763306,
+      "step": 1345
+    },
+    {
+      "epoch": 0.24563318777292575,
+      "grad_norm": 0.15248481929302216,
+      "learning_rate": 4.416582306401481e-05,
+      "loss": 0.15931472778320313,
+      "step": 1350
+    },
+    {
+      "epoch": 0.24654294032023288,
+      "grad_norm": 0.1488373875617981,
+      "learning_rate": 4.4118477370006636e-05,
+      "loss": 0.1701716423034668,
+      "step": 1355
+    },
+    {
+      "epoch": 0.24745269286754004,
+      "grad_norm": 0.14679782092571259,
+      "learning_rate": 4.407096593731142e-05,
+      "loss": 0.157412326335907,
+      "step": 1360
+    },
+    {
+      "epoch": 0.24836244541484717,
+      "grad_norm": 0.17139530181884766,
+      "learning_rate": 4.402328917780728e-05,
+      "loss": 0.17303754091262818,
+      "step": 1365
+    },
+    {
+      "epoch": 0.2492721979621543,
+      "grad_norm": 0.1534871757030487,
+      "learning_rate": 4.397544750480554e-05,
+      "loss": 0.1786255121231079,
+      "step": 1370
+    },
+    {
+      "epoch": 0.2501819505094614,
+      "grad_norm": 0.1876252293586731,
+      "learning_rate": 4.39274413330472e-05,
+      "loss": 0.16442898511886597,
+      "step": 1375
+    },
+    {
+      "epoch": 0.25109170305676853,
+      "grad_norm": 0.16165752708911896,
+      "learning_rate": 4.387927107869928e-05,
+      "loss": 0.1780426025390625,
+      "step": 1380
+    },
+    {
+      "epoch": 0.25200145560407566,
+      "grad_norm": 0.17242255806922913,
+      "learning_rate": 4.383093715935124e-05,
+      "loss": 0.15959256887435913,
+      "step": 1385
+    },
+    {
+      "epoch": 0.25291120815138285,
+      "grad_norm": 0.1627114862203598,
+      "learning_rate": 4.378243999401137e-05,
+      "loss": 0.17606115341186523,
+      "step": 1390
+    },
+    {
+      "epoch": 0.25382096069869,
+      "grad_norm": 0.15911224484443665,
+      "learning_rate": 4.373378000310312e-05,
+      "loss": 0.16798585653305054,
+      "step": 1395
+    },
+    {
+      "epoch": 0.2547307132459971,
+      "grad_norm": 0.15542249381542206,
+      "learning_rate": 4.3684957608461505e-05,
+      "loss": 0.1695417881011963,
+      "step": 1400
+    },
+    {
+      "epoch": 0.25564046579330424,
+      "grad_norm": 0.1475304812192917,
+      "learning_rate": 4.363597323332941e-05,
+      "loss": 0.16340878009796142,
+      "step": 1405
+    },
+    {
+      "epoch": 0.25655021834061137,
+      "grad_norm": 0.16943927109241486,
+      "learning_rate": 4.358682730235395e-05,
+      "loss": 0.17240238189697266,
+      "step": 1410
+    },
+    {
+      "epoch": 0.2574599708879185,
+      "grad_norm": 0.1816391944885254,
+      "learning_rate": 4.3537520241582744e-05,
+      "loss": 0.16558437347412108,
+      "step": 1415
+    },
+    {
+      "epoch": 0.25836972343522563,
+      "grad_norm": 0.23851341009140015,
+      "learning_rate": 4.348805247846027e-05,
+      "loss": 0.16796000003814698,
+      "step": 1420
+    },
+    {
+      "epoch": 0.25927947598253276,
+      "grad_norm": 0.15415243804454803,
+      "learning_rate": 4.343842444182414e-05,
+      "loss": 0.1746017098426819,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2601892285298399,
+      "grad_norm": 0.15651032328605652,
+      "learning_rate": 4.338863656190139e-05,
+      "loss": 0.1649057984352112,
+      "step": 1430
+    },
+    {
+      "epoch": 0.261098981077147,
+      "grad_norm": 0.16601966321468353,
+      "learning_rate": 4.333868927030471e-05,
+      "loss": 0.15888988971710205,
+      "step": 1435
+    },
+    {
+      "epoch": 0.26200873362445415,
+      "grad_norm": 0.1549467295408249,
+      "learning_rate": 4.328858300002876e-05,
+      "loss": 0.16357985734939576,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2629184861717613,
+      "grad_norm": 0.16332370042800903,
+      "learning_rate": 4.32383181854464e-05,
+      "loss": 0.16749982833862304,
+      "step": 1445
+    },
+    {
+      "epoch": 0.2638282387190684,
+      "grad_norm": 0.14827077090740204,
+      "learning_rate": 4.3187895262304894e-05,
+      "loss": 0.16886214017868043,
+      "step": 1450
+    },
+    {
+      "epoch": 0.26473799126637554,
+      "grad_norm": 0.1557198166847229,
+      "learning_rate": 4.313731466772216e-05,
+      "loss": 0.17512214183807373,
+      "step": 1455
+    },
+    {
+      "epoch": 0.26564774381368267,
+      "grad_norm": 0.17263570427894592,
+      "learning_rate": 4.308657684018299e-05,
+      "loss": 0.16248074769973755,
+      "step": 1460
+    },
+    {
+      "epoch": 0.2665574963609898,
+      "grad_norm": 0.17135761678218842,
+      "learning_rate": 4.303568221953521e-05,
+      "loss": 0.16605921983718872,
+      "step": 1465
+    },
+    {
+      "epoch": 0.26746724890829693,
+      "grad_norm": 0.14322632551193237,
+      "learning_rate": 4.2984631246985897e-05,
+      "loss": 0.1610772728919983,
+      "step": 1470
+    },
+    {
+      "epoch": 0.26837700145560406,
+      "grad_norm": 0.18852312862873077,
+      "learning_rate": 4.2933424365097564e-05,
+      "loss": 0.1686462163925171,
+      "step": 1475
+    },
+    {
+      "epoch": 0.2692867540029112,
+      "grad_norm": 0.1780245155096054,
+      "learning_rate": 4.2882062017784294e-05,
+      "loss": 0.16953932046890258,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2701965065502183,
+      "grad_norm": 0.180568665266037,
+      "learning_rate": 4.2830544650307895e-05,
+      "loss": 0.16442664861679077,
+      "step": 1485
+    },
+    {
+      "epoch": 0.27110625909752545,
+      "grad_norm": 0.16876435279846191,
+      "learning_rate": 4.277887270927407e-05,
+      "loss": 0.17128173112869263,
+      "step": 1490
+    },
+    {
+      "epoch": 0.2720160116448326,
+      "grad_norm": 0.164053276181221,
+      "learning_rate": 4.2727046642628513e-05,
+      "loss": 0.16331382989883422,
+      "step": 1495
+    },
+    {
+      "epoch": 0.27292576419213976,
+      "grad_norm": 0.14577528834342957,
+      "learning_rate": 4.267506689965305e-05,
+      "loss": 0.1638316035270691,
+      "step": 1500
+    },
+    {
+      "epoch": 0.2738355167394469,
+      "grad_norm": 0.1648740917444229,
+      "learning_rate": 4.262293393096171e-05,
+      "loss": 0.15332664251327516,
+      "step": 1505
+    },
+    {
+      "epoch": 0.274745269286754,
+      "grad_norm": 0.16445094347000122,
+      "learning_rate": 4.257064818849685e-05,
+      "loss": 0.1706634521484375,
+      "step": 1510
+    },
+    {
+      "epoch": 0.27565502183406115,
+      "grad_norm": 0.1584935486316681,
+      "learning_rate": 4.251821012552524e-05,
+      "loss": 0.1684114694595337,
+      "step": 1515
+    },
+    {
+      "epoch": 0.2765647743813683,
+      "grad_norm": 0.17215611040592194,
+      "learning_rate": 4.24656201966341e-05,
+      "loss": 0.15594131946563722,
+      "step": 1520
+    },
+    {
+      "epoch": 0.2774745269286754,
+      "grad_norm": 0.15945589542388916,
+      "learning_rate": 4.2412878857727214e-05,
+      "loss": 0.1686659574508667,
+      "step": 1525
+    },
+    {
+      "epoch": 0.27838427947598254,
+      "grad_norm": 0.16103951632976532,
+      "learning_rate": 4.2359986566020906e-05,
+      "loss": 0.17779340744018554,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2792940320232897,
+      "grad_norm": 0.1770307570695877,
+      "learning_rate": 4.230694378004014e-05,
+      "loss": 0.16786882877349854,
+      "step": 1535
+    },
+    {
+      "epoch": 0.2802037845705968,
+      "grad_norm": 0.16225053369998932,
+      "learning_rate": 4.2253750959614504e-05,
+      "loss": 0.16558897495269775,
+      "step": 1540
+    },
+    {
+      "epoch": 0.28111353711790393,
+      "grad_norm": 0.27213969826698303,
+      "learning_rate": 4.220040856587425e-05,
+      "loss": 0.1641119599342346,
+      "step": 1545
+    },
+    {
+      "epoch": 0.28202328966521106,
+      "grad_norm": 0.1773071587085724,
+      "learning_rate": 4.2146917061246284e-05,
+      "loss": 0.16919140815734862,
+      "step": 1550
+    },
+    {
+      "epoch": 0.2829330422125182,
+      "grad_norm": 0.15519705414772034,
+      "learning_rate": 4.209327690945014e-05,
+      "loss": 0.15501506328582765,
+      "step": 1555
+    },
+    {
+      "epoch": 0.2838427947598253,
+      "grad_norm": 0.19921597838401794,
+      "learning_rate": 4.203948857549402e-05,
+      "loss": 0.1690821886062622,
+      "step": 1560
+    },
+    {
+      "epoch": 0.28475254730713245,
+      "grad_norm": 0.15417630970478058,
+      "learning_rate": 4.1985552525670696e-05,
+      "loss": 0.1675640344619751,
+      "step": 1565
+    },
+    {
+      "epoch": 0.2856622998544396,
+      "grad_norm": 0.1739572137594223,
+      "learning_rate": 4.193146922755348e-05,
+      "loss": 0.16738017797470092,
+      "step": 1570
+    },
+    {
+      "epoch": 0.2865720524017467,
+      "grad_norm": 0.1384361982345581,
+      "learning_rate": 4.187723914999221e-05,
+      "loss": 0.16802358627319336,
+      "step": 1575
+    },
+    {
+      "epoch": 0.28748180494905384,
+      "grad_norm": 0.1491454839706421,
+      "learning_rate": 4.182286276310915e-05,
+      "loss": 0.1619583249092102,
+      "step": 1580
+    },
+    {
+      "epoch": 0.288391557496361,
+      "grad_norm": 0.15831919014453888,
+      "learning_rate": 4.176834053829492e-05,
+      "loss": 0.1625199794769287,
+      "step": 1585
+    },
+    {
+      "epoch": 0.2893013100436681,
+      "grad_norm": 0.16265396773815155,
+      "learning_rate": 4.1713672948204416e-05,
+      "loss": 0.16718552112579346,
+      "step": 1590
+    },
+    {
+      "epoch": 0.29021106259097523,
+      "grad_norm": 0.15153461694717407,
+      "learning_rate": 4.1658860466752714e-05,
+      "loss": 0.15979087352752686,
+      "step": 1595
+    },
+    {
+      "epoch": 0.29112081513828236,
+      "grad_norm": 0.1620412915945053,
+      "learning_rate": 4.160390356911096e-05,
+      "loss": 0.16103557348251343,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2920305676855895,
+      "grad_norm": 0.16673807799816132,
+      "learning_rate": 4.154880273170223e-05,
+      "loss": 0.16394708156585694,
+      "step": 1605
+    },
+    {
+      "epoch": 0.2929403202328967,
+      "grad_norm": 0.14834867417812347,
+      "learning_rate": 4.149355843219744e-05,
+      "loss": 0.15916435718536376,
+      "step": 1610
+    },
+    {
+      "epoch": 0.2938500727802038,
+      "grad_norm": 0.16977964341640472,
+      "learning_rate": 4.143817114951119e-05,
+      "loss": 0.16538127660751342,
+      "step": 1615
+    },
+    {
+      "epoch": 0.29475982532751094,
+      "grad_norm": 0.17986875772476196,
+      "learning_rate": 4.138264136379756e-05,
+      "loss": 0.15514618158340454,
+      "step": 1620
+    },
+    {
+      "epoch": 0.29566957787481807,
+      "grad_norm": 0.15794920921325684,
+      "learning_rate": 4.132696955644605e-05,
+      "loss": 0.15992183685302735,
+      "step": 1625
+    },
+    {
+      "epoch": 0.2965793304221252,
+      "grad_norm": 0.19955399632453918,
+      "learning_rate": 4.127115621007731e-05,
+      "loss": 0.16362056732177735,
+      "step": 1630
+    },
+    {
+      "epoch": 0.29748908296943233,
+      "grad_norm": 0.1352023035287857,
+      "learning_rate": 4.121520180853903e-05,
+      "loss": 0.15631601810455323,
+      "step": 1635
+    },
+    {
+      "epoch": 0.29839883551673946,
+      "grad_norm": 0.15340781211853027,
+      "learning_rate": 4.1159106836901674e-05,
+      "loss": 0.1571858048439026,
+      "step": 1640
+    },
+    {
+      "epoch": 0.2993085880640466,
+      "grad_norm": 0.15311770141124725,
+      "learning_rate": 4.110287178145433e-05,
+      "loss": 0.16082344055175782,
+      "step": 1645
+    },
+    {
+      "epoch": 0.3002183406113537,
+      "grad_norm": 0.17811627686023712,
+      "learning_rate": 4.10464971297005e-05,
+      "loss": 0.16117215156555176,
+      "step": 1650
+    },
+    {
+      "epoch": 0.30112809315866085,
+      "grad_norm": 0.21060039103031158,
+      "learning_rate": 4.0989983370353805e-05,
+      "loss": 0.15838587284088135,
+      "step": 1655
+    },
+    {
+      "epoch": 0.302037845705968,
+      "grad_norm": 0.155836820602417,
+      "learning_rate": 4.093333099333383e-05,
+      "loss": 0.16648870706558228,
+      "step": 1660
+    },
+    {
+      "epoch": 0.3029475982532751,
+      "grad_norm": 0.13711698353290558,
+      "learning_rate": 4.0876540489761826e-05,
+      "loss": 0.16899349689483642,
+      "step": 1665
+    },
+    {
+      "epoch": 0.30385735080058224,
+      "grad_norm": 0.15162716805934906,
+      "learning_rate": 4.0819612351956485e-05,
+      "loss": 0.16574090719223022,
+      "step": 1670
+    },
+    {
+      "epoch": 0.30476710334788937,
+      "grad_norm": 0.15016348659992218,
+      "learning_rate": 4.0762547073429615e-05,
+      "loss": 0.1689780354499817,
+      "step": 1675
+    },
+    {
+      "epoch": 0.3056768558951965,
+      "grad_norm": 0.15182986855506897,
+      "learning_rate": 4.070534514888194e-05,
+      "loss": 0.1593686819076538,
+      "step": 1680
+    },
+    {
+      "epoch": 0.3065866084425036,
+      "grad_norm": 0.15648750960826874,
+      "learning_rate": 4.0648007074198765e-05,
+      "loss": 0.16436235904693602,
+      "step": 1685
+    },
+    {
+      "epoch": 0.30749636098981076,
+      "grad_norm": 0.18339484930038452,
+      "learning_rate": 4.0590533346445665e-05,
+      "loss": 0.1678077220916748,
+      "step": 1690
+    },
+    {
+      "epoch": 0.3084061135371179,
+      "grad_norm": 0.16426527500152588,
+      "learning_rate": 4.053292446386422e-05,
+      "loss": 0.1689227342605591,
+      "step": 1695
+    },
+    {
+      "epoch": 0.309315866084425,
+      "grad_norm": 0.16129335761070251,
+      "learning_rate": 4.047518092586766e-05,
+      "loss": 0.16592445373535156,
+      "step": 1700
+    },
+    {
+      "epoch": 0.31022561863173215,
+      "grad_norm": 0.15512363612651825,
+      "learning_rate": 4.041730323303654e-05,
+      "loss": 0.16142364740371704,
+      "step": 1705
+    },
+    {
+      "epoch": 0.3111353711790393,
+      "grad_norm": 0.159842386841774,
+      "learning_rate": 4.0359291887114425e-05,
+      "loss": 0.1702875852584839,
+      "step": 1710
+    },
+    {
+      "epoch": 0.3120451237263464,
+      "grad_norm": 0.19558854401111603,
+      "learning_rate": 4.030114739100352e-05,
+      "loss": 0.15966148376464845,
+      "step": 1715
+    },
+    {
+      "epoch": 0.3129548762736536,
+      "grad_norm": 0.1577496975660324,
+      "learning_rate": 4.024287024876029e-05,
+      "loss": 0.1620358943939209,
+      "step": 1720
+    },
+    {
+      "epoch": 0.3138646288209607,
+      "grad_norm": 0.1629355251789093,
+      "learning_rate": 4.0184460965591144e-05,
+      "loss": 0.16511552333831786,
+      "step": 1725
+    },
+    {
+      "epoch": 0.31477438136826785,
+      "grad_norm": 0.17060767114162445,
+      "learning_rate": 4.0125920047848e-05,
+      "loss": 0.15672838687896729,
+      "step": 1730
+    },
+    {
+      "epoch": 0.315684133915575,
+      "grad_norm": 0.22447620332241058,
+      "learning_rate": 4.006724800302394e-05,
+      "loss": 0.15339784622192382,
+      "step": 1735
+    },
+    {
+      "epoch": 0.3165938864628821,
+      "grad_norm": 0.14572037756443024,
+      "learning_rate": 4.000844533974878e-05,
+      "loss": 0.16566959619522095,
+      "step": 1740
+    },
+    {
+      "epoch": 0.31750363901018924,
+      "grad_norm": 0.15915483236312866,
+      "learning_rate": 3.9949512567784684e-05,
+      "loss": 0.16153957843780517,
+      "step": 1745
+    },
+    {
+      "epoch": 0.3184133915574964,
+      "grad_norm": 0.1668540984392166,
+      "learning_rate": 3.9890450198021704e-05,
+      "loss": 0.1659809947013855,
+      "step": 1750
+    },
+    {
+      "epoch": 0.3193231441048035,
+      "grad_norm": 0.16612035036087036,
+      "learning_rate": 3.983125874247341e-05,
+      "loss": 0.16941241025924683,
+      "step": 1755
+    },
+    {
+      "epoch": 0.32023289665211063,
+      "grad_norm": 0.15163679420948029,
+      "learning_rate": 3.9771938714272407e-05,
+      "loss": 0.16053590774536133,
+      "step": 1760
+    },
+    {
+      "epoch": 0.32114264919941776,
+      "grad_norm": 0.1797824203968048,
+      "learning_rate": 3.97124906276659e-05,
+      "loss": 0.1667110800743103,
+      "step": 1765
+    },
+    {
+      "epoch": 0.3220524017467249,
+      "grad_norm": 0.15076608955860138,
+      "learning_rate": 3.9652914998011237e-05,
+      "loss": 0.1607860803604126,
+      "step": 1770
+    },
+    {
+      "epoch": 0.322962154294032,
+      "grad_norm": 0.16523587703704834,
+      "learning_rate": 3.959321234177144e-05,
+      "loss": 0.16515827178955078,
+      "step": 1775
+    },
+    {
+      "epoch": 0.32387190684133915,
+      "grad_norm": 0.22065149247646332,
+      "learning_rate": 3.9533383176510746e-05,
+      "loss": 0.1618957757949829,
+      "step": 1780
+    },
+    {
+      "epoch": 0.3247816593886463,
+      "grad_norm": 0.16426463425159454,
+      "learning_rate": 3.9473428020890066e-05,
+      "loss": 0.15763382911682128,
+      "step": 1785
+    },
+    {
+      "epoch": 0.3256914119359534,
+      "grad_norm": 0.16474904119968414,
+      "learning_rate": 3.941334739466257e-05,
+      "loss": 0.15135571956634522,
+      "step": 1790
+    },
+    {
+      "epoch": 0.32660116448326054,
+      "grad_norm": 0.16746412217617035,
+      "learning_rate": 3.935314181866909e-05,
+      "loss": 0.15925389528274536,
+      "step": 1795
+    },
+    {
+      "epoch": 0.32751091703056767,
+      "grad_norm": 0.17819371819496155,
+      "learning_rate": 3.929281181483369e-05,
+      "loss": 0.1598669171333313,
+      "step": 1800
+    },
+    {
+      "epoch": 0.3284206695778748,
+      "grad_norm": 0.1816040277481079,
+      "learning_rate": 3.923235790615907e-05,
+      "loss": 0.1652522087097168,
+      "step": 1805
+    },
+    {
+      "epoch": 0.32933042212518193,
+      "grad_norm": 0.14846695959568024,
+      "learning_rate": 3.917178061672211e-05,
+      "loss": 0.16665585041046144,
+      "step": 1810
+    },
+    {
+      "epoch": 0.33024017467248906,
+      "grad_norm": 0.1734926551580429,
+      "learning_rate": 3.911108047166924e-05,
+      "loss": 0.16069791316986085,
+      "step": 1815
+    },
+    {
+      "epoch": 0.3311499272197962,
+      "grad_norm": 0.16154922544956207,
+      "learning_rate": 3.905025799721194e-05,
+      "loss": 0.16114097833633423,
+      "step": 1820
+    },
+    {
+      "epoch": 0.3320596797671033,
+      "grad_norm": 0.1538771390914917,
+      "learning_rate": 3.898931372062217e-05,
+      "loss": 0.1602831244468689,
+      "step": 1825
+    },
+    {
+      "epoch": 0.3329694323144105,
+      "grad_norm": 0.14036566019058228,
+      "learning_rate": 3.892824817022781e-05,
+      "loss": 0.1502395749092102,
+      "step": 1830
+    },
+    {
+      "epoch": 0.33387918486171764,
+      "grad_norm": 0.19212059676647186,
+      "learning_rate": 3.886706187540804e-05,
+      "loss": 0.16265250444412233,
+      "step": 1835
+    },
+    {
+      "epoch": 0.33478893740902477,
+      "grad_norm": 0.17410333454608917,
+      "learning_rate": 3.880575536658881e-05,
+      "loss": 0.15689224004745483,
+      "step": 1840
+    },
+    {
+      "epoch": 0.3356986899563319,
+      "grad_norm": 0.15165294706821442,
+      "learning_rate": 3.874432917523817e-05,
+      "loss": 0.15033140182495117,
+      "step": 1845
+    },
+    {
+      "epoch": 0.336608442503639,
+      "grad_norm": 0.16166730225086212,
+      "learning_rate": 3.8682783833861736e-05,
+      "loss": 0.16896235942840576,
+      "step": 1850
+    },
+    {
+      "epoch": 0.33751819505094616,
+      "grad_norm": 0.16497021913528442,
+      "learning_rate": 3.8621119875998026e-05,
+      "loss": 0.1600774645805359,
+      "step": 1855
+    },
+    {
+      "epoch": 0.3384279475982533,
+      "grad_norm": 0.17264948785305023,
+      "learning_rate": 3.855933783621384e-05,
+      "loss": 0.16947593688964843,
+      "step": 1860
+    },
+    {
+      "epoch": 0.3393377001455604,
+      "grad_norm": 0.16870704293251038,
+      "learning_rate": 3.8497438250099636e-05,
+      "loss": 0.16062095165252685,
+      "step": 1865
+    },
+    {
+      "epoch": 0.34024745269286755,
+      "grad_norm": 0.16644036769866943,
+      "learning_rate": 3.843542165426492e-05,
+      "loss": 0.16015599966049193,
+      "step": 1870
+    },
+    {
+      "epoch": 0.3411572052401747,
+      "grad_norm": 0.1626352220773697,
+      "learning_rate": 3.837328858633349e-05,
+      "loss": 0.17444703578948975,
+      "step": 1875
+    },
+    {
+      "epoch": 0.3420669577874818,
+      "grad_norm": 0.1427375227212906,
+      "learning_rate": 3.83110395849389e-05,
+      "loss": 0.1589805006980896,
+      "step": 1880
+    },
+    {
+      "epoch": 0.34297671033478894,
+      "grad_norm": 0.17840255796909332,
+      "learning_rate": 3.824867518971973e-05,
+      "loss": 0.15953952074050903,
+      "step": 1885
+    },
+    {
+      "epoch": 0.34388646288209607,
+      "grad_norm": 0.16998249292373657,
+      "learning_rate": 3.818619594131489e-05,
+      "loss": 0.16027032136917113,
+      "step": 1890
+    },
+    {
+      "epoch": 0.3447962154294032,
+      "grad_norm": 0.14950257539749146,
+      "learning_rate": 3.812360238135897e-05,
+      "loss": 0.15335670709609986,
+      "step": 1895
+    },
+    {
+      "epoch": 0.3457059679767103,
+      "grad_norm": 0.1678011417388916,
+      "learning_rate": 3.806089505247752e-05,
+      "loss": 0.1560648798942566,
+      "step": 1900
+    },
+    {
+      "epoch": 0.34661572052401746,
+      "grad_norm": 0.17944541573524475,
+      "learning_rate": 3.799807449828238e-05,
+      "loss": 0.16072254180908202,
+      "step": 1905
+    },
+    {
+      "epoch": 0.3475254730713246,
+      "grad_norm": 0.166817307472229,
+      "learning_rate": 3.793514126336691e-05,
+      "loss": 0.1542820692062378,
+      "step": 1910
+    },
+    {
+      "epoch": 0.3484352256186317,
+      "grad_norm": 0.16047626733779907,
+      "learning_rate": 3.787209589330134e-05,
+      "loss": 0.16092092990875245,
+      "step": 1915
+    },
+    {
+      "epoch": 0.34934497816593885,
+      "grad_norm": 0.16478900611400604,
+      "learning_rate": 3.7808938934627965e-05,
+      "loss": 0.16765867471694945,
+      "step": 1920
+    },
+    {
+      "epoch": 0.350254730713246,
+      "grad_norm": 0.15349514782428741,
+      "learning_rate": 3.774567093485648e-05,
+      "loss": 0.15890377759933472,
+      "step": 1925
+    },
+    {
+      "epoch": 0.3511644832605531,
+      "grad_norm": 0.1515921950340271,
+      "learning_rate": 3.768229244245917e-05,
+      "loss": 0.16668319702148438,
+      "step": 1930
+    },
+    {
+      "epoch": 0.35207423580786024,
+      "grad_norm": 0.16310466825962067,
+      "learning_rate": 3.7618804006866195e-05,
+      "loss": 0.15182652473449706,
+      "step": 1935
+    },
+    {
+      "epoch": 0.3529839883551674,
+      "grad_norm": 0.17294517159461975,
+      "learning_rate": 3.755520617846084e-05,
+      "loss": 0.16287628412246705,
+      "step": 1940
+    },
+    {
+      "epoch": 0.35389374090247455,
+      "grad_norm": 0.1482895463705063,
+      "learning_rate": 3.749149950857467e-05,
+      "loss": 0.15321952104568481,
+      "step": 1945
+    },
+    {
+      "epoch": 0.3548034934497817,
+      "grad_norm": 0.2236029952764511,
+      "learning_rate": 3.7427684549482847e-05,
+      "loss": 0.15403482913970948,
+      "step": 1950
+    },
+    {
+      "epoch": 0.3557132459970888,
+      "grad_norm": 0.20185327529907227,
+      "learning_rate": 3.736376185439927e-05,
+      "loss": 0.1633884072303772,
+      "step": 1955
+    },
+    {
+      "epoch": 0.35662299854439594,
+      "grad_norm": 0.13906247913837433,
+      "learning_rate": 3.7299731977471816e-05,
+      "loss": 0.15925350189208984,
+      "step": 1960
+    },
+    {
+      "epoch": 0.35753275109170307,
+      "grad_norm": 0.18665002286434174,
+      "learning_rate": 3.723559547377751e-05,
+      "loss": 0.1612026572227478,
+      "step": 1965
+    },
+    {
+      "epoch": 0.3584425036390102,
+      "grad_norm": 0.16913433372974396,
+      "learning_rate": 3.717135289931774e-05,
+      "loss": 0.15479494333267213,
+      "step": 1970
+    },
+    {
+      "epoch": 0.35935225618631733,
+      "grad_norm": 0.1620066910982132,
+      "learning_rate": 3.7107004811013434e-05,
+      "loss": 0.1604058027267456,
+      "step": 1975
+    },
+    {
+      "epoch": 0.36026200873362446,
+      "grad_norm": 0.16838301718235016,
+      "learning_rate": 3.704255176670021e-05,
+      "loss": 0.15335073471069335,
+      "step": 1980
+    },
+    {
+      "epoch": 0.3611717612809316,
+      "grad_norm": 0.3054695427417755,
+      "learning_rate": 3.6977994325123535e-05,
+      "loss": 0.16558053493499755,
+      "step": 1985
+    },
+    {
+      "epoch": 0.3620815138282387,
+      "grad_norm": 0.1526716649532318,
+      "learning_rate": 3.6913333045933934e-05,
+      "loss": 0.16148923635482787,
+      "step": 1990
+    },
+    {
+      "epoch": 0.36299126637554585,
+      "grad_norm": 0.15328513085842133,
+      "learning_rate": 3.684856848968209e-05,
+      "loss": 0.1553613781929016,
+      "step": 1995
+    },
+    {
+      "epoch": 0.363901018922853,
+      "grad_norm": 0.16129714250564575,
+      "learning_rate": 3.6783701217813995e-05,
+      "loss": 0.16724612712860107,
+      "step": 2000
+    },
+    {
+      "epoch": 0.3648107714701601,
+      "grad_norm": 0.15715539455413818,
+      "learning_rate": 3.6718731792666086e-05,
+      "loss": 0.15867922306060792,
+      "step": 2005
+    },
+    {
+      "epoch": 0.36572052401746724,
+      "grad_norm": 0.15569166839122772,
+      "learning_rate": 3.6653660777460366e-05,
+      "loss": 0.1552058696746826,
+      "step": 2010
+    },
+    {
+      "epoch": 0.36663027656477437,
+      "grad_norm": 0.16223010420799255,
+      "learning_rate": 3.6588488736299535e-05,
+      "loss": 0.1583200454711914,
+      "step": 2015
+    },
+    {
+      "epoch": 0.3675400291120815,
+      "grad_norm": 0.18441995978355408,
+      "learning_rate": 3.652321623416209e-05,
+      "loss": 0.15050662755966188,
+      "step": 2020
+    },
+    {
+      "epoch": 0.36844978165938863,
+      "grad_norm": 0.13792674243450165,
+      "learning_rate": 3.645784383689742e-05,
+      "loss": 0.15458759069442748,
+      "step": 2025
+    },
+    {
+      "epoch": 0.36935953420669576,
+      "grad_norm": 0.14993111789226532,
+      "learning_rate": 3.639237211122091e-05,
+      "loss": 0.15926222801208495,
+      "step": 2030
+    },
+    {
+      "epoch": 0.3702692867540029,
+      "grad_norm": 0.16815930604934692,
+      "learning_rate": 3.632680162470904e-05,
+      "loss": 0.15524441003799438,
+      "step": 2035
+    },
+    {
+      "epoch": 0.37117903930131,
+      "grad_norm": 0.13312821090221405,
+      "learning_rate": 3.626113294579441e-05,
+      "loss": 0.15883516073226928,
+      "step": 2040
+    },
+    {
+      "epoch": 0.37208879184861715,
+      "grad_norm": 0.16838273406028748,
+      "learning_rate": 3.619536664376091e-05,
+      "loss": 0.15829603672027587,
+      "step": 2045
+    },
+    {
+      "epoch": 0.37299854439592434,
+      "grad_norm": 0.14706873893737793,
+      "learning_rate": 3.612950328873869e-05,
+      "loss": 0.15644397735595703,
+      "step": 2050
+    },
+    {
+      "epoch": 0.37390829694323147,
+      "grad_norm": 0.1644199639558792,
+      "learning_rate": 3.606354345169926e-05,
+      "loss": 0.15858219861984252,
+      "step": 2055
+    },
+    {
+      "epoch": 0.3748180494905386,
+      "grad_norm": 0.18077051639556885,
+      "learning_rate": 3.599748770445055e-05,
+      "loss": 0.1641286849975586,
+      "step": 2060
+    },
+    {
+      "epoch": 0.3757278020378457,
+      "grad_norm": 0.16329127550125122,
+      "learning_rate": 3.5931336619631914e-05,
+      "loss": 0.15027186870574952,
+      "step": 2065
+    },
+    {
+      "epoch": 0.37663755458515286,
+      "grad_norm": 0.16346783936023712,
+      "learning_rate": 3.586509077070922e-05,
+      "loss": 0.1558641314506531,
+      "step": 2070
+    },
+    {
+      "epoch": 0.37754730713246,
+      "grad_norm": 0.1727602630853653,
+      "learning_rate": 3.5798750731969834e-05,
+      "loss": 0.15390506982803345,
+      "step": 2075
+    },
+    {
+      "epoch": 0.3784570596797671,
+      "grad_norm": 0.7598192691802979,
+      "learning_rate": 3.5732317078517654e-05,
+      "loss": 0.1533232808113098,
+      "step": 2080
+    },
+    {
+      "epoch": 0.37936681222707425,
+      "grad_norm": 0.1433355212211609,
+      "learning_rate": 3.5665790386268124e-05,
+      "loss": 0.15560413599014283,
+      "step": 2085
+    },
+    {
+      "epoch": 0.3802765647743814,
+      "grad_norm": 0.18439625203609467,
+      "learning_rate": 3.559917123194325e-05,
+      "loss": 0.16695556640625,
+      "step": 2090
+    },
+    {
+      "epoch": 0.3811863173216885,
+      "grad_norm": 0.1693502813577652,
+      "learning_rate": 3.55324601930666e-05,
+      "loss": 0.15957870483398437,
+      "step": 2095
+    },
+    {
+      "epoch": 0.38209606986899564,
+      "grad_norm": 0.17776088416576385,
+      "learning_rate": 3.54656578479583e-05,
+      "loss": 0.1527492880821228,
+      "step": 2100
+    },
+    {
+      "epoch": 0.38300582241630277,
+      "grad_norm": 0.15993724763393402,
+      "learning_rate": 3.539876477572998e-05,
+      "loss": 0.1567505717277527,
+      "step": 2105
+    },
+    {
+      "epoch": 0.3839155749636099,
+      "grad_norm": 0.17067375779151917,
+      "learning_rate": 3.533178155627981e-05,
+      "loss": 0.14660797119140626,
+      "step": 2110
+    },
+    {
+      "epoch": 0.384825327510917,
+      "grad_norm": 0.20239882171154022,
+      "learning_rate": 3.526470877028745e-05,
+      "loss": 0.1596767544746399,
+      "step": 2115
+    },
+    {
+      "epoch": 0.38573508005822416,
+      "grad_norm": 0.1863643079996109,
+      "learning_rate": 3.5197546999209005e-05,
+      "loss": 0.15738571882247926,
+      "step": 2120
+    },
+    {
+      "epoch": 0.3866448326055313,
+      "grad_norm": 0.16994133591651917,
+      "learning_rate": 3.5130296825272014e-05,
+      "loss": 0.16255316734313965,
+      "step": 2125
+    },
+    {
+      "epoch": 0.3875545851528384,
+      "grad_norm": 0.18703415989875793,
+      "learning_rate": 3.5062958831470355e-05,
+      "loss": 0.15206334590911866,
+      "step": 2130
+    },
+    {
+      "epoch": 0.38846433770014555,
+      "grad_norm": 0.15433982014656067,
+      "learning_rate": 3.4995533601559226e-05,
+      "loss": 0.1590178370475769,
+      "step": 2135
+    },
+    {
+      "epoch": 0.3893740902474527,
+      "grad_norm": 0.16498146951198578,
+      "learning_rate": 3.4928021720050104e-05,
+      "loss": 0.14759145975112914,
+      "step": 2140
+    },
+    {
+      "epoch": 0.3902838427947598,
+      "grad_norm": 0.17880478501319885,
+      "learning_rate": 3.486042377220562e-05,
+      "loss": 0.1642458915710449,
+      "step": 2145
+    },
+    {
+      "epoch": 0.39119359534206694,
+      "grad_norm": 0.14700061082839966,
+      "learning_rate": 3.479274034403455e-05,
+      "loss": 0.16105138063430785,
+      "step": 2150
+    },
+    {
+      "epoch": 0.39210334788937407,
+      "grad_norm": 0.1620762050151825,
+      "learning_rate": 3.472497202228664e-05,
+      "loss": 0.15104985237121582,
+      "step": 2155
+    },
+    {
+      "epoch": 0.3930131004366812,
+      "grad_norm": 0.1625058799982071,
+      "learning_rate": 3.4657119394447654e-05,
+      "loss": 0.16145485639572144,
+      "step": 2160
+    },
+    {
+      "epoch": 0.3939228529839884,
+      "grad_norm": 0.1631549596786499,
+      "learning_rate": 3.458918304873417e-05,
+      "loss": 0.16712255477905275,
+      "step": 2165
+    },
+    {
+      "epoch": 0.3948326055312955,
+      "grad_norm": 0.16041551530361176,
+      "learning_rate": 3.452116357408853e-05,
+      "loss": 0.15118330717086792,
+      "step": 2170
+    },
+    {
+      "epoch": 0.39574235807860264,
+      "grad_norm": 0.16692611575126648,
+      "learning_rate": 3.44530615601737e-05,
+      "loss": 0.16982550621032716,
+      "step": 2175
+    },
+    {
+      "epoch": 0.39665211062590977,
+      "grad_norm": 0.16082268953323364,
+      "learning_rate": 3.438487759736821e-05,
+      "loss": 0.1513260006904602,
+      "step": 2180
+    },
+    {
+      "epoch": 0.3975618631732169,
+      "grad_norm": 0.1474589854478836,
+      "learning_rate": 3.4316612276761004e-05,
+      "loss": 0.14968743324279785,
+      "step": 2185
+    },
+    {
+      "epoch": 0.39847161572052403,
+      "grad_norm": 0.14531342685222626,
+      "learning_rate": 3.42482661901463e-05,
+      "loss": 0.1563260555267334,
+      "step": 2190
+    },
+    {
+      "epoch": 0.39938136826783116,
+      "grad_norm": 0.16775506734848022,
+      "learning_rate": 3.41798399300185e-05,
+      "loss": 0.14861010313034057,
+      "step": 2195
+    },
+    {
+      "epoch": 0.4002911208151383,
+      "grad_norm": 0.15065217018127441,
+      "learning_rate": 3.411133408956703e-05,
+      "loss": 0.15559519529342652,
+      "step": 2200
+    },
+    {
+      "epoch": 0.4012008733624454,
+      "grad_norm": 0.16655296087265015,
+      "learning_rate": 3.4042749262671184e-05,
+      "loss": 0.16025567054748535,
+      "step": 2205
+    },
+    {
+      "epoch": 0.40211062590975255,
+      "grad_norm": 0.14773905277252197,
+      "learning_rate": 3.397408604389501e-05,
+      "loss": 0.15074082612991332,
+      "step": 2210
+    },
+    {
+      "epoch": 0.4030203784570597,
+      "grad_norm": 0.16233304142951965,
+      "learning_rate": 3.3905345028482125e-05,
+      "loss": 0.15490520000457764,
+      "step": 2215
+    },
+    {
+      "epoch": 0.4039301310043668,
+      "grad_norm": 0.17520153522491455,
+      "learning_rate": 3.383652681235058e-05,
+      "loss": 0.1517520785331726,
+      "step": 2220
+    },
+    {
+      "epoch": 0.40483988355167394,
+      "grad_norm": 0.14749875664710999,
+      "learning_rate": 3.376763199208766e-05,
+      "loss": 0.15410997867584228,
+      "step": 2225
+    },
+    {
+      "epoch": 0.40574963609898107,
+      "grad_norm": 0.16855919361114502,
+      "learning_rate": 3.369866116494477e-05,
+      "loss": 0.1510261058807373,
+      "step": 2230
+    },
+    {
+      "epoch": 0.4066593886462882,
+      "grad_norm": 0.1594122350215912,
+      "learning_rate": 3.362961492883218e-05,
+      "loss": 0.1493813395500183,
+      "step": 2235
+    },
+    {
+      "epoch": 0.40756914119359533,
+      "grad_norm": 0.13645926117897034,
+      "learning_rate": 3.3560493882313915e-05,
+      "loss": 0.14876762628555298,
+      "step": 2240
+    },
+    {
+      "epoch": 0.40847889374090246,
+      "grad_norm": 0.14304400980472565,
+      "learning_rate": 3.349129862460251e-05,
+      "loss": 0.15567013025283813,
+      "step": 2245
+    },
+    {
+      "epoch": 0.4093886462882096,
+      "grad_norm": 0.17040041089057922,
+      "learning_rate": 3.342202975555386e-05,
+      "loss": 0.1563249945640564,
+      "step": 2250
+    },
+    {
+      "epoch": 0.4102983988355167,
+      "grad_norm": 0.15594671666622162,
+      "learning_rate": 3.3352687875661984e-05,
+      "loss": 0.1546410083770752,
+      "step": 2255
+    },
+    {
+      "epoch": 0.41120815138282385,
+      "grad_norm": 0.1677195280790329,
+      "learning_rate": 3.328327358605384e-05,
+      "loss": 0.15710171461105346,
+      "step": 2260
+    },
+    {
+      "epoch": 0.412117903930131,
+      "grad_norm": 0.1731705516576767,
+      "learning_rate": 3.321378748848412e-05,
+      "loss": 0.16444036960601807,
+      "step": 2265
+    },
+    {
+      "epoch": 0.4130276564774381,
+      "grad_norm": 0.18779033422470093,
+      "learning_rate": 3.3144230185329984e-05,
+      "loss": 0.15659687519073487,
+      "step": 2270
+    },
+    {
+      "epoch": 0.4139374090247453,
+      "grad_norm": 0.1543768346309662,
+      "learning_rate": 3.3074602279585913e-05,
+      "loss": 0.15100739002227784,
+      "step": 2275
+    },
+    {
+      "epoch": 0.4148471615720524,
+      "grad_norm": 0.16672168672084808,
+      "learning_rate": 3.300490437485843e-05,
+      "loss": 0.15535364151000977,
+      "step": 2280
+    },
+    {
+      "epoch": 0.41575691411935956,
+      "grad_norm": 0.16741308569908142,
+      "learning_rate": 3.293513707536089e-05,
+      "loss": 0.15523911714553834,
+      "step": 2285
+    },
+    {
+      "epoch": 0.4166666666666667,
+      "grad_norm": 0.1488303542137146,
+      "learning_rate": 3.286530098590822e-05,
+      "loss": 0.1542000651359558,
+      "step": 2290
+    },
+    {
+      "epoch": 0.4175764192139738,
+      "grad_norm": 0.1637732982635498,
+      "learning_rate": 3.2795396711911694e-05,
+      "loss": 0.15354831218719484,
+      "step": 2295
+    },
+    {
+      "epoch": 0.41848617176128095,
+      "grad_norm": 0.1472022533416748,
+      "learning_rate": 3.272542485937369e-05,
+      "loss": 0.16235145330429077,
+      "step": 2300
+    },
+    {
+      "epoch": 0.4193959243085881,
+      "grad_norm": 0.15908290445804596,
+      "learning_rate": 3.265538603488241e-05,
+      "loss": 0.15642645359039306,
+      "step": 2305
+    },
+    {
+      "epoch": 0.4203056768558952,
+      "grad_norm": 0.1584865301847458,
+      "learning_rate": 3.2585280845606645e-05,
+      "loss": 0.15490249395370484,
+      "step": 2310
+    },
+    {
+      "epoch": 0.42121542940320233,
+      "grad_norm": 0.15893949568271637,
+      "learning_rate": 3.251510989929052e-05,
+      "loss": 0.1598116159439087,
+      "step": 2315
+    },
+    {
+      "epoch": 0.42212518195050946,
+      "grad_norm": 0.18930596113204956,
+      "learning_rate": 3.244487380424817e-05,
+      "loss": 0.1482008934020996,
+      "step": 2320
+    },
+    {
+      "epoch": 0.4230349344978166,
+      "grad_norm": 0.132876455783844,
+      "learning_rate": 3.237457316935856e-05,
+      "loss": 0.15304710865020751,
+      "step": 2325
+    },
+    {
+      "epoch": 0.4239446870451237,
+      "grad_norm": 0.16447032988071442,
+      "learning_rate": 3.2304208604060106e-05,
+      "loss": 0.15298750400543212,
+      "step": 2330
+    },
+    {
+      "epoch": 0.42485443959243085,
+      "grad_norm": 0.17748120427131653,
+      "learning_rate": 3.223378071834546e-05,
+      "loss": 0.1556084156036377,
+      "step": 2335
+    },
+    {
+      "epoch": 0.425764192139738,
+      "grad_norm": 0.16366586089134216,
+      "learning_rate": 3.2163290122756206e-05,
+      "loss": 0.14387927055358887,
+      "step": 2340
+    },
+    {
+      "epoch": 0.4266739446870451,
+      "grad_norm": 0.15398970246315002,
+      "learning_rate": 3.209273742837755e-05,
+      "loss": 0.16091293096542358,
+      "step": 2345
+    },
+    {
+      "epoch": 0.42758369723435224,
+      "grad_norm": 0.164212167263031,
+      "learning_rate": 3.202212324683305e-05,
+      "loss": 0.15523531436920165,
+      "step": 2350
+    },
+    {
+      "epoch": 0.4284934497816594,
+      "grad_norm": 0.16749800741672516,
+      "learning_rate": 3.1951448190279255e-05,
+      "loss": 0.15354975461959838,
+      "step": 2355
+    },
+    {
+      "epoch": 0.4294032023289665,
+      "grad_norm": 0.14137034118175507,
+      "learning_rate": 3.18807128714005e-05,
+      "loss": 0.14981694221496583,
+      "step": 2360
+    },
+    {
+      "epoch": 0.43031295487627363,
+      "grad_norm": 0.14848439395427704,
+      "learning_rate": 3.1809917903403507e-05,
+      "loss": 0.15448769330978393,
+      "step": 2365
+    },
+    {
+      "epoch": 0.43122270742358076,
+      "grad_norm": 0.1747605800628662,
+      "learning_rate": 3.1739063900012095e-05,
+      "loss": 0.15882387161254882,
+      "step": 2370
+    },
+    {
+      "epoch": 0.4321324599708879,
+      "grad_norm": 0.16054467856884003,
+      "learning_rate": 3.166815147546186e-05,
+      "loss": 0.15170297622680665,
+      "step": 2375
+    },
+    {
+      "epoch": 0.433042212518195,
+      "grad_norm": 0.15428027510643005,
+      "learning_rate": 3.1597181244494886e-05,
+      "loss": 0.16202548742294312,
+      "step": 2380
+    },
+    {
+      "epoch": 0.4339519650655022,
+      "grad_norm": 0.16747219860553741,
+      "learning_rate": 3.1526153822354325e-05,
+      "loss": 0.15461477041244506,
+      "step": 2385
+    },
+    {
+      "epoch": 0.43486171761280934,
+      "grad_norm": 0.17415772378444672,
+      "learning_rate": 3.145506982477918e-05,
+      "loss": 0.16173542737960817,
+      "step": 2390
+    },
+    {
+      "epoch": 0.43577147016011647,
+      "grad_norm": 0.1293518990278244,
+      "learning_rate": 3.1383929867998865e-05,
+      "loss": 0.15572521686553956,
+      "step": 2395
+    },
+    {
+      "epoch": 0.4366812227074236,
+      "grad_norm": 0.16909323632717133,
+      "learning_rate": 3.1312734568727935e-05,
+      "loss": 0.15898628234863282,
+      "step": 2400
+    },
+    {
+      "epoch": 0.43759097525473073,
+      "grad_norm": 0.16770294308662415,
+      "learning_rate": 3.124148454416069e-05,
+      "loss": 0.1536281704902649,
+      "step": 2405
+    },
+    {
+      "epoch": 0.43850072780203786,
+      "grad_norm": 0.14078612625598907,
+      "learning_rate": 3.117018041196585e-05,
+      "loss": 0.15274266004562378,
+      "step": 2410
+    },
+    {
+      "epoch": 0.439410480349345,
+      "grad_norm": 0.15457536280155182,
+      "learning_rate": 3.1098822790281226e-05,
+      "loss": 0.15391263961791993,
+      "step": 2415
+    },
+    {
+      "epoch": 0.4403202328966521,
+      "grad_norm": 0.1640717089176178,
+      "learning_rate": 3.102741229770827e-05,
+      "loss": 0.15515168905258178,
+      "step": 2420
+    },
+    {
+      "epoch": 0.44122998544395925,
+      "grad_norm": 0.2601533830165863,
+      "learning_rate": 3.095594955330683e-05,
+      "loss": 0.1587247371673584,
+      "step": 2425
+    },
+    {
+      "epoch": 0.4421397379912664,
+      "grad_norm": 0.1352529525756836,
+      "learning_rate": 3.08844351765897e-05,
+      "loss": 0.1483217477798462,
+      "step": 2430
+    },
+    {
+      "epoch": 0.4430494905385735,
+      "grad_norm": 0.18479721248149872,
+      "learning_rate": 3.081286978751728e-05,
+      "loss": 0.15121787786483765,
+      "step": 2435
+    },
+    {
+      "epoch": 0.44395924308588064,
+      "grad_norm": 0.16954511404037476,
+      "learning_rate": 3.074125400649221e-05,
+      "loss": 0.16073100566864013,
+      "step": 2440
+    },
+    {
+      "epoch": 0.44486899563318777,
+      "grad_norm": 0.15154729783535004,
+      "learning_rate": 3.0669588454353944e-05,
+      "loss": 0.15738017559051515,
+      "step": 2445
+    },
+    {
+      "epoch": 0.4457787481804949,
+      "grad_norm": 0.1540488302707672,
+      "learning_rate": 3.059787375237344e-05,
+      "loss": 0.1515384554862976,
+      "step": 2450
+    },
+    {
+      "epoch": 0.44668850072780203,
+      "grad_norm": 0.1814432442188263,
+      "learning_rate": 3.052611052224774e-05,
+      "loss": 0.15731438398361205,
+      "step": 2455
+    },
+    {
+      "epoch": 0.44759825327510916,
+      "grad_norm": 0.16657036542892456,
+      "learning_rate": 3.0454299386094542e-05,
+      "loss": 0.15741543769836425,
+      "step": 2460
+    },
+    {
+      "epoch": 0.4485080058224163,
+      "grad_norm": 0.2177237570285797,
+      "learning_rate": 3.0382440966446875e-05,
+      "loss": 0.14972515106201173,
+      "step": 2465
+    },
+    {
+      "epoch": 0.4494177583697234,
+      "grad_norm": 0.1669909954071045,
+      "learning_rate": 3.031053588624766e-05,
+      "loss": 0.1506432294845581,
+      "step": 2470
+    },
+    {
+      "epoch": 0.45032751091703055,
+      "grad_norm": 0.1752234250307083,
+      "learning_rate": 3.0238584768844313e-05,
+      "loss": 0.14969609975814818,
+      "step": 2475
+    },
+    {
+      "epoch": 0.4512372634643377,
+      "grad_norm": 0.18267901241779327,
+      "learning_rate": 3.0166588237983363e-05,
+      "loss": 0.15112748146057128,
+      "step": 2480
+    },
+    {
+      "epoch": 0.4521470160116448,
+      "grad_norm": 0.16250105202198029,
+      "learning_rate": 3.0094546917805007e-05,
+      "loss": 0.15864100456237792,
+      "step": 2485
+    },
+    {
+      "epoch": 0.45305676855895194,
+      "grad_norm": 0.14825721085071564,
+      "learning_rate": 3.0022461432837752e-05,
+      "loss": 0.1513954520225525,
+      "step": 2490
+    },
+    {
+      "epoch": 0.4539665211062591,
+      "grad_norm": 0.1626640111207962,
+      "learning_rate": 2.9950332407992943e-05,
+      "loss": 0.1505578875541687,
+      "step": 2495
+    },
+    {
+      "epoch": 0.45487627365356625,
+      "grad_norm": 0.1535351574420929,
+      "learning_rate": 2.987816046855939e-05,
+      "loss": 0.15255829095840454,
+      "step": 2500
+    },
+    {
+      "epoch": 0.4557860262008734,
+      "grad_norm": 0.17552775144577026,
+      "learning_rate": 2.9805946240197928e-05,
+      "loss": 0.1516443133354187,
+      "step": 2505
+    },
+    {
+      "epoch": 0.4566957787481805,
+      "grad_norm": 0.16020981967449188,
+      "learning_rate": 2.9733690348935994e-05,
+      "loss": 0.14519743919372557,
+      "step": 2510
+    },
+    {
+      "epoch": 0.45760553129548764,
+      "grad_norm": 0.17800211906433105,
+      "learning_rate": 2.9661393421162204e-05,
+      "loss": 0.15679080486297609,
+      "step": 2515
+    },
+    {
+      "epoch": 0.4585152838427948,
+      "grad_norm": 0.16016991436481476,
+      "learning_rate": 2.9589056083620902e-05,
+      "loss": 0.14768127202987671,
+      "step": 2520
+    },
+    {
+      "epoch": 0.4594250363901019,
+      "grad_norm": 0.16272081434726715,
+      "learning_rate": 2.951667896340679e-05,
+      "loss": 0.1513301968574524,
+      "step": 2525
+    },
+    {
+      "epoch": 0.46033478893740903,
+      "grad_norm": 0.1726413071155548,
+      "learning_rate": 2.9444262687959402e-05,
+      "loss": 0.14819332361221313,
+      "step": 2530
+    },
+    {
+      "epoch": 0.46124454148471616,
+      "grad_norm": 0.1670403778553009,
+      "learning_rate": 2.9371807885057735e-05,
+      "loss": 0.15245940685272216,
+      "step": 2535
+    },
+    {
+      "epoch": 0.4621542940320233,
+      "grad_norm": 0.1650049239397049,
+      "learning_rate": 2.9299315182814772e-05,
+      "loss": 0.15187418460845947,
+      "step": 2540
+    },
+    {
+      "epoch": 0.4630640465793304,
+      "grad_norm": 0.16327734291553497,
+      "learning_rate": 2.9226785209672047e-05,
+      "loss": 0.15579828023910522,
+      "step": 2545
+    },
+    {
+      "epoch": 0.46397379912663755,
+      "grad_norm": 0.3367880582809448,
+      "learning_rate": 2.91542185943942e-05,
+      "loss": 0.15617697238922118,
+      "step": 2550
+    },
+    {
+      "epoch": 0.4648835516739447,
+      "grad_norm": 0.1731594055891037,
+      "learning_rate": 2.908161596606353e-05,
+      "loss": 0.1559603691101074,
+      "step": 2555
+    },
+    {
+      "epoch": 0.4657933042212518,
+      "grad_norm": 0.1477293074131012,
+      "learning_rate": 2.9008977954074517e-05,
+      "loss": 0.15567959547042848,
+      "step": 2560
+    },
+    {
+      "epoch": 0.46670305676855894,
+      "grad_norm": 0.16227173805236816,
+      "learning_rate": 2.8936305188128392e-05,
+      "loss": 0.1522113561630249,
+      "step": 2565
+    },
+    {
+      "epoch": 0.4676128093158661,
+      "grad_norm": 0.2031075656414032,
+      "learning_rate": 2.8863598298227674e-05,
+      "loss": 0.15054640769958497,
+      "step": 2570
+    },
+    {
+      "epoch": 0.4685225618631732,
+      "grad_norm": 0.18351472914218903,
+      "learning_rate": 2.8790857914670698e-05,
+      "loss": 0.15837019681930542,
+      "step": 2575
+    },
+    {
+      "epoch": 0.46943231441048033,
+      "grad_norm": 0.15914765000343323,
+      "learning_rate": 2.871808466804616e-05,
+      "loss": 0.1550259470939636,
+      "step": 2580
+    },
+    {
+      "epoch": 0.47034206695778746,
+      "grad_norm": 0.17366717755794525,
+      "learning_rate": 2.8645279189227636e-05,
+      "loss": 0.15702390670776367,
+      "step": 2585
+    },
+    {
+      "epoch": 0.4712518195050946,
+      "grad_norm": 0.13677838444709778,
+      "learning_rate": 2.8572442109368134e-05,
+      "loss": 0.15485031604766847,
+      "step": 2590
+    },
+    {
+      "epoch": 0.4721615720524017,
+      "grad_norm": 0.1477748304605484,
+      "learning_rate": 2.8499574059894617e-05,
+      "loss": 0.14577245712280273,
+      "step": 2595
+    },
+    {
+      "epoch": 0.47307132459970885,
+      "grad_norm": 0.1582217663526535,
+      "learning_rate": 2.842667567250252e-05,
+      "loss": 0.15586793422698975,
+      "step": 2600
+    },
+    {
+      "epoch": 0.47398107714701604,
+      "grad_norm": 0.19658738374710083,
+      "learning_rate": 2.8353747579150268e-05,
+      "loss": 0.15060495138168334,
+      "step": 2605
+    },
+    {
+      "epoch": 0.47489082969432317,
+      "grad_norm": 0.176767036318779,
+      "learning_rate": 2.828079041205382e-05,
+      "loss": 0.15116705894470214,
+      "step": 2610
+    },
+    {
+      "epoch": 0.4758005822416303,
+      "grad_norm": 0.16972507536411285,
+      "learning_rate": 2.820780480368117e-05,
+      "loss": 0.1541937470436096,
+      "step": 2615
+    },
+    {
+      "epoch": 0.47671033478893743,
+      "grad_norm": 0.1548585742712021,
+      "learning_rate": 2.8134791386746884e-05,
+      "loss": 0.14334756135940552,
+      "step": 2620
+    },
+    {
+      "epoch": 0.47762008733624456,
+      "grad_norm": 0.15411986410617828,
+      "learning_rate": 2.806175079420658e-05,
+      "loss": 0.14642289876937867,
+      "step": 2625
+    },
+    {
+      "epoch": 0.4785298398835517,
+      "grad_norm": 0.16609491407871246,
+      "learning_rate": 2.7988683659251474e-05,
+      "loss": 0.15083469152450563,
+      "step": 2630
+    },
+    {
+      "epoch": 0.4794395924308588,
+      "grad_norm": 0.16592684388160706,
+      "learning_rate": 2.791559061530289e-05,
+      "loss": 0.14218480587005616,
+      "step": 2635
+    },
+    {
+      "epoch": 0.48034934497816595,
+      "grad_norm": 0.1764935404062271,
+      "learning_rate": 2.7842472296006722e-05,
+      "loss": 0.15004343986511232,
+      "step": 2640
+    },
+    {
+      "epoch": 0.4812590975254731,
+      "grad_norm": 0.20094354450702667,
+      "learning_rate": 2.7769329335228022e-05,
+      "loss": 0.14975016117095946,
+      "step": 2645
+    },
+    {
+      "epoch": 0.4821688500727802,
+      "grad_norm": 0.1869269460439682,
+      "learning_rate": 2.769616236704542e-05,
+      "loss": 0.155981707572937,
+      "step": 2650
+    },
+    {
+      "epoch": 0.48307860262008734,
+      "grad_norm": 0.16671574115753174,
+      "learning_rate": 2.762297202574571e-05,
+      "loss": 0.14633859395980836,
+      "step": 2655
+    },
+    {
+      "epoch": 0.48398835516739447,
+      "grad_norm": 0.14999663829803467,
+      "learning_rate": 2.754975894581826e-05,
+      "loss": 0.15692603588104248,
+      "step": 2660
+    },
+    {
+      "epoch": 0.4848981077147016,
+      "grad_norm": 0.16893649101257324,
+      "learning_rate": 2.7476523761949592e-05,
+      "loss": 0.14530394077301026,
+      "step": 2665
+    },
+    {
+      "epoch": 0.48580786026200873,
+      "grad_norm": 0.16039884090423584,
+      "learning_rate": 2.740326710901784e-05,
+      "loss": 0.15013915300369263,
+      "step": 2670
+    },
+    {
+      "epoch": 0.48671761280931586,
+      "grad_norm": 0.16672006249427795,
+      "learning_rate": 2.732998962208725e-05,
+      "loss": 0.15667349100112915,
+      "step": 2675
+    },
+    {
+      "epoch": 0.487627365356623,
+      "grad_norm": 0.2160867303609848,
+      "learning_rate": 2.7256691936402684e-05,
+      "loss": 0.14335414171218872,
+      "step": 2680
+    },
+    {
+      "epoch": 0.4885371179039301,
+      "grad_norm": 0.349030077457428,
+      "learning_rate": 2.71833746873841e-05,
+      "loss": 0.1437530279159546,
+      "step": 2685
+    },
+    {
+      "epoch": 0.48944687045123725,
+      "grad_norm": 0.18380966782569885,
+      "learning_rate": 2.7110038510621073e-05,
+      "loss": 0.1476014256477356,
+      "step": 2690
+    },
+    {
+      "epoch": 0.4903566229985444,
+      "grad_norm": 0.1523742377758026,
+      "learning_rate": 2.703668404186722e-05,
+      "loss": 0.14578526020050048,
+      "step": 2695
+    },
+    {
+      "epoch": 0.4912663755458515,
+      "grad_norm": 0.16092729568481445,
+      "learning_rate": 2.696331191703479e-05,
+      "loss": 0.15335593223571778,
+      "step": 2700
+    },
+    {
+      "epoch": 0.49217612809315864,
+      "grad_norm": 0.17185333371162415,
+      "learning_rate": 2.688992277218904e-05,
+      "loss": 0.1540898084640503,
+      "step": 2705
+    },
+    {
+      "epoch": 0.49308588064046577,
+      "grad_norm": 0.1521969735622406,
+      "learning_rate": 2.6816517243542792e-05,
+      "loss": 0.15171396732330322,
+      "step": 2710
+    },
+    {
+      "epoch": 0.49399563318777295,
+      "grad_norm": 0.16064171493053436,
+      "learning_rate": 2.674309596745092e-05,
+      "loss": 0.1505839228630066,
+      "step": 2715
+    },
+    {
+      "epoch": 0.4949053857350801,
+      "grad_norm": 0.16430898010730743,
+      "learning_rate": 2.6669659580404795e-05,
+      "loss": 0.1551363468170166,
+      "step": 2720
+    },
+    {
+      "epoch": 0.4958151382823872,
+      "grad_norm": 0.16125477850437164,
+      "learning_rate": 2.659620871902677e-05,
+      "loss": 0.15069286823272704,
+      "step": 2725
+    },
+    {
+      "epoch": 0.49672489082969434,
+      "grad_norm": 0.1428450047969818,
+      "learning_rate": 2.652274402006471e-05,
+      "loss": 0.15511081218719483,
+      "step": 2730
+    },
+    {
+      "epoch": 0.4976346433770015,
+      "grad_norm": 0.15452754497528076,
+      "learning_rate": 2.6449266120386406e-05,
+      "loss": 0.14941939115524291,
+      "step": 2735
+    },
+    {
+      "epoch": 0.4985443959243086,
+      "grad_norm": 0.17243537306785583,
+      "learning_rate": 2.6375775656974123e-05,
+      "loss": 0.151741623878479,
+      "step": 2740
+    },
+    {
+      "epoch": 0.49945414847161573,
+      "grad_norm": 0.13736453652381897,
+      "learning_rate": 2.6302273266919008e-05,
+      "loss": 0.147042977809906,
+      "step": 2745
+    },
+    {
+      "epoch": 0.5003639010189228,
+      "grad_norm": 0.16241495311260223,
+      "learning_rate": 2.6228759587415614e-05,
+      "loss": 0.14664684534072875,
+      "step": 2750
+    },
+    {
+      "epoch": 0.50127365356623,
+      "grad_norm": 0.193496435880661,
+      "learning_rate": 2.6155235255756356e-05,
+      "loss": 0.15486966371536254,
+      "step": 2755
+    },
+    {
+      "epoch": 0.5021834061135371,
+      "grad_norm": 0.1542847901582718,
+      "learning_rate": 2.6081700909326e-05,
+      "loss": 0.15148009061813356,
+      "step": 2760
+    },
+    {
+      "epoch": 0.5030931586608443,
+      "grad_norm": 0.1696511209011078,
+      "learning_rate": 2.6008157185596142e-05,
+      "loss": 0.14190055131912233,
+      "step": 2765
+    },
+    {
+      "epoch": 0.5040029112081513,
+      "grad_norm": 0.14690077304840088,
+      "learning_rate": 2.5934604722119655e-05,
+      "loss": 0.1570739269256592,
+      "step": 2770
+    },
+    {
+      "epoch": 0.5049126637554585,
+      "grad_norm": 0.17149671912193298,
+      "learning_rate": 2.5861044156525162e-05,
+      "loss": 0.14940304756164552,
+      "step": 2775
+    },
+    {
+      "epoch": 0.5058224163027657,
+      "grad_norm": 0.16639231145381927,
+      "learning_rate": 2.578747612651155e-05,
+      "loss": 0.15691237449645995,
+      "step": 2780
+    },
+    {
+      "epoch": 0.5067321688500728,
+      "grad_norm": 0.2062763124704361,
+      "learning_rate": 2.5713901269842404e-05,
+      "loss": 0.1564734935760498,
+      "step": 2785
+    },
+    {
+      "epoch": 0.50764192139738,
+      "grad_norm": 0.12636308372020721,
+      "learning_rate": 2.5640320224340502e-05,
+      "loss": 0.14539417028427123,
+      "step": 2790
+    },
+    {
+      "epoch": 0.508551673944687,
+      "grad_norm": 0.16893689334392548,
+      "learning_rate": 2.556673362788225e-05,
+      "loss": 0.15440930128097535,
+      "step": 2795
+    },
+    {
+      "epoch": 0.5094614264919942,
+      "grad_norm": 0.16250015795230865,
+      "learning_rate": 2.54931421183922e-05,
+      "loss": 0.14485647678375244,
+      "step": 2800
+    },
+    {
+      "epoch": 0.5103711790393013,
+      "grad_norm": 0.1700994372367859,
+      "learning_rate": 2.5419546333837462e-05,
+      "loss": 0.15411126613616943,
+      "step": 2805
+    },
+    {
+      "epoch": 0.5112809315866085,
+      "grad_norm": 0.1547706127166748,
+      "learning_rate": 2.5345946912222256e-05,
+      "loss": 0.15516072511672974,
+      "step": 2810
+    },
+    {
+      "epoch": 0.5121906841339156,
+      "grad_norm": 0.17955681681632996,
+      "learning_rate": 2.527234449158228e-05,
+      "loss": 0.15546923875808716,
+      "step": 2815
+    },
+    {
+      "epoch": 0.5131004366812227,
+      "grad_norm": 0.163709819316864,
+      "learning_rate": 2.519873970997927e-05,
+      "loss": 0.15665037631988527,
+      "step": 2820
+    },
+    {
+      "epoch": 0.5140101892285298,
+      "grad_norm": 0.17859576642513275,
+      "learning_rate": 2.5125133205495405e-05,
+      "loss": 0.1539722204208374,
+      "step": 2825
+    },
+    {
+      "epoch": 0.514919941775837,
+      "grad_norm": 0.17443150281906128,
+      "learning_rate": 2.5051525616227806e-05,
+      "loss": 0.148411762714386,
+      "step": 2830
+    },
+    {
+      "epoch": 0.5158296943231441,
+      "grad_norm": 0.17397581040859222,
+      "learning_rate": 2.4977917580283007e-05,
+      "loss": 0.14880497455596925,
+      "step": 2835
+    },
+    {
+      "epoch": 0.5167394468704513,
+      "grad_norm": 0.14565663039684296,
+      "learning_rate": 2.4904309735771405e-05,
+      "loss": 0.14934680461883545,
+      "step": 2840
+    },
+    {
+      "epoch": 0.5176491994177583,
+      "grad_norm": 0.17895659804344177,
+      "learning_rate": 2.4830702720801746e-05,
+      "loss": 0.15287939310073853,
+      "step": 2845
+    },
+    {
+      "epoch": 0.5185589519650655,
+      "grad_norm": 0.15812788903713226,
+      "learning_rate": 2.4757097173475572e-05,
+      "loss": 0.14576947689056396,
+      "step": 2850
+    },
+    {
+      "epoch": 0.5194687045123726,
+      "grad_norm": 0.17123781144618988,
+      "learning_rate": 2.46834937318817e-05,
+      "loss": 0.15224847793579102,
+      "step": 2855
+    },
+    {
+      "epoch": 0.5203784570596798,
+      "grad_norm": 0.14845474064350128,
+      "learning_rate": 2.460989303409072e-05,
+      "loss": 0.14901585578918458,
+      "step": 2860
+    },
+    {
+      "epoch": 0.5212882096069869,
+      "grad_norm": 0.23493704199790955,
+      "learning_rate": 2.4536295718149407e-05,
+      "loss": 0.1517487049102783,
+      "step": 2865
+    },
+    {
+      "epoch": 0.522197962154294,
+      "grad_norm": 0.16209843754768372,
+      "learning_rate": 2.4462702422075217e-05,
+      "loss": 0.14327445030212402,
+      "step": 2870
+    },
+    {
+      "epoch": 0.5231077147016011,
+      "grad_norm": 0.17249803245067596,
+      "learning_rate": 2.4389113783850793e-05,
+      "loss": 0.1517549753189087,
+      "step": 2875
+    },
+    {
+      "epoch": 0.5240174672489083,
+      "grad_norm": 0.14561402797698975,
+      "learning_rate": 2.431553044141836e-05,
+      "loss": 0.14764087200164794,
+      "step": 2880
+    },
+    {
+      "epoch": 0.5249272197962155,
+      "grad_norm": 0.17033302783966064,
+      "learning_rate": 2.4241953032674256e-05,
+      "loss": 0.15181604623794556,
+      "step": 2885
+    },
+    {
+      "epoch": 0.5258369723435226,
+      "grad_norm": 0.1184430941939354,
+      "learning_rate": 2.4168382195463367e-05,
+      "loss": 0.14264242649078368,
+      "step": 2890
+    },
+    {
+      "epoch": 0.5267467248908297,
+      "grad_norm": 0.17521196603775024,
+      "learning_rate": 2.4094818567573618e-05,
+      "loss": 0.1509538173675537,
+      "step": 2895
+    },
+    {
+      "epoch": 0.5276564774381368,
+      "grad_norm": 0.1681576371192932,
+      "learning_rate": 2.4021262786730428e-05,
+      "loss": 0.15344605445861817,
+      "step": 2900
+    },
+    {
+      "epoch": 0.528566229985444,
+      "grad_norm": 0.17134182155132294,
+      "learning_rate": 2.3947715490591206e-05,
+      "loss": 0.15161689519882202,
+      "step": 2905
+    },
+    {
+      "epoch": 0.5294759825327511,
+      "grad_norm": 0.1796472817659378,
+      "learning_rate": 2.3874177316739778e-05,
+      "loss": 0.15086464881896972,
+      "step": 2910
+    },
+    {
+      "epoch": 0.5303857350800583,
+      "grad_norm": 0.23268625140190125,
+      "learning_rate": 2.380064890268093e-05,
+      "loss": 0.15354180335998535,
+      "step": 2915
+    },
+    {
+      "epoch": 0.5312954876273653,
+      "grad_norm": 0.16318941116333008,
+      "learning_rate": 2.372713088583481e-05,
+      "loss": 0.15131797790527343,
+      "step": 2920
+    },
+    {
+      "epoch": 0.5322052401746725,
+      "grad_norm": 0.18171803653240204,
+      "learning_rate": 2.365362390353143e-05,
+      "loss": 0.15784090757369995,
+      "step": 2925
+    },
+    {
+      "epoch": 0.5331149927219796,
+      "grad_norm": 0.17672640085220337,
+      "learning_rate": 2.3580128593005156e-05,
+      "loss": 0.15509436130523682,
+      "step": 2930
+    },
+    {
+      "epoch": 0.5340247452692868,
+      "grad_norm": 0.15985223650932312,
+      "learning_rate": 2.3506645591389174e-05,
+      "loss": 0.14851027727127075,
+      "step": 2935
+    },
+    {
+      "epoch": 0.5349344978165939,
+      "grad_norm": 0.16597607731819153,
+      "learning_rate": 2.343317553570995e-05,
+      "loss": 0.1504931092262268,
+      "step": 2940
+    },
+    {
+      "epoch": 0.535844250363901,
+      "grad_norm": 0.20180748403072357,
+      "learning_rate": 2.3359719062881725e-05,
+      "loss": 0.15023820400238036,
+      "step": 2945
+    },
+    {
+      "epoch": 0.5367540029112081,
+      "grad_norm": 0.1735963076353073,
+      "learning_rate": 2.3286276809701e-05,
+      "loss": 0.15374408960342406,
+      "step": 2950
+    },
+    {
+      "epoch": 0.5376637554585153,
+      "grad_norm": 0.17629501223564148,
+      "learning_rate": 2.3212849412840995e-05,
+      "loss": 0.15007833242416382,
+      "step": 2955
+    },
+    {
+      "epoch": 0.5385735080058224,
+      "grad_norm": 0.1493796557188034,
+      "learning_rate": 2.3139437508846155e-05,
+      "loss": 0.15206656455993653,
+      "step": 2960
+    },
+    {
+      "epoch": 0.5394832605531296,
+      "grad_norm": 0.17426837980747223,
+      "learning_rate": 2.306604173412659e-05,
+      "loss": 0.1441131591796875,
+      "step": 2965
+    },
+    {
+      "epoch": 0.5403930131004366,
+      "grad_norm": 0.16984431445598602,
+      "learning_rate": 2.2992662724952613e-05,
+      "loss": 0.14438753128051757,
+      "step": 2970
+    },
+    {
+      "epoch": 0.5413027656477438,
+      "grad_norm": 0.1814386397600174,
+      "learning_rate": 2.2919301117449167e-05,
+      "loss": 0.14887022972106934,
+      "step": 2975
+    },
+    {
+      "epoch": 0.5422125181950509,
+      "grad_norm": 0.158392995595932,
+      "learning_rate": 2.2845957547590368e-05,
+      "loss": 0.14404361248016356,
+      "step": 2980
+    },
+    {
+      "epoch": 0.5431222707423581,
+      "grad_norm": 0.17496263980865479,
+      "learning_rate": 2.2772632651193953e-05,
+      "loss": 0.1454906702041626,
+      "step": 2985
+    },
+    {
+      "epoch": 0.5440320232896652,
+      "grad_norm": 0.157533198595047,
+      "learning_rate": 2.2699327063915766e-05,
+      "loss": 0.1458217740058899,
+      "step": 2990
+    },
+    {
+      "epoch": 0.5449417758369723,
+      "grad_norm": 0.1767890453338623,
+      "learning_rate": 2.262604142124427e-05,
+      "loss": 0.14384825229644777,
+      "step": 2995
+    },
+    {
+      "epoch": 0.5458515283842795,
+      "grad_norm": 0.1851050704717636,
+      "learning_rate": 2.2552776358495033e-05,
+      "loss": 0.14832457304000854,
+      "step": 3000
+    },
+    {
+      "epoch": 0.5467612809315866,
+      "grad_norm": 0.164175882935524,
+      "learning_rate": 2.247953251080521e-05,
+      "loss": 0.14999878406524658,
+      "step": 3005
+    },
+    {
+      "epoch": 0.5476710334788938,
+      "grad_norm": 0.3403675854206085,
+      "learning_rate": 2.240631051312804e-05,
+      "loss": 0.1443937063217163,
+      "step": 3010
+    },
+    {
+      "epoch": 0.5485807860262009,
+      "grad_norm": 0.16751109063625336,
+      "learning_rate": 2.2333111000227342e-05,
+      "loss": 0.1462402105331421,
+      "step": 3015
+    },
+    {
+      "epoch": 0.549490538573508,
+      "grad_norm": 0.14741151034832,
+      "learning_rate": 2.225993460667201e-05,
+      "loss": 0.149855899810791,
+      "step": 3020
+    },
+    {
+      "epoch": 0.5504002911208151,
+      "grad_norm": 0.20605266094207764,
+      "learning_rate": 2.218678196683054e-05,
+      "loss": 0.15413178205490113,
+      "step": 3025
+    },
+    {
+      "epoch": 0.5513100436681223,
+      "grad_norm": 0.14884796738624573,
+      "learning_rate": 2.2113653714865473e-05,
+      "loss": 0.14592334032058715,
+      "step": 3030
+    },
+    {
+      "epoch": 0.5522197962154294,
+      "grad_norm": 0.17114350199699402,
+      "learning_rate": 2.2040550484727943e-05,
+      "loss": 0.1498338460922241,
+      "step": 3035
+    },
+    {
+      "epoch": 0.5531295487627366,
+      "grad_norm": 0.16496853530406952,
+      "learning_rate": 2.196747291015219e-05,
+      "loss": 0.14650315046310425,
+      "step": 3040
+    },
+    {
+      "epoch": 0.5540393013100436,
+      "grad_norm": 0.15172401070594788,
+      "learning_rate": 2.189442162465001e-05,
+      "loss": 0.14984124898910522,
+      "step": 3045
+    },
+    {
+      "epoch": 0.5549490538573508,
+      "grad_norm": 0.19258467853069305,
+      "learning_rate": 2.182139726150532e-05,
+      "loss": 0.1486764669418335,
+      "step": 3050
+    },
+    {
+      "epoch": 0.5558588064046579,
+      "grad_norm": 0.1749001443386078,
+      "learning_rate": 2.1748400453768652e-05,
+      "loss": 0.14983701705932617,
+      "step": 3055
+    },
+    {
+      "epoch": 0.5567685589519651,
+      "grad_norm": 0.37510567903518677,
+      "learning_rate": 2.1675431834251637e-05,
+      "loss": 0.14483561515808105,
+      "step": 3060
+    },
+    {
+      "epoch": 0.5576783114992722,
+      "grad_norm": 0.16932405531406403,
+      "learning_rate": 2.1602492035521553e-05,
+      "loss": 0.14487643241882325,
+      "step": 3065
+    },
+    {
+      "epoch": 0.5585880640465793,
+      "grad_norm": 0.174176424741745,
+      "learning_rate": 2.152958168989584e-05,
+      "loss": 0.14737497568130492,
+      "step": 3070
+    },
+    {
+      "epoch": 0.5594978165938864,
+      "grad_norm": 0.1601252257823944,
+      "learning_rate": 2.1456701429436577e-05,
+      "loss": 0.15183379650115966,
+      "step": 3075
+    },
+    {
+      "epoch": 0.5604075691411936,
+      "grad_norm": 0.14960910379886627,
+      "learning_rate": 2.1383851885945085e-05,
+      "loss": 0.143074893951416,
+      "step": 3080
+    },
+    {
+      "epoch": 0.5613173216885007,
+      "grad_norm": 0.1678633838891983,
+      "learning_rate": 2.1311033690956346e-05,
+      "loss": 0.14961432218551635,
+      "step": 3085
+    },
+    {
+      "epoch": 0.5622270742358079,
+      "grad_norm": 0.15814319252967834,
+      "learning_rate": 2.1238247475733613e-05,
+      "loss": 0.14308581352233887,
+      "step": 3090
+    },
+    {
+      "epoch": 0.5631368267831149,
+      "grad_norm": 0.21240772306919098,
+      "learning_rate": 2.1165493871262887e-05,
+      "loss": 0.14737485647201537,
+      "step": 3095
+    },
+    {
+      "epoch": 0.5640465793304221,
+      "grad_norm": 0.15161271393299103,
+      "learning_rate": 2.109277350824749e-05,
+      "loss": 0.14534420967102052,
+      "step": 3100
+    },
+    {
+      "epoch": 0.5649563318777293,
+      "grad_norm": 0.16572362184524536,
+      "learning_rate": 2.1020087017102537e-05,
+      "loss": 0.14299670457839966,
+      "step": 3105
+    },
+    {
+      "epoch": 0.5658660844250364,
+      "grad_norm": 0.1548164039850235,
+      "learning_rate": 2.094743502794954e-05,
+      "loss": 0.14371142387390137,
+      "step": 3110
+    },
+    {
+      "epoch": 0.5667758369723436,
+      "grad_norm": 0.2574169933795929,
+      "learning_rate": 2.0874818170610885e-05,
+      "loss": 0.14350423812866211,
+      "step": 3115
+    },
+    {
+      "epoch": 0.5676855895196506,
+      "grad_norm": 0.16359548270702362,
+      "learning_rate": 2.080223707460443e-05,
+      "loss": 0.1520243763923645,
+      "step": 3120
+    },
+    {
+      "epoch": 0.5685953420669578,
+      "grad_norm": 0.1798320859670639,
+      "learning_rate": 2.072969236913799e-05,
+      "loss": 0.14832595586776734,
+      "step": 3125
+    },
+    {
+      "epoch": 0.5695050946142649,
+      "grad_norm": 0.17045916616916656,
+      "learning_rate": 2.0657184683103926e-05,
+      "loss": 0.15308042764663696,
+      "step": 3130
+    },
+    {
+      "epoch": 0.5704148471615721,
+      "grad_norm": 0.16345897316932678,
+      "learning_rate": 2.058471464507366e-05,
+      "loss": 0.14564799070358275,
+      "step": 3135
+    },
+    {
+      "epoch": 0.5713245997088792,
+      "grad_norm": 0.15170110762119293,
+      "learning_rate": 2.0512282883292257e-05,
+      "loss": 0.14161767959594726,
+      "step": 3140
+    },
+    {
+      "epoch": 0.5722343522561864,
+      "grad_norm": 0.8107472658157349,
+      "learning_rate": 2.0439890025672955e-05,
+      "loss": 0.14481087923049926,
+      "step": 3145
+    },
+    {
+      "epoch": 0.5731441048034934,
+      "grad_norm": 0.15346679091453552,
+      "learning_rate": 2.036753669979174e-05,
+      "loss": 0.14860262870788574,
+      "step": 3150
+    },
+    {
+      "epoch": 0.5740538573508006,
+      "grad_norm": 0.1632593423128128,
+      "learning_rate": 2.0295223532881886e-05,
+      "loss": 0.1481687307357788,
+      "step": 3155
+    },
+    {
+      "epoch": 0.5749636098981077,
+      "grad_norm": 0.23399172723293304,
+      "learning_rate": 2.022295115182852e-05,
+      "loss": 0.149153733253479,
+      "step": 3160
+    },
+    {
+      "epoch": 0.5758733624454149,
+      "grad_norm": 0.14977394044399261,
+      "learning_rate": 2.015072018316323e-05,
+      "loss": 0.14921388626098633,
+      "step": 3165
+    },
+    {
+      "epoch": 0.576783114992722,
+      "grad_norm": 0.1550658792257309,
+      "learning_rate": 2.007853125305856e-05,
+      "loss": 0.1482759475708008,
+      "step": 3170
+    },
+    {
+      "epoch": 0.5776928675400291,
+      "grad_norm": 0.16661737859249115,
+      "learning_rate": 2.0006384987322645e-05,
+      "loss": 0.14903552532196046,
+      "step": 3175
+    },
+    {
+      "epoch": 0.5786026200873362,
+      "grad_norm": 0.1746823936700821,
+      "learning_rate": 1.9934282011393753e-05,
+      "loss": 0.1412947654724121,
+      "step": 3180
+    },
+    {
+      "epoch": 0.5795123726346434,
+      "grad_norm": 0.17025792598724365,
+      "learning_rate": 1.9862222950334857e-05,
+      "loss": 0.15289769172668458,
+      "step": 3185
+    },
+    {
+      "epoch": 0.5804221251819505,
+      "grad_norm": 0.16857658326625824,
+      "learning_rate": 1.9790208428828252e-05,
+      "loss": 0.14419941902160643,
+      "step": 3190
+    },
+    {
+      "epoch": 0.5813318777292577,
+      "grad_norm": 0.16099876165390015,
+      "learning_rate": 1.9718239071170118e-05,
+      "loss": 0.14476487636566163,
+      "step": 3195
+    },
+    {
+      "epoch": 0.5822416302765647,
+      "grad_norm": 0.16140873730182648,
+      "learning_rate": 1.964631550126508e-05,
+      "loss": 0.14588416814804078,
+      "step": 3200
+    },
+    {
+      "epoch": 0.5831513828238719,
+      "grad_norm": 0.15719448029994965,
+      "learning_rate": 1.957443834262087e-05,
+      "loss": 0.15144693851470947,
+      "step": 3205
+    },
+    {
+      "epoch": 0.584061135371179,
+      "grad_norm": 0.16512645781040192,
+      "learning_rate": 1.950260821834285e-05,
+      "loss": 0.14787566661834717,
+      "step": 3210
+    },
+    {
+      "epoch": 0.5849708879184862,
+      "grad_norm": 0.18584516644477844,
+      "learning_rate": 1.9430825751128643e-05,
+      "loss": 0.14514710903167724,
+      "step": 3215
+    },
+    {
+      "epoch": 0.5858806404657934,
+      "grad_norm": 0.17640981078147888,
+      "learning_rate": 1.9359091563262742e-05,
+      "loss": 0.1511004686355591,
+      "step": 3220
+    },
+    {
+      "epoch": 0.5867903930131004,
+      "grad_norm": 0.1697624921798706,
+      "learning_rate": 1.9287406276611095e-05,
+      "loss": 0.15392563343048096,
+      "step": 3225
+    },
+    {
+      "epoch": 0.5877001455604076,
+      "grad_norm": 0.1677260845899582,
+      "learning_rate": 1.9215770512615725e-05,
+      "loss": 0.15311745405197144,
+      "step": 3230
+    },
+    {
+      "epoch": 0.5886098981077147,
+      "grad_norm": 0.15357480943202972,
+      "learning_rate": 1.9144184892289337e-05,
+      "loss": 0.14370160102844237,
+      "step": 3235
+    },
+    {
+      "epoch": 0.5895196506550219,
+      "grad_norm": 0.18601207435131073,
+      "learning_rate": 1.9072650036209955e-05,
+      "loss": 0.14095077514648438,
+      "step": 3240
+    },
+    {
+      "epoch": 0.590429403202329,
+      "grad_norm": 0.17313526570796967,
+      "learning_rate": 1.9001166564515513e-05,
+      "loss": 0.148259174823761,
+      "step": 3245
+    },
+    {
+      "epoch": 0.5913391557496361,
+      "grad_norm": 0.1634378433227539,
+      "learning_rate": 1.8929735096898504e-05,
+      "loss": 0.15082294940948487,
+      "step": 3250
+    },
+    {
+      "epoch": 0.5922489082969432,
+      "grad_norm": 0.18542174994945526,
+      "learning_rate": 1.885835625260058e-05,
+      "loss": 0.14461435079574586,
+      "step": 3255
+    },
+    {
+      "epoch": 0.5931586608442504,
+      "grad_norm": 0.1740756630897522,
+      "learning_rate": 1.87870306504072e-05,
+      "loss": 0.14083608388900756,
+      "step": 3260
+    },
+    {
+      "epoch": 0.5940684133915575,
+      "grad_norm": 0.25606217980384827,
+      "learning_rate": 1.8715758908642288e-05,
+      "loss": 0.15125386714935302,
+      "step": 3265
+    },
+    {
+      "epoch": 0.5949781659388647,
+      "grad_norm": 0.20194627344608307,
+      "learning_rate": 1.8644541645162834e-05,
+      "loss": 0.14433003664016725,
+      "step": 3270
+    },
+    {
+      "epoch": 0.5958879184861717,
+      "grad_norm": 0.1902168095111847,
+      "learning_rate": 1.8573379477353542e-05,
+      "loss": 0.14718132019042968,
+      "step": 3275
+    },
+    {
+      "epoch": 0.5967976710334789,
+      "grad_norm": 0.15122972428798676,
+      "learning_rate": 1.850227302212151e-05,
+      "loss": 0.153376567363739,
+      "step": 3280
+    },
+    {
+      "epoch": 0.597707423580786,
+      "grad_norm": 0.14331959187984467,
+      "learning_rate": 1.843122289589085e-05,
+      "loss": 0.146630597114563,
+      "step": 3285
+    },
+    {
+      "epoch": 0.5986171761280932,
+      "grad_norm": 0.15083099901676178,
+      "learning_rate": 1.836022971459737e-05,
+      "loss": 0.1445971965789795,
+      "step": 3290
+    },
+    {
+      "epoch": 0.5995269286754003,
+      "grad_norm": 0.16585418581962585,
+      "learning_rate": 1.828929409368321e-05,
+      "loss": 0.15120241641998292,
+      "step": 3295
+    },
+    {
+      "epoch": 0.6004366812227074,
+      "grad_norm": 0.1653224229812622,
+      "learning_rate": 1.8218416648091524e-05,
+      "loss": 0.14349838495254516,
+      "step": 3300
+    },
+    {
+      "epoch": 0.6013464337700145,
+      "grad_norm": 0.1891375184059143,
+      "learning_rate": 1.8147597992261124e-05,
+      "loss": 0.15171384811401367,
+      "step": 3305
+    },
+    {
+      "epoch": 0.6022561863173217,
+      "grad_norm": 0.13392704725265503,
+      "learning_rate": 1.8076838740121187e-05,
+      "loss": 0.14607118368148803,
+      "step": 3310
+    },
+    {
+      "epoch": 0.6031659388646288,
+      "grad_norm": 0.15421944856643677,
+      "learning_rate": 1.8006139505085926e-05,
+      "loss": 0.1380957007408142,
+      "step": 3315
+    },
+    {
+      "epoch": 0.604075691411936,
+      "grad_norm": 0.16637761890888214,
+      "learning_rate": 1.7935500900049246e-05,
+      "loss": 0.14604611396789552,
+      "step": 3320
+    },
+    {
+      "epoch": 0.6049854439592431,
+      "grad_norm": 0.16638441383838654,
+      "learning_rate": 1.7864923537379445e-05,
+      "loss": 0.1513611912727356,
+      "step": 3325
+    },
+    {
+      "epoch": 0.6058951965065502,
+      "grad_norm": 0.1745707094669342,
+      "learning_rate": 1.779440802891394e-05,
+      "loss": 0.15391240119934083,
+      "step": 3330
+    },
+    {
+      "epoch": 0.6068049490538574,
+      "grad_norm": 0.1620505005121231,
+      "learning_rate": 1.77239549859539e-05,
+      "loss": 0.14986472129821776,
+      "step": 3335
+    },
+    {
+      "epoch": 0.6077147016011645,
+      "grad_norm": 0.1579132080078125,
+      "learning_rate": 1.7653565019259e-05,
+      "loss": 0.1466603994369507,
+      "step": 3340
+    },
+    {
+      "epoch": 0.6086244541484717,
+      "grad_norm": 0.19154994189739227,
+      "learning_rate": 1.7583238739042086e-05,
+      "loss": 0.15228934288024903,
+      "step": 3345
+    },
+    {
+      "epoch": 0.6095342066957787,
+      "grad_norm": 0.15771779417991638,
+      "learning_rate": 1.7512976754963913e-05,
+      "loss": 0.14965078830718995,
+      "step": 3350
+    },
+    {
+      "epoch": 0.6104439592430859,
+      "grad_norm": 0.18406136333942413,
+      "learning_rate": 1.744277967612785e-05,
+      "loss": 0.1473196864128113,
+      "step": 3355
+    },
+    {
+      "epoch": 0.611353711790393,
+      "grad_norm": 0.17603816092014313,
+      "learning_rate": 1.7372648111074607e-05,
+      "loss": 0.1430676221847534,
+      "step": 3360
+    },
+    {
+      "epoch": 0.6122634643377002,
+      "grad_norm": 0.156408429145813,
+      "learning_rate": 1.7302582667776933e-05,
+      "loss": 0.14018454551696777,
+      "step": 3365
+    },
+    {
+      "epoch": 0.6131732168850073,
+      "grad_norm": 0.14504843950271606,
+      "learning_rate": 1.7232583953634407e-05,
+      "loss": 0.14505640268325806,
+      "step": 3370
+    },
+    {
+      "epoch": 0.6140829694323144,
+      "grad_norm": 0.1864968240261078,
+      "learning_rate": 1.716265257546808e-05,
+      "loss": 0.14810394048690795,
+      "step": 3375
+    },
+    {
+      "epoch": 0.6149927219796215,
+      "grad_norm": 0.1621711403131485,
+      "learning_rate": 1.7092789139515295e-05,
+      "loss": 0.14203091859817504,
+      "step": 3380
+    },
+    {
+      "epoch": 0.6159024745269287,
+      "grad_norm": 0.17994914948940277,
+      "learning_rate": 1.70229942514244e-05,
+      "loss": 0.14565644264221192,
+      "step": 3385
+    },
+    {
+      "epoch": 0.6168122270742358,
+      "grad_norm": 0.1707388162612915,
+      "learning_rate": 1.6953268516249486e-05,
+      "loss": 0.14449434280395507,
+      "step": 3390
+    },
+    {
+      "epoch": 0.617721979621543,
+      "grad_norm": 0.16425329446792603,
+      "learning_rate": 1.6883612538445175e-05,
+      "loss": 0.15185940265655518,
+      "step": 3395
+    },
+    {
+      "epoch": 0.61863173216885,
+      "grad_norm": 0.15987788140773773,
+      "learning_rate": 1.6814026921861335e-05,
+      "loss": 0.14994431734085084,
+      "step": 3400
+    },
+    {
+      "epoch": 0.6195414847161572,
+      "grad_norm": 0.2987690269947052,
+      "learning_rate": 1.6744512269737894e-05,
+      "loss": 0.14652738571166993,
+      "step": 3405
+    },
+    {
+      "epoch": 0.6204512372634643,
+      "grad_norm": 0.1681315004825592,
+      "learning_rate": 1.6675069184699574e-05,
+      "loss": 0.14566165208816528,
+      "step": 3410
+    },
+    {
+      "epoch": 0.6213609898107715,
+      "grad_norm": 0.15847846865653992,
+      "learning_rate": 1.660569826875069e-05,
+      "loss": 0.1374401330947876,
+      "step": 3415
+    },
+    {
+      "epoch": 0.6222707423580786,
+      "grad_norm": 0.16370312869548798,
+      "learning_rate": 1.6536400123269907e-05,
+      "loss": 0.14905524253845215,
+      "step": 3420
+    },
+    {
+      "epoch": 0.6231804949053857,
+      "grad_norm": 0.16054444015026093,
+      "learning_rate": 1.6467175349005054e-05,
+      "loss": 0.1496324896812439,
+      "step": 3425
+    },
+    {
+      "epoch": 0.6240902474526928,
+      "grad_norm": 0.1663951277732849,
+      "learning_rate": 1.639802454606788e-05,
+      "loss": 0.1504170298576355,
+      "step": 3430
+    },
+    {
+      "epoch": 0.625,
+      "grad_norm": 0.1591310054063797,
+      "learning_rate": 1.6328948313928906e-05,
+      "loss": 0.1410186171531677,
+      "step": 3435
+    },
+    {
+      "epoch": 0.6259097525473072,
+      "grad_norm": 0.1637524962425232,
+      "learning_rate": 1.6259947251412178e-05,
+      "loss": 0.13963305950164795,
+      "step": 3440
+    },
+    {
+      "epoch": 0.6268195050946143,
+      "grad_norm": 0.1688017100095749,
+      "learning_rate": 1.6191021956690096e-05,
+      "loss": 0.14727941751480103,
+      "step": 3445
+    },
+    {
+      "epoch": 0.6277292576419214,
+      "grad_norm": 0.1691795438528061,
+      "learning_rate": 1.612217302727821e-05,
+      "loss": 0.14856183528900146,
+      "step": 3450
+    },
+    {
+      "epoch": 0.6286390101892285,
+      "grad_norm": 0.18501746654510498,
+      "learning_rate": 1.60534010600301e-05,
+      "loss": 0.1481746554374695,
+      "step": 3455
+    },
+    {
+      "epoch": 0.6295487627365357,
+      "grad_norm": 0.16234716773033142,
+      "learning_rate": 1.5984706651132125e-05,
+      "loss": 0.1427530527114868,
+      "step": 3460
+    },
+    {
+      "epoch": 0.6304585152838428,
+      "grad_norm": 0.16013780236244202,
+      "learning_rate": 1.5916090396098293e-05,
+      "loss": 0.14264426231384278,
+      "step": 3465
+    },
+    {
+      "epoch": 0.63136826783115,
+      "grad_norm": 0.17116396129131317,
+      "learning_rate": 1.5847552889765095e-05,
+      "loss": 0.14109257459640503,
+      "step": 3470
+    },
+    {
+      "epoch": 0.632278020378457,
+      "grad_norm": 0.16949769854545593,
+      "learning_rate": 1.5779094726286344e-05,
+      "loss": 0.1387040376663208,
+      "step": 3475
+    },
+    {
+      "epoch": 0.6331877729257642,
+      "grad_norm": 0.14983431994915009,
+      "learning_rate": 1.5710716499128044e-05,
+      "loss": 0.13645120859146118,
+      "step": 3480
+    },
+    {
+      "epoch": 0.6340975254730713,
+      "grad_norm": 0.1632554531097412,
+      "learning_rate": 1.564241880106321e-05,
+      "loss": 0.14883992671966553,
+      "step": 3485
+    },
+    {
+      "epoch": 0.6350072780203785,
+      "grad_norm": 0.15686506032943726,
+      "learning_rate": 1.5574202224166744e-05,
+      "loss": 0.14244272708892822,
+      "step": 3490
+    },
+    {
+      "epoch": 0.6359170305676856,
+      "grad_norm": 0.18843458592891693,
+      "learning_rate": 1.5506067359810333e-05,
+      "loss": 0.15149861574172974,
+      "step": 3495
+    },
+    {
+      "epoch": 0.6368267831149927,
+      "grad_norm": 0.15874551236629486,
+      "learning_rate": 1.5438014798657275e-05,
+      "loss": 0.15188233852386473,
+      "step": 3500
+    },
+    {
+      "epoch": 0.6377365356622998,
+      "grad_norm": 0.17014239728450775,
+      "learning_rate": 1.5370045130657366e-05,
+      "loss": 0.14694437980651856,
+      "step": 3505
+    },
+    {
+      "epoch": 0.638646288209607,
+      "grad_norm": 0.14744038879871368,
+      "learning_rate": 1.5302158945041838e-05,
+      "loss": 0.14434736967086792,
+      "step": 3510
+    },
+    {
+      "epoch": 0.6395560407569141,
+      "grad_norm": 0.2069770246744156,
+      "learning_rate": 1.523435683031818e-05,
+      "loss": 0.13982917070388795,
+      "step": 3515
+    },
+    {
+      "epoch": 0.6404657933042213,
+      "grad_norm": 0.17811502516269684,
+      "learning_rate": 1.5166639374265063e-05,
+      "loss": 0.1408839702606201,
+      "step": 3520
+    },
+    {
+      "epoch": 0.6413755458515283,
+      "grad_norm": 0.165786474943161,
+      "learning_rate": 1.509900716392728e-05,
+      "loss": 0.15312877893447877,
+      "step": 3525
+    },
+    {
+      "epoch": 0.6422852983988355,
+      "grad_norm": 0.1633884161710739,
+      "learning_rate": 1.5031460785610596e-05,
+      "loss": 0.1488795518875122,
+      "step": 3530
+    },
+    {
+      "epoch": 0.6431950509461426,
+      "grad_norm": 0.16498984396457672,
+      "learning_rate": 1.4964000824876723e-05,
+      "loss": 0.15031465291976928,
+      "step": 3535
+    },
+    {
+      "epoch": 0.6441048034934498,
+      "grad_norm": 0.18043678998947144,
+      "learning_rate": 1.4896627866538191e-05,
+      "loss": 0.147829806804657,
+      "step": 3540
+    },
+    {
+      "epoch": 0.6450145560407569,
+      "grad_norm": 0.16813597083091736,
+      "learning_rate": 1.4829342494653315e-05,
+      "loss": 0.1418998956680298,
+      "step": 3545
+    },
+    {
+      "epoch": 0.645924308588064,
+      "grad_norm": 0.1817242056131363,
+      "learning_rate": 1.4762145292521118e-05,
+      "loss": 0.14508869647979736,
+      "step": 3550
+    },
+    {
+      "epoch": 0.6468340611353712,
+      "grad_norm": 0.14666494727134705,
+      "learning_rate": 1.469503684267628e-05,
+      "loss": 0.14159854650497436,
+      "step": 3555
+    },
+    {
+      "epoch": 0.6477438136826783,
+      "grad_norm": 0.16485381126403809,
+      "learning_rate": 1.4628017726884086e-05,
+      "loss": 0.14419105052947997,
+      "step": 3560
+    },
+    {
+      "epoch": 0.6486535662299855,
+      "grad_norm": 0.16100342571735382,
+      "learning_rate": 1.4561088526135375e-05,
+      "loss": 0.14501721858978273,
+      "step": 3565
+    },
+    {
+      "epoch": 0.6495633187772926,
+      "grad_norm": 0.16996590793132782,
+      "learning_rate": 1.4494249820641493e-05,
+      "loss": 0.1377166509628296,
+      "step": 3570
+    },
+    {
+      "epoch": 0.6504730713245997,
+      "grad_norm": 0.16168837249279022,
+      "learning_rate": 1.4427502189829339e-05,
+      "loss": 0.1414325475692749,
+      "step": 3575
+    },
+    {
+      "epoch": 0.6513828238719068,
+      "grad_norm": 0.16318906843662262,
+      "learning_rate": 1.436084621233621e-05,
+      "loss": 0.14685193300247193,
+      "step": 3580
+    },
+    {
+      "epoch": 0.652292576419214,
+      "grad_norm": 0.1636219322681427,
+      "learning_rate": 1.4294282466004899e-05,
+      "loss": 0.1405899167060852,
+      "step": 3585
+    },
+    {
+      "epoch": 0.6532023289665211,
+      "grad_norm": 0.1838461309671402,
+      "learning_rate": 1.422781152787865e-05,
+      "loss": 0.14386332035064697,
+      "step": 3590
+    },
+    {
+      "epoch": 0.6541120815138283,
+      "grad_norm": 0.1796344667673111,
+      "learning_rate": 1.4161433974196115e-05,
+      "loss": 0.1513024687767029,
+      "step": 3595
+    },
+    {
+      "epoch": 0.6550218340611353,
+      "grad_norm": 0.16424529254436493,
+      "learning_rate": 1.4095150380386427e-05,
+      "loss": 0.14238927364349366,
+      "step": 3600
+    },
+    {
+      "epoch": 0.6559315866084425,
+      "grad_norm": 0.19264160096645355,
+      "learning_rate": 1.402896132106415e-05,
+      "loss": 0.14297477006912232,
+      "step": 3605
+    },
+    {
+      "epoch": 0.6568413391557496,
+      "grad_norm": 0.18319948017597198,
+      "learning_rate": 1.3962867370024347e-05,
+      "loss": 0.1448880434036255,
+      "step": 3610
+    },
+    {
+      "epoch": 0.6577510917030568,
+      "grad_norm": 0.16507290303707123,
+      "learning_rate": 1.389686910023758e-05,
+      "loss": 0.14724698066711425,
+      "step": 3615
+    },
+    {
+      "epoch": 0.6586608442503639,
+      "grad_norm": 0.17871244251728058,
+      "learning_rate": 1.3830967083844942e-05,
+      "loss": 0.14479386806488037,
+      "step": 3620
+    },
+    {
+      "epoch": 0.659570596797671,
+      "grad_norm": 0.1846228390932083,
+      "learning_rate": 1.3765161892153112e-05,
+      "loss": 0.1453616738319397,
+      "step": 3625
+    },
+    {
+      "epoch": 0.6604803493449781,
+      "grad_norm": 0.17185978591442108,
+      "learning_rate": 1.3699454095629372e-05,
+      "loss": 0.14906206130981445,
+      "step": 3630
+    },
+    {
+      "epoch": 0.6613901018922853,
+      "grad_norm": 0.14751191437244415,
+      "learning_rate": 1.3633844263896698e-05,
+      "loss": 0.13991892337799072,
+      "step": 3635
+    },
+    {
+      "epoch": 0.6622998544395924,
+      "grad_norm": 0.22059763967990875,
+      "learning_rate": 1.3568332965728817e-05,
+      "loss": 0.14680869579315187,
+      "step": 3640
+    },
+    {
+      "epoch": 0.6632096069868996,
+      "grad_norm": 0.15295909345149994,
+      "learning_rate": 1.3502920769045232e-05,
+      "loss": 0.1404443383216858,
+      "step": 3645
+    },
+    {
+      "epoch": 0.6641193595342066,
+      "grad_norm": 0.14600558578968048,
+      "learning_rate": 1.3437608240906364e-05,
+      "loss": 0.14663270711898804,
+      "step": 3650
+    },
+    {
+      "epoch": 0.6650291120815138,
+      "grad_norm": 0.15548352897167206,
+      "learning_rate": 1.3372395947508587e-05,
+      "loss": 0.1431443452835083,
+      "step": 3655
+    },
+    {
+      "epoch": 0.665938864628821,
+      "grad_norm": 0.1813388466835022,
+      "learning_rate": 1.3307284454179342e-05,
+      "loss": 0.1458706736564636,
+      "step": 3660
+    },
+    {
+      "epoch": 0.6668486171761281,
+      "grad_norm": 0.16326870024204254,
+      "learning_rate": 1.3242274325372247e-05,
+      "loss": 0.14700595140457154,
+      "step": 3665
+    },
+    {
+      "epoch": 0.6677583697234353,
+      "grad_norm": 0.18779197335243225,
+      "learning_rate": 1.3177366124662149e-05,
+      "loss": 0.1497237801551819,
+      "step": 3670
+    },
+    {
+      "epoch": 0.6686681222707423,
+      "grad_norm": 0.16291002929210663,
+      "learning_rate": 1.3112560414740315e-05,
+      "loss": 0.1387086868286133,
+      "step": 3675
+    },
+    {
+      "epoch": 0.6695778748180495,
+      "grad_norm": 0.1532297134399414,
+      "learning_rate": 1.3047857757409487e-05,
+      "loss": 0.14497545957565308,
+      "step": 3680
+    },
+    {
+      "epoch": 0.6704876273653566,
+      "grad_norm": 0.14697515964508057,
+      "learning_rate": 1.2983258713579066e-05,
+      "loss": 0.1494283437728882,
+      "step": 3685
+    },
+    {
+      "epoch": 0.6713973799126638,
+      "grad_norm": 0.15213452279567719,
+      "learning_rate": 1.2918763843260218e-05,
+      "loss": 0.1468907594680786,
+      "step": 3690
+    },
+    {
+      "epoch": 0.6723071324599709,
+      "grad_norm": 0.1745215803384781,
+      "learning_rate": 1.285437370556099e-05,
+      "loss": 0.14997754096984864,
+      "step": 3695
+    },
+    {
+      "epoch": 0.673216885007278,
+      "grad_norm": 0.19207637012004852,
+      "learning_rate": 1.2790088858681577e-05,
+      "loss": 0.14202862977981567,
+      "step": 3700
+    },
+    {
+      "epoch": 0.6741266375545851,
+      "grad_norm": 0.1521359086036682,
+      "learning_rate": 1.2725909859909313e-05,
+      "loss": 0.14547673463821412,
+      "step": 3705
+    },
+    {
+      "epoch": 0.6750363901018923,
+      "grad_norm": 0.16975535452365875,
+      "learning_rate": 1.2661837265613999e-05,
+      "loss": 0.14006874561309815,
+      "step": 3710
+    },
+    {
+      "epoch": 0.6759461426491994,
+      "grad_norm": 0.22234582901000977,
+      "learning_rate": 1.2597871631242992e-05,
+      "loss": 0.13691173791885375,
+      "step": 3715
+    },
+    {
+      "epoch": 0.6768558951965066,
+      "grad_norm": 0.16082969307899475,
+      "learning_rate": 1.2534013511316383e-05,
+      "loss": 0.14932308197021485,
+      "step": 3720
+    },
+    {
+      "epoch": 0.6777656477438136,
+      "grad_norm": 0.1751091182231903,
+      "learning_rate": 1.247026345942226e-05,
+      "loss": 0.14531974792480468,
+      "step": 3725
+    },
+    {
+      "epoch": 0.6786754002911208,
+      "grad_norm": 0.15838147699832916,
+      "learning_rate": 1.2406622028211844e-05,
+      "loss": 0.14759832620620728,
+      "step": 3730
+    },
+    {
+      "epoch": 0.6795851528384279,
+      "grad_norm": 0.1771744042634964,
+      "learning_rate": 1.2343089769394714e-05,
+      "loss": 0.1382831573486328,
+      "step": 3735
+    },
+    {
+      "epoch": 0.6804949053857351,
+      "grad_norm": 0.16301538050174713,
+      "learning_rate": 1.2279667233734037e-05,
+      "loss": 0.14444775581359864,
+      "step": 3740
+    },
+    {
+      "epoch": 0.6814046579330422,
+      "grad_norm": 0.1584121286869049,
+      "learning_rate": 1.2216354971041796e-05,
+      "loss": 0.14200170040130616,
+      "step": 3745
+    },
+    {
+      "epoch": 0.6823144104803494,
+      "grad_norm": 0.139187291264534,
+      "learning_rate": 1.2153153530174007e-05,
+      "loss": 0.14318310022354125,
+      "step": 3750
+    },
+    {
+      "epoch": 0.6832241630276564,
+      "grad_norm": 0.13665248453617096,
+      "learning_rate": 1.2090063459025955e-05,
+      "loss": 0.1411946654319763,
+      "step": 3755
+    },
+    {
+      "epoch": 0.6841339155749636,
+      "grad_norm": 0.16273781657218933,
+      "learning_rate": 1.2027085304527475e-05,
+      "loss": 0.14873508214950562,
+      "step": 3760
+    },
+    {
+      "epoch": 0.6850436681222707,
+      "grad_norm": 0.16317526996135712,
+      "learning_rate": 1.1964219612638194e-05,
+      "loss": 0.14644203186035157,
+      "step": 3765
+    },
+    {
+      "epoch": 0.6859534206695779,
+      "grad_norm": 0.17253617942333221,
+      "learning_rate": 1.1901466928342777e-05,
+      "loss": 0.14027841091156007,
+      "step": 3770
+    },
+    {
+      "epoch": 0.6868631732168851,
+      "grad_norm": 0.19692830741405487,
+      "learning_rate": 1.183882779564624e-05,
+      "loss": 0.14411110877990724,
+      "step": 3775
+    },
+    {
+      "epoch": 0.6877729257641921,
+      "grad_norm": 0.15444578230381012,
+      "learning_rate": 1.1776302757569214e-05,
+      "loss": 0.14355008602142333,
+      "step": 3780
+    },
+    {
+      "epoch": 0.6886826783114993,
+      "grad_norm": 0.1622200757265091,
+      "learning_rate": 1.1713892356143239e-05,
+      "loss": 0.14794334173202514,
+      "step": 3785
+    },
+    {
+      "epoch": 0.6895924308588064,
+      "grad_norm": 0.1898501068353653,
+      "learning_rate": 1.1651597132406073e-05,
+      "loss": 0.1418622612953186,
+      "step": 3790
+    },
+    {
+      "epoch": 0.6905021834061136,
+      "grad_norm": 0.17803208529949188,
+      "learning_rate": 1.1589417626396973e-05,
+      "loss": 0.14576040506362914,
+      "step": 3795
+    },
+    {
+      "epoch": 0.6914119359534207,
+      "grad_norm": 0.17138013243675232,
+      "learning_rate": 1.1527354377152053e-05,
+      "loss": 0.14494270086288452,
+      "step": 3800
+    },
+    {
+      "epoch": 0.6923216885007278,
+      "grad_norm": 0.15170913934707642,
+      "learning_rate": 1.1465407922699603e-05,
+      "loss": 0.144084370136261,
+      "step": 3805
+    },
+    {
+      "epoch": 0.6932314410480349,
+      "grad_norm": 0.158562570810318,
+      "learning_rate": 1.1403578800055387e-05,
+      "loss": 0.13636608123779298,
+      "step": 3810
+    },
+    {
+      "epoch": 0.6941411935953421,
+      "grad_norm": 0.17687302827835083,
+      "learning_rate": 1.1341867545218044e-05,
+      "loss": 0.14214688539505005,
+      "step": 3815
+    },
+    {
+      "epoch": 0.6950509461426492,
+      "grad_norm": 0.15394899249076843,
+      "learning_rate": 1.1280274693164378e-05,
+      "loss": 0.14914129972457885,
+      "step": 3820
+    },
+    {
+      "epoch": 0.6959606986899564,
+      "grad_norm": 0.15709355473518372,
+      "learning_rate": 1.12188007778448e-05,
+      "loss": 0.14798580408096312,
+      "step": 3825
+    },
+    {
+      "epoch": 0.6968704512372634,
+      "grad_norm": 0.16631539165973663,
+      "learning_rate": 1.115744633217864e-05,
+      "loss": 0.14756966829299928,
+      "step": 3830
+    },
+    {
+      "epoch": 0.6977802037845706,
+      "grad_norm": 0.15893076360225677,
+      "learning_rate": 1.109621188804951e-05,
+      "loss": 0.14061959981918334,
+      "step": 3835
+    },
+    {
+      "epoch": 0.6986899563318777,
+      "grad_norm": 0.183414489030838,
+      "learning_rate": 1.103509797630077e-05,
+      "loss": 0.1448473334312439,
+      "step": 3840
+    },
+    {
+      "epoch": 0.6995997088791849,
+      "grad_norm": 0.14087305963039398,
+      "learning_rate": 1.0974105126730841e-05,
+      "loss": 0.14369285106658936,
+      "step": 3845
+    },
+    {
+      "epoch": 0.700509461426492,
+      "grad_norm": 0.16919967532157898,
+      "learning_rate": 1.0913233868088685e-05,
+      "loss": 0.1478085398674011,
+      "step": 3850
+    },
+    {
+      "epoch": 0.7014192139737991,
+      "grad_norm": 0.1439533829689026,
+      "learning_rate": 1.0852484728069178e-05,
+      "loss": 0.14376721382141114,
+      "step": 3855
+    },
+    {
+      "epoch": 0.7023289665211062,
+      "grad_norm": 0.17719274759292603,
+      "learning_rate": 1.0791858233308521e-05,
+      "loss": 0.14089040756225585,
+      "step": 3860
+    },
+    {
+      "epoch": 0.7032387190684134,
+      "grad_norm": 0.19753769040107727,
+      "learning_rate": 1.0731354909379754e-05,
+      "loss": 0.15021742582321168,
+      "step": 3865
+    },
+    {
+      "epoch": 0.7041484716157205,
+      "grad_norm": 0.19186992943286896,
+      "learning_rate": 1.0670975280788086e-05,
+      "loss": 0.14113202095031738,
+      "step": 3870
+    },
+    {
+      "epoch": 0.7050582241630277,
+      "grad_norm": 0.1709229201078415,
+      "learning_rate": 1.0610719870966443e-05,
+      "loss": 0.1500566840171814,
+      "step": 3875
+    },
+    {
+      "epoch": 0.7059679767103348,
+      "grad_norm": 0.17846204340457916,
+      "learning_rate": 1.0550589202270892e-05,
+      "loss": 0.15014195442199707,
+      "step": 3880
+    },
+    {
+      "epoch": 0.7068777292576419,
+      "grad_norm": 0.1827082335948944,
+      "learning_rate": 1.0490583795976091e-05,
+      "loss": 0.1423472762107849,
+      "step": 3885
+    },
+    {
+      "epoch": 0.7077874818049491,
+      "grad_norm": 0.17418377101421356,
+      "learning_rate": 1.043070417227083e-05,
+      "loss": 0.14668900966644288,
+      "step": 3890
+    },
+    {
+      "epoch": 0.7086972343522562,
+      "grad_norm": 0.17385616898536682,
+      "learning_rate": 1.0370950850253449e-05,
+      "loss": 0.14627279043197633,
+      "step": 3895
+    },
+    {
+      "epoch": 0.7096069868995634,
+      "grad_norm": 0.16486723721027374,
+      "learning_rate": 1.0311324347927404e-05,
+      "loss": 0.14603652954101562,
+      "step": 3900
+    },
+    {
+      "epoch": 0.7105167394468704,
+      "grad_norm": 0.21806862950325012,
+      "learning_rate": 1.0251825182196732e-05,
+      "loss": 0.1488169550895691,
+      "step": 3905
+    },
+    {
+      "epoch": 0.7114264919941776,
+      "grad_norm": 0.19884569942951202,
+      "learning_rate": 1.019245386886159e-05,
+      "loss": 0.14387656450271608,
+      "step": 3910
+    },
+    {
+      "epoch": 0.7123362445414847,
+      "grad_norm": 0.16139011085033417,
+      "learning_rate": 1.0133210922613789e-05,
+      "loss": 0.1483074426651001,
+      "step": 3915
+    },
+    {
+      "epoch": 0.7132459970887919,
+      "grad_norm": 0.17000740766525269,
+      "learning_rate": 1.007409685703229e-05,
+      "loss": 0.14050065279006957,
+      "step": 3920
+    },
+    {
+      "epoch": 0.714155749636099,
+      "grad_norm": 0.17235304415225983,
+      "learning_rate": 1.0015112184578813e-05,
+      "loss": 0.1440442681312561,
+      "step": 3925
+    },
+    {
+      "epoch": 0.7150655021834061,
+      "grad_norm": 0.15737567842006683,
+      "learning_rate": 9.956257416593362e-06,
+      "loss": 0.14960765838623047,
+      "step": 3930
+    },
+    {
+      "epoch": 0.7159752547307132,
+      "grad_norm": 0.15499180555343628,
+      "learning_rate": 9.897533063289773e-06,
+      "loss": 0.14488829374313356,
+      "step": 3935
+    },
+    {
+      "epoch": 0.7168850072780204,
+      "grad_norm": 0.17744216322898865,
+      "learning_rate": 9.838939633751337e-06,
+      "loss": 0.1416949987411499,
+      "step": 3940
+    },
+    {
+      "epoch": 0.7177947598253275,
+      "grad_norm": 0.1597192883491516,
+      "learning_rate": 9.780477635926358e-06,
+      "loss": 0.14275280237197877,
+      "step": 3945
+    },
+    {
+      "epoch": 0.7187045123726347,
+      "grad_norm": 0.17800374329090118,
+      "learning_rate": 9.722147576623743e-06,
+      "loss": 0.14532098770141602,
+      "step": 3950
+    },
+    {
+      "epoch": 0.7196142649199417,
+      "grad_norm": 0.1828162521123886,
+      "learning_rate": 9.66394996150864e-06,
+      "loss": 0.14525585174560546,
+      "step": 3955
+    },
+    {
+      "epoch": 0.7205240174672489,
+      "grad_norm": 0.1800539344549179,
+      "learning_rate": 9.605885295098005e-06,
+      "loss": 0.14235819578170777,
+      "step": 3960
+    },
+    {
+      "epoch": 0.721433770014556,
+      "grad_norm": 0.16556483507156372,
+      "learning_rate": 9.54795408075628e-06,
+      "loss": 0.13965482711791993,
+      "step": 3965
+    },
+    {
+      "epoch": 0.7223435225618632,
+      "grad_norm": 0.1592024862766266,
+      "learning_rate": 9.49015682069101e-06,
+      "loss": 0.14051042795181273,
+      "step": 3970
+    },
+    {
+      "epoch": 0.7232532751091703,
+      "grad_norm": 0.18988847732543945,
+      "learning_rate": 9.43249401594846e-06,
+      "loss": 0.1436900496482849,
+      "step": 3975
+    },
+    {
+      "epoch": 0.7241630276564774,
+      "grad_norm": 0.24433808028697968,
+      "learning_rate": 9.374966166409329e-06,
+      "loss": 0.14883997440338134,
+      "step": 3980
+    },
+    {
+      "epoch": 0.7250727802037845,
+      "grad_norm": 0.15091639757156372,
+      "learning_rate": 9.317573770784352e-06,
+      "loss": 0.14726560115814208,
+      "step": 3985
+    },
+    {
+      "epoch": 0.7259825327510917,
+      "grad_norm": 0.17045573890209198,
+      "learning_rate": 9.260317326610051e-06,
+      "loss": 0.14120506048202514,
+      "step": 3990
+    },
+    {
+      "epoch": 0.7268922852983989,
+      "grad_norm": 0.18847957253456116,
+      "learning_rate": 9.203197330244343e-06,
+      "loss": 0.1377041220664978,
+      "step": 3995
+    },
+    {
+      "epoch": 0.727802037845706,
+      "grad_norm": 0.1516445279121399,
+      "learning_rate": 9.14621427686229e-06,
+      "loss": 0.14043946266174318,
+      "step": 4000
+    },
+    {
+      "epoch": 0.7287117903930131,
+      "grad_norm": 0.18264050781726837,
+      "learning_rate": 9.0893686604518e-06,
+      "loss": 0.14080368280410765,
+      "step": 4005
+    },
+    {
+      "epoch": 0.7296215429403202,
+      "grad_norm": 0.19129371643066406,
+      "learning_rate": 9.032660973809312e-06,
+      "loss": 0.1402561902999878,
+      "step": 4010
+    },
+    {
+      "epoch": 0.7305312954876274,
+      "grad_norm": 0.15762710571289062,
+      "learning_rate": 8.976091708535567e-06,
+      "loss": 0.14421157836914061,
+      "step": 4015
+    },
+    {
+      "epoch": 0.7314410480349345,
+      "grad_norm": 0.17785198986530304,
+      "learning_rate": 8.919661355031331e-06,
+      "loss": 0.14999009370803834,
+      "step": 4020
+    },
+    {
+      "epoch": 0.7323508005822417,
+      "grad_norm": 0.15306031703948975,
+      "learning_rate": 8.8633704024931e-06,
+      "loss": 0.14101698398590087,
+      "step": 4025
+    },
+    {
+      "epoch": 0.7332605531295487,
+      "grad_norm": 0.16481758654117584,
+      "learning_rate": 8.807219338908968e-06,
+      "loss": 0.14170764684677123,
+      "step": 4030
+    },
+    {
+      "epoch": 0.7341703056768559,
+      "grad_norm": 0.14892235398292542,
+      "learning_rate": 8.751208651054257e-06,
+      "loss": 0.15317896604537964,
+      "step": 4035
+    },
+    {
+      "epoch": 0.735080058224163,
+      "grad_norm": 0.1775592565536499,
+      "learning_rate": 8.695338824487409e-06,
+      "loss": 0.1520617723464966,
+      "step": 4040
+    },
+    {
+      "epoch": 0.7359898107714702,
+      "grad_norm": 0.1614258885383606,
+      "learning_rate": 8.639610343545728e-06,
+      "loss": 0.13747400045394897,
+      "step": 4045
+    },
+    {
+      "epoch": 0.7368995633187773,
+      "grad_norm": 0.21415506303310394,
+      "learning_rate": 8.58402369134117e-06,
+      "loss": 0.1432439088821411,
+      "step": 4050
+    },
+    {
+      "epoch": 0.7378093158660844,
+      "grad_norm": 0.1759418249130249,
+      "learning_rate": 8.528579349756205e-06,
+      "loss": 0.141641104221344,
+      "step": 4055
+    },
+    {
+      "epoch": 0.7387190684133915,
+      "grad_norm": 0.16738329827785492,
+      "learning_rate": 8.47327779943957e-06,
+      "loss": 0.14294810295104982,
+      "step": 4060
+    },
+    {
+      "epoch": 0.7396288209606987,
+      "grad_norm": 0.13916844129562378,
+      "learning_rate": 8.41811951980217e-06,
+      "loss": 0.13876968622207642,
+      "step": 4065
+    },
+    {
+      "epoch": 0.7405385735080058,
+      "grad_norm": 0.1828441321849823,
+      "learning_rate": 8.36310498901288e-06,
+      "loss": 0.148428475856781,
+      "step": 4070
+    },
+    {
+      "epoch": 0.741448326055313,
+      "grad_norm": 0.16534076631069183,
+      "learning_rate": 8.308234683994415e-06,
+      "loss": 0.14222711324691772,
+      "step": 4075
+    },
+    {
+      "epoch": 0.74235807860262,
+      "grad_norm": 0.17922644317150116,
+      "learning_rate": 8.253509080419198e-06,
+      "loss": 0.14365782737731933,
+      "step": 4080
+    },
+    {
+      "epoch": 0.7432678311499272,
+      "grad_norm": 0.15061035752296448,
+      "learning_rate": 8.198928652705204e-06,
+      "loss": 0.13571925163269044,
+      "step": 4085
+    },
+    {
+      "epoch": 0.7441775836972343,
+      "grad_norm": 0.18075402081012726,
+      "learning_rate": 8.144493874011908e-06,
+      "loss": 0.14385528564453126,
+      "step": 4090
+    },
+    {
+      "epoch": 0.7450873362445415,
+      "grad_norm": 0.16514739394187927,
+      "learning_rate": 8.090205216236135e-06,
+      "loss": 0.14920626878738402,
+      "step": 4095
+    },
+    {
+      "epoch": 0.7459970887918487,
+      "grad_norm": 0.16453702747821808,
+      "learning_rate": 8.03606315000797e-06,
+      "loss": 0.14704222679138185,
+      "step": 4100
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.2550361457430938e+18,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-4100/training_args.bin b/checkpoint-4100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-4100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/checkpoint-4200/README.md b/checkpoint-4200/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-4200/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-4200/adapter_config.json b/checkpoint-4200/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-4200/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-4200/adapter_model.safetensors b/checkpoint-4200/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..2570f36ac78878c6e97e690641f61c09a36908ba
--- /dev/null
+++ b/checkpoint-4200/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d8fdf9cab00c6fe8710b887249098c77545137bdf0cfbe89a2efce95eba18135
+size 169741912
diff --git a/checkpoint-4200/chat_template.jinja b/checkpoint-4200/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-4200/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-4200/optimizer.pt b/checkpoint-4200/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b547347b2b4646ecaa0f7228d245b95e5f955a9f
--- /dev/null
+++ b/checkpoint-4200/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:399886bf84ebe0ebecbc0b6e99ea015dc2c82c288b3ac7ae70f4140221e0b46c
+size 72807355
diff --git a/checkpoint-4200/processor_config.json b/checkpoint-4200/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-4200/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-4200/rng_state.pth b/checkpoint-4200/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-4200/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-4200/scheduler.pt b/checkpoint-4200/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c4d152871cc7296c135da6ad42c3ad6c0f28a4e7
--- /dev/null
+++ b/checkpoint-4200/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec927d5dd629391e1d3dbd2e680b7b7d117933dd3a6c4235cefa58c2e99af6d3
+size 1465
diff --git a/checkpoint-4200/tokenizer.json b/checkpoint-4200/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-4200/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-4200/tokenizer_config.json b/checkpoint-4200/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-4200/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-4200/trainer_state.json b/checkpoint-4200/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..973c0fc4e3750f721099ebadee25c19a26dce1c1
--- /dev/null
+++ b/checkpoint-4200/trainer_state.json
@@ -0,0 +1,5922 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.7641921397379913,
+  "eval_steps": 100,
+  "global_step": 4200,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    },
+    {
+      "epoch": 0.03729985443959243,
+      "grad_norm": 0.061678580939769745,
+      "learning_rate": 4.999340748636462e-05,
+      "loss": 0.24956226348876953,
+      "step": 205
+    },
+    {
+      "epoch": 0.03820960698689956,
+      "grad_norm": 0.07328873127698898,
+      "learning_rate": 4.999160884067051e-05,
+      "loss": 0.26169676780700685,
+      "step": 210
+    },
+    {
+      "epoch": 0.0391193595342067,
+      "grad_norm": 0.08287990838289261,
+      "learning_rate": 4.9989593541926246e-05,
+      "loss": 0.2574604034423828,
+      "step": 215
+    },
+    {
+      "epoch": 0.04002911208151383,
+      "grad_norm": 0.06787359714508057,
+      "learning_rate": 4.9987361607602525e-05,
+      "loss": 0.25351409912109374,
+      "step": 220
+    },
+    {
+      "epoch": 0.04093886462882096,
+      "grad_norm": 0.06695502996444702,
+      "learning_rate": 4.998491305704805e-05,
+      "loss": 0.24522039890289307,
+      "step": 225
+    },
+    {
+      "epoch": 0.041848617176128096,
+      "grad_norm": 0.08872214704751968,
+      "learning_rate": 4.9982247911489375e-05,
+      "loss": 0.2581867933273315,
+      "step": 230
+    },
+    {
+      "epoch": 0.042758369723435226,
+      "grad_norm": 0.07637131959199905,
+      "learning_rate": 4.9979366194030743e-05,
+      "loss": 0.25569658279418944,
+      "step": 235
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 0.08158119022846222,
+      "learning_rate": 4.997626792965385e-05,
+      "loss": 0.2529409646987915,
+      "step": 240
+    },
+    {
+      "epoch": 0.04457787481804949,
+      "grad_norm": 0.07529161125421524,
+      "learning_rate": 4.997295314521766e-05,
+      "loss": 0.24049024581909179,
+      "step": 245
+    },
+    {
+      "epoch": 0.04548762736535662,
+      "grad_norm": 0.08860139548778534,
+      "learning_rate": 4.996942186945813e-05,
+      "loss": 0.2490522861480713,
+      "step": 250
+    },
+    {
+      "epoch": 0.04639737991266375,
+      "grad_norm": 0.0850321501493454,
+      "learning_rate": 4.9965674132988005e-05,
+      "loss": 0.24180831909179687,
+      "step": 255
+    },
+    {
+      "epoch": 0.04730713245997089,
+      "grad_norm": 0.07556115090847015,
+      "learning_rate": 4.996170996829653e-05,
+      "loss": 0.2509631872177124,
+      "step": 260
+    },
+    {
+      "epoch": 0.04821688500727802,
+      "grad_norm": 0.07971206307411194,
+      "learning_rate": 4.995752940974918e-05,
+      "loss": 0.24398891925811766,
+      "step": 265
+    },
+    {
+      "epoch": 0.04912663755458515,
+      "grad_norm": 0.09149336814880371,
+      "learning_rate": 4.9953132493587344e-05,
+      "loss": 0.2300492286682129,
+      "step": 270
+    },
+    {
+      "epoch": 0.050036390101892286,
+      "grad_norm": 0.08265820890665054,
+      "learning_rate": 4.9948519257928034e-05,
+      "loss": 0.24246792793273925,
+      "step": 275
+    },
+    {
+      "epoch": 0.050946142649199416,
+      "grad_norm": 0.10328587144613266,
+      "learning_rate": 4.9943689742763534e-05,
+      "loss": 0.2367171049118042,
+      "step": 280
+    },
+    {
+      "epoch": 0.05185589519650655,
+      "grad_norm": 0.0836917981505394,
+      "learning_rate": 4.993864398996105e-05,
+      "loss": 0.23215813636779786,
+      "step": 285
+    },
+    {
+      "epoch": 0.05276564774381368,
+      "grad_norm": 0.09475161135196686,
+      "learning_rate": 4.99333820432624e-05,
+      "loss": 0.2350748062133789,
+      "step": 290
+    },
+    {
+      "epoch": 0.05367540029112081,
+      "grad_norm": 0.08040128648281097,
+      "learning_rate": 4.992790394828355e-05,
+      "loss": 0.23253886699676513,
+      "step": 295
+    },
+    {
+      "epoch": 0.05458515283842795,
+      "grad_norm": 0.08852150291204453,
+      "learning_rate": 4.992220975251428e-05,
+      "loss": 0.23856515884399415,
+      "step": 300
+    },
+    {
+      "epoch": 0.05549490538573508,
+      "grad_norm": 0.09565229713916779,
+      "learning_rate": 4.991629950531775e-05,
+      "loss": 0.23311660289764405,
+      "step": 305
+    },
+    {
+      "epoch": 0.05640465793304221,
+      "grad_norm": 0.08158160001039505,
+      "learning_rate": 4.991017325793009e-05,
+      "loss": 0.22467944622039795,
+      "step": 310
+    },
+    {
+      "epoch": 0.05731441048034935,
+      "grad_norm": 0.07746429741382599,
+      "learning_rate": 4.990383106345994e-05,
+      "loss": 0.229844069480896,
+      "step": 315
+    },
+    {
+      "epoch": 0.05822416302765648,
+      "grad_norm": 0.08564355969429016,
+      "learning_rate": 4.989727297688797e-05,
+      "loss": 0.22414517402648926,
+      "step": 320
+    },
+    {
+      "epoch": 0.05913391557496361,
+      "grad_norm": 0.07517435401678085,
+      "learning_rate": 4.9890499055066435e-05,
+      "loss": 0.2236532211303711,
+      "step": 325
+    },
+    {
+      "epoch": 0.060043668122270744,
+      "grad_norm": 0.111734539270401,
+      "learning_rate": 4.988350935671869e-05,
+      "loss": 0.21474847793579102,
+      "step": 330
+    },
+    {
+      "epoch": 0.060953420669577874,
+      "grad_norm": 0.09906989336013794,
+      "learning_rate": 4.987630394243866e-05,
+      "loss": 0.23321933746337892,
+      "step": 335
+    },
+    {
+      "epoch": 0.06186317321688501,
+      "grad_norm": 0.10131457448005676,
+      "learning_rate": 4.98688828746903e-05,
+      "loss": 0.2310662031173706,
+      "step": 340
+    },
+    {
+      "epoch": 0.06277292576419213,
+      "grad_norm": 0.09203507006168365,
+      "learning_rate": 4.986124621780708e-05,
+      "loss": 0.22021169662475587,
+      "step": 345
+    },
+    {
+      "epoch": 0.06368267831149928,
+      "grad_norm": 0.09505912661552429,
+      "learning_rate": 4.9853394037991416e-05,
+      "loss": 0.2197155237197876,
+      "step": 350
+    },
+    {
+      "epoch": 0.06459243085880641,
+      "grad_norm": 0.09038657695055008,
+      "learning_rate": 4.984532640331412e-05,
+      "loss": 0.22066287994384765,
+      "step": 355
+    },
+    {
+      "epoch": 0.06550218340611354,
+      "grad_norm": 0.09707064181566238,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 0.22455451488494874,
+      "step": 360
+    },
+    {
+      "epoch": 0.06641193595342067,
+      "grad_norm": 0.10367228090763092,
+      "learning_rate": 4.98285450509961e-05,
+      "loss": 0.21993820667266845,
+      "step": 365
+    },
+    {
+      "epoch": 0.0673216885007278,
+      "grad_norm": 0.12229471653699875,
+      "learning_rate": 4.9819831478833456e-05,
+      "loss": 0.2168867588043213,
+      "step": 370
+    },
+    {
+      "epoch": 0.06823144104803494,
+      "grad_norm": 0.0964592918753624,
+      "learning_rate": 4.981090274276406e-05,
+      "loss": 0.21579203605651856,
+      "step": 375
+    },
+    {
+      "epoch": 0.06914119359534207,
+      "grad_norm": 0.09400496631860733,
+      "learning_rate": 4.980175892019141e-05,
+      "loss": 0.20972180366516113,
+      "step": 380
+    },
+    {
+      "epoch": 0.0700509461426492,
+      "grad_norm": 0.08158645778894424,
+      "learning_rate": 4.9792400090383594e-05,
+      "loss": 0.22148358821868896,
+      "step": 385
+    },
+    {
+      "epoch": 0.07096069868995633,
+      "grad_norm": 0.10916394740343094,
+      "learning_rate": 4.978282633447261e-05,
+      "loss": 0.2214418649673462,
+      "step": 390
+    },
+    {
+      "epoch": 0.07187045123726346,
+      "grad_norm": 0.11138810962438583,
+      "learning_rate": 4.9773037735453636e-05,
+      "loss": 0.21814754009246826,
+      "step": 395
+    },
+    {
+      "epoch": 0.07278020378457059,
+      "grad_norm": 0.10914396494626999,
+      "learning_rate": 4.9763034378184365e-05,
+      "loss": 0.21310818195343018,
+      "step": 400
+    },
+    {
+      "epoch": 0.07368995633187773,
+      "grad_norm": 0.1043366864323616,
+      "learning_rate": 4.975281634938421e-05,
+      "loss": 0.21266789436340333,
+      "step": 405
+    },
+    {
+      "epoch": 0.07459970887918486,
+      "grad_norm": 0.1036868542432785,
+      "learning_rate": 4.9742383737633594e-05,
+      "loss": 0.21606721878051757,
+      "step": 410
+    },
+    {
+      "epoch": 0.075509461426492,
+      "grad_norm": 0.11640442907810211,
+      "learning_rate": 4.9731736633373144e-05,
+      "loss": 0.21532948017120362,
+      "step": 415
+    },
+    {
+      "epoch": 0.07641921397379912,
+      "grad_norm": 0.11219926178455353,
+      "learning_rate": 4.9720875128902956e-05,
+      "loss": 0.2191627025604248,
+      "step": 420
+    },
+    {
+      "epoch": 0.07732896652110625,
+      "grad_norm": 0.12103637307882309,
+      "learning_rate": 4.970979931838176e-05,
+      "loss": 0.20938868522644044,
+      "step": 425
+    },
+    {
+      "epoch": 0.0782387190684134,
+      "grad_norm": 0.13274189829826355,
+      "learning_rate": 4.96985092978261e-05,
+      "loss": 0.21792960166931152,
+      "step": 430
+    },
+    {
+      "epoch": 0.07914847161572053,
+      "grad_norm": 0.11164513230323792,
+      "learning_rate": 4.968700516510954e-05,
+      "loss": 0.2022618055343628,
+      "step": 435
+    },
+    {
+      "epoch": 0.08005822416302766,
+      "grad_norm": 0.09532847255468369,
+      "learning_rate": 4.967528701996174e-05,
+      "loss": 0.21255812644958497,
+      "step": 440
+    },
+    {
+      "epoch": 0.08096797671033479,
+      "grad_norm": 0.10279258340597153,
+      "learning_rate": 4.96633549639677e-05,
+      "loss": 0.20683050155639648,
+      "step": 445
+    },
+    {
+      "epoch": 0.08187772925764192,
+      "grad_norm": 0.1257462352514267,
+      "learning_rate": 4.965120910056677e-05,
+      "loss": 0.21419920921325683,
+      "step": 450
+    },
+    {
+      "epoch": 0.08278748180494905,
+      "grad_norm": 0.11663137376308441,
+      "learning_rate": 4.963884953505186e-05,
+      "loss": 0.2072287082672119,
+      "step": 455
+    },
+    {
+      "epoch": 0.08369723435225619,
+      "grad_norm": 0.10488224029541016,
+      "learning_rate": 4.96262763745684e-05,
+      "loss": 0.1982678532600403,
+      "step": 460
+    },
+    {
+      "epoch": 0.08460698689956332,
+      "grad_norm": 0.11801692098379135,
+      "learning_rate": 4.961348972811354e-05,
+      "loss": 0.20662031173706055,
+      "step": 465
+    },
+    {
+      "epoch": 0.08551673944687045,
+      "grad_norm": 0.11318827420473099,
+      "learning_rate": 4.96004897065351e-05,
+      "loss": 0.20947303771972656,
+      "step": 470
+    },
+    {
+      "epoch": 0.08642649199417758,
+      "grad_norm": 0.13409486413002014,
+      "learning_rate": 4.95872764225307e-05,
+      "loss": 0.19670876264572143,
+      "step": 475
+    },
+    {
+      "epoch": 0.08733624454148471,
+      "grad_norm": 0.14440792798995972,
+      "learning_rate": 4.957384999064672e-05,
+      "loss": 0.19842848777770997,
+      "step": 480
+    },
+    {
+      "epoch": 0.08824599708879186,
+      "grad_norm": 0.12246996909379959,
+      "learning_rate": 4.956021052727731e-05,
+      "loss": 0.20318071842193602,
+      "step": 485
+    },
+    {
+      "epoch": 0.08915574963609899,
+      "grad_norm": 0.13437233865261078,
+      "learning_rate": 4.954635815066342e-05,
+      "loss": 0.21675212383270265,
+      "step": 490
+    },
+    {
+      "epoch": 0.09006550218340612,
+      "grad_norm": 0.11109672486782074,
+      "learning_rate": 4.9532292980891744e-05,
+      "loss": 0.2100757837295532,
+      "step": 495
+    },
+    {
+      "epoch": 0.09097525473071325,
+      "grad_norm": 0.1388893872499466,
+      "learning_rate": 4.9518015139893675e-05,
+      "loss": 0.20303285121917725,
+      "step": 500
+    },
+    {
+      "epoch": 0.09188500727802038,
+      "grad_norm": 0.13239721953868866,
+      "learning_rate": 4.950352475144427e-05,
+      "loss": 0.2152268409729004,
+      "step": 505
+    },
+    {
+      "epoch": 0.0927947598253275,
+      "grad_norm": 0.12834979593753815,
+      "learning_rate": 4.948882194116119e-05,
+      "loss": 0.20799248218536376,
+      "step": 510
+    },
+    {
+      "epoch": 0.09370451237263465,
+      "grad_norm": 0.11886704713106155,
+      "learning_rate": 4.947390683650354e-05,
+      "loss": 0.20394976139068605,
+      "step": 515
+    },
+    {
+      "epoch": 0.09461426491994178,
+      "grad_norm": 0.11398876458406448,
+      "learning_rate": 4.945877956677083e-05,
+      "loss": 0.2091092586517334,
+      "step": 520
+    },
+    {
+      "epoch": 0.09552401746724891,
+      "grad_norm": 0.1422540694475174,
+      "learning_rate": 4.944344026310186e-05,
+      "loss": 0.19564238786697388,
+      "step": 525
+    },
+    {
+      "epoch": 0.09643377001455604,
+      "grad_norm": 0.11359584331512451,
+      "learning_rate": 4.9427889058473535e-05,
+      "loss": 0.20493624210357667,
+      "step": 530
+    },
+    {
+      "epoch": 0.09734352256186317,
+      "grad_norm": 0.11703553050756454,
+      "learning_rate": 4.941212608769974e-05,
+      "loss": 0.2098615884780884,
+      "step": 535
+    },
+    {
+      "epoch": 0.0982532751091703,
+      "grad_norm": 0.14552047848701477,
+      "learning_rate": 4.939615148743017e-05,
+      "loss": 0.20382182598114013,
+      "step": 540
+    },
+    {
+      "epoch": 0.09916302765647744,
+      "grad_norm": 0.13178016245365143,
+      "learning_rate": 4.937996539614914e-05,
+      "loss": 0.19901862144470214,
+      "step": 545
+    },
+    {
+      "epoch": 0.10007278020378457,
+      "grad_norm": 0.635392427444458,
+      "learning_rate": 4.936356795417439e-05,
+      "loss": 0.20694944858551026,
+      "step": 550
+    },
+    {
+      "epoch": 0.1009825327510917,
+      "grad_norm": 0.15019077062606812,
+      "learning_rate": 4.934695930365586e-05,
+      "loss": 0.19313746690750122,
+      "step": 555
+    },
+    {
+      "epoch": 0.10189228529839883,
+      "grad_norm": 0.12941956520080566,
+      "learning_rate": 4.9330139588574474e-05,
+      "loss": 0.19671722650527954,
+      "step": 560
+    },
+    {
+      "epoch": 0.10280203784570596,
+      "grad_norm": 0.13818831741809845,
+      "learning_rate": 4.931310895474088e-05,
+      "loss": 0.20026786327362062,
+      "step": 565
+    },
+    {
+      "epoch": 0.1037117903930131,
+      "grad_norm": 0.12011194974184036,
+      "learning_rate": 4.929586754979417e-05,
+      "loss": 0.1932437539100647,
+      "step": 570
+    },
+    {
+      "epoch": 0.10462154294032024,
+      "grad_norm": 0.1345364898443222,
+      "learning_rate": 4.9278415523200644e-05,
+      "loss": 0.20245940685272218,
+      "step": 575
+    },
+    {
+      "epoch": 0.10553129548762737,
+      "grad_norm": 0.13281017541885376,
+      "learning_rate": 4.926075302625247e-05,
+      "loss": 0.19864981174468993,
+      "step": 580
+    },
+    {
+      "epoch": 0.1064410480349345,
+      "grad_norm": 0.13465586304664612,
+      "learning_rate": 4.924288021206639e-05,
+      "loss": 0.19573183059692384,
+      "step": 585
+    },
+    {
+      "epoch": 0.10735080058224163,
+      "grad_norm": 0.15225961804389954,
+      "learning_rate": 4.9224797235582396e-05,
+      "loss": 0.19946801662445068,
+      "step": 590
+    },
+    {
+      "epoch": 0.10826055312954876,
+      "grad_norm": 0.12816746532917023,
+      "learning_rate": 4.92065042535624e-05,
+      "loss": 0.19851526021957397,
+      "step": 595
+    },
+    {
+      "epoch": 0.1091703056768559,
+      "grad_norm": 0.13802853226661682,
+      "learning_rate": 4.9188001424588824e-05,
+      "loss": 0.19321763515472412,
+      "step": 600
+    },
+    {
+      "epoch": 0.11008005822416303,
+      "grad_norm": 0.17504797875881195,
+      "learning_rate": 4.9169288909063295e-05,
+      "loss": 0.2032616138458252,
+      "step": 605
+    },
+    {
+      "epoch": 0.11098981077147016,
+      "grad_norm": 0.13544194400310516,
+      "learning_rate": 4.91503668692052e-05,
+      "loss": 0.2011256456375122,
+      "step": 610
+    },
+    {
+      "epoch": 0.11189956331877729,
+      "grad_norm": 1.3976134061813354,
+      "learning_rate": 4.91312354690503e-05,
+      "loss": 0.19916868209838867,
+      "step": 615
+    },
+    {
+      "epoch": 0.11280931586608442,
+      "grad_norm": 0.1465059071779251,
+      "learning_rate": 4.91118948744493e-05,
+      "loss": 0.19487457275390624,
+      "step": 620
+    },
+    {
+      "epoch": 0.11371906841339156,
+      "grad_norm": 0.12103168666362762,
+      "learning_rate": 4.909234525306645e-05,
+      "loss": 0.1907251238822937,
+      "step": 625
+    },
+    {
+      "epoch": 0.1146288209606987,
+      "grad_norm": 0.12660574913024902,
+      "learning_rate": 4.907258677437802e-05,
+      "loss": 0.19327253103256226,
+      "step": 630
+    },
+    {
+      "epoch": 0.11553857350800582,
+      "grad_norm": 0.1347813606262207,
+      "learning_rate": 4.90526196096709e-05,
+      "loss": 0.19637736082077026,
+      "step": 635
+    },
+    {
+      "epoch": 0.11644832605531295,
+      "grad_norm": 0.14953652024269104,
+      "learning_rate": 4.903244393204107e-05,
+      "loss": 0.20325069427490233,
+      "step": 640
+    },
+    {
+      "epoch": 0.11735807860262008,
+      "grad_norm": 0.13936272263526917,
+      "learning_rate": 4.901205991639213e-05,
+      "loss": 0.1930275321006775,
+      "step": 645
+    },
+    {
+      "epoch": 0.11826783114992721,
+      "grad_norm": 0.1448420137166977,
+      "learning_rate": 4.899146773943374e-05,
+      "loss": 0.20026936531066894,
+      "step": 650
+    },
+    {
+      "epoch": 0.11917758369723436,
+      "grad_norm": 0.1312534064054489,
+      "learning_rate": 4.897066757968014e-05,
+      "loss": 0.19062033891677857,
+      "step": 655
+    },
+    {
+      "epoch": 0.12008733624454149,
+      "grad_norm": 0.13644742965698242,
+      "learning_rate": 4.894965961744859e-05,
+      "loss": 0.18719595670700073,
+      "step": 660
+    },
+    {
+      "epoch": 0.12099708879184862,
+      "grad_norm": 0.14276087284088135,
+      "learning_rate": 4.892844403485777e-05,
+      "loss": 0.19784307479858398,
+      "step": 665
+    },
+    {
+      "epoch": 0.12190684133915575,
+      "grad_norm": 0.14735399186611176,
+      "learning_rate": 4.890702101582623e-05,
+      "loss": 0.19163782596588136,
+      "step": 670
+    },
+    {
+      "epoch": 0.12281659388646288,
+      "grad_norm": 0.15742065012454987,
+      "learning_rate": 4.888539074607082e-05,
+      "loss": 0.19312986135482788,
+      "step": 675
+    },
+    {
+      "epoch": 0.12372634643377002,
+      "grad_norm": 0.12917031347751617,
+      "learning_rate": 4.8863553413105025e-05,
+      "loss": 0.20066320896148682,
+      "step": 680
+    },
+    {
+      "epoch": 0.12463609898107715,
+      "grad_norm": 0.1484801322221756,
+      "learning_rate": 4.884150920623737e-05,
+      "loss": 0.20096096992492676,
+      "step": 685
+    },
+    {
+      "epoch": 0.12554585152838427,
+      "grad_norm": 0.1455296128988266,
+      "learning_rate": 4.88192583165698e-05,
+      "loss": 0.20518505573272705,
+      "step": 690
+    },
+    {
+      "epoch": 0.12645560407569142,
+      "grad_norm": 0.14517490565776825,
+      "learning_rate": 4.879680093699598e-05,
+      "loss": 0.18859238624572755,
+      "step": 695
+    },
+    {
+      "epoch": 0.12736535662299855,
+      "grad_norm": 0.18778090178966522,
+      "learning_rate": 4.877413726219964e-05,
+      "loss": 0.197074818611145,
+      "step": 700
+    },
+    {
+      "epoch": 0.12827510917030568,
+      "grad_norm": 0.13497677445411682,
+      "learning_rate": 4.87512674886529e-05,
+      "loss": 0.18713107109069824,
+      "step": 705
+    },
+    {
+      "epoch": 0.12918486171761281,
+      "grad_norm": 0.12657155096530914,
+      "learning_rate": 4.872819181461455e-05,
+      "loss": 0.1858484387397766,
+      "step": 710
+    },
+    {
+      "epoch": 0.13009461426491994,
+      "grad_norm": 0.11458148807287216,
+      "learning_rate": 4.870491044012834e-05,
+      "loss": 0.18732179403305055,
+      "step": 715
+    },
+    {
+      "epoch": 0.13100436681222707,
+      "grad_norm": 0.13000249862670898,
+      "learning_rate": 4.8681423567021244e-05,
+      "loss": 0.1872936010360718,
+      "step": 720
+    },
+    {
+      "epoch": 0.1319141193595342,
+      "grad_norm": 0.14580890536308289,
+      "learning_rate": 4.865773139890172e-05,
+      "loss": 0.19280019998550416,
+      "step": 725
+    },
+    {
+      "epoch": 0.13282387190684133,
+      "grad_norm": 0.1507277935743332,
+      "learning_rate": 4.8633834141157913e-05,
+      "loss": 0.1898929238319397,
+      "step": 730
+    },
+    {
+      "epoch": 0.13373362445414846,
+      "grad_norm": 0.1418737769126892,
+      "learning_rate": 4.860973200095592e-05,
+      "loss": 0.17926375865936278,
+      "step": 735
+    },
+    {
+      "epoch": 0.1346433770014556,
+      "grad_norm": 0.17151866853237152,
+      "learning_rate": 4.858542518723794e-05,
+      "loss": 0.18963592052459716,
+      "step": 740
+    },
+    {
+      "epoch": 0.13555312954876272,
+      "grad_norm": 0.11162743717432022,
+      "learning_rate": 4.8560913910720535e-05,
+      "loss": 0.19466646909713745,
+      "step": 745
+    },
+    {
+      "epoch": 0.13646288209606988,
+      "grad_norm": 0.15628376603126526,
+      "learning_rate": 4.8536198383892725e-05,
+      "loss": 0.19494034051895143,
+      "step": 750
+    },
+    {
+      "epoch": 0.137372634643377,
+      "grad_norm": 0.18209289014339447,
+      "learning_rate": 4.851127882101421e-05,
+      "loss": 0.18747550249099731,
+      "step": 755
+    },
+    {
+      "epoch": 0.13828238719068414,
+      "grad_norm": 0.14559614658355713,
+      "learning_rate": 4.8486155438113454e-05,
+      "loss": 0.1897158980369568,
+      "step": 760
+    },
+    {
+      "epoch": 0.13919213973799127,
+      "grad_norm": 0.3198587894439697,
+      "learning_rate": 4.846082845298586e-05,
+      "loss": 0.18571001291275024,
+      "step": 765
+    },
+    {
+      "epoch": 0.1401018922852984,
+      "grad_norm": 0.1486678421497345,
+      "learning_rate": 4.843529808519189e-05,
+      "loss": 0.19561930894851684,
+      "step": 770
+    },
+    {
+      "epoch": 0.14101164483260553,
+      "grad_norm": 0.15318170189857483,
+      "learning_rate": 4.840956455605509e-05,
+      "loss": 0.187040114402771,
+      "step": 775
+    },
+    {
+      "epoch": 0.14192139737991266,
+      "grad_norm": 0.13754244148731232,
+      "learning_rate": 4.838362808866025e-05,
+      "loss": 0.18345539569854735,
+      "step": 780
+    },
+    {
+      "epoch": 0.1428311499272198,
+      "grad_norm": 0.12943248450756073,
+      "learning_rate": 4.835748890785143e-05,
+      "loss": 0.1921079397201538,
+      "step": 785
+    },
+    {
+      "epoch": 0.14374090247452692,
+      "grad_norm": 0.110458143055439,
+      "learning_rate": 4.833114724023001e-05,
+      "loss": 0.17927205562591553,
+      "step": 790
+    },
+    {
+      "epoch": 0.14465065502183405,
+      "grad_norm": 0.2421770840883255,
+      "learning_rate": 4.830460331415275e-05,
+      "loss": 0.18317567110061644,
+      "step": 795
+    },
+    {
+      "epoch": 0.14556040756914118,
+      "grad_norm": 0.14752762019634247,
+      "learning_rate": 4.8277857359729787e-05,
+      "loss": 0.1843916058540344,
+      "step": 800
+    },
+    {
+      "epoch": 0.14647016011644834,
+      "grad_norm": 0.15043556690216064,
+      "learning_rate": 4.8250909608822644e-05,
+      "loss": 0.18354393243789674,
+      "step": 805
+    },
+    {
+      "epoch": 0.14737991266375547,
+      "grad_norm": 0.1381794661283493,
+      "learning_rate": 4.822376029504223e-05,
+      "loss": 0.1789781332015991,
+      "step": 810
+    },
+    {
+      "epoch": 0.1482896652110626,
+      "grad_norm": 0.18386174738407135,
+      "learning_rate": 4.819640965374681e-05,
+      "loss": 0.19494292736053467,
+      "step": 815
+    },
+    {
+      "epoch": 0.14919941775836973,
+      "grad_norm": 0.13829593360424042,
+      "learning_rate": 4.816885792203996e-05,
+      "loss": 0.18486063480377196,
+      "step": 820
+    },
+    {
+      "epoch": 0.15010917030567686,
+      "grad_norm": 0.15033291280269623,
+      "learning_rate": 4.814110533876852e-05,
+      "loss": 0.18061509132385253,
+      "step": 825
+    },
+    {
+      "epoch": 0.151018922852984,
+      "grad_norm": 0.17150473594665527,
+      "learning_rate": 4.811315214452051e-05,
+      "loss": 0.18464866876602173,
+      "step": 830
+    },
+    {
+      "epoch": 0.15192867540029112,
+      "grad_norm": 0.15317125618457794,
+      "learning_rate": 4.808499858162307e-05,
+      "loss": 0.1837708592414856,
+      "step": 835
+    },
+    {
+      "epoch": 0.15283842794759825,
+      "grad_norm": 0.2671392560005188,
+      "learning_rate": 4.805664489414031e-05,
+      "loss": 0.19338636398315429,
+      "step": 840
+    },
+    {
+      "epoch": 0.15374818049490538,
+      "grad_norm": 0.14047028124332428,
+      "learning_rate": 4.802809132787125e-05,
+      "loss": 0.17069108486175538,
+      "step": 845
+    },
+    {
+      "epoch": 0.1546579330422125,
+      "grad_norm": 0.1520431935787201,
+      "learning_rate": 4.799933813034768e-05,
+      "loss": 0.18607735633850098,
+      "step": 850
+    },
+    {
+      "epoch": 0.15556768558951964,
+      "grad_norm": 0.17239463329315186,
+      "learning_rate": 4.797038555083197e-05,
+      "loss": 0.18069062232971192,
+      "step": 855
+    },
+    {
+      "epoch": 0.1564774381368268,
+      "grad_norm": 0.1377955675125122,
+      "learning_rate": 4.794123384031495e-05,
+      "loss": 0.18870222568511963,
+      "step": 860
+    },
+    {
+      "epoch": 0.15738719068413393,
+      "grad_norm": 0.15901461243629456,
+      "learning_rate": 4.791188325151373e-05,
+      "loss": 0.18128334283828734,
+      "step": 865
+    },
+    {
+      "epoch": 0.15829694323144106,
+      "grad_norm": 0.14634132385253906,
+      "learning_rate": 4.7882334038869495e-05,
+      "loss": 0.1866163969039917,
+      "step": 870
+    },
+    {
+      "epoch": 0.1592066957787482,
+      "grad_norm": 0.15361061692237854,
+      "learning_rate": 4.785258645854529e-05,
+      "loss": 0.17850807905197144,
+      "step": 875
+    },
+    {
+      "epoch": 0.16011644832605532,
+      "grad_norm": 0.13751649856567383,
+      "learning_rate": 4.782264076842385e-05,
+      "loss": 0.17731113433837892,
+      "step": 880
+    },
+    {
+      "epoch": 0.16102620087336245,
+      "grad_norm": 0.17909638583660126,
+      "learning_rate": 4.7792497228105314e-05,
+      "loss": 0.18344542980194092,
+      "step": 885
+    },
+    {
+      "epoch": 0.16193595342066958,
+      "grad_norm": 0.16038304567337036,
+      "learning_rate": 4.776215609890498e-05,
+      "loss": 0.18868647813796996,
+      "step": 890
+    },
+    {
+      "epoch": 0.1628457059679767,
+      "grad_norm": 0.1653951108455658,
+      "learning_rate": 4.773161764385107e-05,
+      "loss": 0.18614152669906617,
+      "step": 895
+    },
+    {
+      "epoch": 0.16375545851528384,
+      "grad_norm": 0.16193026304244995,
+      "learning_rate": 4.770088212768241e-05,
+      "loss": 0.18564575910568237,
+      "step": 900
+    },
+    {
+      "epoch": 0.16466521106259097,
+      "grad_norm": 0.16048531234264374,
+      "learning_rate": 4.7669949816846173e-05,
+      "loss": 0.18330031633377075,
+      "step": 905
+    },
+    {
+      "epoch": 0.1655749636098981,
+      "grad_norm": 0.1440177708864212,
+      "learning_rate": 4.7638820979495534e-05,
+      "loss": 0.17712442874908446,
+      "step": 910
+    },
+    {
+      "epoch": 0.16648471615720525,
+      "grad_norm": 0.19635969400405884,
+      "learning_rate": 4.760749588548738e-05,
+      "loss": 0.18679027557373046,
+      "step": 915
+    },
+    {
+      "epoch": 0.16739446870451238,
+      "grad_norm": 0.15576541423797607,
+      "learning_rate": 4.757597480637995e-05,
+      "loss": 0.19283764362335204,
+      "step": 920
+    },
+    {
+      "epoch": 0.1683042212518195,
+      "grad_norm": 0.1550331562757492,
+      "learning_rate": 4.7544258015430463e-05,
+      "loss": 0.18269542455673218,
+      "step": 925
+    },
+    {
+      "epoch": 0.16921397379912664,
+      "grad_norm": 0.18369626998901367,
+      "learning_rate": 4.75123457875928e-05,
+      "loss": 0.1697891116142273,
+      "step": 930
+    },
+    {
+      "epoch": 0.17012372634643377,
+      "grad_norm": 0.15266314148902893,
+      "learning_rate": 4.7480238399515074e-05,
+      "loss": 0.18523451089859008,
+      "step": 935
+    },
+    {
+      "epoch": 0.1710334788937409,
+      "grad_norm": 0.16709664463996887,
+      "learning_rate": 4.744793612953724e-05,
+      "loss": 0.1803238034248352,
+      "step": 940
+    },
+    {
+      "epoch": 0.17194323144104803,
+      "grad_norm": 0.14929179847240448,
+      "learning_rate": 4.741543925768872e-05,
+      "loss": 0.1861217737197876,
+      "step": 945
+    },
+    {
+      "epoch": 0.17285298398835516,
+      "grad_norm": 0.1362280696630478,
+      "learning_rate": 4.7382748065685915e-05,
+      "loss": 0.17896100282669067,
+      "step": 950
+    },
+    {
+      "epoch": 0.1737627365356623,
+      "grad_norm": 0.15290239453315735,
+      "learning_rate": 4.734986283692982e-05,
+      "loss": 0.18432788848876952,
+      "step": 955
+    },
+    {
+      "epoch": 0.17467248908296942,
+      "grad_norm": 0.1287035197019577,
+      "learning_rate": 4.73167838565035e-05,
+      "loss": 0.18485682010650634,
+      "step": 960
+    },
+    {
+      "epoch": 0.17558224163027655,
+      "grad_norm": 0.17969627678394318,
+      "learning_rate": 4.728351141116971e-05,
+      "loss": 0.17361557483673096,
+      "step": 965
+    },
+    {
+      "epoch": 0.1764919941775837,
+      "grad_norm": 0.13751201331615448,
+      "learning_rate": 4.7250045789368326e-05,
+      "loss": 0.1731679320335388,
+      "step": 970
+    },
+    {
+      "epoch": 0.17740174672489084,
+      "grad_norm": 0.1603265255689621,
+      "learning_rate": 4.721638728121388e-05,
+      "loss": 0.17308170795440675,
+      "step": 975
+    },
+    {
+      "epoch": 0.17831149927219797,
+      "grad_norm": 0.1592789888381958,
+      "learning_rate": 4.718253617849306e-05,
+      "loss": 0.17534757852554322,
+      "step": 980
+    },
+    {
+      "epoch": 0.1792212518195051,
+      "grad_norm": 0.12727224826812744,
+      "learning_rate": 4.714849277466214e-05,
+      "loss": 0.17817609310150145,
+      "step": 985
+    },
+    {
+      "epoch": 0.18013100436681223,
+      "grad_norm": 0.15401554107666016,
+      "learning_rate": 4.711425736484447e-05,
+      "loss": 0.1733405351638794,
+      "step": 990
+    },
+    {
+      "epoch": 0.18104075691411936,
+      "grad_norm": 0.13253968954086304,
+      "learning_rate": 4.7079830245827906e-05,
+      "loss": 0.17846795320510864,
+      "step": 995
+    },
+    {
+      "epoch": 0.1819505094614265,
+      "grad_norm": 0.21846213936805725,
+      "learning_rate": 4.7045211716062245e-05,
+      "loss": 0.18021599054336548,
+      "step": 1000
+    },
+    {
+      "epoch": 0.18286026200873362,
+      "grad_norm": 0.16867990791797638,
+      "learning_rate": 4.7010402075656595e-05,
+      "loss": 0.18232386112213134,
+      "step": 1005
+    },
+    {
+      "epoch": 0.18377001455604075,
+      "grad_norm": 0.17180582880973816,
+      "learning_rate": 4.697540162637686e-05,
+      "loss": 0.1816317319869995,
+      "step": 1010
+    },
+    {
+      "epoch": 0.18467976710334788,
+      "grad_norm": 0.16480213403701782,
+      "learning_rate": 4.694021067164303e-05,
+      "loss": 0.17718446254730225,
+      "step": 1015
+    },
+    {
+      "epoch": 0.185589519650655,
+      "grad_norm": 0.15015918016433716,
+      "learning_rate": 4.6904829516526605e-05,
+      "loss": 0.17412011623382567,
+      "step": 1020
+    },
+    {
+      "epoch": 0.18649927219796217,
+      "grad_norm": 0.14445139467716217,
+      "learning_rate": 4.686925846774795e-05,
+      "loss": 0.1778018832206726,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1874090247452693,
+      "grad_norm": 0.1701960265636444,
+      "learning_rate": 4.683349783367362e-05,
+      "loss": 0.16901081800460815,
+      "step": 1030
+    },
+    {
+      "epoch": 0.18831877729257643,
+      "grad_norm": 0.15894867479801178,
+      "learning_rate": 4.679754792431368e-05,
+      "loss": 0.17055928707122803,
+      "step": 1035
+    },
+    {
+      "epoch": 0.18922852983988356,
+      "grad_norm": 0.1511942446231842,
+      "learning_rate": 4.676140905131903e-05,
+      "loss": 0.17339680194854737,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1901382823871907,
+      "grad_norm": 0.14735209941864014,
+      "learning_rate": 4.672508152797872e-05,
+      "loss": 0.17802717685699462,
+      "step": 1045
+    },
+    {
+      "epoch": 0.19104803493449782,
+      "grad_norm": 0.17367291450500488,
+      "learning_rate": 4.66885656692172e-05,
+      "loss": 0.1732744097709656,
+      "step": 1050
+    },
+    {
+      "epoch": 0.19195778748180495,
+      "grad_norm": 0.147227481007576,
+      "learning_rate": 4.665186179159159e-05,
+      "loss": 0.17040517330169677,
+      "step": 1055
+    },
+    {
+      "epoch": 0.19286754002911208,
+      "grad_norm": 0.1709655076265335,
+      "learning_rate": 4.6614970213289e-05,
+      "loss": 0.17794088125228882,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1937772925764192,
+      "grad_norm": 0.1588088721036911,
+      "learning_rate": 4.657789125412366e-05,
+      "loss": 0.17180380821228028,
+      "step": 1065
+    },
+    {
+      "epoch": 0.19468704512372634,
+      "grad_norm": 0.14827021956443787,
+      "learning_rate": 4.654062523553428e-05,
+      "loss": 0.182997989654541,
+      "step": 1070
+    },
+    {
+      "epoch": 0.19559679767103347,
+      "grad_norm": 0.16230466961860657,
+      "learning_rate": 4.6503172480581126e-05,
+      "loss": 0.17346880435943604,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1965065502183406,
+      "grad_norm": 0.1637624353170395,
+      "learning_rate": 4.646553331394333e-05,
+      "loss": 0.17263576984405518,
+      "step": 1080
+    },
+    {
+      "epoch": 0.19741630276564776,
+      "grad_norm": 0.15977843105793,
+      "learning_rate": 4.642770806191603e-05,
+      "loss": 0.17284308671951293,
+      "step": 1085
+    },
+    {
+      "epoch": 0.19832605531295489,
+      "grad_norm": 0.15394869446754456,
+      "learning_rate": 4.6389697052407534e-05,
+      "loss": 0.17797101736068727,
+      "step": 1090
+    },
+    {
+      "epoch": 0.19923580786026202,
+      "grad_norm": 0.15995225310325623,
+      "learning_rate": 4.6351500614936485e-05,
+      "loss": 0.18137198686599731,
+      "step": 1095
+    },
+    {
+      "epoch": 0.20014556040756915,
+      "grad_norm": 0.1779479682445526,
+      "learning_rate": 4.6313119080629006e-05,
+      "loss": 0.17998344898223878,
+      "step": 1100
+    },
+    {
+      "epoch": 0.20105531295487628,
+      "grad_norm": 0.14362832903862,
+      "learning_rate": 4.627455278221584e-05,
+      "loss": 0.18196423053741456,
+      "step": 1105
+    },
+    {
+      "epoch": 0.2019650655021834,
+      "grad_norm": 0.15951639413833618,
+      "learning_rate": 4.623580205402947e-05,
+      "loss": 0.17423888444900512,
+      "step": 1110
+    },
+    {
+      "epoch": 0.20287481804949054,
+      "grad_norm": 0.17273563146591187,
+      "learning_rate": 4.619686723200115e-05,
+      "loss": 0.17392473220825194,
+      "step": 1115
+    },
+    {
+      "epoch": 0.20378457059679767,
+      "grad_norm": 0.1655360758304596,
+      "learning_rate": 4.615774865365813e-05,
+      "loss": 0.17528389692306517,
+      "step": 1120
+    },
+    {
+      "epoch": 0.2046943231441048,
+      "grad_norm": 0.15920691192150116,
+      "learning_rate": 4.611844665812058e-05,
+      "loss": 0.1806849241256714,
+      "step": 1125
+    },
+    {
+      "epoch": 0.20560407569141192,
+      "grad_norm": 0.16114577651023865,
+      "learning_rate": 4.607896158609875e-05,
+      "loss": 0.17217352390289306,
+      "step": 1130
+    },
+    {
+      "epoch": 0.20651382823871905,
+      "grad_norm": 0.1499422937631607,
+      "learning_rate": 4.603929377988999e-05,
+      "loss": 0.17806737422943114,
+      "step": 1135
+    },
+    {
+      "epoch": 0.2074235807860262,
+      "grad_norm": 0.17605191469192505,
+      "learning_rate": 4.5999443583375765e-05,
+      "loss": 0.17842113971710205,
+      "step": 1140
+    },
+    {
+      "epoch": 0.20833333333333334,
+      "grad_norm": 0.16117210686206818,
+      "learning_rate": 4.595941134201871e-05,
+      "loss": 0.18379683494567872,
+      "step": 1145
+    },
+    {
+      "epoch": 0.20924308588064047,
+      "grad_norm": 0.21199050545692444,
+      "learning_rate": 4.591919740285957e-05,
+      "loss": 0.16286123991012574,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2101528384279476,
+      "grad_norm": 0.15100529789924622,
+      "learning_rate": 4.587880211451427e-05,
+      "loss": 0.17995200157165528,
+      "step": 1155
+    },
+    {
+      "epoch": 0.21106259097525473,
+      "grad_norm": 0.16618172824382782,
+      "learning_rate": 4.583822582717085e-05,
+      "loss": 0.16960303783416747,
+      "step": 1160
+    },
+    {
+      "epoch": 0.21197234352256186,
+      "grad_norm": 0.14743569493293762,
+      "learning_rate": 4.579746889258643e-05,
+      "loss": 0.17762668132781984,
+      "step": 1165
+    },
+    {
+      "epoch": 0.212882096069869,
+      "grad_norm": 0.1697179079055786,
+      "learning_rate": 4.575653166408417e-05,
+      "loss": 0.16656005382537842,
+      "step": 1170
+    },
+    {
+      "epoch": 0.21379184861717612,
+      "grad_norm": 0.14886513352394104,
+      "learning_rate": 4.57154144965502e-05,
+      "loss": 0.17091882228851318,
+      "step": 1175
+    },
+    {
+      "epoch": 0.21470160116448325,
+      "grad_norm": 0.18197473883628845,
+      "learning_rate": 4.5674117746430556e-05,
+      "loss": 0.1770920753479004,
+      "step": 1180
+    },
+    {
+      "epoch": 0.21561135371179038,
+      "grad_norm": 0.17323088645935059,
+      "learning_rate": 4.563264177172807e-05,
+      "loss": 0.1734643578529358,
+      "step": 1185
+    },
+    {
+      "epoch": 0.2165211062590975,
+      "grad_norm": 0.1521984338760376,
+      "learning_rate": 4.559098693199929e-05,
+      "loss": 0.17515116930007935,
+      "step": 1190
+    },
+    {
+      "epoch": 0.21743085880640467,
+      "grad_norm": 0.1842304915189743,
+      "learning_rate": 4.554915358835134e-05,
+      "loss": 0.16798022985458375,
+      "step": 1195
+    },
+    {
+      "epoch": 0.2183406113537118,
+      "grad_norm": 0.14753451943397522,
+      "learning_rate": 4.5507142103438794e-05,
+      "loss": 0.1755476713180542,
+      "step": 1200
+    },
+    {
+      "epoch": 0.21925036390101893,
+      "grad_norm": 0.17096194624900818,
+      "learning_rate": 4.546495284146057e-05,
+      "loss": 0.1792473554611206,
+      "step": 1205
+    },
+    {
+      "epoch": 0.22016011644832606,
+      "grad_norm": 0.1579233556985855,
+      "learning_rate": 4.542258616815669e-05,
+      "loss": 0.17230144739151002,
+      "step": 1210
+    },
+    {
+      "epoch": 0.2210698689956332,
+      "grad_norm": 0.177297905087471,
+      "learning_rate": 4.5380042450805216e-05,
+      "loss": 0.1807127833366394,
+      "step": 1215
+    },
+    {
+      "epoch": 0.22197962154294032,
+      "grad_norm": 0.14331696927547455,
+      "learning_rate": 4.533732205821897e-05,
+      "loss": 0.17201389074325563,
+      "step": 1220
+    },
+    {
+      "epoch": 0.22288937409024745,
+      "grad_norm": 0.14473360776901245,
+      "learning_rate": 4.529442536074239e-05,
+      "loss": 0.17036900520324708,
+      "step": 1225
+    },
+    {
+      "epoch": 0.22379912663755458,
+      "grad_norm": 0.1820901483297348,
+      "learning_rate": 4.5251352730248314e-05,
+      "loss": 0.17704882621765136,
+      "step": 1230
+    },
+    {
+      "epoch": 0.2247088791848617,
+      "grad_norm": 0.1948976367712021,
+      "learning_rate": 4.5208104540134746e-05,
+      "loss": 0.1706973433494568,
+      "step": 1235
+    },
+    {
+      "epoch": 0.22561863173216884,
+      "grad_norm": 0.16660070419311523,
+      "learning_rate": 4.51646811653216e-05,
+      "loss": 0.17636821269989014,
+      "step": 1240
+    },
+    {
+      "epoch": 0.22652838427947597,
+      "grad_norm": 0.1699984073638916,
+      "learning_rate": 4.512108298224751e-05,
+      "loss": 0.16986632347106934,
+      "step": 1245
+    },
+    {
+      "epoch": 0.22743813682678313,
+      "grad_norm": 0.17601042985916138,
+      "learning_rate": 4.50773103688665e-05,
+      "loss": 0.17507898807525635,
+      "step": 1250
+    },
+    {
+      "epoch": 0.22834788937409026,
+      "grad_norm": 0.17557238042354584,
+      "learning_rate": 4.503336370464476e-05,
+      "loss": 0.17702863216400147,
+      "step": 1255
+    },
+    {
+      "epoch": 0.2292576419213974,
+      "grad_norm": 0.1800651252269745,
+      "learning_rate": 4.498924337055729e-05,
+      "loss": 0.16419180631637573,
+      "step": 1260
+    },
+    {
+      "epoch": 0.23016739446870452,
+      "grad_norm": 0.2022479772567749,
+      "learning_rate": 4.494494974908468e-05,
+      "loss": 0.17482060194015503,
+      "step": 1265
+    },
+    {
+      "epoch": 0.23107714701601165,
+      "grad_norm": 0.14180205762386322,
+      "learning_rate": 4.490048322420973e-05,
+      "loss": 0.1723136067390442,
+      "step": 1270
+    },
+    {
+      "epoch": 0.23198689956331878,
+      "grad_norm": 0.18607310950756073,
+      "learning_rate": 4.485584418141419e-05,
+      "loss": 0.17096419334411622,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2328966521106259,
+      "grad_norm": 0.15958310663700104,
+      "learning_rate": 4.481103300767529e-05,
+      "loss": 0.1656244158744812,
+      "step": 1280
+    },
+    {
+      "epoch": 0.23380640465793304,
+      "grad_norm": 0.17552383244037628,
+      "learning_rate": 4.476605009146255e-05,
+      "loss": 0.17677626609802247,
+      "step": 1285
+    },
+    {
+      "epoch": 0.23471615720524017,
+      "grad_norm": 0.15299823880195618,
+      "learning_rate": 4.472089582273429e-05,
+      "loss": 0.1778991103172302,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2356259097525473,
+      "grad_norm": 0.14613987505435944,
+      "learning_rate": 4.46755705929343e-05,
+      "loss": 0.17071452140808105,
+      "step": 1295
+    },
+    {
+      "epoch": 0.23653566229985443,
+      "grad_norm": 0.17781122028827667,
+      "learning_rate": 4.463007479498843e-05,
+      "loss": 0.16955430507659913,
+      "step": 1300
+    },
+    {
+      "epoch": 0.23744541484716158,
+      "grad_norm": 0.16326487064361572,
+      "learning_rate": 4.458440882330119e-05,
+      "loss": 0.1777693510055542,
+      "step": 1305
+    },
+    {
+      "epoch": 0.23835516739446871,
+      "grad_norm": 0.17701926827430725,
+      "learning_rate": 4.4538573073752365e-05,
+      "loss": 0.16323351860046387,
+      "step": 1310
+    },
+    {
+      "epoch": 0.23926491994177584,
+      "grad_norm": 0.13104717433452606,
+      "learning_rate": 4.449256794369349e-05,
+      "loss": 0.17653456926345826,
+      "step": 1315
+    },
+    {
+      "epoch": 0.24017467248908297,
+      "grad_norm": 0.1796836256980896,
+      "learning_rate": 4.444639383194452e-05,
+      "loss": 0.17189600467681884,
+      "step": 1320
+    },
+    {
+      "epoch": 0.2410844250363901,
+      "grad_norm": 0.14919696748256683,
+      "learning_rate": 4.440005113879029e-05,
+      "loss": 0.17003334760665895,
+      "step": 1325
+    },
+    {
+      "epoch": 0.24199417758369723,
+      "grad_norm": 0.1728784441947937,
+      "learning_rate": 4.4353540265977064e-05,
+      "loss": 0.17397408485412597,
+      "step": 1330
+    },
+    {
+      "epoch": 0.24290393013100436,
+      "grad_norm": 0.14591015875339508,
+      "learning_rate": 4.43068616167091e-05,
+      "loss": 0.16498478651046752,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2438136826783115,
+      "grad_norm": 0.18417201936244965,
+      "learning_rate": 4.4260015595645055e-05,
+      "loss": 0.16841750144958495,
+      "step": 1340
+    },
+    {
+      "epoch": 0.24472343522561862,
+      "grad_norm": 0.16264279186725616,
+      "learning_rate": 4.4213002608894605e-05,
+      "loss": 0.16907373666763306,
+      "step": 1345
+    },
+    {
+      "epoch": 0.24563318777292575,
+      "grad_norm": 0.15248481929302216,
+      "learning_rate": 4.416582306401481e-05,
+      "loss": 0.15931472778320313,
+      "step": 1350
+    },
+    {
+      "epoch": 0.24654294032023288,
+      "grad_norm": 0.1488373875617981,
+      "learning_rate": 4.4118477370006636e-05,
+      "loss": 0.1701716423034668,
+      "step": 1355
+    },
+    {
+      "epoch": 0.24745269286754004,
+      "grad_norm": 0.14679782092571259,
+      "learning_rate": 4.407096593731142e-05,
+      "loss": 0.157412326335907,
+      "step": 1360
+    },
+    {
+      "epoch": 0.24836244541484717,
+      "grad_norm": 0.17139530181884766,
+      "learning_rate": 4.402328917780728e-05,
+      "loss": 0.17303754091262818,
+      "step": 1365
+    },
+    {
+      "epoch": 0.2492721979621543,
+      "grad_norm": 0.1534871757030487,
+      "learning_rate": 4.397544750480554e-05,
+      "loss": 0.1786255121231079,
+      "step": 1370
+    },
+    {
+      "epoch": 0.2501819505094614,
+      "grad_norm": 0.1876252293586731,
+      "learning_rate": 4.39274413330472e-05,
+      "loss": 0.16442898511886597,
+      "step": 1375
+    },
+    {
+      "epoch": 0.25109170305676853,
+      "grad_norm": 0.16165752708911896,
+      "learning_rate": 4.387927107869928e-05,
+      "loss": 0.1780426025390625,
+      "step": 1380
+    },
+    {
+      "epoch": 0.25200145560407566,
+      "grad_norm": 0.17242255806922913,
+      "learning_rate": 4.383093715935124e-05,
+      "loss": 0.15959256887435913,
+      "step": 1385
+    },
+    {
+      "epoch": 0.25291120815138285,
+      "grad_norm": 0.1627114862203598,
+      "learning_rate": 4.378243999401137e-05,
+      "loss": 0.17606115341186523,
+      "step": 1390
+    },
+    {
+      "epoch": 0.25382096069869,
+      "grad_norm": 0.15911224484443665,
+      "learning_rate": 4.373378000310312e-05,
+      "loss": 0.16798585653305054,
+      "step": 1395
+    },
+    {
+      "epoch": 0.2547307132459971,
+      "grad_norm": 0.15542249381542206,
+      "learning_rate": 4.3684957608461505e-05,
+      "loss": 0.1695417881011963,
+      "step": 1400
+    },
+    {
+      "epoch": 0.25564046579330424,
+      "grad_norm": 0.1475304812192917,
+      "learning_rate": 4.363597323332941e-05,
+      "loss": 0.16340878009796142,
+      "step": 1405
+    },
+    {
+      "epoch": 0.25655021834061137,
+      "grad_norm": 0.16943927109241486,
+      "learning_rate": 4.358682730235395e-05,
+      "loss": 0.17240238189697266,
+      "step": 1410
+    },
+    {
+      "epoch": 0.2574599708879185,
+      "grad_norm": 0.1816391944885254,
+      "learning_rate": 4.3537520241582744e-05,
+      "loss": 0.16558437347412108,
+      "step": 1415
+    },
+    {
+      "epoch": 0.25836972343522563,
+      "grad_norm": 0.23851341009140015,
+      "learning_rate": 4.348805247846027e-05,
+      "loss": 0.16796000003814698,
+      "step": 1420
+    },
+    {
+      "epoch": 0.25927947598253276,
+      "grad_norm": 0.15415243804454803,
+      "learning_rate": 4.343842444182414e-05,
+      "loss": 0.1746017098426819,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2601892285298399,
+      "grad_norm": 0.15651032328605652,
+      "learning_rate": 4.338863656190139e-05,
+      "loss": 0.1649057984352112,
+      "step": 1430
+    },
+    {
+      "epoch": 0.261098981077147,
+      "grad_norm": 0.16601966321468353,
+      "learning_rate": 4.333868927030471e-05,
+      "loss": 0.15888988971710205,
+      "step": 1435
+    },
+    {
+      "epoch": 0.26200873362445415,
+      "grad_norm": 0.1549467295408249,
+      "learning_rate": 4.328858300002876e-05,
+      "loss": 0.16357985734939576,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2629184861717613,
+      "grad_norm": 0.16332370042800903,
+      "learning_rate": 4.32383181854464e-05,
+      "loss": 0.16749982833862304,
+      "step": 1445
+    },
+    {
+      "epoch": 0.2638282387190684,
+      "grad_norm": 0.14827077090740204,
+      "learning_rate": 4.3187895262304894e-05,
+      "loss": 0.16886214017868043,
+      "step": 1450
+    },
+    {
+      "epoch": 0.26473799126637554,
+      "grad_norm": 0.1557198166847229,
+      "learning_rate": 4.313731466772216e-05,
+      "loss": 0.17512214183807373,
+      "step": 1455
+    },
+    {
+      "epoch": 0.26564774381368267,
+      "grad_norm": 0.17263570427894592,
+      "learning_rate": 4.308657684018299e-05,
+      "loss": 0.16248074769973755,
+      "step": 1460
+    },
+    {
+      "epoch": 0.2665574963609898,
+      "grad_norm": 0.17135761678218842,
+      "learning_rate": 4.303568221953521e-05,
+      "loss": 0.16605921983718872,
+      "step": 1465
+    },
+    {
+      "epoch": 0.26746724890829693,
+      "grad_norm": 0.14322632551193237,
+      "learning_rate": 4.2984631246985897e-05,
+      "loss": 0.1610772728919983,
+      "step": 1470
+    },
+    {
+      "epoch": 0.26837700145560406,
+      "grad_norm": 0.18852312862873077,
+      "learning_rate": 4.2933424365097564e-05,
+      "loss": 0.1686462163925171,
+      "step": 1475
+    },
+    {
+      "epoch": 0.2692867540029112,
+      "grad_norm": 0.1780245155096054,
+      "learning_rate": 4.2882062017784294e-05,
+      "loss": 0.16953932046890258,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2701965065502183,
+      "grad_norm": 0.180568665266037,
+      "learning_rate": 4.2830544650307895e-05,
+      "loss": 0.16442664861679077,
+      "step": 1485
+    },
+    {
+      "epoch": 0.27110625909752545,
+      "grad_norm": 0.16876435279846191,
+      "learning_rate": 4.277887270927407e-05,
+      "loss": 0.17128173112869263,
+      "step": 1490
+    },
+    {
+      "epoch": 0.2720160116448326,
+      "grad_norm": 0.164053276181221,
+      "learning_rate": 4.2727046642628513e-05,
+      "loss": 0.16331382989883422,
+      "step": 1495
+    },
+    {
+      "epoch": 0.27292576419213976,
+      "grad_norm": 0.14577528834342957,
+      "learning_rate": 4.267506689965305e-05,
+      "loss": 0.1638316035270691,
+      "step": 1500
+    },
+    {
+      "epoch": 0.2738355167394469,
+      "grad_norm": 0.1648740917444229,
+      "learning_rate": 4.262293393096171e-05,
+      "loss": 0.15332664251327516,
+      "step": 1505
+    },
+    {
+      "epoch": 0.274745269286754,
+      "grad_norm": 0.16445094347000122,
+      "learning_rate": 4.257064818849685e-05,
+      "loss": 0.1706634521484375,
+      "step": 1510
+    },
+    {
+      "epoch": 0.27565502183406115,
+      "grad_norm": 0.1584935486316681,
+      "learning_rate": 4.251821012552524e-05,
+      "loss": 0.1684114694595337,
+      "step": 1515
+    },
+    {
+      "epoch": 0.2765647743813683,
+      "grad_norm": 0.17215611040592194,
+      "learning_rate": 4.24656201966341e-05,
+      "loss": 0.15594131946563722,
+      "step": 1520
+    },
+    {
+      "epoch": 0.2774745269286754,
+      "grad_norm": 0.15945589542388916,
+      "learning_rate": 4.2412878857727214e-05,
+      "loss": 0.1686659574508667,
+      "step": 1525
+    },
+    {
+      "epoch": 0.27838427947598254,
+      "grad_norm": 0.16103951632976532,
+      "learning_rate": 4.2359986566020906e-05,
+      "loss": 0.17779340744018554,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2792940320232897,
+      "grad_norm": 0.1770307570695877,
+      "learning_rate": 4.230694378004014e-05,
+      "loss": 0.16786882877349854,
+      "step": 1535
+    },
+    {
+      "epoch": 0.2802037845705968,
+      "grad_norm": 0.16225053369998932,
+      "learning_rate": 4.2253750959614504e-05,
+      "loss": 0.16558897495269775,
+      "step": 1540
+    },
+    {
+      "epoch": 0.28111353711790393,
+      "grad_norm": 0.27213969826698303,
+      "learning_rate": 4.220040856587425e-05,
+      "loss": 0.1641119599342346,
+      "step": 1545
+    },
+    {
+      "epoch": 0.28202328966521106,
+      "grad_norm": 0.1773071587085724,
+      "learning_rate": 4.2146917061246284e-05,
+      "loss": 0.16919140815734862,
+      "step": 1550
+    },
+    {
+      "epoch": 0.2829330422125182,
+      "grad_norm": 0.15519705414772034,
+      "learning_rate": 4.209327690945014e-05,
+      "loss": 0.15501506328582765,
+      "step": 1555
+    },
+    {
+      "epoch": 0.2838427947598253,
+      "grad_norm": 0.19921597838401794,
+      "learning_rate": 4.203948857549402e-05,
+      "loss": 0.1690821886062622,
+      "step": 1560
+    },
+    {
+      "epoch": 0.28475254730713245,
+      "grad_norm": 0.15417630970478058,
+      "learning_rate": 4.1985552525670696e-05,
+      "loss": 0.1675640344619751,
+      "step": 1565
+    },
+    {
+      "epoch": 0.2856622998544396,
+      "grad_norm": 0.1739572137594223,
+      "learning_rate": 4.193146922755348e-05,
+      "loss": 0.16738017797470092,
+      "step": 1570
+    },
+    {
+      "epoch": 0.2865720524017467,
+      "grad_norm": 0.1384361982345581,
+      "learning_rate": 4.187723914999221e-05,
+      "loss": 0.16802358627319336,
+      "step": 1575
+    },
+    {
+      "epoch": 0.28748180494905384,
+      "grad_norm": 0.1491454839706421,
+      "learning_rate": 4.182286276310915e-05,
+      "loss": 0.1619583249092102,
+      "step": 1580
+    },
+    {
+      "epoch": 0.288391557496361,
+      "grad_norm": 0.15831919014453888,
+      "learning_rate": 4.176834053829492e-05,
+      "loss": 0.1625199794769287,
+      "step": 1585
+    },
+    {
+      "epoch": 0.2893013100436681,
+      "grad_norm": 0.16265396773815155,
+      "learning_rate": 4.1713672948204416e-05,
+      "loss": 0.16718552112579346,
+      "step": 1590
+    },
+    {
+      "epoch": 0.29021106259097523,
+      "grad_norm": 0.15153461694717407,
+      "learning_rate": 4.1658860466752714e-05,
+      "loss": 0.15979087352752686,
+      "step": 1595
+    },
+    {
+      "epoch": 0.29112081513828236,
+      "grad_norm": 0.1620412915945053,
+      "learning_rate": 4.160390356911096e-05,
+      "loss": 0.16103557348251343,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2920305676855895,
+      "grad_norm": 0.16673807799816132,
+      "learning_rate": 4.154880273170223e-05,
+      "loss": 0.16394708156585694,
+      "step": 1605
+    },
+    {
+      "epoch": 0.2929403202328967,
+      "grad_norm": 0.14834867417812347,
+      "learning_rate": 4.149355843219744e-05,
+      "loss": 0.15916435718536376,
+      "step": 1610
+    },
+    {
+      "epoch": 0.2938500727802038,
+      "grad_norm": 0.16977964341640472,
+      "learning_rate": 4.143817114951119e-05,
+      "loss": 0.16538127660751342,
+      "step": 1615
+    },
+    {
+      "epoch": 0.29475982532751094,
+      "grad_norm": 0.17986875772476196,
+      "learning_rate": 4.138264136379756e-05,
+      "loss": 0.15514618158340454,
+      "step": 1620
+    },
+    {
+      "epoch": 0.29566957787481807,
+      "grad_norm": 0.15794920921325684,
+      "learning_rate": 4.132696955644605e-05,
+      "loss": 0.15992183685302735,
+      "step": 1625
+    },
+    {
+      "epoch": 0.2965793304221252,
+      "grad_norm": 0.19955399632453918,
+      "learning_rate": 4.127115621007731e-05,
+      "loss": 0.16362056732177735,
+      "step": 1630
+    },
+    {
+      "epoch": 0.29748908296943233,
+      "grad_norm": 0.1352023035287857,
+      "learning_rate": 4.121520180853903e-05,
+      "loss": 0.15631601810455323,
+      "step": 1635
+    },
+    {
+      "epoch": 0.29839883551673946,
+      "grad_norm": 0.15340781211853027,
+      "learning_rate": 4.1159106836901674e-05,
+      "loss": 0.1571858048439026,
+      "step": 1640
+    },
+    {
+      "epoch": 0.2993085880640466,
+      "grad_norm": 0.15311770141124725,
+      "learning_rate": 4.110287178145433e-05,
+      "loss": 0.16082344055175782,
+      "step": 1645
+    },
+    {
+      "epoch": 0.3002183406113537,
+      "grad_norm": 0.17811627686023712,
+      "learning_rate": 4.10464971297005e-05,
+      "loss": 0.16117215156555176,
+      "step": 1650
+    },
+    {
+      "epoch": 0.30112809315866085,
+      "grad_norm": 0.21060039103031158,
+      "learning_rate": 4.0989983370353805e-05,
+      "loss": 0.15838587284088135,
+      "step": 1655
+    },
+    {
+      "epoch": 0.302037845705968,
+      "grad_norm": 0.155836820602417,
+      "learning_rate": 4.093333099333383e-05,
+      "loss": 0.16648870706558228,
+      "step": 1660
+    },
+    {
+      "epoch": 0.3029475982532751,
+      "grad_norm": 0.13711698353290558,
+      "learning_rate": 4.0876540489761826e-05,
+      "loss": 0.16899349689483642,
+      "step": 1665
+    },
+    {
+      "epoch": 0.30385735080058224,
+      "grad_norm": 0.15162716805934906,
+      "learning_rate": 4.0819612351956485e-05,
+      "loss": 0.16574090719223022,
+      "step": 1670
+    },
+    {
+      "epoch": 0.30476710334788937,
+      "grad_norm": 0.15016348659992218,
+      "learning_rate": 4.0762547073429615e-05,
+      "loss": 0.1689780354499817,
+      "step": 1675
+    },
+    {
+      "epoch": 0.3056768558951965,
+      "grad_norm": 0.15182986855506897,
+      "learning_rate": 4.070534514888194e-05,
+      "loss": 0.1593686819076538,
+      "step": 1680
+    },
+    {
+      "epoch": 0.3065866084425036,
+      "grad_norm": 0.15648750960826874,
+      "learning_rate": 4.0648007074198765e-05,
+      "loss": 0.16436235904693602,
+      "step": 1685
+    },
+    {
+      "epoch": 0.30749636098981076,
+      "grad_norm": 0.18339484930038452,
+      "learning_rate": 4.0590533346445665e-05,
+      "loss": 0.1678077220916748,
+      "step": 1690
+    },
+    {
+      "epoch": 0.3084061135371179,
+      "grad_norm": 0.16426527500152588,
+      "learning_rate": 4.053292446386422e-05,
+      "loss": 0.1689227342605591,
+      "step": 1695
+    },
+    {
+      "epoch": 0.309315866084425,
+      "grad_norm": 0.16129335761070251,
+      "learning_rate": 4.047518092586766e-05,
+      "loss": 0.16592445373535156,
+      "step": 1700
+    },
+    {
+      "epoch": 0.31022561863173215,
+      "grad_norm": 0.15512363612651825,
+      "learning_rate": 4.041730323303654e-05,
+      "loss": 0.16142364740371704,
+      "step": 1705
+    },
+    {
+      "epoch": 0.3111353711790393,
+      "grad_norm": 0.159842386841774,
+      "learning_rate": 4.0359291887114425e-05,
+      "loss": 0.1702875852584839,
+      "step": 1710
+    },
+    {
+      "epoch": 0.3120451237263464,
+      "grad_norm": 0.19558854401111603,
+      "learning_rate": 4.030114739100352e-05,
+      "loss": 0.15966148376464845,
+      "step": 1715
+    },
+    {
+      "epoch": 0.3129548762736536,
+      "grad_norm": 0.1577496975660324,
+      "learning_rate": 4.024287024876029e-05,
+      "loss": 0.1620358943939209,
+      "step": 1720
+    },
+    {
+      "epoch": 0.3138646288209607,
+      "grad_norm": 0.1629355251789093,
+      "learning_rate": 4.0184460965591144e-05,
+      "loss": 0.16511552333831786,
+      "step": 1725
+    },
+    {
+      "epoch": 0.31477438136826785,
+      "grad_norm": 0.17060767114162445,
+      "learning_rate": 4.0125920047848e-05,
+      "loss": 0.15672838687896729,
+      "step": 1730
+    },
+    {
+      "epoch": 0.315684133915575,
+      "grad_norm": 0.22447620332241058,
+      "learning_rate": 4.006724800302394e-05,
+      "loss": 0.15339784622192382,
+      "step": 1735
+    },
+    {
+      "epoch": 0.3165938864628821,
+      "grad_norm": 0.14572037756443024,
+      "learning_rate": 4.000844533974878e-05,
+      "loss": 0.16566959619522095,
+      "step": 1740
+    },
+    {
+      "epoch": 0.31750363901018924,
+      "grad_norm": 0.15915483236312866,
+      "learning_rate": 3.9949512567784684e-05,
+      "loss": 0.16153957843780517,
+      "step": 1745
+    },
+    {
+      "epoch": 0.3184133915574964,
+      "grad_norm": 0.1668540984392166,
+      "learning_rate": 3.9890450198021704e-05,
+      "loss": 0.1659809947013855,
+      "step": 1750
+    },
+    {
+      "epoch": 0.3193231441048035,
+      "grad_norm": 0.16612035036087036,
+      "learning_rate": 3.983125874247341e-05,
+      "loss": 0.16941241025924683,
+      "step": 1755
+    },
+    {
+      "epoch": 0.32023289665211063,
+      "grad_norm": 0.15163679420948029,
+      "learning_rate": 3.9771938714272407e-05,
+      "loss": 0.16053590774536133,
+      "step": 1760
+    },
+    {
+      "epoch": 0.32114264919941776,
+      "grad_norm": 0.1797824203968048,
+      "learning_rate": 3.97124906276659e-05,
+      "loss": 0.1667110800743103,
+      "step": 1765
+    },
+    {
+      "epoch": 0.3220524017467249,
+      "grad_norm": 0.15076608955860138,
+      "learning_rate": 3.9652914998011237e-05,
+      "loss": 0.1607860803604126,
+      "step": 1770
+    },
+    {
+      "epoch": 0.322962154294032,
+      "grad_norm": 0.16523587703704834,
+      "learning_rate": 3.959321234177144e-05,
+      "loss": 0.16515827178955078,
+      "step": 1775
+    },
+    {
+      "epoch": 0.32387190684133915,
+      "grad_norm": 0.22065149247646332,
+      "learning_rate": 3.9533383176510746e-05,
+      "loss": 0.1618957757949829,
+      "step": 1780
+    },
+    {
+      "epoch": 0.3247816593886463,
+      "grad_norm": 0.16426463425159454,
+      "learning_rate": 3.9473428020890066e-05,
+      "loss": 0.15763382911682128,
+      "step": 1785
+    },
+    {
+      "epoch": 0.3256914119359534,
+      "grad_norm": 0.16474904119968414,
+      "learning_rate": 3.941334739466257e-05,
+      "loss": 0.15135571956634522,
+      "step": 1790
+    },
+    {
+      "epoch": 0.32660116448326054,
+      "grad_norm": 0.16746412217617035,
+      "learning_rate": 3.935314181866909e-05,
+      "loss": 0.15925389528274536,
+      "step": 1795
+    },
+    {
+      "epoch": 0.32751091703056767,
+      "grad_norm": 0.17819371819496155,
+      "learning_rate": 3.929281181483369e-05,
+      "loss": 0.1598669171333313,
+      "step": 1800
+    },
+    {
+      "epoch": 0.3284206695778748,
+      "grad_norm": 0.1816040277481079,
+      "learning_rate": 3.923235790615907e-05,
+      "loss": 0.1652522087097168,
+      "step": 1805
+    },
+    {
+      "epoch": 0.32933042212518193,
+      "grad_norm": 0.14846695959568024,
+      "learning_rate": 3.917178061672211e-05,
+      "loss": 0.16665585041046144,
+      "step": 1810
+    },
+    {
+      "epoch": 0.33024017467248906,
+      "grad_norm": 0.1734926551580429,
+      "learning_rate": 3.911108047166924e-05,
+      "loss": 0.16069791316986085,
+      "step": 1815
+    },
+    {
+      "epoch": 0.3311499272197962,
+      "grad_norm": 0.16154922544956207,
+      "learning_rate": 3.905025799721194e-05,
+      "loss": 0.16114097833633423,
+      "step": 1820
+    },
+    {
+      "epoch": 0.3320596797671033,
+      "grad_norm": 0.1538771390914917,
+      "learning_rate": 3.898931372062217e-05,
+      "loss": 0.1602831244468689,
+      "step": 1825
+    },
+    {
+      "epoch": 0.3329694323144105,
+      "grad_norm": 0.14036566019058228,
+      "learning_rate": 3.892824817022781e-05,
+      "loss": 0.1502395749092102,
+      "step": 1830
+    },
+    {
+      "epoch": 0.33387918486171764,
+      "grad_norm": 0.19212059676647186,
+      "learning_rate": 3.886706187540804e-05,
+      "loss": 0.16265250444412233,
+      "step": 1835
+    },
+    {
+      "epoch": 0.33478893740902477,
+      "grad_norm": 0.17410333454608917,
+      "learning_rate": 3.880575536658881e-05,
+      "loss": 0.15689224004745483,
+      "step": 1840
+    },
+    {
+      "epoch": 0.3356986899563319,
+      "grad_norm": 0.15165294706821442,
+      "learning_rate": 3.874432917523817e-05,
+      "loss": 0.15033140182495117,
+      "step": 1845
+    },
+    {
+      "epoch": 0.336608442503639,
+      "grad_norm": 0.16166730225086212,
+      "learning_rate": 3.8682783833861736e-05,
+      "loss": 0.16896235942840576,
+      "step": 1850
+    },
+    {
+      "epoch": 0.33751819505094616,
+      "grad_norm": 0.16497021913528442,
+      "learning_rate": 3.8621119875998026e-05,
+      "loss": 0.1600774645805359,
+      "step": 1855
+    },
+    {
+      "epoch": 0.3384279475982533,
+      "grad_norm": 0.17264948785305023,
+      "learning_rate": 3.855933783621384e-05,
+      "loss": 0.16947593688964843,
+      "step": 1860
+    },
+    {
+      "epoch": 0.3393377001455604,
+      "grad_norm": 0.16870704293251038,
+      "learning_rate": 3.8497438250099636e-05,
+      "loss": 0.16062095165252685,
+      "step": 1865
+    },
+    {
+      "epoch": 0.34024745269286755,
+      "grad_norm": 0.16644036769866943,
+      "learning_rate": 3.843542165426492e-05,
+      "loss": 0.16015599966049193,
+      "step": 1870
+    },
+    {
+      "epoch": 0.3411572052401747,
+      "grad_norm": 0.1626352220773697,
+      "learning_rate": 3.837328858633349e-05,
+      "loss": 0.17444703578948975,
+      "step": 1875
+    },
+    {
+      "epoch": 0.3420669577874818,
+      "grad_norm": 0.1427375227212906,
+      "learning_rate": 3.83110395849389e-05,
+      "loss": 0.1589805006980896,
+      "step": 1880
+    },
+    {
+      "epoch": 0.34297671033478894,
+      "grad_norm": 0.17840255796909332,
+      "learning_rate": 3.824867518971973e-05,
+      "loss": 0.15953952074050903,
+      "step": 1885
+    },
+    {
+      "epoch": 0.34388646288209607,
+      "grad_norm": 0.16998249292373657,
+      "learning_rate": 3.818619594131489e-05,
+      "loss": 0.16027032136917113,
+      "step": 1890
+    },
+    {
+      "epoch": 0.3447962154294032,
+      "grad_norm": 0.14950257539749146,
+      "learning_rate": 3.812360238135897e-05,
+      "loss": 0.15335670709609986,
+      "step": 1895
+    },
+    {
+      "epoch": 0.3457059679767103,
+      "grad_norm": 0.1678011417388916,
+      "learning_rate": 3.806089505247752e-05,
+      "loss": 0.1560648798942566,
+      "step": 1900
+    },
+    {
+      "epoch": 0.34661572052401746,
+      "grad_norm": 0.17944541573524475,
+      "learning_rate": 3.799807449828238e-05,
+      "loss": 0.16072254180908202,
+      "step": 1905
+    },
+    {
+      "epoch": 0.3475254730713246,
+      "grad_norm": 0.166817307472229,
+      "learning_rate": 3.793514126336691e-05,
+      "loss": 0.1542820692062378,
+      "step": 1910
+    },
+    {
+      "epoch": 0.3484352256186317,
+      "grad_norm": 0.16047626733779907,
+      "learning_rate": 3.787209589330134e-05,
+      "loss": 0.16092092990875245,
+      "step": 1915
+    },
+    {
+      "epoch": 0.34934497816593885,
+      "grad_norm": 0.16478900611400604,
+      "learning_rate": 3.7808938934627965e-05,
+      "loss": 0.16765867471694945,
+      "step": 1920
+    },
+    {
+      "epoch": 0.350254730713246,
+      "grad_norm": 0.15349514782428741,
+      "learning_rate": 3.774567093485648e-05,
+      "loss": 0.15890377759933472,
+      "step": 1925
+    },
+    {
+      "epoch": 0.3511644832605531,
+      "grad_norm": 0.1515921950340271,
+      "learning_rate": 3.768229244245917e-05,
+      "loss": 0.16668319702148438,
+      "step": 1930
+    },
+    {
+      "epoch": 0.35207423580786024,
+      "grad_norm": 0.16310466825962067,
+      "learning_rate": 3.7618804006866195e-05,
+      "loss": 0.15182652473449706,
+      "step": 1935
+    },
+    {
+      "epoch": 0.3529839883551674,
+      "grad_norm": 0.17294517159461975,
+      "learning_rate": 3.755520617846084e-05,
+      "loss": 0.16287628412246705,
+      "step": 1940
+    },
+    {
+      "epoch": 0.35389374090247455,
+      "grad_norm": 0.1482895463705063,
+      "learning_rate": 3.749149950857467e-05,
+      "loss": 0.15321952104568481,
+      "step": 1945
+    },
+    {
+      "epoch": 0.3548034934497817,
+      "grad_norm": 0.2236029952764511,
+      "learning_rate": 3.7427684549482847e-05,
+      "loss": 0.15403482913970948,
+      "step": 1950
+    },
+    {
+      "epoch": 0.3557132459970888,
+      "grad_norm": 0.20185327529907227,
+      "learning_rate": 3.736376185439927e-05,
+      "loss": 0.1633884072303772,
+      "step": 1955
+    },
+    {
+      "epoch": 0.35662299854439594,
+      "grad_norm": 0.13906247913837433,
+      "learning_rate": 3.7299731977471816e-05,
+      "loss": 0.15925350189208984,
+      "step": 1960
+    },
+    {
+      "epoch": 0.35753275109170307,
+      "grad_norm": 0.18665002286434174,
+      "learning_rate": 3.723559547377751e-05,
+      "loss": 0.1612026572227478,
+      "step": 1965
+    },
+    {
+      "epoch": 0.3584425036390102,
+      "grad_norm": 0.16913433372974396,
+      "learning_rate": 3.717135289931774e-05,
+      "loss": 0.15479494333267213,
+      "step": 1970
+    },
+    {
+      "epoch": 0.35935225618631733,
+      "grad_norm": 0.1620066910982132,
+      "learning_rate": 3.7107004811013434e-05,
+      "loss": 0.1604058027267456,
+      "step": 1975
+    },
+    {
+      "epoch": 0.36026200873362446,
+      "grad_norm": 0.16838301718235016,
+      "learning_rate": 3.704255176670021e-05,
+      "loss": 0.15335073471069335,
+      "step": 1980
+    },
+    {
+      "epoch": 0.3611717612809316,
+      "grad_norm": 0.3054695427417755,
+      "learning_rate": 3.6977994325123535e-05,
+      "loss": 0.16558053493499755,
+      "step": 1985
+    },
+    {
+      "epoch": 0.3620815138282387,
+      "grad_norm": 0.1526716649532318,
+      "learning_rate": 3.6913333045933934e-05,
+      "loss": 0.16148923635482787,
+      "step": 1990
+    },
+    {
+      "epoch": 0.36299126637554585,
+      "grad_norm": 0.15328513085842133,
+      "learning_rate": 3.684856848968209e-05,
+      "loss": 0.1553613781929016,
+      "step": 1995
+    },
+    {
+      "epoch": 0.363901018922853,
+      "grad_norm": 0.16129714250564575,
+      "learning_rate": 3.6783701217813995e-05,
+      "loss": 0.16724612712860107,
+      "step": 2000
+    },
+    {
+      "epoch": 0.3648107714701601,
+      "grad_norm": 0.15715539455413818,
+      "learning_rate": 3.6718731792666086e-05,
+      "loss": 0.15867922306060792,
+      "step": 2005
+    },
+    {
+      "epoch": 0.36572052401746724,
+      "grad_norm": 0.15569166839122772,
+      "learning_rate": 3.6653660777460366e-05,
+      "loss": 0.1552058696746826,
+      "step": 2010
+    },
+    {
+      "epoch": 0.36663027656477437,
+      "grad_norm": 0.16223010420799255,
+      "learning_rate": 3.6588488736299535e-05,
+      "loss": 0.1583200454711914,
+      "step": 2015
+    },
+    {
+      "epoch": 0.3675400291120815,
+      "grad_norm": 0.18441995978355408,
+      "learning_rate": 3.652321623416209e-05,
+      "loss": 0.15050662755966188,
+      "step": 2020
+    },
+    {
+      "epoch": 0.36844978165938863,
+      "grad_norm": 0.13792674243450165,
+      "learning_rate": 3.645784383689742e-05,
+      "loss": 0.15458759069442748,
+      "step": 2025
+    },
+    {
+      "epoch": 0.36935953420669576,
+      "grad_norm": 0.14993111789226532,
+      "learning_rate": 3.639237211122091e-05,
+      "loss": 0.15926222801208495,
+      "step": 2030
+    },
+    {
+      "epoch": 0.3702692867540029,
+      "grad_norm": 0.16815930604934692,
+      "learning_rate": 3.632680162470904e-05,
+      "loss": 0.15524441003799438,
+      "step": 2035
+    },
+    {
+      "epoch": 0.37117903930131,
+      "grad_norm": 0.13312821090221405,
+      "learning_rate": 3.626113294579441e-05,
+      "loss": 0.15883516073226928,
+      "step": 2040
+    },
+    {
+      "epoch": 0.37208879184861715,
+      "grad_norm": 0.16838273406028748,
+      "learning_rate": 3.619536664376091e-05,
+      "loss": 0.15829603672027587,
+      "step": 2045
+    },
+    {
+      "epoch": 0.37299854439592434,
+      "grad_norm": 0.14706873893737793,
+      "learning_rate": 3.612950328873869e-05,
+      "loss": 0.15644397735595703,
+      "step": 2050
+    },
+    {
+      "epoch": 0.37390829694323147,
+      "grad_norm": 0.1644199639558792,
+      "learning_rate": 3.606354345169926e-05,
+      "loss": 0.15858219861984252,
+      "step": 2055
+    },
+    {
+      "epoch": 0.3748180494905386,
+      "grad_norm": 0.18077051639556885,
+      "learning_rate": 3.599748770445055e-05,
+      "loss": 0.1641286849975586,
+      "step": 2060
+    },
+    {
+      "epoch": 0.3757278020378457,
+      "grad_norm": 0.16329127550125122,
+      "learning_rate": 3.5931336619631914e-05,
+      "loss": 0.15027186870574952,
+      "step": 2065
+    },
+    {
+      "epoch": 0.37663755458515286,
+      "grad_norm": 0.16346783936023712,
+      "learning_rate": 3.586509077070922e-05,
+      "loss": 0.1558641314506531,
+      "step": 2070
+    },
+    {
+      "epoch": 0.37754730713246,
+      "grad_norm": 0.1727602630853653,
+      "learning_rate": 3.5798750731969834e-05,
+      "loss": 0.15390506982803345,
+      "step": 2075
+    },
+    {
+      "epoch": 0.3784570596797671,
+      "grad_norm": 0.7598192691802979,
+      "learning_rate": 3.5732317078517654e-05,
+      "loss": 0.1533232808113098,
+      "step": 2080
+    },
+    {
+      "epoch": 0.37936681222707425,
+      "grad_norm": 0.1433355212211609,
+      "learning_rate": 3.5665790386268124e-05,
+      "loss": 0.15560413599014283,
+      "step": 2085
+    },
+    {
+      "epoch": 0.3802765647743814,
+      "grad_norm": 0.18439625203609467,
+      "learning_rate": 3.559917123194325e-05,
+      "loss": 0.16695556640625,
+      "step": 2090
+    },
+    {
+      "epoch": 0.3811863173216885,
+      "grad_norm": 0.1693502813577652,
+      "learning_rate": 3.55324601930666e-05,
+      "loss": 0.15957870483398437,
+      "step": 2095
+    },
+    {
+      "epoch": 0.38209606986899564,
+      "grad_norm": 0.17776088416576385,
+      "learning_rate": 3.54656578479583e-05,
+      "loss": 0.1527492880821228,
+      "step": 2100
+    },
+    {
+      "epoch": 0.38300582241630277,
+      "grad_norm": 0.15993724763393402,
+      "learning_rate": 3.539876477572998e-05,
+      "loss": 0.1567505717277527,
+      "step": 2105
+    },
+    {
+      "epoch": 0.3839155749636099,
+      "grad_norm": 0.17067375779151917,
+      "learning_rate": 3.533178155627981e-05,
+      "loss": 0.14660797119140626,
+      "step": 2110
+    },
+    {
+      "epoch": 0.384825327510917,
+      "grad_norm": 0.20239882171154022,
+      "learning_rate": 3.526470877028745e-05,
+      "loss": 0.1596767544746399,
+      "step": 2115
+    },
+    {
+      "epoch": 0.38573508005822416,
+      "grad_norm": 0.1863643079996109,
+      "learning_rate": 3.5197546999209005e-05,
+      "loss": 0.15738571882247926,
+      "step": 2120
+    },
+    {
+      "epoch": 0.3866448326055313,
+      "grad_norm": 0.16994133591651917,
+      "learning_rate": 3.5130296825272014e-05,
+      "loss": 0.16255316734313965,
+      "step": 2125
+    },
+    {
+      "epoch": 0.3875545851528384,
+      "grad_norm": 0.18703415989875793,
+      "learning_rate": 3.5062958831470355e-05,
+      "loss": 0.15206334590911866,
+      "step": 2130
+    },
+    {
+      "epoch": 0.38846433770014555,
+      "grad_norm": 0.15433982014656067,
+      "learning_rate": 3.4995533601559226e-05,
+      "loss": 0.1590178370475769,
+      "step": 2135
+    },
+    {
+      "epoch": 0.3893740902474527,
+      "grad_norm": 0.16498146951198578,
+      "learning_rate": 3.4928021720050104e-05,
+      "loss": 0.14759145975112914,
+      "step": 2140
+    },
+    {
+      "epoch": 0.3902838427947598,
+      "grad_norm": 0.17880478501319885,
+      "learning_rate": 3.486042377220562e-05,
+      "loss": 0.1642458915710449,
+      "step": 2145
+    },
+    {
+      "epoch": 0.39119359534206694,
+      "grad_norm": 0.14700061082839966,
+      "learning_rate": 3.479274034403455e-05,
+      "loss": 0.16105138063430785,
+      "step": 2150
+    },
+    {
+      "epoch": 0.39210334788937407,
+      "grad_norm": 0.1620762050151825,
+      "learning_rate": 3.472497202228664e-05,
+      "loss": 0.15104985237121582,
+      "step": 2155
+    },
+    {
+      "epoch": 0.3930131004366812,
+      "grad_norm": 0.1625058799982071,
+      "learning_rate": 3.4657119394447654e-05,
+      "loss": 0.16145485639572144,
+      "step": 2160
+    },
+    {
+      "epoch": 0.3939228529839884,
+      "grad_norm": 0.1631549596786499,
+      "learning_rate": 3.458918304873417e-05,
+      "loss": 0.16712255477905275,
+      "step": 2165
+    },
+    {
+      "epoch": 0.3948326055312955,
+      "grad_norm": 0.16041551530361176,
+      "learning_rate": 3.452116357408853e-05,
+      "loss": 0.15118330717086792,
+      "step": 2170
+    },
+    {
+      "epoch": 0.39574235807860264,
+      "grad_norm": 0.16692611575126648,
+      "learning_rate": 3.44530615601737e-05,
+      "loss": 0.16982550621032716,
+      "step": 2175
+    },
+    {
+      "epoch": 0.39665211062590977,
+      "grad_norm": 0.16082268953323364,
+      "learning_rate": 3.438487759736821e-05,
+      "loss": 0.1513260006904602,
+      "step": 2180
+    },
+    {
+      "epoch": 0.3975618631732169,
+      "grad_norm": 0.1474589854478836,
+      "learning_rate": 3.4316612276761004e-05,
+      "loss": 0.14968743324279785,
+      "step": 2185
+    },
+    {
+      "epoch": 0.39847161572052403,
+      "grad_norm": 0.14531342685222626,
+      "learning_rate": 3.42482661901463e-05,
+      "loss": 0.1563260555267334,
+      "step": 2190
+    },
+    {
+      "epoch": 0.39938136826783116,
+      "grad_norm": 0.16775506734848022,
+      "learning_rate": 3.41798399300185e-05,
+      "loss": 0.14861010313034057,
+      "step": 2195
+    },
+    {
+      "epoch": 0.4002911208151383,
+      "grad_norm": 0.15065217018127441,
+      "learning_rate": 3.411133408956703e-05,
+      "loss": 0.15559519529342652,
+      "step": 2200
+    },
+    {
+      "epoch": 0.4012008733624454,
+      "grad_norm": 0.16655296087265015,
+      "learning_rate": 3.4042749262671184e-05,
+      "loss": 0.16025567054748535,
+      "step": 2205
+    },
+    {
+      "epoch": 0.40211062590975255,
+      "grad_norm": 0.14773905277252197,
+      "learning_rate": 3.397408604389501e-05,
+      "loss": 0.15074082612991332,
+      "step": 2210
+    },
+    {
+      "epoch": 0.4030203784570597,
+      "grad_norm": 0.16233304142951965,
+      "learning_rate": 3.3905345028482125e-05,
+      "loss": 0.15490520000457764,
+      "step": 2215
+    },
+    {
+      "epoch": 0.4039301310043668,
+      "grad_norm": 0.17520153522491455,
+      "learning_rate": 3.383652681235058e-05,
+      "loss": 0.1517520785331726,
+      "step": 2220
+    },
+    {
+      "epoch": 0.40483988355167394,
+      "grad_norm": 0.14749875664710999,
+      "learning_rate": 3.376763199208766e-05,
+      "loss": 0.15410997867584228,
+      "step": 2225
+    },
+    {
+      "epoch": 0.40574963609898107,
+      "grad_norm": 0.16855919361114502,
+      "learning_rate": 3.369866116494477e-05,
+      "loss": 0.1510261058807373,
+      "step": 2230
+    },
+    {
+      "epoch": 0.4066593886462882,
+      "grad_norm": 0.1594122350215912,
+      "learning_rate": 3.362961492883218e-05,
+      "loss": 0.1493813395500183,
+      "step": 2235
+    },
+    {
+      "epoch": 0.40756914119359533,
+      "grad_norm": 0.13645926117897034,
+      "learning_rate": 3.3560493882313915e-05,
+      "loss": 0.14876762628555298,
+      "step": 2240
+    },
+    {
+      "epoch": 0.40847889374090246,
+      "grad_norm": 0.14304400980472565,
+      "learning_rate": 3.349129862460251e-05,
+      "loss": 0.15567013025283813,
+      "step": 2245
+    },
+    {
+      "epoch": 0.4093886462882096,
+      "grad_norm": 0.17040041089057922,
+      "learning_rate": 3.342202975555386e-05,
+      "loss": 0.1563249945640564,
+      "step": 2250
+    },
+    {
+      "epoch": 0.4102983988355167,
+      "grad_norm": 0.15594671666622162,
+      "learning_rate": 3.3352687875661984e-05,
+      "loss": 0.1546410083770752,
+      "step": 2255
+    },
+    {
+      "epoch": 0.41120815138282385,
+      "grad_norm": 0.1677195280790329,
+      "learning_rate": 3.328327358605384e-05,
+      "loss": 0.15710171461105346,
+      "step": 2260
+    },
+    {
+      "epoch": 0.412117903930131,
+      "grad_norm": 0.1731705516576767,
+      "learning_rate": 3.321378748848412e-05,
+      "loss": 0.16444036960601807,
+      "step": 2265
+    },
+    {
+      "epoch": 0.4130276564774381,
+      "grad_norm": 0.18779033422470093,
+      "learning_rate": 3.3144230185329984e-05,
+      "loss": 0.15659687519073487,
+      "step": 2270
+    },
+    {
+      "epoch": 0.4139374090247453,
+      "grad_norm": 0.1543768346309662,
+      "learning_rate": 3.3074602279585913e-05,
+      "loss": 0.15100739002227784,
+      "step": 2275
+    },
+    {
+      "epoch": 0.4148471615720524,
+      "grad_norm": 0.16672168672084808,
+      "learning_rate": 3.300490437485843e-05,
+      "loss": 0.15535364151000977,
+      "step": 2280
+    },
+    {
+      "epoch": 0.41575691411935956,
+      "grad_norm": 0.16741308569908142,
+      "learning_rate": 3.293513707536089e-05,
+      "loss": 0.15523911714553834,
+      "step": 2285
+    },
+    {
+      "epoch": 0.4166666666666667,
+      "grad_norm": 0.1488303542137146,
+      "learning_rate": 3.286530098590822e-05,
+      "loss": 0.1542000651359558,
+      "step": 2290
+    },
+    {
+      "epoch": 0.4175764192139738,
+      "grad_norm": 0.1637732982635498,
+      "learning_rate": 3.2795396711911694e-05,
+      "loss": 0.15354831218719484,
+      "step": 2295
+    },
+    {
+      "epoch": 0.41848617176128095,
+      "grad_norm": 0.1472022533416748,
+      "learning_rate": 3.272542485937369e-05,
+      "loss": 0.16235145330429077,
+      "step": 2300
+    },
+    {
+      "epoch": 0.4193959243085881,
+      "grad_norm": 0.15908290445804596,
+      "learning_rate": 3.265538603488241e-05,
+      "loss": 0.15642645359039306,
+      "step": 2305
+    },
+    {
+      "epoch": 0.4203056768558952,
+      "grad_norm": 0.1584865301847458,
+      "learning_rate": 3.2585280845606645e-05,
+      "loss": 0.15490249395370484,
+      "step": 2310
+    },
+    {
+      "epoch": 0.42121542940320233,
+      "grad_norm": 0.15893949568271637,
+      "learning_rate": 3.251510989929052e-05,
+      "loss": 0.1598116159439087,
+      "step": 2315
+    },
+    {
+      "epoch": 0.42212518195050946,
+      "grad_norm": 0.18930596113204956,
+      "learning_rate": 3.244487380424817e-05,
+      "loss": 0.1482008934020996,
+      "step": 2320
+    },
+    {
+      "epoch": 0.4230349344978166,
+      "grad_norm": 0.132876455783844,
+      "learning_rate": 3.237457316935856e-05,
+      "loss": 0.15304710865020751,
+      "step": 2325
+    },
+    {
+      "epoch": 0.4239446870451237,
+      "grad_norm": 0.16447032988071442,
+      "learning_rate": 3.2304208604060106e-05,
+      "loss": 0.15298750400543212,
+      "step": 2330
+    },
+    {
+      "epoch": 0.42485443959243085,
+      "grad_norm": 0.17748120427131653,
+      "learning_rate": 3.223378071834546e-05,
+      "loss": 0.1556084156036377,
+      "step": 2335
+    },
+    {
+      "epoch": 0.425764192139738,
+      "grad_norm": 0.16366586089134216,
+      "learning_rate": 3.2163290122756206e-05,
+      "loss": 0.14387927055358887,
+      "step": 2340
+    },
+    {
+      "epoch": 0.4266739446870451,
+      "grad_norm": 0.15398970246315002,
+      "learning_rate": 3.209273742837755e-05,
+      "loss": 0.16091293096542358,
+      "step": 2345
+    },
+    {
+      "epoch": 0.42758369723435224,
+      "grad_norm": 0.164212167263031,
+      "learning_rate": 3.202212324683305e-05,
+      "loss": 0.15523531436920165,
+      "step": 2350
+    },
+    {
+      "epoch": 0.4284934497816594,
+      "grad_norm": 0.16749800741672516,
+      "learning_rate": 3.1951448190279255e-05,
+      "loss": 0.15354975461959838,
+      "step": 2355
+    },
+    {
+      "epoch": 0.4294032023289665,
+      "grad_norm": 0.14137034118175507,
+      "learning_rate": 3.18807128714005e-05,
+      "loss": 0.14981694221496583,
+      "step": 2360
+    },
+    {
+      "epoch": 0.43031295487627363,
+      "grad_norm": 0.14848439395427704,
+      "learning_rate": 3.1809917903403507e-05,
+      "loss": 0.15448769330978393,
+      "step": 2365
+    },
+    {
+      "epoch": 0.43122270742358076,
+      "grad_norm": 0.1747605800628662,
+      "learning_rate": 3.1739063900012095e-05,
+      "loss": 0.15882387161254882,
+      "step": 2370
+    },
+    {
+      "epoch": 0.4321324599708879,
+      "grad_norm": 0.16054467856884003,
+      "learning_rate": 3.166815147546186e-05,
+      "loss": 0.15170297622680665,
+      "step": 2375
+    },
+    {
+      "epoch": 0.433042212518195,
+      "grad_norm": 0.15428027510643005,
+      "learning_rate": 3.1597181244494886e-05,
+      "loss": 0.16202548742294312,
+      "step": 2380
+    },
+    {
+      "epoch": 0.4339519650655022,
+      "grad_norm": 0.16747219860553741,
+      "learning_rate": 3.1526153822354325e-05,
+      "loss": 0.15461477041244506,
+      "step": 2385
+    },
+    {
+      "epoch": 0.43486171761280934,
+      "grad_norm": 0.17415772378444672,
+      "learning_rate": 3.145506982477918e-05,
+      "loss": 0.16173542737960817,
+      "step": 2390
+    },
+    {
+      "epoch": 0.43577147016011647,
+      "grad_norm": 0.1293518990278244,
+      "learning_rate": 3.1383929867998865e-05,
+      "loss": 0.15572521686553956,
+      "step": 2395
+    },
+    {
+      "epoch": 0.4366812227074236,
+      "grad_norm": 0.16909323632717133,
+      "learning_rate": 3.1312734568727935e-05,
+      "loss": 0.15898628234863282,
+      "step": 2400
+    },
+    {
+      "epoch": 0.43759097525473073,
+      "grad_norm": 0.16770294308662415,
+      "learning_rate": 3.124148454416069e-05,
+      "loss": 0.1536281704902649,
+      "step": 2405
+    },
+    {
+      "epoch": 0.43850072780203786,
+      "grad_norm": 0.14078612625598907,
+      "learning_rate": 3.117018041196585e-05,
+      "loss": 0.15274266004562378,
+      "step": 2410
+    },
+    {
+      "epoch": 0.439410480349345,
+      "grad_norm": 0.15457536280155182,
+      "learning_rate": 3.1098822790281226e-05,
+      "loss": 0.15391263961791993,
+      "step": 2415
+    },
+    {
+      "epoch": 0.4403202328966521,
+      "grad_norm": 0.1640717089176178,
+      "learning_rate": 3.102741229770827e-05,
+      "loss": 0.15515168905258178,
+      "step": 2420
+    },
+    {
+      "epoch": 0.44122998544395925,
+      "grad_norm": 0.2601533830165863,
+      "learning_rate": 3.095594955330683e-05,
+      "loss": 0.1587247371673584,
+      "step": 2425
+    },
+    {
+      "epoch": 0.4421397379912664,
+      "grad_norm": 0.1352529525756836,
+      "learning_rate": 3.08844351765897e-05,
+      "loss": 0.1483217477798462,
+      "step": 2430
+    },
+    {
+      "epoch": 0.4430494905385735,
+      "grad_norm": 0.18479721248149872,
+      "learning_rate": 3.081286978751728e-05,
+      "loss": 0.15121787786483765,
+      "step": 2435
+    },
+    {
+      "epoch": 0.44395924308588064,
+      "grad_norm": 0.16954511404037476,
+      "learning_rate": 3.074125400649221e-05,
+      "loss": 0.16073100566864013,
+      "step": 2440
+    },
+    {
+      "epoch": 0.44486899563318777,
+      "grad_norm": 0.15154729783535004,
+      "learning_rate": 3.0669588454353944e-05,
+      "loss": 0.15738017559051515,
+      "step": 2445
+    },
+    {
+      "epoch": 0.4457787481804949,
+      "grad_norm": 0.1540488302707672,
+      "learning_rate": 3.059787375237344e-05,
+      "loss": 0.1515384554862976,
+      "step": 2450
+    },
+    {
+      "epoch": 0.44668850072780203,
+      "grad_norm": 0.1814432442188263,
+      "learning_rate": 3.052611052224774e-05,
+      "loss": 0.15731438398361205,
+      "step": 2455
+    },
+    {
+      "epoch": 0.44759825327510916,
+      "grad_norm": 0.16657036542892456,
+      "learning_rate": 3.0454299386094542e-05,
+      "loss": 0.15741543769836425,
+      "step": 2460
+    },
+    {
+      "epoch": 0.4485080058224163,
+      "grad_norm": 0.2177237570285797,
+      "learning_rate": 3.0382440966446875e-05,
+      "loss": 0.14972515106201173,
+      "step": 2465
+    },
+    {
+      "epoch": 0.4494177583697234,
+      "grad_norm": 0.1669909954071045,
+      "learning_rate": 3.031053588624766e-05,
+      "loss": 0.1506432294845581,
+      "step": 2470
+    },
+    {
+      "epoch": 0.45032751091703055,
+      "grad_norm": 0.1752234250307083,
+      "learning_rate": 3.0238584768844313e-05,
+      "loss": 0.14969609975814818,
+      "step": 2475
+    },
+    {
+      "epoch": 0.4512372634643377,
+      "grad_norm": 0.18267901241779327,
+      "learning_rate": 3.0166588237983363e-05,
+      "loss": 0.15112748146057128,
+      "step": 2480
+    },
+    {
+      "epoch": 0.4521470160116448,
+      "grad_norm": 0.16250105202198029,
+      "learning_rate": 3.0094546917805007e-05,
+      "loss": 0.15864100456237792,
+      "step": 2485
+    },
+    {
+      "epoch": 0.45305676855895194,
+      "grad_norm": 0.14825721085071564,
+      "learning_rate": 3.0022461432837752e-05,
+      "loss": 0.1513954520225525,
+      "step": 2490
+    },
+    {
+      "epoch": 0.4539665211062591,
+      "grad_norm": 0.1626640111207962,
+      "learning_rate": 2.9950332407992943e-05,
+      "loss": 0.1505578875541687,
+      "step": 2495
+    },
+    {
+      "epoch": 0.45487627365356625,
+      "grad_norm": 0.1535351574420929,
+      "learning_rate": 2.987816046855939e-05,
+      "loss": 0.15255829095840454,
+      "step": 2500
+    },
+    {
+      "epoch": 0.4557860262008734,
+      "grad_norm": 0.17552775144577026,
+      "learning_rate": 2.9805946240197928e-05,
+      "loss": 0.1516443133354187,
+      "step": 2505
+    },
+    {
+      "epoch": 0.4566957787481805,
+      "grad_norm": 0.16020981967449188,
+      "learning_rate": 2.9733690348935994e-05,
+      "loss": 0.14519743919372557,
+      "step": 2510
+    },
+    {
+      "epoch": 0.45760553129548764,
+      "grad_norm": 0.17800211906433105,
+      "learning_rate": 2.9661393421162204e-05,
+      "loss": 0.15679080486297609,
+      "step": 2515
+    },
+    {
+      "epoch": 0.4585152838427948,
+      "grad_norm": 0.16016991436481476,
+      "learning_rate": 2.9589056083620902e-05,
+      "loss": 0.14768127202987671,
+      "step": 2520
+    },
+    {
+      "epoch": 0.4594250363901019,
+      "grad_norm": 0.16272081434726715,
+      "learning_rate": 2.951667896340679e-05,
+      "loss": 0.1513301968574524,
+      "step": 2525
+    },
+    {
+      "epoch": 0.46033478893740903,
+      "grad_norm": 0.1726413071155548,
+      "learning_rate": 2.9444262687959402e-05,
+      "loss": 0.14819332361221313,
+      "step": 2530
+    },
+    {
+      "epoch": 0.46124454148471616,
+      "grad_norm": 0.1670403778553009,
+      "learning_rate": 2.9371807885057735e-05,
+      "loss": 0.15245940685272216,
+      "step": 2535
+    },
+    {
+      "epoch": 0.4621542940320233,
+      "grad_norm": 0.1650049239397049,
+      "learning_rate": 2.9299315182814772e-05,
+      "loss": 0.15187418460845947,
+      "step": 2540
+    },
+    {
+      "epoch": 0.4630640465793304,
+      "grad_norm": 0.16327734291553497,
+      "learning_rate": 2.9226785209672047e-05,
+      "loss": 0.15579828023910522,
+      "step": 2545
+    },
+    {
+      "epoch": 0.46397379912663755,
+      "grad_norm": 0.3367880582809448,
+      "learning_rate": 2.91542185943942e-05,
+      "loss": 0.15617697238922118,
+      "step": 2550
+    },
+    {
+      "epoch": 0.4648835516739447,
+      "grad_norm": 0.1731594055891037,
+      "learning_rate": 2.908161596606353e-05,
+      "loss": 0.1559603691101074,
+      "step": 2555
+    },
+    {
+      "epoch": 0.4657933042212518,
+      "grad_norm": 0.1477293074131012,
+      "learning_rate": 2.9008977954074517e-05,
+      "loss": 0.15567959547042848,
+      "step": 2560
+    },
+    {
+      "epoch": 0.46670305676855894,
+      "grad_norm": 0.16227173805236816,
+      "learning_rate": 2.8936305188128392e-05,
+      "loss": 0.1522113561630249,
+      "step": 2565
+    },
+    {
+      "epoch": 0.4676128093158661,
+      "grad_norm": 0.2031075656414032,
+      "learning_rate": 2.8863598298227674e-05,
+      "loss": 0.15054640769958497,
+      "step": 2570
+    },
+    {
+      "epoch": 0.4685225618631732,
+      "grad_norm": 0.18351472914218903,
+      "learning_rate": 2.8790857914670698e-05,
+      "loss": 0.15837019681930542,
+      "step": 2575
+    },
+    {
+      "epoch": 0.46943231441048033,
+      "grad_norm": 0.15914765000343323,
+      "learning_rate": 2.871808466804616e-05,
+      "loss": 0.1550259470939636,
+      "step": 2580
+    },
+    {
+      "epoch": 0.47034206695778746,
+      "grad_norm": 0.17366717755794525,
+      "learning_rate": 2.8645279189227636e-05,
+      "loss": 0.15702390670776367,
+      "step": 2585
+    },
+    {
+      "epoch": 0.4712518195050946,
+      "grad_norm": 0.13677838444709778,
+      "learning_rate": 2.8572442109368134e-05,
+      "loss": 0.15485031604766847,
+      "step": 2590
+    },
+    {
+      "epoch": 0.4721615720524017,
+      "grad_norm": 0.1477748304605484,
+      "learning_rate": 2.8499574059894617e-05,
+      "loss": 0.14577245712280273,
+      "step": 2595
+    },
+    {
+      "epoch": 0.47307132459970885,
+      "grad_norm": 0.1582217663526535,
+      "learning_rate": 2.842667567250252e-05,
+      "loss": 0.15586793422698975,
+      "step": 2600
+    },
+    {
+      "epoch": 0.47398107714701604,
+      "grad_norm": 0.19658738374710083,
+      "learning_rate": 2.8353747579150268e-05,
+      "loss": 0.15060495138168334,
+      "step": 2605
+    },
+    {
+      "epoch": 0.47489082969432317,
+      "grad_norm": 0.176767036318779,
+      "learning_rate": 2.828079041205382e-05,
+      "loss": 0.15116705894470214,
+      "step": 2610
+    },
+    {
+      "epoch": 0.4758005822416303,
+      "grad_norm": 0.16972507536411285,
+      "learning_rate": 2.820780480368117e-05,
+      "loss": 0.1541937470436096,
+      "step": 2615
+    },
+    {
+      "epoch": 0.47671033478893743,
+      "grad_norm": 0.1548585742712021,
+      "learning_rate": 2.8134791386746884e-05,
+      "loss": 0.14334756135940552,
+      "step": 2620
+    },
+    {
+      "epoch": 0.47762008733624456,
+      "grad_norm": 0.15411986410617828,
+      "learning_rate": 2.806175079420658e-05,
+      "loss": 0.14642289876937867,
+      "step": 2625
+    },
+    {
+      "epoch": 0.4785298398835517,
+      "grad_norm": 0.16609491407871246,
+      "learning_rate": 2.7988683659251474e-05,
+      "loss": 0.15083469152450563,
+      "step": 2630
+    },
+    {
+      "epoch": 0.4794395924308588,
+      "grad_norm": 0.16592684388160706,
+      "learning_rate": 2.791559061530289e-05,
+      "loss": 0.14218480587005616,
+      "step": 2635
+    },
+    {
+      "epoch": 0.48034934497816595,
+      "grad_norm": 0.1764935404062271,
+      "learning_rate": 2.7842472296006722e-05,
+      "loss": 0.15004343986511232,
+      "step": 2640
+    },
+    {
+      "epoch": 0.4812590975254731,
+      "grad_norm": 0.20094354450702667,
+      "learning_rate": 2.7769329335228022e-05,
+      "loss": 0.14975016117095946,
+      "step": 2645
+    },
+    {
+      "epoch": 0.4821688500727802,
+      "grad_norm": 0.1869269460439682,
+      "learning_rate": 2.769616236704542e-05,
+      "loss": 0.155981707572937,
+      "step": 2650
+    },
+    {
+      "epoch": 0.48307860262008734,
+      "grad_norm": 0.16671574115753174,
+      "learning_rate": 2.762297202574571e-05,
+      "loss": 0.14633859395980836,
+      "step": 2655
+    },
+    {
+      "epoch": 0.48398835516739447,
+      "grad_norm": 0.14999663829803467,
+      "learning_rate": 2.754975894581826e-05,
+      "loss": 0.15692603588104248,
+      "step": 2660
+    },
+    {
+      "epoch": 0.4848981077147016,
+      "grad_norm": 0.16893649101257324,
+      "learning_rate": 2.7476523761949592e-05,
+      "loss": 0.14530394077301026,
+      "step": 2665
+    },
+    {
+      "epoch": 0.48580786026200873,
+      "grad_norm": 0.16039884090423584,
+      "learning_rate": 2.740326710901784e-05,
+      "loss": 0.15013915300369263,
+      "step": 2670
+    },
+    {
+      "epoch": 0.48671761280931586,
+      "grad_norm": 0.16672006249427795,
+      "learning_rate": 2.732998962208725e-05,
+      "loss": 0.15667349100112915,
+      "step": 2675
+    },
+    {
+      "epoch": 0.487627365356623,
+      "grad_norm": 0.2160867303609848,
+      "learning_rate": 2.7256691936402684e-05,
+      "loss": 0.14335414171218872,
+      "step": 2680
+    },
+    {
+      "epoch": 0.4885371179039301,
+      "grad_norm": 0.349030077457428,
+      "learning_rate": 2.71833746873841e-05,
+      "loss": 0.1437530279159546,
+      "step": 2685
+    },
+    {
+      "epoch": 0.48944687045123725,
+      "grad_norm": 0.18380966782569885,
+      "learning_rate": 2.7110038510621073e-05,
+      "loss": 0.1476014256477356,
+      "step": 2690
+    },
+    {
+      "epoch": 0.4903566229985444,
+      "grad_norm": 0.1523742377758026,
+      "learning_rate": 2.703668404186722e-05,
+      "loss": 0.14578526020050048,
+      "step": 2695
+    },
+    {
+      "epoch": 0.4912663755458515,
+      "grad_norm": 0.16092729568481445,
+      "learning_rate": 2.696331191703479e-05,
+      "loss": 0.15335593223571778,
+      "step": 2700
+    },
+    {
+      "epoch": 0.49217612809315864,
+      "grad_norm": 0.17185333371162415,
+      "learning_rate": 2.688992277218904e-05,
+      "loss": 0.1540898084640503,
+      "step": 2705
+    },
+    {
+      "epoch": 0.49308588064046577,
+      "grad_norm": 0.1521969735622406,
+      "learning_rate": 2.6816517243542792e-05,
+      "loss": 0.15171396732330322,
+      "step": 2710
+    },
+    {
+      "epoch": 0.49399563318777295,
+      "grad_norm": 0.16064171493053436,
+      "learning_rate": 2.674309596745092e-05,
+      "loss": 0.1505839228630066,
+      "step": 2715
+    },
+    {
+      "epoch": 0.4949053857350801,
+      "grad_norm": 0.16430898010730743,
+      "learning_rate": 2.6669659580404795e-05,
+      "loss": 0.1551363468170166,
+      "step": 2720
+    },
+    {
+      "epoch": 0.4958151382823872,
+      "grad_norm": 0.16125477850437164,
+      "learning_rate": 2.659620871902677e-05,
+      "loss": 0.15069286823272704,
+      "step": 2725
+    },
+    {
+      "epoch": 0.49672489082969434,
+      "grad_norm": 0.1428450047969818,
+      "learning_rate": 2.652274402006471e-05,
+      "loss": 0.15511081218719483,
+      "step": 2730
+    },
+    {
+      "epoch": 0.4976346433770015,
+      "grad_norm": 0.15452754497528076,
+      "learning_rate": 2.6449266120386406e-05,
+      "loss": 0.14941939115524291,
+      "step": 2735
+    },
+    {
+      "epoch": 0.4985443959243086,
+      "grad_norm": 0.17243537306785583,
+      "learning_rate": 2.6375775656974123e-05,
+      "loss": 0.151741623878479,
+      "step": 2740
+    },
+    {
+      "epoch": 0.49945414847161573,
+      "grad_norm": 0.13736453652381897,
+      "learning_rate": 2.6302273266919008e-05,
+      "loss": 0.147042977809906,
+      "step": 2745
+    },
+    {
+      "epoch": 0.5003639010189228,
+      "grad_norm": 0.16241495311260223,
+      "learning_rate": 2.6228759587415614e-05,
+      "loss": 0.14664684534072875,
+      "step": 2750
+    },
+    {
+      "epoch": 0.50127365356623,
+      "grad_norm": 0.193496435880661,
+      "learning_rate": 2.6155235255756356e-05,
+      "loss": 0.15486966371536254,
+      "step": 2755
+    },
+    {
+      "epoch": 0.5021834061135371,
+      "grad_norm": 0.1542847901582718,
+      "learning_rate": 2.6081700909326e-05,
+      "loss": 0.15148009061813356,
+      "step": 2760
+    },
+    {
+      "epoch": 0.5030931586608443,
+      "grad_norm": 0.1696511209011078,
+      "learning_rate": 2.6008157185596142e-05,
+      "loss": 0.14190055131912233,
+      "step": 2765
+    },
+    {
+      "epoch": 0.5040029112081513,
+      "grad_norm": 0.14690077304840088,
+      "learning_rate": 2.5934604722119655e-05,
+      "loss": 0.1570739269256592,
+      "step": 2770
+    },
+    {
+      "epoch": 0.5049126637554585,
+      "grad_norm": 0.17149671912193298,
+      "learning_rate": 2.5861044156525162e-05,
+      "loss": 0.14940304756164552,
+      "step": 2775
+    },
+    {
+      "epoch": 0.5058224163027657,
+      "grad_norm": 0.16639231145381927,
+      "learning_rate": 2.578747612651155e-05,
+      "loss": 0.15691237449645995,
+      "step": 2780
+    },
+    {
+      "epoch": 0.5067321688500728,
+      "grad_norm": 0.2062763124704361,
+      "learning_rate": 2.5713901269842404e-05,
+      "loss": 0.1564734935760498,
+      "step": 2785
+    },
+    {
+      "epoch": 0.50764192139738,
+      "grad_norm": 0.12636308372020721,
+      "learning_rate": 2.5640320224340502e-05,
+      "loss": 0.14539417028427123,
+      "step": 2790
+    },
+    {
+      "epoch": 0.508551673944687,
+      "grad_norm": 0.16893689334392548,
+      "learning_rate": 2.556673362788225e-05,
+      "loss": 0.15440930128097535,
+      "step": 2795
+    },
+    {
+      "epoch": 0.5094614264919942,
+      "grad_norm": 0.16250015795230865,
+      "learning_rate": 2.54931421183922e-05,
+      "loss": 0.14485647678375244,
+      "step": 2800
+    },
+    {
+      "epoch": 0.5103711790393013,
+      "grad_norm": 0.1700994372367859,
+      "learning_rate": 2.5419546333837462e-05,
+      "loss": 0.15411126613616943,
+      "step": 2805
+    },
+    {
+      "epoch": 0.5112809315866085,
+      "grad_norm": 0.1547706127166748,
+      "learning_rate": 2.5345946912222256e-05,
+      "loss": 0.15516072511672974,
+      "step": 2810
+    },
+    {
+      "epoch": 0.5121906841339156,
+      "grad_norm": 0.17955681681632996,
+      "learning_rate": 2.527234449158228e-05,
+      "loss": 0.15546923875808716,
+      "step": 2815
+    },
+    {
+      "epoch": 0.5131004366812227,
+      "grad_norm": 0.163709819316864,
+      "learning_rate": 2.519873970997927e-05,
+      "loss": 0.15665037631988527,
+      "step": 2820
+    },
+    {
+      "epoch": 0.5140101892285298,
+      "grad_norm": 0.17859576642513275,
+      "learning_rate": 2.5125133205495405e-05,
+      "loss": 0.1539722204208374,
+      "step": 2825
+    },
+    {
+      "epoch": 0.514919941775837,
+      "grad_norm": 0.17443150281906128,
+      "learning_rate": 2.5051525616227806e-05,
+      "loss": 0.148411762714386,
+      "step": 2830
+    },
+    {
+      "epoch": 0.5158296943231441,
+      "grad_norm": 0.17397581040859222,
+      "learning_rate": 2.4977917580283007e-05,
+      "loss": 0.14880497455596925,
+      "step": 2835
+    },
+    {
+      "epoch": 0.5167394468704513,
+      "grad_norm": 0.14565663039684296,
+      "learning_rate": 2.4904309735771405e-05,
+      "loss": 0.14934680461883545,
+      "step": 2840
+    },
+    {
+      "epoch": 0.5176491994177583,
+      "grad_norm": 0.17895659804344177,
+      "learning_rate": 2.4830702720801746e-05,
+      "loss": 0.15287939310073853,
+      "step": 2845
+    },
+    {
+      "epoch": 0.5185589519650655,
+      "grad_norm": 0.15812788903713226,
+      "learning_rate": 2.4757097173475572e-05,
+      "loss": 0.14576947689056396,
+      "step": 2850
+    },
+    {
+      "epoch": 0.5194687045123726,
+      "grad_norm": 0.17123781144618988,
+      "learning_rate": 2.46834937318817e-05,
+      "loss": 0.15224847793579102,
+      "step": 2855
+    },
+    {
+      "epoch": 0.5203784570596798,
+      "grad_norm": 0.14845474064350128,
+      "learning_rate": 2.460989303409072e-05,
+      "loss": 0.14901585578918458,
+      "step": 2860
+    },
+    {
+      "epoch": 0.5212882096069869,
+      "grad_norm": 0.23493704199790955,
+      "learning_rate": 2.4536295718149407e-05,
+      "loss": 0.1517487049102783,
+      "step": 2865
+    },
+    {
+      "epoch": 0.522197962154294,
+      "grad_norm": 0.16209843754768372,
+      "learning_rate": 2.4462702422075217e-05,
+      "loss": 0.14327445030212402,
+      "step": 2870
+    },
+    {
+      "epoch": 0.5231077147016011,
+      "grad_norm": 0.17249803245067596,
+      "learning_rate": 2.4389113783850793e-05,
+      "loss": 0.1517549753189087,
+      "step": 2875
+    },
+    {
+      "epoch": 0.5240174672489083,
+      "grad_norm": 0.14561402797698975,
+      "learning_rate": 2.431553044141836e-05,
+      "loss": 0.14764087200164794,
+      "step": 2880
+    },
+    {
+      "epoch": 0.5249272197962155,
+      "grad_norm": 0.17033302783966064,
+      "learning_rate": 2.4241953032674256e-05,
+      "loss": 0.15181604623794556,
+      "step": 2885
+    },
+    {
+      "epoch": 0.5258369723435226,
+      "grad_norm": 0.1184430941939354,
+      "learning_rate": 2.4168382195463367e-05,
+      "loss": 0.14264242649078368,
+      "step": 2890
+    },
+    {
+      "epoch": 0.5267467248908297,
+      "grad_norm": 0.17521196603775024,
+      "learning_rate": 2.4094818567573618e-05,
+      "loss": 0.1509538173675537,
+      "step": 2895
+    },
+    {
+      "epoch": 0.5276564774381368,
+      "grad_norm": 0.1681576371192932,
+      "learning_rate": 2.4021262786730428e-05,
+      "loss": 0.15344605445861817,
+      "step": 2900
+    },
+    {
+      "epoch": 0.528566229985444,
+      "grad_norm": 0.17134182155132294,
+      "learning_rate": 2.3947715490591206e-05,
+      "loss": 0.15161689519882202,
+      "step": 2905
+    },
+    {
+      "epoch": 0.5294759825327511,
+      "grad_norm": 0.1796472817659378,
+      "learning_rate": 2.3874177316739778e-05,
+      "loss": 0.15086464881896972,
+      "step": 2910
+    },
+    {
+      "epoch": 0.5303857350800583,
+      "grad_norm": 0.23268625140190125,
+      "learning_rate": 2.380064890268093e-05,
+      "loss": 0.15354180335998535,
+      "step": 2915
+    },
+    {
+      "epoch": 0.5312954876273653,
+      "grad_norm": 0.16318941116333008,
+      "learning_rate": 2.372713088583481e-05,
+      "loss": 0.15131797790527343,
+      "step": 2920
+    },
+    {
+      "epoch": 0.5322052401746725,
+      "grad_norm": 0.18171803653240204,
+      "learning_rate": 2.365362390353143e-05,
+      "loss": 0.15784090757369995,
+      "step": 2925
+    },
+    {
+      "epoch": 0.5331149927219796,
+      "grad_norm": 0.17672640085220337,
+      "learning_rate": 2.3580128593005156e-05,
+      "loss": 0.15509436130523682,
+      "step": 2930
+    },
+    {
+      "epoch": 0.5340247452692868,
+      "grad_norm": 0.15985223650932312,
+      "learning_rate": 2.3506645591389174e-05,
+      "loss": 0.14851027727127075,
+      "step": 2935
+    },
+    {
+      "epoch": 0.5349344978165939,
+      "grad_norm": 0.16597607731819153,
+      "learning_rate": 2.343317553570995e-05,
+      "loss": 0.1504931092262268,
+      "step": 2940
+    },
+    {
+      "epoch": 0.535844250363901,
+      "grad_norm": 0.20180748403072357,
+      "learning_rate": 2.3359719062881725e-05,
+      "loss": 0.15023820400238036,
+      "step": 2945
+    },
+    {
+      "epoch": 0.5367540029112081,
+      "grad_norm": 0.1735963076353073,
+      "learning_rate": 2.3286276809701e-05,
+      "loss": 0.15374408960342406,
+      "step": 2950
+    },
+    {
+      "epoch": 0.5376637554585153,
+      "grad_norm": 0.17629501223564148,
+      "learning_rate": 2.3212849412840995e-05,
+      "loss": 0.15007833242416382,
+      "step": 2955
+    },
+    {
+      "epoch": 0.5385735080058224,
+      "grad_norm": 0.1493796557188034,
+      "learning_rate": 2.3139437508846155e-05,
+      "loss": 0.15206656455993653,
+      "step": 2960
+    },
+    {
+      "epoch": 0.5394832605531296,
+      "grad_norm": 0.17426837980747223,
+      "learning_rate": 2.306604173412659e-05,
+      "loss": 0.1441131591796875,
+      "step": 2965
+    },
+    {
+      "epoch": 0.5403930131004366,
+      "grad_norm": 0.16984431445598602,
+      "learning_rate": 2.2992662724952613e-05,
+      "loss": 0.14438753128051757,
+      "step": 2970
+    },
+    {
+      "epoch": 0.5413027656477438,
+      "grad_norm": 0.1814386397600174,
+      "learning_rate": 2.2919301117449167e-05,
+      "loss": 0.14887022972106934,
+      "step": 2975
+    },
+    {
+      "epoch": 0.5422125181950509,
+      "grad_norm": 0.158392995595932,
+      "learning_rate": 2.2845957547590368e-05,
+      "loss": 0.14404361248016356,
+      "step": 2980
+    },
+    {
+      "epoch": 0.5431222707423581,
+      "grad_norm": 0.17496263980865479,
+      "learning_rate": 2.2772632651193953e-05,
+      "loss": 0.1454906702041626,
+      "step": 2985
+    },
+    {
+      "epoch": 0.5440320232896652,
+      "grad_norm": 0.157533198595047,
+      "learning_rate": 2.2699327063915766e-05,
+      "loss": 0.1458217740058899,
+      "step": 2990
+    },
+    {
+      "epoch": 0.5449417758369723,
+      "grad_norm": 0.1767890453338623,
+      "learning_rate": 2.262604142124427e-05,
+      "loss": 0.14384825229644777,
+      "step": 2995
+    },
+    {
+      "epoch": 0.5458515283842795,
+      "grad_norm": 0.1851050704717636,
+      "learning_rate": 2.2552776358495033e-05,
+      "loss": 0.14832457304000854,
+      "step": 3000
+    },
+    {
+      "epoch": 0.5467612809315866,
+      "grad_norm": 0.164175882935524,
+      "learning_rate": 2.247953251080521e-05,
+      "loss": 0.14999878406524658,
+      "step": 3005
+    },
+    {
+      "epoch": 0.5476710334788938,
+      "grad_norm": 0.3403675854206085,
+      "learning_rate": 2.240631051312804e-05,
+      "loss": 0.1443937063217163,
+      "step": 3010
+    },
+    {
+      "epoch": 0.5485807860262009,
+      "grad_norm": 0.16751109063625336,
+      "learning_rate": 2.2333111000227342e-05,
+      "loss": 0.1462402105331421,
+      "step": 3015
+    },
+    {
+      "epoch": 0.549490538573508,
+      "grad_norm": 0.14741151034832,
+      "learning_rate": 2.225993460667201e-05,
+      "loss": 0.149855899810791,
+      "step": 3020
+    },
+    {
+      "epoch": 0.5504002911208151,
+      "grad_norm": 0.20605266094207764,
+      "learning_rate": 2.218678196683054e-05,
+      "loss": 0.15413178205490113,
+      "step": 3025
+    },
+    {
+      "epoch": 0.5513100436681223,
+      "grad_norm": 0.14884796738624573,
+      "learning_rate": 2.2113653714865473e-05,
+      "loss": 0.14592334032058715,
+      "step": 3030
+    },
+    {
+      "epoch": 0.5522197962154294,
+      "grad_norm": 0.17114350199699402,
+      "learning_rate": 2.2040550484727943e-05,
+      "loss": 0.1498338460922241,
+      "step": 3035
+    },
+    {
+      "epoch": 0.5531295487627366,
+      "grad_norm": 0.16496853530406952,
+      "learning_rate": 2.196747291015219e-05,
+      "loss": 0.14650315046310425,
+      "step": 3040
+    },
+    {
+      "epoch": 0.5540393013100436,
+      "grad_norm": 0.15172401070594788,
+      "learning_rate": 2.189442162465001e-05,
+      "loss": 0.14984124898910522,
+      "step": 3045
+    },
+    {
+      "epoch": 0.5549490538573508,
+      "grad_norm": 0.19258467853069305,
+      "learning_rate": 2.182139726150532e-05,
+      "loss": 0.1486764669418335,
+      "step": 3050
+    },
+    {
+      "epoch": 0.5558588064046579,
+      "grad_norm": 0.1749001443386078,
+      "learning_rate": 2.1748400453768652e-05,
+      "loss": 0.14983701705932617,
+      "step": 3055
+    },
+    {
+      "epoch": 0.5567685589519651,
+      "grad_norm": 0.37510567903518677,
+      "learning_rate": 2.1675431834251637e-05,
+      "loss": 0.14483561515808105,
+      "step": 3060
+    },
+    {
+      "epoch": 0.5576783114992722,
+      "grad_norm": 0.16932405531406403,
+      "learning_rate": 2.1602492035521553e-05,
+      "loss": 0.14487643241882325,
+      "step": 3065
+    },
+    {
+      "epoch": 0.5585880640465793,
+      "grad_norm": 0.174176424741745,
+      "learning_rate": 2.152958168989584e-05,
+      "loss": 0.14737497568130492,
+      "step": 3070
+    },
+    {
+      "epoch": 0.5594978165938864,
+      "grad_norm": 0.1601252257823944,
+      "learning_rate": 2.1456701429436577e-05,
+      "loss": 0.15183379650115966,
+      "step": 3075
+    },
+    {
+      "epoch": 0.5604075691411936,
+      "grad_norm": 0.14960910379886627,
+      "learning_rate": 2.1383851885945085e-05,
+      "loss": 0.143074893951416,
+      "step": 3080
+    },
+    {
+      "epoch": 0.5613173216885007,
+      "grad_norm": 0.1678633838891983,
+      "learning_rate": 2.1311033690956346e-05,
+      "loss": 0.14961432218551635,
+      "step": 3085
+    },
+    {
+      "epoch": 0.5622270742358079,
+      "grad_norm": 0.15814319252967834,
+      "learning_rate": 2.1238247475733613e-05,
+      "loss": 0.14308581352233887,
+      "step": 3090
+    },
+    {
+      "epoch": 0.5631368267831149,
+      "grad_norm": 0.21240772306919098,
+      "learning_rate": 2.1165493871262887e-05,
+      "loss": 0.14737485647201537,
+      "step": 3095
+    },
+    {
+      "epoch": 0.5640465793304221,
+      "grad_norm": 0.15161271393299103,
+      "learning_rate": 2.109277350824749e-05,
+      "loss": 0.14534420967102052,
+      "step": 3100
+    },
+    {
+      "epoch": 0.5649563318777293,
+      "grad_norm": 0.16572362184524536,
+      "learning_rate": 2.1020087017102537e-05,
+      "loss": 0.14299670457839966,
+      "step": 3105
+    },
+    {
+      "epoch": 0.5658660844250364,
+      "grad_norm": 0.1548164039850235,
+      "learning_rate": 2.094743502794954e-05,
+      "loss": 0.14371142387390137,
+      "step": 3110
+    },
+    {
+      "epoch": 0.5667758369723436,
+      "grad_norm": 0.2574169933795929,
+      "learning_rate": 2.0874818170610885e-05,
+      "loss": 0.14350423812866211,
+      "step": 3115
+    },
+    {
+      "epoch": 0.5676855895196506,
+      "grad_norm": 0.16359548270702362,
+      "learning_rate": 2.080223707460443e-05,
+      "loss": 0.1520243763923645,
+      "step": 3120
+    },
+    {
+      "epoch": 0.5685953420669578,
+      "grad_norm": 0.1798320859670639,
+      "learning_rate": 2.072969236913799e-05,
+      "loss": 0.14832595586776734,
+      "step": 3125
+    },
+    {
+      "epoch": 0.5695050946142649,
+      "grad_norm": 0.17045916616916656,
+      "learning_rate": 2.0657184683103926e-05,
+      "loss": 0.15308042764663696,
+      "step": 3130
+    },
+    {
+      "epoch": 0.5704148471615721,
+      "grad_norm": 0.16345897316932678,
+      "learning_rate": 2.058471464507366e-05,
+      "loss": 0.14564799070358275,
+      "step": 3135
+    },
+    {
+      "epoch": 0.5713245997088792,
+      "grad_norm": 0.15170110762119293,
+      "learning_rate": 2.0512282883292257e-05,
+      "loss": 0.14161767959594726,
+      "step": 3140
+    },
+    {
+      "epoch": 0.5722343522561864,
+      "grad_norm": 0.8107472658157349,
+      "learning_rate": 2.0439890025672955e-05,
+      "loss": 0.14481087923049926,
+      "step": 3145
+    },
+    {
+      "epoch": 0.5731441048034934,
+      "grad_norm": 0.15346679091453552,
+      "learning_rate": 2.036753669979174e-05,
+      "loss": 0.14860262870788574,
+      "step": 3150
+    },
+    {
+      "epoch": 0.5740538573508006,
+      "grad_norm": 0.1632593423128128,
+      "learning_rate": 2.0295223532881886e-05,
+      "loss": 0.1481687307357788,
+      "step": 3155
+    },
+    {
+      "epoch": 0.5749636098981077,
+      "grad_norm": 0.23399172723293304,
+      "learning_rate": 2.022295115182852e-05,
+      "loss": 0.149153733253479,
+      "step": 3160
+    },
+    {
+      "epoch": 0.5758733624454149,
+      "grad_norm": 0.14977394044399261,
+      "learning_rate": 2.015072018316323e-05,
+      "loss": 0.14921388626098633,
+      "step": 3165
+    },
+    {
+      "epoch": 0.576783114992722,
+      "grad_norm": 0.1550658792257309,
+      "learning_rate": 2.007853125305856e-05,
+      "loss": 0.1482759475708008,
+      "step": 3170
+    },
+    {
+      "epoch": 0.5776928675400291,
+      "grad_norm": 0.16661737859249115,
+      "learning_rate": 2.0006384987322645e-05,
+      "loss": 0.14903552532196046,
+      "step": 3175
+    },
+    {
+      "epoch": 0.5786026200873362,
+      "grad_norm": 0.1746823936700821,
+      "learning_rate": 1.9934282011393753e-05,
+      "loss": 0.1412947654724121,
+      "step": 3180
+    },
+    {
+      "epoch": 0.5795123726346434,
+      "grad_norm": 0.17025792598724365,
+      "learning_rate": 1.9862222950334857e-05,
+      "loss": 0.15289769172668458,
+      "step": 3185
+    },
+    {
+      "epoch": 0.5804221251819505,
+      "grad_norm": 0.16857658326625824,
+      "learning_rate": 1.9790208428828252e-05,
+      "loss": 0.14419941902160643,
+      "step": 3190
+    },
+    {
+      "epoch": 0.5813318777292577,
+      "grad_norm": 0.16099876165390015,
+      "learning_rate": 1.9718239071170118e-05,
+      "loss": 0.14476487636566163,
+      "step": 3195
+    },
+    {
+      "epoch": 0.5822416302765647,
+      "grad_norm": 0.16140873730182648,
+      "learning_rate": 1.964631550126508e-05,
+      "loss": 0.14588416814804078,
+      "step": 3200
+    },
+    {
+      "epoch": 0.5831513828238719,
+      "grad_norm": 0.15719448029994965,
+      "learning_rate": 1.957443834262087e-05,
+      "loss": 0.15144693851470947,
+      "step": 3205
+    },
+    {
+      "epoch": 0.584061135371179,
+      "grad_norm": 0.16512645781040192,
+      "learning_rate": 1.950260821834285e-05,
+      "loss": 0.14787566661834717,
+      "step": 3210
+    },
+    {
+      "epoch": 0.5849708879184862,
+      "grad_norm": 0.18584516644477844,
+      "learning_rate": 1.9430825751128643e-05,
+      "loss": 0.14514710903167724,
+      "step": 3215
+    },
+    {
+      "epoch": 0.5858806404657934,
+      "grad_norm": 0.17640981078147888,
+      "learning_rate": 1.9359091563262742e-05,
+      "loss": 0.1511004686355591,
+      "step": 3220
+    },
+    {
+      "epoch": 0.5867903930131004,
+      "grad_norm": 0.1697624921798706,
+      "learning_rate": 1.9287406276611095e-05,
+      "loss": 0.15392563343048096,
+      "step": 3225
+    },
+    {
+      "epoch": 0.5877001455604076,
+      "grad_norm": 0.1677260845899582,
+      "learning_rate": 1.9215770512615725e-05,
+      "loss": 0.15311745405197144,
+      "step": 3230
+    },
+    {
+      "epoch": 0.5886098981077147,
+      "grad_norm": 0.15357480943202972,
+      "learning_rate": 1.9144184892289337e-05,
+      "loss": 0.14370160102844237,
+      "step": 3235
+    },
+    {
+      "epoch": 0.5895196506550219,
+      "grad_norm": 0.18601207435131073,
+      "learning_rate": 1.9072650036209955e-05,
+      "loss": 0.14095077514648438,
+      "step": 3240
+    },
+    {
+      "epoch": 0.590429403202329,
+      "grad_norm": 0.17313526570796967,
+      "learning_rate": 1.9001166564515513e-05,
+      "loss": 0.148259174823761,
+      "step": 3245
+    },
+    {
+      "epoch": 0.5913391557496361,
+      "grad_norm": 0.1634378433227539,
+      "learning_rate": 1.8929735096898504e-05,
+      "loss": 0.15082294940948487,
+      "step": 3250
+    },
+    {
+      "epoch": 0.5922489082969432,
+      "grad_norm": 0.18542174994945526,
+      "learning_rate": 1.885835625260058e-05,
+      "loss": 0.14461435079574586,
+      "step": 3255
+    },
+    {
+      "epoch": 0.5931586608442504,
+      "grad_norm": 0.1740756630897522,
+      "learning_rate": 1.87870306504072e-05,
+      "loss": 0.14083608388900756,
+      "step": 3260
+    },
+    {
+      "epoch": 0.5940684133915575,
+      "grad_norm": 0.25606217980384827,
+      "learning_rate": 1.8715758908642288e-05,
+      "loss": 0.15125386714935302,
+      "step": 3265
+    },
+    {
+      "epoch": 0.5949781659388647,
+      "grad_norm": 0.20194627344608307,
+      "learning_rate": 1.8644541645162834e-05,
+      "loss": 0.14433003664016725,
+      "step": 3270
+    },
+    {
+      "epoch": 0.5958879184861717,
+      "grad_norm": 0.1902168095111847,
+      "learning_rate": 1.8573379477353542e-05,
+      "loss": 0.14718132019042968,
+      "step": 3275
+    },
+    {
+      "epoch": 0.5967976710334789,
+      "grad_norm": 0.15122972428798676,
+      "learning_rate": 1.850227302212151e-05,
+      "loss": 0.153376567363739,
+      "step": 3280
+    },
+    {
+      "epoch": 0.597707423580786,
+      "grad_norm": 0.14331959187984467,
+      "learning_rate": 1.843122289589085e-05,
+      "loss": 0.146630597114563,
+      "step": 3285
+    },
+    {
+      "epoch": 0.5986171761280932,
+      "grad_norm": 0.15083099901676178,
+      "learning_rate": 1.836022971459737e-05,
+      "loss": 0.1445971965789795,
+      "step": 3290
+    },
+    {
+      "epoch": 0.5995269286754003,
+      "grad_norm": 0.16585418581962585,
+      "learning_rate": 1.828929409368321e-05,
+      "loss": 0.15120241641998292,
+      "step": 3295
+    },
+    {
+      "epoch": 0.6004366812227074,
+      "grad_norm": 0.1653224229812622,
+      "learning_rate": 1.8218416648091524e-05,
+      "loss": 0.14349838495254516,
+      "step": 3300
+    },
+    {
+      "epoch": 0.6013464337700145,
+      "grad_norm": 0.1891375184059143,
+      "learning_rate": 1.8147597992261124e-05,
+      "loss": 0.15171384811401367,
+      "step": 3305
+    },
+    {
+      "epoch": 0.6022561863173217,
+      "grad_norm": 0.13392704725265503,
+      "learning_rate": 1.8076838740121187e-05,
+      "loss": 0.14607118368148803,
+      "step": 3310
+    },
+    {
+      "epoch": 0.6031659388646288,
+      "grad_norm": 0.15421944856643677,
+      "learning_rate": 1.8006139505085926e-05,
+      "loss": 0.1380957007408142,
+      "step": 3315
+    },
+    {
+      "epoch": 0.604075691411936,
+      "grad_norm": 0.16637761890888214,
+      "learning_rate": 1.7935500900049246e-05,
+      "loss": 0.14604611396789552,
+      "step": 3320
+    },
+    {
+      "epoch": 0.6049854439592431,
+      "grad_norm": 0.16638441383838654,
+      "learning_rate": 1.7864923537379445e-05,
+      "loss": 0.1513611912727356,
+      "step": 3325
+    },
+    {
+      "epoch": 0.6058951965065502,
+      "grad_norm": 0.1745707094669342,
+      "learning_rate": 1.779440802891394e-05,
+      "loss": 0.15391240119934083,
+      "step": 3330
+    },
+    {
+      "epoch": 0.6068049490538574,
+      "grad_norm": 0.1620505005121231,
+      "learning_rate": 1.77239549859539e-05,
+      "loss": 0.14986472129821776,
+      "step": 3335
+    },
+    {
+      "epoch": 0.6077147016011645,
+      "grad_norm": 0.1579132080078125,
+      "learning_rate": 1.7653565019259e-05,
+      "loss": 0.1466603994369507,
+      "step": 3340
+    },
+    {
+      "epoch": 0.6086244541484717,
+      "grad_norm": 0.19154994189739227,
+      "learning_rate": 1.7583238739042086e-05,
+      "loss": 0.15228934288024903,
+      "step": 3345
+    },
+    {
+      "epoch": 0.6095342066957787,
+      "grad_norm": 0.15771779417991638,
+      "learning_rate": 1.7512976754963913e-05,
+      "loss": 0.14965078830718995,
+      "step": 3350
+    },
+    {
+      "epoch": 0.6104439592430859,
+      "grad_norm": 0.18406136333942413,
+      "learning_rate": 1.744277967612785e-05,
+      "loss": 0.1473196864128113,
+      "step": 3355
+    },
+    {
+      "epoch": 0.611353711790393,
+      "grad_norm": 0.17603816092014313,
+      "learning_rate": 1.7372648111074607e-05,
+      "loss": 0.1430676221847534,
+      "step": 3360
+    },
+    {
+      "epoch": 0.6122634643377002,
+      "grad_norm": 0.156408429145813,
+      "learning_rate": 1.7302582667776933e-05,
+      "loss": 0.14018454551696777,
+      "step": 3365
+    },
+    {
+      "epoch": 0.6131732168850073,
+      "grad_norm": 0.14504843950271606,
+      "learning_rate": 1.7232583953634407e-05,
+      "loss": 0.14505640268325806,
+      "step": 3370
+    },
+    {
+      "epoch": 0.6140829694323144,
+      "grad_norm": 0.1864968240261078,
+      "learning_rate": 1.716265257546808e-05,
+      "loss": 0.14810394048690795,
+      "step": 3375
+    },
+    {
+      "epoch": 0.6149927219796215,
+      "grad_norm": 0.1621711403131485,
+      "learning_rate": 1.7092789139515295e-05,
+      "loss": 0.14203091859817504,
+      "step": 3380
+    },
+    {
+      "epoch": 0.6159024745269287,
+      "grad_norm": 0.17994914948940277,
+      "learning_rate": 1.70229942514244e-05,
+      "loss": 0.14565644264221192,
+      "step": 3385
+    },
+    {
+      "epoch": 0.6168122270742358,
+      "grad_norm": 0.1707388162612915,
+      "learning_rate": 1.6953268516249486e-05,
+      "loss": 0.14449434280395507,
+      "step": 3390
+    },
+    {
+      "epoch": 0.617721979621543,
+      "grad_norm": 0.16425329446792603,
+      "learning_rate": 1.6883612538445175e-05,
+      "loss": 0.15185940265655518,
+      "step": 3395
+    },
+    {
+      "epoch": 0.61863173216885,
+      "grad_norm": 0.15987788140773773,
+      "learning_rate": 1.6814026921861335e-05,
+      "loss": 0.14994431734085084,
+      "step": 3400
+    },
+    {
+      "epoch": 0.6195414847161572,
+      "grad_norm": 0.2987690269947052,
+      "learning_rate": 1.6744512269737894e-05,
+      "loss": 0.14652738571166993,
+      "step": 3405
+    },
+    {
+      "epoch": 0.6204512372634643,
+      "grad_norm": 0.1681315004825592,
+      "learning_rate": 1.6675069184699574e-05,
+      "loss": 0.14566165208816528,
+      "step": 3410
+    },
+    {
+      "epoch": 0.6213609898107715,
+      "grad_norm": 0.15847846865653992,
+      "learning_rate": 1.660569826875069e-05,
+      "loss": 0.1374401330947876,
+      "step": 3415
+    },
+    {
+      "epoch": 0.6222707423580786,
+      "grad_norm": 0.16370312869548798,
+      "learning_rate": 1.6536400123269907e-05,
+      "loss": 0.14905524253845215,
+      "step": 3420
+    },
+    {
+      "epoch": 0.6231804949053857,
+      "grad_norm": 0.16054444015026093,
+      "learning_rate": 1.6467175349005054e-05,
+      "loss": 0.1496324896812439,
+      "step": 3425
+    },
+    {
+      "epoch": 0.6240902474526928,
+      "grad_norm": 0.1663951277732849,
+      "learning_rate": 1.639802454606788e-05,
+      "loss": 0.1504170298576355,
+      "step": 3430
+    },
+    {
+      "epoch": 0.625,
+      "grad_norm": 0.1591310054063797,
+      "learning_rate": 1.6328948313928906e-05,
+      "loss": 0.1410186171531677,
+      "step": 3435
+    },
+    {
+      "epoch": 0.6259097525473072,
+      "grad_norm": 0.1637524962425232,
+      "learning_rate": 1.6259947251412178e-05,
+      "loss": 0.13963305950164795,
+      "step": 3440
+    },
+    {
+      "epoch": 0.6268195050946143,
+      "grad_norm": 0.1688017100095749,
+      "learning_rate": 1.6191021956690096e-05,
+      "loss": 0.14727941751480103,
+      "step": 3445
+    },
+    {
+      "epoch": 0.6277292576419214,
+      "grad_norm": 0.1691795438528061,
+      "learning_rate": 1.612217302727821e-05,
+      "loss": 0.14856183528900146,
+      "step": 3450
+    },
+    {
+      "epoch": 0.6286390101892285,
+      "grad_norm": 0.18501746654510498,
+      "learning_rate": 1.60534010600301e-05,
+      "loss": 0.1481746554374695,
+      "step": 3455
+    },
+    {
+      "epoch": 0.6295487627365357,
+      "grad_norm": 0.16234716773033142,
+      "learning_rate": 1.5984706651132125e-05,
+      "loss": 0.1427530527114868,
+      "step": 3460
+    },
+    {
+      "epoch": 0.6304585152838428,
+      "grad_norm": 0.16013780236244202,
+      "learning_rate": 1.5916090396098293e-05,
+      "loss": 0.14264426231384278,
+      "step": 3465
+    },
+    {
+      "epoch": 0.63136826783115,
+      "grad_norm": 0.17116396129131317,
+      "learning_rate": 1.5847552889765095e-05,
+      "loss": 0.14109257459640503,
+      "step": 3470
+    },
+    {
+      "epoch": 0.632278020378457,
+      "grad_norm": 0.16949769854545593,
+      "learning_rate": 1.5779094726286344e-05,
+      "loss": 0.1387040376663208,
+      "step": 3475
+    },
+    {
+      "epoch": 0.6331877729257642,
+      "grad_norm": 0.14983431994915009,
+      "learning_rate": 1.5710716499128044e-05,
+      "loss": 0.13645120859146118,
+      "step": 3480
+    },
+    {
+      "epoch": 0.6340975254730713,
+      "grad_norm": 0.1632554531097412,
+      "learning_rate": 1.564241880106321e-05,
+      "loss": 0.14883992671966553,
+      "step": 3485
+    },
+    {
+      "epoch": 0.6350072780203785,
+      "grad_norm": 0.15686506032943726,
+      "learning_rate": 1.5574202224166744e-05,
+      "loss": 0.14244272708892822,
+      "step": 3490
+    },
+    {
+      "epoch": 0.6359170305676856,
+      "grad_norm": 0.18843458592891693,
+      "learning_rate": 1.5506067359810333e-05,
+      "loss": 0.15149861574172974,
+      "step": 3495
+    },
+    {
+      "epoch": 0.6368267831149927,
+      "grad_norm": 0.15874551236629486,
+      "learning_rate": 1.5438014798657275e-05,
+      "loss": 0.15188233852386473,
+      "step": 3500
+    },
+    {
+      "epoch": 0.6377365356622998,
+      "grad_norm": 0.17014239728450775,
+      "learning_rate": 1.5370045130657366e-05,
+      "loss": 0.14694437980651856,
+      "step": 3505
+    },
+    {
+      "epoch": 0.638646288209607,
+      "grad_norm": 0.14744038879871368,
+      "learning_rate": 1.5302158945041838e-05,
+      "loss": 0.14434736967086792,
+      "step": 3510
+    },
+    {
+      "epoch": 0.6395560407569141,
+      "grad_norm": 0.2069770246744156,
+      "learning_rate": 1.523435683031818e-05,
+      "loss": 0.13982917070388795,
+      "step": 3515
+    },
+    {
+      "epoch": 0.6404657933042213,
+      "grad_norm": 0.17811502516269684,
+      "learning_rate": 1.5166639374265063e-05,
+      "loss": 0.1408839702606201,
+      "step": 3520
+    },
+    {
+      "epoch": 0.6413755458515283,
+      "grad_norm": 0.165786474943161,
+      "learning_rate": 1.509900716392728e-05,
+      "loss": 0.15312877893447877,
+      "step": 3525
+    },
+    {
+      "epoch": 0.6422852983988355,
+      "grad_norm": 0.1633884161710739,
+      "learning_rate": 1.5031460785610596e-05,
+      "loss": 0.1488795518875122,
+      "step": 3530
+    },
+    {
+      "epoch": 0.6431950509461426,
+      "grad_norm": 0.16498984396457672,
+      "learning_rate": 1.4964000824876723e-05,
+      "loss": 0.15031465291976928,
+      "step": 3535
+    },
+    {
+      "epoch": 0.6441048034934498,
+      "grad_norm": 0.18043678998947144,
+      "learning_rate": 1.4896627866538191e-05,
+      "loss": 0.147829806804657,
+      "step": 3540
+    },
+    {
+      "epoch": 0.6450145560407569,
+      "grad_norm": 0.16813597083091736,
+      "learning_rate": 1.4829342494653315e-05,
+      "loss": 0.1418998956680298,
+      "step": 3545
+    },
+    {
+      "epoch": 0.645924308588064,
+      "grad_norm": 0.1817242056131363,
+      "learning_rate": 1.4762145292521118e-05,
+      "loss": 0.14508869647979736,
+      "step": 3550
+    },
+    {
+      "epoch": 0.6468340611353712,
+      "grad_norm": 0.14666494727134705,
+      "learning_rate": 1.469503684267628e-05,
+      "loss": 0.14159854650497436,
+      "step": 3555
+    },
+    {
+      "epoch": 0.6477438136826783,
+      "grad_norm": 0.16485381126403809,
+      "learning_rate": 1.4628017726884086e-05,
+      "loss": 0.14419105052947997,
+      "step": 3560
+    },
+    {
+      "epoch": 0.6486535662299855,
+      "grad_norm": 0.16100342571735382,
+      "learning_rate": 1.4561088526135375e-05,
+      "loss": 0.14501721858978273,
+      "step": 3565
+    },
+    {
+      "epoch": 0.6495633187772926,
+      "grad_norm": 0.16996590793132782,
+      "learning_rate": 1.4494249820641493e-05,
+      "loss": 0.1377166509628296,
+      "step": 3570
+    },
+    {
+      "epoch": 0.6504730713245997,
+      "grad_norm": 0.16168837249279022,
+      "learning_rate": 1.4427502189829339e-05,
+      "loss": 0.1414325475692749,
+      "step": 3575
+    },
+    {
+      "epoch": 0.6513828238719068,
+      "grad_norm": 0.16318906843662262,
+      "learning_rate": 1.436084621233621e-05,
+      "loss": 0.14685193300247193,
+      "step": 3580
+    },
+    {
+      "epoch": 0.652292576419214,
+      "grad_norm": 0.1636219322681427,
+      "learning_rate": 1.4294282466004899e-05,
+      "loss": 0.1405899167060852,
+      "step": 3585
+    },
+    {
+      "epoch": 0.6532023289665211,
+      "grad_norm": 0.1838461309671402,
+      "learning_rate": 1.422781152787865e-05,
+      "loss": 0.14386332035064697,
+      "step": 3590
+    },
+    {
+      "epoch": 0.6541120815138283,
+      "grad_norm": 0.1796344667673111,
+      "learning_rate": 1.4161433974196115e-05,
+      "loss": 0.1513024687767029,
+      "step": 3595
+    },
+    {
+      "epoch": 0.6550218340611353,
+      "grad_norm": 0.16424529254436493,
+      "learning_rate": 1.4095150380386427e-05,
+      "loss": 0.14238927364349366,
+      "step": 3600
+    },
+    {
+      "epoch": 0.6559315866084425,
+      "grad_norm": 0.19264160096645355,
+      "learning_rate": 1.402896132106415e-05,
+      "loss": 0.14297477006912232,
+      "step": 3605
+    },
+    {
+      "epoch": 0.6568413391557496,
+      "grad_norm": 0.18319948017597198,
+      "learning_rate": 1.3962867370024347e-05,
+      "loss": 0.1448880434036255,
+      "step": 3610
+    },
+    {
+      "epoch": 0.6577510917030568,
+      "grad_norm": 0.16507290303707123,
+      "learning_rate": 1.389686910023758e-05,
+      "loss": 0.14724698066711425,
+      "step": 3615
+    },
+    {
+      "epoch": 0.6586608442503639,
+      "grad_norm": 0.17871244251728058,
+      "learning_rate": 1.3830967083844942e-05,
+      "loss": 0.14479386806488037,
+      "step": 3620
+    },
+    {
+      "epoch": 0.659570596797671,
+      "grad_norm": 0.1846228390932083,
+      "learning_rate": 1.3765161892153112e-05,
+      "loss": 0.1453616738319397,
+      "step": 3625
+    },
+    {
+      "epoch": 0.6604803493449781,
+      "grad_norm": 0.17185978591442108,
+      "learning_rate": 1.3699454095629372e-05,
+      "loss": 0.14906206130981445,
+      "step": 3630
+    },
+    {
+      "epoch": 0.6613901018922853,
+      "grad_norm": 0.14751191437244415,
+      "learning_rate": 1.3633844263896698e-05,
+      "loss": 0.13991892337799072,
+      "step": 3635
+    },
+    {
+      "epoch": 0.6622998544395924,
+      "grad_norm": 0.22059763967990875,
+      "learning_rate": 1.3568332965728817e-05,
+      "loss": 0.14680869579315187,
+      "step": 3640
+    },
+    {
+      "epoch": 0.6632096069868996,
+      "grad_norm": 0.15295909345149994,
+      "learning_rate": 1.3502920769045232e-05,
+      "loss": 0.1404443383216858,
+      "step": 3645
+    },
+    {
+      "epoch": 0.6641193595342066,
+      "grad_norm": 0.14600558578968048,
+      "learning_rate": 1.3437608240906364e-05,
+      "loss": 0.14663270711898804,
+      "step": 3650
+    },
+    {
+      "epoch": 0.6650291120815138,
+      "grad_norm": 0.15548352897167206,
+      "learning_rate": 1.3372395947508587e-05,
+      "loss": 0.1431443452835083,
+      "step": 3655
+    },
+    {
+      "epoch": 0.665938864628821,
+      "grad_norm": 0.1813388466835022,
+      "learning_rate": 1.3307284454179342e-05,
+      "loss": 0.1458706736564636,
+      "step": 3660
+    },
+    {
+      "epoch": 0.6668486171761281,
+      "grad_norm": 0.16326870024204254,
+      "learning_rate": 1.3242274325372247e-05,
+      "loss": 0.14700595140457154,
+      "step": 3665
+    },
+    {
+      "epoch": 0.6677583697234353,
+      "grad_norm": 0.18779197335243225,
+      "learning_rate": 1.3177366124662149e-05,
+      "loss": 0.1497237801551819,
+      "step": 3670
+    },
+    {
+      "epoch": 0.6686681222707423,
+      "grad_norm": 0.16291002929210663,
+      "learning_rate": 1.3112560414740315e-05,
+      "loss": 0.1387086868286133,
+      "step": 3675
+    },
+    {
+      "epoch": 0.6695778748180495,
+      "grad_norm": 0.1532297134399414,
+      "learning_rate": 1.3047857757409487e-05,
+      "loss": 0.14497545957565308,
+      "step": 3680
+    },
+    {
+      "epoch": 0.6704876273653566,
+      "grad_norm": 0.14697515964508057,
+      "learning_rate": 1.2983258713579066e-05,
+      "loss": 0.1494283437728882,
+      "step": 3685
+    },
+    {
+      "epoch": 0.6713973799126638,
+      "grad_norm": 0.15213452279567719,
+      "learning_rate": 1.2918763843260218e-05,
+      "loss": 0.1468907594680786,
+      "step": 3690
+    },
+    {
+      "epoch": 0.6723071324599709,
+      "grad_norm": 0.1745215803384781,
+      "learning_rate": 1.285437370556099e-05,
+      "loss": 0.14997754096984864,
+      "step": 3695
+    },
+    {
+      "epoch": 0.673216885007278,
+      "grad_norm": 0.19207637012004852,
+      "learning_rate": 1.2790088858681577e-05,
+      "loss": 0.14202862977981567,
+      "step": 3700
+    },
+    {
+      "epoch": 0.6741266375545851,
+      "grad_norm": 0.1521359086036682,
+      "learning_rate": 1.2725909859909313e-05,
+      "loss": 0.14547673463821412,
+      "step": 3705
+    },
+    {
+      "epoch": 0.6750363901018923,
+      "grad_norm": 0.16975535452365875,
+      "learning_rate": 1.2661837265613999e-05,
+      "loss": 0.14006874561309815,
+      "step": 3710
+    },
+    {
+      "epoch": 0.6759461426491994,
+      "grad_norm": 0.22234582901000977,
+      "learning_rate": 1.2597871631242992e-05,
+      "loss": 0.13691173791885375,
+      "step": 3715
+    },
+    {
+      "epoch": 0.6768558951965066,
+      "grad_norm": 0.16082969307899475,
+      "learning_rate": 1.2534013511316383e-05,
+      "loss": 0.14932308197021485,
+      "step": 3720
+    },
+    {
+      "epoch": 0.6777656477438136,
+      "grad_norm": 0.1751091182231903,
+      "learning_rate": 1.247026345942226e-05,
+      "loss": 0.14531974792480468,
+      "step": 3725
+    },
+    {
+      "epoch": 0.6786754002911208,
+      "grad_norm": 0.15838147699832916,
+      "learning_rate": 1.2406622028211844e-05,
+      "loss": 0.14759832620620728,
+      "step": 3730
+    },
+    {
+      "epoch": 0.6795851528384279,
+      "grad_norm": 0.1771744042634964,
+      "learning_rate": 1.2343089769394714e-05,
+      "loss": 0.1382831573486328,
+      "step": 3735
+    },
+    {
+      "epoch": 0.6804949053857351,
+      "grad_norm": 0.16301538050174713,
+      "learning_rate": 1.2279667233734037e-05,
+      "loss": 0.14444775581359864,
+      "step": 3740
+    },
+    {
+      "epoch": 0.6814046579330422,
+      "grad_norm": 0.1584121286869049,
+      "learning_rate": 1.2216354971041796e-05,
+      "loss": 0.14200170040130616,
+      "step": 3745
+    },
+    {
+      "epoch": 0.6823144104803494,
+      "grad_norm": 0.139187291264534,
+      "learning_rate": 1.2153153530174007e-05,
+      "loss": 0.14318310022354125,
+      "step": 3750
+    },
+    {
+      "epoch": 0.6832241630276564,
+      "grad_norm": 0.13665248453617096,
+      "learning_rate": 1.2090063459025955e-05,
+      "loss": 0.1411946654319763,
+      "step": 3755
+    },
+    {
+      "epoch": 0.6841339155749636,
+      "grad_norm": 0.16273781657218933,
+      "learning_rate": 1.2027085304527475e-05,
+      "loss": 0.14873508214950562,
+      "step": 3760
+    },
+    {
+      "epoch": 0.6850436681222707,
+      "grad_norm": 0.16317526996135712,
+      "learning_rate": 1.1964219612638194e-05,
+      "loss": 0.14644203186035157,
+      "step": 3765
+    },
+    {
+      "epoch": 0.6859534206695779,
+      "grad_norm": 0.17253617942333221,
+      "learning_rate": 1.1901466928342777e-05,
+      "loss": 0.14027841091156007,
+      "step": 3770
+    },
+    {
+      "epoch": 0.6868631732168851,
+      "grad_norm": 0.19692830741405487,
+      "learning_rate": 1.183882779564624e-05,
+      "loss": 0.14411110877990724,
+      "step": 3775
+    },
+    {
+      "epoch": 0.6877729257641921,
+      "grad_norm": 0.15444578230381012,
+      "learning_rate": 1.1776302757569214e-05,
+      "loss": 0.14355008602142333,
+      "step": 3780
+    },
+    {
+      "epoch": 0.6886826783114993,
+      "grad_norm": 0.1622200757265091,
+      "learning_rate": 1.1713892356143239e-05,
+      "loss": 0.14794334173202514,
+      "step": 3785
+    },
+    {
+      "epoch": 0.6895924308588064,
+      "grad_norm": 0.1898501068353653,
+      "learning_rate": 1.1651597132406073e-05,
+      "loss": 0.1418622612953186,
+      "step": 3790
+    },
+    {
+      "epoch": 0.6905021834061136,
+      "grad_norm": 0.17803208529949188,
+      "learning_rate": 1.1589417626396973e-05,
+      "loss": 0.14576040506362914,
+      "step": 3795
+    },
+    {
+      "epoch": 0.6914119359534207,
+      "grad_norm": 0.17138013243675232,
+      "learning_rate": 1.1527354377152053e-05,
+      "loss": 0.14494270086288452,
+      "step": 3800
+    },
+    {
+      "epoch": 0.6923216885007278,
+      "grad_norm": 0.15170913934707642,
+      "learning_rate": 1.1465407922699603e-05,
+      "loss": 0.144084370136261,
+      "step": 3805
+    },
+    {
+      "epoch": 0.6932314410480349,
+      "grad_norm": 0.158562570810318,
+      "learning_rate": 1.1403578800055387e-05,
+      "loss": 0.13636608123779298,
+      "step": 3810
+    },
+    {
+      "epoch": 0.6941411935953421,
+      "grad_norm": 0.17687302827835083,
+      "learning_rate": 1.1341867545218044e-05,
+      "loss": 0.14214688539505005,
+      "step": 3815
+    },
+    {
+      "epoch": 0.6950509461426492,
+      "grad_norm": 0.15394899249076843,
+      "learning_rate": 1.1280274693164378e-05,
+      "loss": 0.14914129972457885,
+      "step": 3820
+    },
+    {
+      "epoch": 0.6959606986899564,
+      "grad_norm": 0.15709355473518372,
+      "learning_rate": 1.12188007778448e-05,
+      "loss": 0.14798580408096312,
+      "step": 3825
+    },
+    {
+      "epoch": 0.6968704512372634,
+      "grad_norm": 0.16631539165973663,
+      "learning_rate": 1.115744633217864e-05,
+      "loss": 0.14756966829299928,
+      "step": 3830
+    },
+    {
+      "epoch": 0.6977802037845706,
+      "grad_norm": 0.15893076360225677,
+      "learning_rate": 1.109621188804951e-05,
+      "loss": 0.14061959981918334,
+      "step": 3835
+    },
+    {
+      "epoch": 0.6986899563318777,
+      "grad_norm": 0.183414489030838,
+      "learning_rate": 1.103509797630077e-05,
+      "loss": 0.1448473334312439,
+      "step": 3840
+    },
+    {
+      "epoch": 0.6995997088791849,
+      "grad_norm": 0.14087305963039398,
+      "learning_rate": 1.0974105126730841e-05,
+      "loss": 0.14369285106658936,
+      "step": 3845
+    },
+    {
+      "epoch": 0.700509461426492,
+      "grad_norm": 0.16919967532157898,
+      "learning_rate": 1.0913233868088685e-05,
+      "loss": 0.1478085398674011,
+      "step": 3850
+    },
+    {
+      "epoch": 0.7014192139737991,
+      "grad_norm": 0.1439533829689026,
+      "learning_rate": 1.0852484728069178e-05,
+      "loss": 0.14376721382141114,
+      "step": 3855
+    },
+    {
+      "epoch": 0.7023289665211062,
+      "grad_norm": 0.17719274759292603,
+      "learning_rate": 1.0791858233308521e-05,
+      "loss": 0.14089040756225585,
+      "step": 3860
+    },
+    {
+      "epoch": 0.7032387190684134,
+      "grad_norm": 0.19753769040107727,
+      "learning_rate": 1.0731354909379754e-05,
+      "loss": 0.15021742582321168,
+      "step": 3865
+    },
+    {
+      "epoch": 0.7041484716157205,
+      "grad_norm": 0.19186992943286896,
+      "learning_rate": 1.0670975280788086e-05,
+      "loss": 0.14113202095031738,
+      "step": 3870
+    },
+    {
+      "epoch": 0.7050582241630277,
+      "grad_norm": 0.1709229201078415,
+      "learning_rate": 1.0610719870966443e-05,
+      "loss": 0.1500566840171814,
+      "step": 3875
+    },
+    {
+      "epoch": 0.7059679767103348,
+      "grad_norm": 0.17846204340457916,
+      "learning_rate": 1.0550589202270892e-05,
+      "loss": 0.15014195442199707,
+      "step": 3880
+    },
+    {
+      "epoch": 0.7068777292576419,
+      "grad_norm": 0.1827082335948944,
+      "learning_rate": 1.0490583795976091e-05,
+      "loss": 0.1423472762107849,
+      "step": 3885
+    },
+    {
+      "epoch": 0.7077874818049491,
+      "grad_norm": 0.17418377101421356,
+      "learning_rate": 1.043070417227083e-05,
+      "loss": 0.14668900966644288,
+      "step": 3890
+    },
+    {
+      "epoch": 0.7086972343522562,
+      "grad_norm": 0.17385616898536682,
+      "learning_rate": 1.0370950850253449e-05,
+      "loss": 0.14627279043197633,
+      "step": 3895
+    },
+    {
+      "epoch": 0.7096069868995634,
+      "grad_norm": 0.16486723721027374,
+      "learning_rate": 1.0311324347927404e-05,
+      "loss": 0.14603652954101562,
+      "step": 3900
+    },
+    {
+      "epoch": 0.7105167394468704,
+      "grad_norm": 0.21806862950325012,
+      "learning_rate": 1.0251825182196732e-05,
+      "loss": 0.1488169550895691,
+      "step": 3905
+    },
+    {
+      "epoch": 0.7114264919941776,
+      "grad_norm": 0.19884569942951202,
+      "learning_rate": 1.019245386886159e-05,
+      "loss": 0.14387656450271608,
+      "step": 3910
+    },
+    {
+      "epoch": 0.7123362445414847,
+      "grad_norm": 0.16139011085033417,
+      "learning_rate": 1.0133210922613789e-05,
+      "loss": 0.1483074426651001,
+      "step": 3915
+    },
+    {
+      "epoch": 0.7132459970887919,
+      "grad_norm": 0.17000740766525269,
+      "learning_rate": 1.007409685703229e-05,
+      "loss": 0.14050065279006957,
+      "step": 3920
+    },
+    {
+      "epoch": 0.714155749636099,
+      "grad_norm": 0.17235304415225983,
+      "learning_rate": 1.0015112184578813e-05,
+      "loss": 0.1440442681312561,
+      "step": 3925
+    },
+    {
+      "epoch": 0.7150655021834061,
+      "grad_norm": 0.15737567842006683,
+      "learning_rate": 9.956257416593362e-06,
+      "loss": 0.14960765838623047,
+      "step": 3930
+    },
+    {
+      "epoch": 0.7159752547307132,
+      "grad_norm": 0.15499180555343628,
+      "learning_rate": 9.897533063289773e-06,
+      "loss": 0.14488829374313356,
+      "step": 3935
+    },
+    {
+      "epoch": 0.7168850072780204,
+      "grad_norm": 0.17744216322898865,
+      "learning_rate": 9.838939633751337e-06,
+      "loss": 0.1416949987411499,
+      "step": 3940
+    },
+    {
+      "epoch": 0.7177947598253275,
+      "grad_norm": 0.1597192883491516,
+      "learning_rate": 9.780477635926358e-06,
+      "loss": 0.14275280237197877,
+      "step": 3945
+    },
+    {
+      "epoch": 0.7187045123726347,
+      "grad_norm": 0.17800374329090118,
+      "learning_rate": 9.722147576623743e-06,
+      "loss": 0.14532098770141602,
+      "step": 3950
+    },
+    {
+      "epoch": 0.7196142649199417,
+      "grad_norm": 0.1828162521123886,
+      "learning_rate": 9.66394996150864e-06,
+      "loss": 0.14525585174560546,
+      "step": 3955
+    },
+    {
+      "epoch": 0.7205240174672489,
+      "grad_norm": 0.1800539344549179,
+      "learning_rate": 9.605885295098005e-06,
+      "loss": 0.14235819578170777,
+      "step": 3960
+    },
+    {
+      "epoch": 0.721433770014556,
+      "grad_norm": 0.16556483507156372,
+      "learning_rate": 9.54795408075628e-06,
+      "loss": 0.13965482711791993,
+      "step": 3965
+    },
+    {
+      "epoch": 0.7223435225618632,
+      "grad_norm": 0.1592024862766266,
+      "learning_rate": 9.49015682069101e-06,
+      "loss": 0.14051042795181273,
+      "step": 3970
+    },
+    {
+      "epoch": 0.7232532751091703,
+      "grad_norm": 0.18988847732543945,
+      "learning_rate": 9.43249401594846e-06,
+      "loss": 0.1436900496482849,
+      "step": 3975
+    },
+    {
+      "epoch": 0.7241630276564774,
+      "grad_norm": 0.24433808028697968,
+      "learning_rate": 9.374966166409329e-06,
+      "loss": 0.14883997440338134,
+      "step": 3980
+    },
+    {
+      "epoch": 0.7250727802037845,
+      "grad_norm": 0.15091639757156372,
+      "learning_rate": 9.317573770784352e-06,
+      "loss": 0.14726560115814208,
+      "step": 3985
+    },
+    {
+      "epoch": 0.7259825327510917,
+      "grad_norm": 0.17045573890209198,
+      "learning_rate": 9.260317326610051e-06,
+      "loss": 0.14120506048202514,
+      "step": 3990
+    },
+    {
+      "epoch": 0.7268922852983989,
+      "grad_norm": 0.18847957253456116,
+      "learning_rate": 9.203197330244343e-06,
+      "loss": 0.1377041220664978,
+      "step": 3995
+    },
+    {
+      "epoch": 0.727802037845706,
+      "grad_norm": 0.1516445279121399,
+      "learning_rate": 9.14621427686229e-06,
+      "loss": 0.14043946266174318,
+      "step": 4000
+    },
+    {
+      "epoch": 0.7287117903930131,
+      "grad_norm": 0.18264050781726837,
+      "learning_rate": 9.0893686604518e-06,
+      "loss": 0.14080368280410765,
+      "step": 4005
+    },
+    {
+      "epoch": 0.7296215429403202,
+      "grad_norm": 0.19129371643066406,
+      "learning_rate": 9.032660973809312e-06,
+      "loss": 0.1402561902999878,
+      "step": 4010
+    },
+    {
+      "epoch": 0.7305312954876274,
+      "grad_norm": 0.15762710571289062,
+      "learning_rate": 8.976091708535567e-06,
+      "loss": 0.14421157836914061,
+      "step": 4015
+    },
+    {
+      "epoch": 0.7314410480349345,
+      "grad_norm": 0.17785198986530304,
+      "learning_rate": 8.919661355031331e-06,
+      "loss": 0.14999009370803834,
+      "step": 4020
+    },
+    {
+      "epoch": 0.7323508005822417,
+      "grad_norm": 0.15306031703948975,
+      "learning_rate": 8.8633704024931e-06,
+      "loss": 0.14101698398590087,
+      "step": 4025
+    },
+    {
+      "epoch": 0.7332605531295487,
+      "grad_norm": 0.16481758654117584,
+      "learning_rate": 8.807219338908968e-06,
+      "loss": 0.14170764684677123,
+      "step": 4030
+    },
+    {
+      "epoch": 0.7341703056768559,
+      "grad_norm": 0.14892235398292542,
+      "learning_rate": 8.751208651054257e-06,
+      "loss": 0.15317896604537964,
+      "step": 4035
+    },
+    {
+      "epoch": 0.735080058224163,
+      "grad_norm": 0.1775592565536499,
+      "learning_rate": 8.695338824487409e-06,
+      "loss": 0.1520617723464966,
+      "step": 4040
+    },
+    {
+      "epoch": 0.7359898107714702,
+      "grad_norm": 0.1614258885383606,
+      "learning_rate": 8.639610343545728e-06,
+      "loss": 0.13747400045394897,
+      "step": 4045
+    },
+    {
+      "epoch": 0.7368995633187773,
+      "grad_norm": 0.21415506303310394,
+      "learning_rate": 8.58402369134117e-06,
+      "loss": 0.1432439088821411,
+      "step": 4050
+    },
+    {
+      "epoch": 0.7378093158660844,
+      "grad_norm": 0.1759418249130249,
+      "learning_rate": 8.528579349756205e-06,
+      "loss": 0.141641104221344,
+      "step": 4055
+    },
+    {
+      "epoch": 0.7387190684133915,
+      "grad_norm": 0.16738329827785492,
+      "learning_rate": 8.47327779943957e-06,
+      "loss": 0.14294810295104982,
+      "step": 4060
+    },
+    {
+      "epoch": 0.7396288209606987,
+      "grad_norm": 0.13916844129562378,
+      "learning_rate": 8.41811951980217e-06,
+      "loss": 0.13876968622207642,
+      "step": 4065
+    },
+    {
+      "epoch": 0.7405385735080058,
+      "grad_norm": 0.1828441321849823,
+      "learning_rate": 8.36310498901288e-06,
+      "loss": 0.148428475856781,
+      "step": 4070
+    },
+    {
+      "epoch": 0.741448326055313,
+      "grad_norm": 0.16534076631069183,
+      "learning_rate": 8.308234683994415e-06,
+      "loss": 0.14222711324691772,
+      "step": 4075
+    },
+    {
+      "epoch": 0.74235807860262,
+      "grad_norm": 0.17922644317150116,
+      "learning_rate": 8.253509080419198e-06,
+      "loss": 0.14365782737731933,
+      "step": 4080
+    },
+    {
+      "epoch": 0.7432678311499272,
+      "grad_norm": 0.15061035752296448,
+      "learning_rate": 8.198928652705204e-06,
+      "loss": 0.13571925163269044,
+      "step": 4085
+    },
+    {
+      "epoch": 0.7441775836972343,
+      "grad_norm": 0.18075402081012726,
+      "learning_rate": 8.144493874011908e-06,
+      "loss": 0.14385528564453126,
+      "step": 4090
+    },
+    {
+      "epoch": 0.7450873362445415,
+      "grad_norm": 0.16514739394187927,
+      "learning_rate": 8.090205216236135e-06,
+      "loss": 0.14920626878738402,
+      "step": 4095
+    },
+    {
+      "epoch": 0.7459970887918487,
+      "grad_norm": 0.16453702747821808,
+      "learning_rate": 8.03606315000797e-06,
+      "loss": 0.14704222679138185,
+      "step": 4100
+    },
+    {
+      "epoch": 0.7469068413391557,
+      "grad_norm": 0.16719917953014374,
+      "learning_rate": 7.982068144686707e-06,
+      "loss": 0.14722511768341065,
+      "step": 4105
+    },
+    {
+      "epoch": 0.7478165938864629,
+      "grad_norm": 0.18499110639095306,
+      "learning_rate": 7.92822066835677e-06,
+      "loss": 0.1401848554611206,
+      "step": 4110
+    },
+    {
+      "epoch": 0.74872634643377,
+      "grad_norm": 0.17249563336372375,
+      "learning_rate": 7.87452118782363e-06,
+      "loss": 0.15132423639297485,
+      "step": 4115
+    },
+    {
+      "epoch": 0.7496360989810772,
+      "grad_norm": 0.15049682557582855,
+      "learning_rate": 7.8209701686098e-06,
+      "loss": 0.1341150164604187,
+      "step": 4120
+    },
+    {
+      "epoch": 0.7505458515283843,
+      "grad_norm": 0.16892646253108978,
+      "learning_rate": 7.767568074950751e-06,
+      "loss": 0.1466840147972107,
+      "step": 4125
+    },
+    {
+      "epoch": 0.7514556040756915,
+      "grad_norm": 0.17288286983966827,
+      "learning_rate": 7.714315369790942e-06,
+      "loss": 0.13819680213928223,
+      "step": 4130
+    },
+    {
+      "epoch": 0.7523653566229985,
+      "grad_norm": 0.21893996000289917,
+      "learning_rate": 7.661212514779745e-06,
+      "loss": 0.14369510412216185,
+      "step": 4135
+    },
+    {
+      "epoch": 0.7532751091703057,
+      "grad_norm": 0.1674601435661316,
+      "learning_rate": 7.608259970267509e-06,
+      "loss": 0.14810250997543334,
+      "step": 4140
+    },
+    {
+      "epoch": 0.7541848617176128,
+      "grad_norm": 0.15875539183616638,
+      "learning_rate": 7.555458195301526e-06,
+      "loss": 0.14103198051452637,
+      "step": 4145
+    },
+    {
+      "epoch": 0.75509461426492,
+      "grad_norm": 0.19454079866409302,
+      "learning_rate": 7.502807647622037e-06,
+      "loss": 0.13848764896392823,
+      "step": 4150
+    },
+    {
+      "epoch": 0.756004366812227,
+      "grad_norm": 0.1795455813407898,
+      "learning_rate": 7.450308783658341e-06,
+      "loss": 0.14459335803985596,
+      "step": 4155
+    },
+    {
+      "epoch": 0.7569141193595342,
+      "grad_norm": 0.1643362045288086,
+      "learning_rate": 7.397962058524735e-06,
+      "loss": 0.14335378408432006,
+      "step": 4160
+    },
+    {
+      "epoch": 0.7578238719068413,
+      "grad_norm": 0.16362066566944122,
+      "learning_rate": 7.3457679260166475e-06,
+      "loss": 0.14222005605697632,
+      "step": 4165
+    },
+    {
+      "epoch": 0.7587336244541485,
+      "grad_norm": 0.17313003540039062,
+      "learning_rate": 7.293726838606674e-06,
+      "loss": 0.14272255897521974,
+      "step": 4170
+    },
+    {
+      "epoch": 0.7596433770014556,
+      "grad_norm": 0.1809929460287094,
+      "learning_rate": 7.2418392474406405e-06,
+      "loss": 0.14089123010635377,
+      "step": 4175
+    },
+    {
+      "epoch": 0.7605531295487628,
+      "grad_norm": 0.14306005835533142,
+      "learning_rate": 7.19010560233373e-06,
+      "loss": 0.13531534671783446,
+      "step": 4180
+    },
+    {
+      "epoch": 0.7614628820960698,
+      "grad_norm": 0.15525390207767487,
+      "learning_rate": 7.138526351766559e-06,
+      "loss": 0.14340845346450806,
+      "step": 4185
+    },
+    {
+      "epoch": 0.762372634643377,
+      "grad_norm": 0.24478943645954132,
+      "learning_rate": 7.087101942881263e-06,
+      "loss": 0.14744555950164795,
+      "step": 4190
+    },
+    {
+      "epoch": 0.7632823871906841,
+      "grad_norm": 0.31335577368736267,
+      "learning_rate": 7.035832821477711e-06,
+      "loss": 0.1484094500541687,
+      "step": 4195
+    },
+    {
+      "epoch": 0.7641921397379913,
+      "grad_norm": 0.15140366554260254,
+      "learning_rate": 6.984719432009515e-06,
+      "loss": 0.14991614818572999,
+      "step": 4200
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.310282758335583e+18,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-4200/training_args.bin b/checkpoint-4200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-4200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/checkpoint-4300/README.md b/checkpoint-4300/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-4300/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-4300/adapter_config.json b/checkpoint-4300/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-4300/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-4300/adapter_model.safetensors b/checkpoint-4300/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..21a3bfdf15ae89346be952eecbf49eaa9895b507
--- /dev/null
+++ b/checkpoint-4300/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eced2e5f1c6843a8db0df0ec5eb04b45b7692d56698e73044e8992c34df47313
+size 169741912
diff --git a/checkpoint-4300/chat_template.jinja b/checkpoint-4300/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-4300/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-4300/optimizer.pt b/checkpoint-4300/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4af328e6e08f11ea5ca478fb648c4889b691e981
--- /dev/null
+++ b/checkpoint-4300/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0008b10c3a319ad2cf110abf3c4ba5abf9cb83f10750b788ea1cdb1dabbe43e3
+size 72807355
diff --git a/checkpoint-4300/processor_config.json b/checkpoint-4300/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-4300/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-4300/rng_state.pth b/checkpoint-4300/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-4300/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-4300/scheduler.pt b/checkpoint-4300/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d209d60bdad40196a5d6cf52ef9fb8998ed35455
--- /dev/null
+++ b/checkpoint-4300/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5db332af5b43857b3570818d2a706d2392164182f86ae875d5e386c1f9ce788a
+size 1465
diff --git a/checkpoint-4300/tokenizer.json b/checkpoint-4300/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-4300/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-4300/tokenizer_config.json b/checkpoint-4300/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-4300/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-4300/trainer_state.json b/checkpoint-4300/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..3c808868ab6b360268da6e3c09cf1ea9af405893
--- /dev/null
+++ b/checkpoint-4300/trainer_state.json
@@ -0,0 +1,6062 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.7823871906841339,
+  "eval_steps": 100,
+  "global_step": 4300,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    },
+    {
+      "epoch": 0.03729985443959243,
+      "grad_norm": 0.061678580939769745,
+      "learning_rate": 4.999340748636462e-05,
+      "loss": 0.24956226348876953,
+      "step": 205
+    },
+    {
+      "epoch": 0.03820960698689956,
+      "grad_norm": 0.07328873127698898,
+      "learning_rate": 4.999160884067051e-05,
+      "loss": 0.26169676780700685,
+      "step": 210
+    },
+    {
+      "epoch": 0.0391193595342067,
+      "grad_norm": 0.08287990838289261,
+      "learning_rate": 4.9989593541926246e-05,
+      "loss": 0.2574604034423828,
+      "step": 215
+    },
+    {
+      "epoch": 0.04002911208151383,
+      "grad_norm": 0.06787359714508057,
+      "learning_rate": 4.9987361607602525e-05,
+      "loss": 0.25351409912109374,
+      "step": 220
+    },
+    {
+      "epoch": 0.04093886462882096,
+      "grad_norm": 0.06695502996444702,
+      "learning_rate": 4.998491305704805e-05,
+      "loss": 0.24522039890289307,
+      "step": 225
+    },
+    {
+      "epoch": 0.041848617176128096,
+      "grad_norm": 0.08872214704751968,
+      "learning_rate": 4.9982247911489375e-05,
+      "loss": 0.2581867933273315,
+      "step": 230
+    },
+    {
+      "epoch": 0.042758369723435226,
+      "grad_norm": 0.07637131959199905,
+      "learning_rate": 4.9979366194030743e-05,
+      "loss": 0.25569658279418944,
+      "step": 235
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 0.08158119022846222,
+      "learning_rate": 4.997626792965385e-05,
+      "loss": 0.2529409646987915,
+      "step": 240
+    },
+    {
+      "epoch": 0.04457787481804949,
+      "grad_norm": 0.07529161125421524,
+      "learning_rate": 4.997295314521766e-05,
+      "loss": 0.24049024581909179,
+      "step": 245
+    },
+    {
+      "epoch": 0.04548762736535662,
+      "grad_norm": 0.08860139548778534,
+      "learning_rate": 4.996942186945813e-05,
+      "loss": 0.2490522861480713,
+      "step": 250
+    },
+    {
+      "epoch": 0.04639737991266375,
+      "grad_norm": 0.0850321501493454,
+      "learning_rate": 4.9965674132988005e-05,
+      "loss": 0.24180831909179687,
+      "step": 255
+    },
+    {
+      "epoch": 0.04730713245997089,
+      "grad_norm": 0.07556115090847015,
+      "learning_rate": 4.996170996829653e-05,
+      "loss": 0.2509631872177124,
+      "step": 260
+    },
+    {
+      "epoch": 0.04821688500727802,
+      "grad_norm": 0.07971206307411194,
+      "learning_rate": 4.995752940974918e-05,
+      "loss": 0.24398891925811766,
+      "step": 265
+    },
+    {
+      "epoch": 0.04912663755458515,
+      "grad_norm": 0.09149336814880371,
+      "learning_rate": 4.9953132493587344e-05,
+      "loss": 0.2300492286682129,
+      "step": 270
+    },
+    {
+      "epoch": 0.050036390101892286,
+      "grad_norm": 0.08265820890665054,
+      "learning_rate": 4.9948519257928034e-05,
+      "loss": 0.24246792793273925,
+      "step": 275
+    },
+    {
+      "epoch": 0.050946142649199416,
+      "grad_norm": 0.10328587144613266,
+      "learning_rate": 4.9943689742763534e-05,
+      "loss": 0.2367171049118042,
+      "step": 280
+    },
+    {
+      "epoch": 0.05185589519650655,
+      "grad_norm": 0.0836917981505394,
+      "learning_rate": 4.993864398996105e-05,
+      "loss": 0.23215813636779786,
+      "step": 285
+    },
+    {
+      "epoch": 0.05276564774381368,
+      "grad_norm": 0.09475161135196686,
+      "learning_rate": 4.99333820432624e-05,
+      "loss": 0.2350748062133789,
+      "step": 290
+    },
+    {
+      "epoch": 0.05367540029112081,
+      "grad_norm": 0.08040128648281097,
+      "learning_rate": 4.992790394828355e-05,
+      "loss": 0.23253886699676513,
+      "step": 295
+    },
+    {
+      "epoch": 0.05458515283842795,
+      "grad_norm": 0.08852150291204453,
+      "learning_rate": 4.992220975251428e-05,
+      "loss": 0.23856515884399415,
+      "step": 300
+    },
+    {
+      "epoch": 0.05549490538573508,
+      "grad_norm": 0.09565229713916779,
+      "learning_rate": 4.991629950531775e-05,
+      "loss": 0.23311660289764405,
+      "step": 305
+    },
+    {
+      "epoch": 0.05640465793304221,
+      "grad_norm": 0.08158160001039505,
+      "learning_rate": 4.991017325793009e-05,
+      "loss": 0.22467944622039795,
+      "step": 310
+    },
+    {
+      "epoch": 0.05731441048034935,
+      "grad_norm": 0.07746429741382599,
+      "learning_rate": 4.990383106345994e-05,
+      "loss": 0.229844069480896,
+      "step": 315
+    },
+    {
+      "epoch": 0.05822416302765648,
+      "grad_norm": 0.08564355969429016,
+      "learning_rate": 4.989727297688797e-05,
+      "loss": 0.22414517402648926,
+      "step": 320
+    },
+    {
+      "epoch": 0.05913391557496361,
+      "grad_norm": 0.07517435401678085,
+      "learning_rate": 4.9890499055066435e-05,
+      "loss": 0.2236532211303711,
+      "step": 325
+    },
+    {
+      "epoch": 0.060043668122270744,
+      "grad_norm": 0.111734539270401,
+      "learning_rate": 4.988350935671869e-05,
+      "loss": 0.21474847793579102,
+      "step": 330
+    },
+    {
+      "epoch": 0.060953420669577874,
+      "grad_norm": 0.09906989336013794,
+      "learning_rate": 4.987630394243866e-05,
+      "loss": 0.23321933746337892,
+      "step": 335
+    },
+    {
+      "epoch": 0.06186317321688501,
+      "grad_norm": 0.10131457448005676,
+      "learning_rate": 4.98688828746903e-05,
+      "loss": 0.2310662031173706,
+      "step": 340
+    },
+    {
+      "epoch": 0.06277292576419213,
+      "grad_norm": 0.09203507006168365,
+      "learning_rate": 4.986124621780708e-05,
+      "loss": 0.22021169662475587,
+      "step": 345
+    },
+    {
+      "epoch": 0.06368267831149928,
+      "grad_norm": 0.09505912661552429,
+      "learning_rate": 4.9853394037991416e-05,
+      "loss": 0.2197155237197876,
+      "step": 350
+    },
+    {
+      "epoch": 0.06459243085880641,
+      "grad_norm": 0.09038657695055008,
+      "learning_rate": 4.984532640331412e-05,
+      "loss": 0.22066287994384765,
+      "step": 355
+    },
+    {
+      "epoch": 0.06550218340611354,
+      "grad_norm": 0.09707064181566238,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 0.22455451488494874,
+      "step": 360
+    },
+    {
+      "epoch": 0.06641193595342067,
+      "grad_norm": 0.10367228090763092,
+      "learning_rate": 4.98285450509961e-05,
+      "loss": 0.21993820667266845,
+      "step": 365
+    },
+    {
+      "epoch": 0.0673216885007278,
+      "grad_norm": 0.12229471653699875,
+      "learning_rate": 4.9819831478833456e-05,
+      "loss": 0.2168867588043213,
+      "step": 370
+    },
+    {
+      "epoch": 0.06823144104803494,
+      "grad_norm": 0.0964592918753624,
+      "learning_rate": 4.981090274276406e-05,
+      "loss": 0.21579203605651856,
+      "step": 375
+    },
+    {
+      "epoch": 0.06914119359534207,
+      "grad_norm": 0.09400496631860733,
+      "learning_rate": 4.980175892019141e-05,
+      "loss": 0.20972180366516113,
+      "step": 380
+    },
+    {
+      "epoch": 0.0700509461426492,
+      "grad_norm": 0.08158645778894424,
+      "learning_rate": 4.9792400090383594e-05,
+      "loss": 0.22148358821868896,
+      "step": 385
+    },
+    {
+      "epoch": 0.07096069868995633,
+      "grad_norm": 0.10916394740343094,
+      "learning_rate": 4.978282633447261e-05,
+      "loss": 0.2214418649673462,
+      "step": 390
+    },
+    {
+      "epoch": 0.07187045123726346,
+      "grad_norm": 0.11138810962438583,
+      "learning_rate": 4.9773037735453636e-05,
+      "loss": 0.21814754009246826,
+      "step": 395
+    },
+    {
+      "epoch": 0.07278020378457059,
+      "grad_norm": 0.10914396494626999,
+      "learning_rate": 4.9763034378184365e-05,
+      "loss": 0.21310818195343018,
+      "step": 400
+    },
+    {
+      "epoch": 0.07368995633187773,
+      "grad_norm": 0.1043366864323616,
+      "learning_rate": 4.975281634938421e-05,
+      "loss": 0.21266789436340333,
+      "step": 405
+    },
+    {
+      "epoch": 0.07459970887918486,
+      "grad_norm": 0.1036868542432785,
+      "learning_rate": 4.9742383737633594e-05,
+      "loss": 0.21606721878051757,
+      "step": 410
+    },
+    {
+      "epoch": 0.075509461426492,
+      "grad_norm": 0.11640442907810211,
+      "learning_rate": 4.9731736633373144e-05,
+      "loss": 0.21532948017120362,
+      "step": 415
+    },
+    {
+      "epoch": 0.07641921397379912,
+      "grad_norm": 0.11219926178455353,
+      "learning_rate": 4.9720875128902956e-05,
+      "loss": 0.2191627025604248,
+      "step": 420
+    },
+    {
+      "epoch": 0.07732896652110625,
+      "grad_norm": 0.12103637307882309,
+      "learning_rate": 4.970979931838176e-05,
+      "loss": 0.20938868522644044,
+      "step": 425
+    },
+    {
+      "epoch": 0.0782387190684134,
+      "grad_norm": 0.13274189829826355,
+      "learning_rate": 4.96985092978261e-05,
+      "loss": 0.21792960166931152,
+      "step": 430
+    },
+    {
+      "epoch": 0.07914847161572053,
+      "grad_norm": 0.11164513230323792,
+      "learning_rate": 4.968700516510954e-05,
+      "loss": 0.2022618055343628,
+      "step": 435
+    },
+    {
+      "epoch": 0.08005822416302766,
+      "grad_norm": 0.09532847255468369,
+      "learning_rate": 4.967528701996174e-05,
+      "loss": 0.21255812644958497,
+      "step": 440
+    },
+    {
+      "epoch": 0.08096797671033479,
+      "grad_norm": 0.10279258340597153,
+      "learning_rate": 4.96633549639677e-05,
+      "loss": 0.20683050155639648,
+      "step": 445
+    },
+    {
+      "epoch": 0.08187772925764192,
+      "grad_norm": 0.1257462352514267,
+      "learning_rate": 4.965120910056677e-05,
+      "loss": 0.21419920921325683,
+      "step": 450
+    },
+    {
+      "epoch": 0.08278748180494905,
+      "grad_norm": 0.11663137376308441,
+      "learning_rate": 4.963884953505186e-05,
+      "loss": 0.2072287082672119,
+      "step": 455
+    },
+    {
+      "epoch": 0.08369723435225619,
+      "grad_norm": 0.10488224029541016,
+      "learning_rate": 4.96262763745684e-05,
+      "loss": 0.1982678532600403,
+      "step": 460
+    },
+    {
+      "epoch": 0.08460698689956332,
+      "grad_norm": 0.11801692098379135,
+      "learning_rate": 4.961348972811354e-05,
+      "loss": 0.20662031173706055,
+      "step": 465
+    },
+    {
+      "epoch": 0.08551673944687045,
+      "grad_norm": 0.11318827420473099,
+      "learning_rate": 4.96004897065351e-05,
+      "loss": 0.20947303771972656,
+      "step": 470
+    },
+    {
+      "epoch": 0.08642649199417758,
+      "grad_norm": 0.13409486413002014,
+      "learning_rate": 4.95872764225307e-05,
+      "loss": 0.19670876264572143,
+      "step": 475
+    },
+    {
+      "epoch": 0.08733624454148471,
+      "grad_norm": 0.14440792798995972,
+      "learning_rate": 4.957384999064672e-05,
+      "loss": 0.19842848777770997,
+      "step": 480
+    },
+    {
+      "epoch": 0.08824599708879186,
+      "grad_norm": 0.12246996909379959,
+      "learning_rate": 4.956021052727731e-05,
+      "loss": 0.20318071842193602,
+      "step": 485
+    },
+    {
+      "epoch": 0.08915574963609899,
+      "grad_norm": 0.13437233865261078,
+      "learning_rate": 4.954635815066342e-05,
+      "loss": 0.21675212383270265,
+      "step": 490
+    },
+    {
+      "epoch": 0.09006550218340612,
+      "grad_norm": 0.11109672486782074,
+      "learning_rate": 4.9532292980891744e-05,
+      "loss": 0.2100757837295532,
+      "step": 495
+    },
+    {
+      "epoch": 0.09097525473071325,
+      "grad_norm": 0.1388893872499466,
+      "learning_rate": 4.9518015139893675e-05,
+      "loss": 0.20303285121917725,
+      "step": 500
+    },
+    {
+      "epoch": 0.09188500727802038,
+      "grad_norm": 0.13239721953868866,
+      "learning_rate": 4.950352475144427e-05,
+      "loss": 0.2152268409729004,
+      "step": 505
+    },
+    {
+      "epoch": 0.0927947598253275,
+      "grad_norm": 0.12834979593753815,
+      "learning_rate": 4.948882194116119e-05,
+      "loss": 0.20799248218536376,
+      "step": 510
+    },
+    {
+      "epoch": 0.09370451237263465,
+      "grad_norm": 0.11886704713106155,
+      "learning_rate": 4.947390683650354e-05,
+      "loss": 0.20394976139068605,
+      "step": 515
+    },
+    {
+      "epoch": 0.09461426491994178,
+      "grad_norm": 0.11398876458406448,
+      "learning_rate": 4.945877956677083e-05,
+      "loss": 0.2091092586517334,
+      "step": 520
+    },
+    {
+      "epoch": 0.09552401746724891,
+      "grad_norm": 0.1422540694475174,
+      "learning_rate": 4.944344026310186e-05,
+      "loss": 0.19564238786697388,
+      "step": 525
+    },
+    {
+      "epoch": 0.09643377001455604,
+      "grad_norm": 0.11359584331512451,
+      "learning_rate": 4.9427889058473535e-05,
+      "loss": 0.20493624210357667,
+      "step": 530
+    },
+    {
+      "epoch": 0.09734352256186317,
+      "grad_norm": 0.11703553050756454,
+      "learning_rate": 4.941212608769974e-05,
+      "loss": 0.2098615884780884,
+      "step": 535
+    },
+    {
+      "epoch": 0.0982532751091703,
+      "grad_norm": 0.14552047848701477,
+      "learning_rate": 4.939615148743017e-05,
+      "loss": 0.20382182598114013,
+      "step": 540
+    },
+    {
+      "epoch": 0.09916302765647744,
+      "grad_norm": 0.13178016245365143,
+      "learning_rate": 4.937996539614914e-05,
+      "loss": 0.19901862144470214,
+      "step": 545
+    },
+    {
+      "epoch": 0.10007278020378457,
+      "grad_norm": 0.635392427444458,
+      "learning_rate": 4.936356795417439e-05,
+      "loss": 0.20694944858551026,
+      "step": 550
+    },
+    {
+      "epoch": 0.1009825327510917,
+      "grad_norm": 0.15019077062606812,
+      "learning_rate": 4.934695930365586e-05,
+      "loss": 0.19313746690750122,
+      "step": 555
+    },
+    {
+      "epoch": 0.10189228529839883,
+      "grad_norm": 0.12941956520080566,
+      "learning_rate": 4.9330139588574474e-05,
+      "loss": 0.19671722650527954,
+      "step": 560
+    },
+    {
+      "epoch": 0.10280203784570596,
+      "grad_norm": 0.13818831741809845,
+      "learning_rate": 4.931310895474088e-05,
+      "loss": 0.20026786327362062,
+      "step": 565
+    },
+    {
+      "epoch": 0.1037117903930131,
+      "grad_norm": 0.12011194974184036,
+      "learning_rate": 4.929586754979417e-05,
+      "loss": 0.1932437539100647,
+      "step": 570
+    },
+    {
+      "epoch": 0.10462154294032024,
+      "grad_norm": 0.1345364898443222,
+      "learning_rate": 4.9278415523200644e-05,
+      "loss": 0.20245940685272218,
+      "step": 575
+    },
+    {
+      "epoch": 0.10553129548762737,
+      "grad_norm": 0.13281017541885376,
+      "learning_rate": 4.926075302625247e-05,
+      "loss": 0.19864981174468993,
+      "step": 580
+    },
+    {
+      "epoch": 0.1064410480349345,
+      "grad_norm": 0.13465586304664612,
+      "learning_rate": 4.924288021206639e-05,
+      "loss": 0.19573183059692384,
+      "step": 585
+    },
+    {
+      "epoch": 0.10735080058224163,
+      "grad_norm": 0.15225961804389954,
+      "learning_rate": 4.9224797235582396e-05,
+      "loss": 0.19946801662445068,
+      "step": 590
+    },
+    {
+      "epoch": 0.10826055312954876,
+      "grad_norm": 0.12816746532917023,
+      "learning_rate": 4.92065042535624e-05,
+      "loss": 0.19851526021957397,
+      "step": 595
+    },
+    {
+      "epoch": 0.1091703056768559,
+      "grad_norm": 0.13802853226661682,
+      "learning_rate": 4.9188001424588824e-05,
+      "loss": 0.19321763515472412,
+      "step": 600
+    },
+    {
+      "epoch": 0.11008005822416303,
+      "grad_norm": 0.17504797875881195,
+      "learning_rate": 4.9169288909063295e-05,
+      "loss": 0.2032616138458252,
+      "step": 605
+    },
+    {
+      "epoch": 0.11098981077147016,
+      "grad_norm": 0.13544194400310516,
+      "learning_rate": 4.91503668692052e-05,
+      "loss": 0.2011256456375122,
+      "step": 610
+    },
+    {
+      "epoch": 0.11189956331877729,
+      "grad_norm": 1.3976134061813354,
+      "learning_rate": 4.91312354690503e-05,
+      "loss": 0.19916868209838867,
+      "step": 615
+    },
+    {
+      "epoch": 0.11280931586608442,
+      "grad_norm": 0.1465059071779251,
+      "learning_rate": 4.91118948744493e-05,
+      "loss": 0.19487457275390624,
+      "step": 620
+    },
+    {
+      "epoch": 0.11371906841339156,
+      "grad_norm": 0.12103168666362762,
+      "learning_rate": 4.909234525306645e-05,
+      "loss": 0.1907251238822937,
+      "step": 625
+    },
+    {
+      "epoch": 0.1146288209606987,
+      "grad_norm": 0.12660574913024902,
+      "learning_rate": 4.907258677437802e-05,
+      "loss": 0.19327253103256226,
+      "step": 630
+    },
+    {
+      "epoch": 0.11553857350800582,
+      "grad_norm": 0.1347813606262207,
+      "learning_rate": 4.90526196096709e-05,
+      "loss": 0.19637736082077026,
+      "step": 635
+    },
+    {
+      "epoch": 0.11644832605531295,
+      "grad_norm": 0.14953652024269104,
+      "learning_rate": 4.903244393204107e-05,
+      "loss": 0.20325069427490233,
+      "step": 640
+    },
+    {
+      "epoch": 0.11735807860262008,
+      "grad_norm": 0.13936272263526917,
+      "learning_rate": 4.901205991639213e-05,
+      "loss": 0.1930275321006775,
+      "step": 645
+    },
+    {
+      "epoch": 0.11826783114992721,
+      "grad_norm": 0.1448420137166977,
+      "learning_rate": 4.899146773943374e-05,
+      "loss": 0.20026936531066894,
+      "step": 650
+    },
+    {
+      "epoch": 0.11917758369723436,
+      "grad_norm": 0.1312534064054489,
+      "learning_rate": 4.897066757968014e-05,
+      "loss": 0.19062033891677857,
+      "step": 655
+    },
+    {
+      "epoch": 0.12008733624454149,
+      "grad_norm": 0.13644742965698242,
+      "learning_rate": 4.894965961744859e-05,
+      "loss": 0.18719595670700073,
+      "step": 660
+    },
+    {
+      "epoch": 0.12099708879184862,
+      "grad_norm": 0.14276087284088135,
+      "learning_rate": 4.892844403485777e-05,
+      "loss": 0.19784307479858398,
+      "step": 665
+    },
+    {
+      "epoch": 0.12190684133915575,
+      "grad_norm": 0.14735399186611176,
+      "learning_rate": 4.890702101582623e-05,
+      "loss": 0.19163782596588136,
+      "step": 670
+    },
+    {
+      "epoch": 0.12281659388646288,
+      "grad_norm": 0.15742065012454987,
+      "learning_rate": 4.888539074607082e-05,
+      "loss": 0.19312986135482788,
+      "step": 675
+    },
+    {
+      "epoch": 0.12372634643377002,
+      "grad_norm": 0.12917031347751617,
+      "learning_rate": 4.8863553413105025e-05,
+      "loss": 0.20066320896148682,
+      "step": 680
+    },
+    {
+      "epoch": 0.12463609898107715,
+      "grad_norm": 0.1484801322221756,
+      "learning_rate": 4.884150920623737e-05,
+      "loss": 0.20096096992492676,
+      "step": 685
+    },
+    {
+      "epoch": 0.12554585152838427,
+      "grad_norm": 0.1455296128988266,
+      "learning_rate": 4.88192583165698e-05,
+      "loss": 0.20518505573272705,
+      "step": 690
+    },
+    {
+      "epoch": 0.12645560407569142,
+      "grad_norm": 0.14517490565776825,
+      "learning_rate": 4.879680093699598e-05,
+      "loss": 0.18859238624572755,
+      "step": 695
+    },
+    {
+      "epoch": 0.12736535662299855,
+      "grad_norm": 0.18778090178966522,
+      "learning_rate": 4.877413726219964e-05,
+      "loss": 0.197074818611145,
+      "step": 700
+    },
+    {
+      "epoch": 0.12827510917030568,
+      "grad_norm": 0.13497677445411682,
+      "learning_rate": 4.87512674886529e-05,
+      "loss": 0.18713107109069824,
+      "step": 705
+    },
+    {
+      "epoch": 0.12918486171761281,
+      "grad_norm": 0.12657155096530914,
+      "learning_rate": 4.872819181461455e-05,
+      "loss": 0.1858484387397766,
+      "step": 710
+    },
+    {
+      "epoch": 0.13009461426491994,
+      "grad_norm": 0.11458148807287216,
+      "learning_rate": 4.870491044012834e-05,
+      "loss": 0.18732179403305055,
+      "step": 715
+    },
+    {
+      "epoch": 0.13100436681222707,
+      "grad_norm": 0.13000249862670898,
+      "learning_rate": 4.8681423567021244e-05,
+      "loss": 0.1872936010360718,
+      "step": 720
+    },
+    {
+      "epoch": 0.1319141193595342,
+      "grad_norm": 0.14580890536308289,
+      "learning_rate": 4.865773139890172e-05,
+      "loss": 0.19280019998550416,
+      "step": 725
+    },
+    {
+      "epoch": 0.13282387190684133,
+      "grad_norm": 0.1507277935743332,
+      "learning_rate": 4.8633834141157913e-05,
+      "loss": 0.1898929238319397,
+      "step": 730
+    },
+    {
+      "epoch": 0.13373362445414846,
+      "grad_norm": 0.1418737769126892,
+      "learning_rate": 4.860973200095592e-05,
+      "loss": 0.17926375865936278,
+      "step": 735
+    },
+    {
+      "epoch": 0.1346433770014556,
+      "grad_norm": 0.17151866853237152,
+      "learning_rate": 4.858542518723794e-05,
+      "loss": 0.18963592052459716,
+      "step": 740
+    },
+    {
+      "epoch": 0.13555312954876272,
+      "grad_norm": 0.11162743717432022,
+      "learning_rate": 4.8560913910720535e-05,
+      "loss": 0.19466646909713745,
+      "step": 745
+    },
+    {
+      "epoch": 0.13646288209606988,
+      "grad_norm": 0.15628376603126526,
+      "learning_rate": 4.8536198383892725e-05,
+      "loss": 0.19494034051895143,
+      "step": 750
+    },
+    {
+      "epoch": 0.137372634643377,
+      "grad_norm": 0.18209289014339447,
+      "learning_rate": 4.851127882101421e-05,
+      "loss": 0.18747550249099731,
+      "step": 755
+    },
+    {
+      "epoch": 0.13828238719068414,
+      "grad_norm": 0.14559614658355713,
+      "learning_rate": 4.8486155438113454e-05,
+      "loss": 0.1897158980369568,
+      "step": 760
+    },
+    {
+      "epoch": 0.13919213973799127,
+      "grad_norm": 0.3198587894439697,
+      "learning_rate": 4.846082845298586e-05,
+      "loss": 0.18571001291275024,
+      "step": 765
+    },
+    {
+      "epoch": 0.1401018922852984,
+      "grad_norm": 0.1486678421497345,
+      "learning_rate": 4.843529808519189e-05,
+      "loss": 0.19561930894851684,
+      "step": 770
+    },
+    {
+      "epoch": 0.14101164483260553,
+      "grad_norm": 0.15318170189857483,
+      "learning_rate": 4.840956455605509e-05,
+      "loss": 0.187040114402771,
+      "step": 775
+    },
+    {
+      "epoch": 0.14192139737991266,
+      "grad_norm": 0.13754244148731232,
+      "learning_rate": 4.838362808866025e-05,
+      "loss": 0.18345539569854735,
+      "step": 780
+    },
+    {
+      "epoch": 0.1428311499272198,
+      "grad_norm": 0.12943248450756073,
+      "learning_rate": 4.835748890785143e-05,
+      "loss": 0.1921079397201538,
+      "step": 785
+    },
+    {
+      "epoch": 0.14374090247452692,
+      "grad_norm": 0.110458143055439,
+      "learning_rate": 4.833114724023001e-05,
+      "loss": 0.17927205562591553,
+      "step": 790
+    },
+    {
+      "epoch": 0.14465065502183405,
+      "grad_norm": 0.2421770840883255,
+      "learning_rate": 4.830460331415275e-05,
+      "loss": 0.18317567110061644,
+      "step": 795
+    },
+    {
+      "epoch": 0.14556040756914118,
+      "grad_norm": 0.14752762019634247,
+      "learning_rate": 4.8277857359729787e-05,
+      "loss": 0.1843916058540344,
+      "step": 800
+    },
+    {
+      "epoch": 0.14647016011644834,
+      "grad_norm": 0.15043556690216064,
+      "learning_rate": 4.8250909608822644e-05,
+      "loss": 0.18354393243789674,
+      "step": 805
+    },
+    {
+      "epoch": 0.14737991266375547,
+      "grad_norm": 0.1381794661283493,
+      "learning_rate": 4.822376029504223e-05,
+      "loss": 0.1789781332015991,
+      "step": 810
+    },
+    {
+      "epoch": 0.1482896652110626,
+      "grad_norm": 0.18386174738407135,
+      "learning_rate": 4.819640965374681e-05,
+      "loss": 0.19494292736053467,
+      "step": 815
+    },
+    {
+      "epoch": 0.14919941775836973,
+      "grad_norm": 0.13829593360424042,
+      "learning_rate": 4.816885792203996e-05,
+      "loss": 0.18486063480377196,
+      "step": 820
+    },
+    {
+      "epoch": 0.15010917030567686,
+      "grad_norm": 0.15033291280269623,
+      "learning_rate": 4.814110533876852e-05,
+      "loss": 0.18061509132385253,
+      "step": 825
+    },
+    {
+      "epoch": 0.151018922852984,
+      "grad_norm": 0.17150473594665527,
+      "learning_rate": 4.811315214452051e-05,
+      "loss": 0.18464866876602173,
+      "step": 830
+    },
+    {
+      "epoch": 0.15192867540029112,
+      "grad_norm": 0.15317125618457794,
+      "learning_rate": 4.808499858162307e-05,
+      "loss": 0.1837708592414856,
+      "step": 835
+    },
+    {
+      "epoch": 0.15283842794759825,
+      "grad_norm": 0.2671392560005188,
+      "learning_rate": 4.805664489414031e-05,
+      "loss": 0.19338636398315429,
+      "step": 840
+    },
+    {
+      "epoch": 0.15374818049490538,
+      "grad_norm": 0.14047028124332428,
+      "learning_rate": 4.802809132787125e-05,
+      "loss": 0.17069108486175538,
+      "step": 845
+    },
+    {
+      "epoch": 0.1546579330422125,
+      "grad_norm": 0.1520431935787201,
+      "learning_rate": 4.799933813034768e-05,
+      "loss": 0.18607735633850098,
+      "step": 850
+    },
+    {
+      "epoch": 0.15556768558951964,
+      "grad_norm": 0.17239463329315186,
+      "learning_rate": 4.797038555083197e-05,
+      "loss": 0.18069062232971192,
+      "step": 855
+    },
+    {
+      "epoch": 0.1564774381368268,
+      "grad_norm": 0.1377955675125122,
+      "learning_rate": 4.794123384031495e-05,
+      "loss": 0.18870222568511963,
+      "step": 860
+    },
+    {
+      "epoch": 0.15738719068413393,
+      "grad_norm": 0.15901461243629456,
+      "learning_rate": 4.791188325151373e-05,
+      "loss": 0.18128334283828734,
+      "step": 865
+    },
+    {
+      "epoch": 0.15829694323144106,
+      "grad_norm": 0.14634132385253906,
+      "learning_rate": 4.7882334038869495e-05,
+      "loss": 0.1866163969039917,
+      "step": 870
+    },
+    {
+      "epoch": 0.1592066957787482,
+      "grad_norm": 0.15361061692237854,
+      "learning_rate": 4.785258645854529e-05,
+      "loss": 0.17850807905197144,
+      "step": 875
+    },
+    {
+      "epoch": 0.16011644832605532,
+      "grad_norm": 0.13751649856567383,
+      "learning_rate": 4.782264076842385e-05,
+      "loss": 0.17731113433837892,
+      "step": 880
+    },
+    {
+      "epoch": 0.16102620087336245,
+      "grad_norm": 0.17909638583660126,
+      "learning_rate": 4.7792497228105314e-05,
+      "loss": 0.18344542980194092,
+      "step": 885
+    },
+    {
+      "epoch": 0.16193595342066958,
+      "grad_norm": 0.16038304567337036,
+      "learning_rate": 4.776215609890498e-05,
+      "loss": 0.18868647813796996,
+      "step": 890
+    },
+    {
+      "epoch": 0.1628457059679767,
+      "grad_norm": 0.1653951108455658,
+      "learning_rate": 4.773161764385107e-05,
+      "loss": 0.18614152669906617,
+      "step": 895
+    },
+    {
+      "epoch": 0.16375545851528384,
+      "grad_norm": 0.16193026304244995,
+      "learning_rate": 4.770088212768241e-05,
+      "loss": 0.18564575910568237,
+      "step": 900
+    },
+    {
+      "epoch": 0.16466521106259097,
+      "grad_norm": 0.16048531234264374,
+      "learning_rate": 4.7669949816846173e-05,
+      "loss": 0.18330031633377075,
+      "step": 905
+    },
+    {
+      "epoch": 0.1655749636098981,
+      "grad_norm": 0.1440177708864212,
+      "learning_rate": 4.7638820979495534e-05,
+      "loss": 0.17712442874908446,
+      "step": 910
+    },
+    {
+      "epoch": 0.16648471615720525,
+      "grad_norm": 0.19635969400405884,
+      "learning_rate": 4.760749588548738e-05,
+      "loss": 0.18679027557373046,
+      "step": 915
+    },
+    {
+      "epoch": 0.16739446870451238,
+      "grad_norm": 0.15576541423797607,
+      "learning_rate": 4.757597480637995e-05,
+      "loss": 0.19283764362335204,
+      "step": 920
+    },
+    {
+      "epoch": 0.1683042212518195,
+      "grad_norm": 0.1550331562757492,
+      "learning_rate": 4.7544258015430463e-05,
+      "loss": 0.18269542455673218,
+      "step": 925
+    },
+    {
+      "epoch": 0.16921397379912664,
+      "grad_norm": 0.18369626998901367,
+      "learning_rate": 4.75123457875928e-05,
+      "loss": 0.1697891116142273,
+      "step": 930
+    },
+    {
+      "epoch": 0.17012372634643377,
+      "grad_norm": 0.15266314148902893,
+      "learning_rate": 4.7480238399515074e-05,
+      "loss": 0.18523451089859008,
+      "step": 935
+    },
+    {
+      "epoch": 0.1710334788937409,
+      "grad_norm": 0.16709664463996887,
+      "learning_rate": 4.744793612953724e-05,
+      "loss": 0.1803238034248352,
+      "step": 940
+    },
+    {
+      "epoch": 0.17194323144104803,
+      "grad_norm": 0.14929179847240448,
+      "learning_rate": 4.741543925768872e-05,
+      "loss": 0.1861217737197876,
+      "step": 945
+    },
+    {
+      "epoch": 0.17285298398835516,
+      "grad_norm": 0.1362280696630478,
+      "learning_rate": 4.7382748065685915e-05,
+      "loss": 0.17896100282669067,
+      "step": 950
+    },
+    {
+      "epoch": 0.1737627365356623,
+      "grad_norm": 0.15290239453315735,
+      "learning_rate": 4.734986283692982e-05,
+      "loss": 0.18432788848876952,
+      "step": 955
+    },
+    {
+      "epoch": 0.17467248908296942,
+      "grad_norm": 0.1287035197019577,
+      "learning_rate": 4.73167838565035e-05,
+      "loss": 0.18485682010650634,
+      "step": 960
+    },
+    {
+      "epoch": 0.17558224163027655,
+      "grad_norm": 0.17969627678394318,
+      "learning_rate": 4.728351141116971e-05,
+      "loss": 0.17361557483673096,
+      "step": 965
+    },
+    {
+      "epoch": 0.1764919941775837,
+      "grad_norm": 0.13751201331615448,
+      "learning_rate": 4.7250045789368326e-05,
+      "loss": 0.1731679320335388,
+      "step": 970
+    },
+    {
+      "epoch": 0.17740174672489084,
+      "grad_norm": 0.1603265255689621,
+      "learning_rate": 4.721638728121388e-05,
+      "loss": 0.17308170795440675,
+      "step": 975
+    },
+    {
+      "epoch": 0.17831149927219797,
+      "grad_norm": 0.1592789888381958,
+      "learning_rate": 4.718253617849306e-05,
+      "loss": 0.17534757852554322,
+      "step": 980
+    },
+    {
+      "epoch": 0.1792212518195051,
+      "grad_norm": 0.12727224826812744,
+      "learning_rate": 4.714849277466214e-05,
+      "loss": 0.17817609310150145,
+      "step": 985
+    },
+    {
+      "epoch": 0.18013100436681223,
+      "grad_norm": 0.15401554107666016,
+      "learning_rate": 4.711425736484447e-05,
+      "loss": 0.1733405351638794,
+      "step": 990
+    },
+    {
+      "epoch": 0.18104075691411936,
+      "grad_norm": 0.13253968954086304,
+      "learning_rate": 4.7079830245827906e-05,
+      "loss": 0.17846795320510864,
+      "step": 995
+    },
+    {
+      "epoch": 0.1819505094614265,
+      "grad_norm": 0.21846213936805725,
+      "learning_rate": 4.7045211716062245e-05,
+      "loss": 0.18021599054336548,
+      "step": 1000
+    },
+    {
+      "epoch": 0.18286026200873362,
+      "grad_norm": 0.16867990791797638,
+      "learning_rate": 4.7010402075656595e-05,
+      "loss": 0.18232386112213134,
+      "step": 1005
+    },
+    {
+      "epoch": 0.18377001455604075,
+      "grad_norm": 0.17180582880973816,
+      "learning_rate": 4.697540162637686e-05,
+      "loss": 0.1816317319869995,
+      "step": 1010
+    },
+    {
+      "epoch": 0.18467976710334788,
+      "grad_norm": 0.16480213403701782,
+      "learning_rate": 4.694021067164303e-05,
+      "loss": 0.17718446254730225,
+      "step": 1015
+    },
+    {
+      "epoch": 0.185589519650655,
+      "grad_norm": 0.15015918016433716,
+      "learning_rate": 4.6904829516526605e-05,
+      "loss": 0.17412011623382567,
+      "step": 1020
+    },
+    {
+      "epoch": 0.18649927219796217,
+      "grad_norm": 0.14445139467716217,
+      "learning_rate": 4.686925846774795e-05,
+      "loss": 0.1778018832206726,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1874090247452693,
+      "grad_norm": 0.1701960265636444,
+      "learning_rate": 4.683349783367362e-05,
+      "loss": 0.16901081800460815,
+      "step": 1030
+    },
+    {
+      "epoch": 0.18831877729257643,
+      "grad_norm": 0.15894867479801178,
+      "learning_rate": 4.679754792431368e-05,
+      "loss": 0.17055928707122803,
+      "step": 1035
+    },
+    {
+      "epoch": 0.18922852983988356,
+      "grad_norm": 0.1511942446231842,
+      "learning_rate": 4.676140905131903e-05,
+      "loss": 0.17339680194854737,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1901382823871907,
+      "grad_norm": 0.14735209941864014,
+      "learning_rate": 4.672508152797872e-05,
+      "loss": 0.17802717685699462,
+      "step": 1045
+    },
+    {
+      "epoch": 0.19104803493449782,
+      "grad_norm": 0.17367291450500488,
+      "learning_rate": 4.66885656692172e-05,
+      "loss": 0.1732744097709656,
+      "step": 1050
+    },
+    {
+      "epoch": 0.19195778748180495,
+      "grad_norm": 0.147227481007576,
+      "learning_rate": 4.665186179159159e-05,
+      "loss": 0.17040517330169677,
+      "step": 1055
+    },
+    {
+      "epoch": 0.19286754002911208,
+      "grad_norm": 0.1709655076265335,
+      "learning_rate": 4.6614970213289e-05,
+      "loss": 0.17794088125228882,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1937772925764192,
+      "grad_norm": 0.1588088721036911,
+      "learning_rate": 4.657789125412366e-05,
+      "loss": 0.17180380821228028,
+      "step": 1065
+    },
+    {
+      "epoch": 0.19468704512372634,
+      "grad_norm": 0.14827021956443787,
+      "learning_rate": 4.654062523553428e-05,
+      "loss": 0.182997989654541,
+      "step": 1070
+    },
+    {
+      "epoch": 0.19559679767103347,
+      "grad_norm": 0.16230466961860657,
+      "learning_rate": 4.6503172480581126e-05,
+      "loss": 0.17346880435943604,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1965065502183406,
+      "grad_norm": 0.1637624353170395,
+      "learning_rate": 4.646553331394333e-05,
+      "loss": 0.17263576984405518,
+      "step": 1080
+    },
+    {
+      "epoch": 0.19741630276564776,
+      "grad_norm": 0.15977843105793,
+      "learning_rate": 4.642770806191603e-05,
+      "loss": 0.17284308671951293,
+      "step": 1085
+    },
+    {
+      "epoch": 0.19832605531295489,
+      "grad_norm": 0.15394869446754456,
+      "learning_rate": 4.6389697052407534e-05,
+      "loss": 0.17797101736068727,
+      "step": 1090
+    },
+    {
+      "epoch": 0.19923580786026202,
+      "grad_norm": 0.15995225310325623,
+      "learning_rate": 4.6351500614936485e-05,
+      "loss": 0.18137198686599731,
+      "step": 1095
+    },
+    {
+      "epoch": 0.20014556040756915,
+      "grad_norm": 0.1779479682445526,
+      "learning_rate": 4.6313119080629006e-05,
+      "loss": 0.17998344898223878,
+      "step": 1100
+    },
+    {
+      "epoch": 0.20105531295487628,
+      "grad_norm": 0.14362832903862,
+      "learning_rate": 4.627455278221584e-05,
+      "loss": 0.18196423053741456,
+      "step": 1105
+    },
+    {
+      "epoch": 0.2019650655021834,
+      "grad_norm": 0.15951639413833618,
+      "learning_rate": 4.623580205402947e-05,
+      "loss": 0.17423888444900512,
+      "step": 1110
+    },
+    {
+      "epoch": 0.20287481804949054,
+      "grad_norm": 0.17273563146591187,
+      "learning_rate": 4.619686723200115e-05,
+      "loss": 0.17392473220825194,
+      "step": 1115
+    },
+    {
+      "epoch": 0.20378457059679767,
+      "grad_norm": 0.1655360758304596,
+      "learning_rate": 4.615774865365813e-05,
+      "loss": 0.17528389692306517,
+      "step": 1120
+    },
+    {
+      "epoch": 0.2046943231441048,
+      "grad_norm": 0.15920691192150116,
+      "learning_rate": 4.611844665812058e-05,
+      "loss": 0.1806849241256714,
+      "step": 1125
+    },
+    {
+      "epoch": 0.20560407569141192,
+      "grad_norm": 0.16114577651023865,
+      "learning_rate": 4.607896158609875e-05,
+      "loss": 0.17217352390289306,
+      "step": 1130
+    },
+    {
+      "epoch": 0.20651382823871905,
+      "grad_norm": 0.1499422937631607,
+      "learning_rate": 4.603929377988999e-05,
+      "loss": 0.17806737422943114,
+      "step": 1135
+    },
+    {
+      "epoch": 0.2074235807860262,
+      "grad_norm": 0.17605191469192505,
+      "learning_rate": 4.5999443583375765e-05,
+      "loss": 0.17842113971710205,
+      "step": 1140
+    },
+    {
+      "epoch": 0.20833333333333334,
+      "grad_norm": 0.16117210686206818,
+      "learning_rate": 4.595941134201871e-05,
+      "loss": 0.18379683494567872,
+      "step": 1145
+    },
+    {
+      "epoch": 0.20924308588064047,
+      "grad_norm": 0.21199050545692444,
+      "learning_rate": 4.591919740285957e-05,
+      "loss": 0.16286123991012574,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2101528384279476,
+      "grad_norm": 0.15100529789924622,
+      "learning_rate": 4.587880211451427e-05,
+      "loss": 0.17995200157165528,
+      "step": 1155
+    },
+    {
+      "epoch": 0.21106259097525473,
+      "grad_norm": 0.16618172824382782,
+      "learning_rate": 4.583822582717085e-05,
+      "loss": 0.16960303783416747,
+      "step": 1160
+    },
+    {
+      "epoch": 0.21197234352256186,
+      "grad_norm": 0.14743569493293762,
+      "learning_rate": 4.579746889258643e-05,
+      "loss": 0.17762668132781984,
+      "step": 1165
+    },
+    {
+      "epoch": 0.212882096069869,
+      "grad_norm": 0.1697179079055786,
+      "learning_rate": 4.575653166408417e-05,
+      "loss": 0.16656005382537842,
+      "step": 1170
+    },
+    {
+      "epoch": 0.21379184861717612,
+      "grad_norm": 0.14886513352394104,
+      "learning_rate": 4.57154144965502e-05,
+      "loss": 0.17091882228851318,
+      "step": 1175
+    },
+    {
+      "epoch": 0.21470160116448325,
+      "grad_norm": 0.18197473883628845,
+      "learning_rate": 4.5674117746430556e-05,
+      "loss": 0.1770920753479004,
+      "step": 1180
+    },
+    {
+      "epoch": 0.21561135371179038,
+      "grad_norm": 0.17323088645935059,
+      "learning_rate": 4.563264177172807e-05,
+      "loss": 0.1734643578529358,
+      "step": 1185
+    },
+    {
+      "epoch": 0.2165211062590975,
+      "grad_norm": 0.1521984338760376,
+      "learning_rate": 4.559098693199929e-05,
+      "loss": 0.17515116930007935,
+      "step": 1190
+    },
+    {
+      "epoch": 0.21743085880640467,
+      "grad_norm": 0.1842304915189743,
+      "learning_rate": 4.554915358835134e-05,
+      "loss": 0.16798022985458375,
+      "step": 1195
+    },
+    {
+      "epoch": 0.2183406113537118,
+      "grad_norm": 0.14753451943397522,
+      "learning_rate": 4.5507142103438794e-05,
+      "loss": 0.1755476713180542,
+      "step": 1200
+    },
+    {
+      "epoch": 0.21925036390101893,
+      "grad_norm": 0.17096194624900818,
+      "learning_rate": 4.546495284146057e-05,
+      "loss": 0.1792473554611206,
+      "step": 1205
+    },
+    {
+      "epoch": 0.22016011644832606,
+      "grad_norm": 0.1579233556985855,
+      "learning_rate": 4.542258616815669e-05,
+      "loss": 0.17230144739151002,
+      "step": 1210
+    },
+    {
+      "epoch": 0.2210698689956332,
+      "grad_norm": 0.177297905087471,
+      "learning_rate": 4.5380042450805216e-05,
+      "loss": 0.1807127833366394,
+      "step": 1215
+    },
+    {
+      "epoch": 0.22197962154294032,
+      "grad_norm": 0.14331696927547455,
+      "learning_rate": 4.533732205821897e-05,
+      "loss": 0.17201389074325563,
+      "step": 1220
+    },
+    {
+      "epoch": 0.22288937409024745,
+      "grad_norm": 0.14473360776901245,
+      "learning_rate": 4.529442536074239e-05,
+      "loss": 0.17036900520324708,
+      "step": 1225
+    },
+    {
+      "epoch": 0.22379912663755458,
+      "grad_norm": 0.1820901483297348,
+      "learning_rate": 4.5251352730248314e-05,
+      "loss": 0.17704882621765136,
+      "step": 1230
+    },
+    {
+      "epoch": 0.2247088791848617,
+      "grad_norm": 0.1948976367712021,
+      "learning_rate": 4.5208104540134746e-05,
+      "loss": 0.1706973433494568,
+      "step": 1235
+    },
+    {
+      "epoch": 0.22561863173216884,
+      "grad_norm": 0.16660070419311523,
+      "learning_rate": 4.51646811653216e-05,
+      "loss": 0.17636821269989014,
+      "step": 1240
+    },
+    {
+      "epoch": 0.22652838427947597,
+      "grad_norm": 0.1699984073638916,
+      "learning_rate": 4.512108298224751e-05,
+      "loss": 0.16986632347106934,
+      "step": 1245
+    },
+    {
+      "epoch": 0.22743813682678313,
+      "grad_norm": 0.17601042985916138,
+      "learning_rate": 4.50773103688665e-05,
+      "loss": 0.17507898807525635,
+      "step": 1250
+    },
+    {
+      "epoch": 0.22834788937409026,
+      "grad_norm": 0.17557238042354584,
+      "learning_rate": 4.503336370464476e-05,
+      "loss": 0.17702863216400147,
+      "step": 1255
+    },
+    {
+      "epoch": 0.2292576419213974,
+      "grad_norm": 0.1800651252269745,
+      "learning_rate": 4.498924337055729e-05,
+      "loss": 0.16419180631637573,
+      "step": 1260
+    },
+    {
+      "epoch": 0.23016739446870452,
+      "grad_norm": 0.2022479772567749,
+      "learning_rate": 4.494494974908468e-05,
+      "loss": 0.17482060194015503,
+      "step": 1265
+    },
+    {
+      "epoch": 0.23107714701601165,
+      "grad_norm": 0.14180205762386322,
+      "learning_rate": 4.490048322420973e-05,
+      "loss": 0.1723136067390442,
+      "step": 1270
+    },
+    {
+      "epoch": 0.23198689956331878,
+      "grad_norm": 0.18607310950756073,
+      "learning_rate": 4.485584418141419e-05,
+      "loss": 0.17096419334411622,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2328966521106259,
+      "grad_norm": 0.15958310663700104,
+      "learning_rate": 4.481103300767529e-05,
+      "loss": 0.1656244158744812,
+      "step": 1280
+    },
+    {
+      "epoch": 0.23380640465793304,
+      "grad_norm": 0.17552383244037628,
+      "learning_rate": 4.476605009146255e-05,
+      "loss": 0.17677626609802247,
+      "step": 1285
+    },
+    {
+      "epoch": 0.23471615720524017,
+      "grad_norm": 0.15299823880195618,
+      "learning_rate": 4.472089582273429e-05,
+      "loss": 0.1778991103172302,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2356259097525473,
+      "grad_norm": 0.14613987505435944,
+      "learning_rate": 4.46755705929343e-05,
+      "loss": 0.17071452140808105,
+      "step": 1295
+    },
+    {
+      "epoch": 0.23653566229985443,
+      "grad_norm": 0.17781122028827667,
+      "learning_rate": 4.463007479498843e-05,
+      "loss": 0.16955430507659913,
+      "step": 1300
+    },
+    {
+      "epoch": 0.23744541484716158,
+      "grad_norm": 0.16326487064361572,
+      "learning_rate": 4.458440882330119e-05,
+      "loss": 0.1777693510055542,
+      "step": 1305
+    },
+    {
+      "epoch": 0.23835516739446871,
+      "grad_norm": 0.17701926827430725,
+      "learning_rate": 4.4538573073752365e-05,
+      "loss": 0.16323351860046387,
+      "step": 1310
+    },
+    {
+      "epoch": 0.23926491994177584,
+      "grad_norm": 0.13104717433452606,
+      "learning_rate": 4.449256794369349e-05,
+      "loss": 0.17653456926345826,
+      "step": 1315
+    },
+    {
+      "epoch": 0.24017467248908297,
+      "grad_norm": 0.1796836256980896,
+      "learning_rate": 4.444639383194452e-05,
+      "loss": 0.17189600467681884,
+      "step": 1320
+    },
+    {
+      "epoch": 0.2410844250363901,
+      "grad_norm": 0.14919696748256683,
+      "learning_rate": 4.440005113879029e-05,
+      "loss": 0.17003334760665895,
+      "step": 1325
+    },
+    {
+      "epoch": 0.24199417758369723,
+      "grad_norm": 0.1728784441947937,
+      "learning_rate": 4.4353540265977064e-05,
+      "loss": 0.17397408485412597,
+      "step": 1330
+    },
+    {
+      "epoch": 0.24290393013100436,
+      "grad_norm": 0.14591015875339508,
+      "learning_rate": 4.43068616167091e-05,
+      "loss": 0.16498478651046752,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2438136826783115,
+      "grad_norm": 0.18417201936244965,
+      "learning_rate": 4.4260015595645055e-05,
+      "loss": 0.16841750144958495,
+      "step": 1340
+    },
+    {
+      "epoch": 0.24472343522561862,
+      "grad_norm": 0.16264279186725616,
+      "learning_rate": 4.4213002608894605e-05,
+      "loss": 0.16907373666763306,
+      "step": 1345
+    },
+    {
+      "epoch": 0.24563318777292575,
+      "grad_norm": 0.15248481929302216,
+      "learning_rate": 4.416582306401481e-05,
+      "loss": 0.15931472778320313,
+      "step": 1350
+    },
+    {
+      "epoch": 0.24654294032023288,
+      "grad_norm": 0.1488373875617981,
+      "learning_rate": 4.4118477370006636e-05,
+      "loss": 0.1701716423034668,
+      "step": 1355
+    },
+    {
+      "epoch": 0.24745269286754004,
+      "grad_norm": 0.14679782092571259,
+      "learning_rate": 4.407096593731142e-05,
+      "loss": 0.157412326335907,
+      "step": 1360
+    },
+    {
+      "epoch": 0.24836244541484717,
+      "grad_norm": 0.17139530181884766,
+      "learning_rate": 4.402328917780728e-05,
+      "loss": 0.17303754091262818,
+      "step": 1365
+    },
+    {
+      "epoch": 0.2492721979621543,
+      "grad_norm": 0.1534871757030487,
+      "learning_rate": 4.397544750480554e-05,
+      "loss": 0.1786255121231079,
+      "step": 1370
+    },
+    {
+      "epoch": 0.2501819505094614,
+      "grad_norm": 0.1876252293586731,
+      "learning_rate": 4.39274413330472e-05,
+      "loss": 0.16442898511886597,
+      "step": 1375
+    },
+    {
+      "epoch": 0.25109170305676853,
+      "grad_norm": 0.16165752708911896,
+      "learning_rate": 4.387927107869928e-05,
+      "loss": 0.1780426025390625,
+      "step": 1380
+    },
+    {
+      "epoch": 0.25200145560407566,
+      "grad_norm": 0.17242255806922913,
+      "learning_rate": 4.383093715935124e-05,
+      "loss": 0.15959256887435913,
+      "step": 1385
+    },
+    {
+      "epoch": 0.25291120815138285,
+      "grad_norm": 0.1627114862203598,
+      "learning_rate": 4.378243999401137e-05,
+      "loss": 0.17606115341186523,
+      "step": 1390
+    },
+    {
+      "epoch": 0.25382096069869,
+      "grad_norm": 0.15911224484443665,
+      "learning_rate": 4.373378000310312e-05,
+      "loss": 0.16798585653305054,
+      "step": 1395
+    },
+    {
+      "epoch": 0.2547307132459971,
+      "grad_norm": 0.15542249381542206,
+      "learning_rate": 4.3684957608461505e-05,
+      "loss": 0.1695417881011963,
+      "step": 1400
+    },
+    {
+      "epoch": 0.25564046579330424,
+      "grad_norm": 0.1475304812192917,
+      "learning_rate": 4.363597323332941e-05,
+      "loss": 0.16340878009796142,
+      "step": 1405
+    },
+    {
+      "epoch": 0.25655021834061137,
+      "grad_norm": 0.16943927109241486,
+      "learning_rate": 4.358682730235395e-05,
+      "loss": 0.17240238189697266,
+      "step": 1410
+    },
+    {
+      "epoch": 0.2574599708879185,
+      "grad_norm": 0.1816391944885254,
+      "learning_rate": 4.3537520241582744e-05,
+      "loss": 0.16558437347412108,
+      "step": 1415
+    },
+    {
+      "epoch": 0.25836972343522563,
+      "grad_norm": 0.23851341009140015,
+      "learning_rate": 4.348805247846027e-05,
+      "loss": 0.16796000003814698,
+      "step": 1420
+    },
+    {
+      "epoch": 0.25927947598253276,
+      "grad_norm": 0.15415243804454803,
+      "learning_rate": 4.343842444182414e-05,
+      "loss": 0.1746017098426819,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2601892285298399,
+      "grad_norm": 0.15651032328605652,
+      "learning_rate": 4.338863656190139e-05,
+      "loss": 0.1649057984352112,
+      "step": 1430
+    },
+    {
+      "epoch": 0.261098981077147,
+      "grad_norm": 0.16601966321468353,
+      "learning_rate": 4.333868927030471e-05,
+      "loss": 0.15888988971710205,
+      "step": 1435
+    },
+    {
+      "epoch": 0.26200873362445415,
+      "grad_norm": 0.1549467295408249,
+      "learning_rate": 4.328858300002876e-05,
+      "loss": 0.16357985734939576,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2629184861717613,
+      "grad_norm": 0.16332370042800903,
+      "learning_rate": 4.32383181854464e-05,
+      "loss": 0.16749982833862304,
+      "step": 1445
+    },
+    {
+      "epoch": 0.2638282387190684,
+      "grad_norm": 0.14827077090740204,
+      "learning_rate": 4.3187895262304894e-05,
+      "loss": 0.16886214017868043,
+      "step": 1450
+    },
+    {
+      "epoch": 0.26473799126637554,
+      "grad_norm": 0.1557198166847229,
+      "learning_rate": 4.313731466772216e-05,
+      "loss": 0.17512214183807373,
+      "step": 1455
+    },
+    {
+      "epoch": 0.26564774381368267,
+      "grad_norm": 0.17263570427894592,
+      "learning_rate": 4.308657684018299e-05,
+      "loss": 0.16248074769973755,
+      "step": 1460
+    },
+    {
+      "epoch": 0.2665574963609898,
+      "grad_norm": 0.17135761678218842,
+      "learning_rate": 4.303568221953521e-05,
+      "loss": 0.16605921983718872,
+      "step": 1465
+    },
+    {
+      "epoch": 0.26746724890829693,
+      "grad_norm": 0.14322632551193237,
+      "learning_rate": 4.2984631246985897e-05,
+      "loss": 0.1610772728919983,
+      "step": 1470
+    },
+    {
+      "epoch": 0.26837700145560406,
+      "grad_norm": 0.18852312862873077,
+      "learning_rate": 4.2933424365097564e-05,
+      "loss": 0.1686462163925171,
+      "step": 1475
+    },
+    {
+      "epoch": 0.2692867540029112,
+      "grad_norm": 0.1780245155096054,
+      "learning_rate": 4.2882062017784294e-05,
+      "loss": 0.16953932046890258,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2701965065502183,
+      "grad_norm": 0.180568665266037,
+      "learning_rate": 4.2830544650307895e-05,
+      "loss": 0.16442664861679077,
+      "step": 1485
+    },
+    {
+      "epoch": 0.27110625909752545,
+      "grad_norm": 0.16876435279846191,
+      "learning_rate": 4.277887270927407e-05,
+      "loss": 0.17128173112869263,
+      "step": 1490
+    },
+    {
+      "epoch": 0.2720160116448326,
+      "grad_norm": 0.164053276181221,
+      "learning_rate": 4.2727046642628513e-05,
+      "loss": 0.16331382989883422,
+      "step": 1495
+    },
+    {
+      "epoch": 0.27292576419213976,
+      "grad_norm": 0.14577528834342957,
+      "learning_rate": 4.267506689965305e-05,
+      "loss": 0.1638316035270691,
+      "step": 1500
+    },
+    {
+      "epoch": 0.2738355167394469,
+      "grad_norm": 0.1648740917444229,
+      "learning_rate": 4.262293393096171e-05,
+      "loss": 0.15332664251327516,
+      "step": 1505
+    },
+    {
+      "epoch": 0.274745269286754,
+      "grad_norm": 0.16445094347000122,
+      "learning_rate": 4.257064818849685e-05,
+      "loss": 0.1706634521484375,
+      "step": 1510
+    },
+    {
+      "epoch": 0.27565502183406115,
+      "grad_norm": 0.1584935486316681,
+      "learning_rate": 4.251821012552524e-05,
+      "loss": 0.1684114694595337,
+      "step": 1515
+    },
+    {
+      "epoch": 0.2765647743813683,
+      "grad_norm": 0.17215611040592194,
+      "learning_rate": 4.24656201966341e-05,
+      "loss": 0.15594131946563722,
+      "step": 1520
+    },
+    {
+      "epoch": 0.2774745269286754,
+      "grad_norm": 0.15945589542388916,
+      "learning_rate": 4.2412878857727214e-05,
+      "loss": 0.1686659574508667,
+      "step": 1525
+    },
+    {
+      "epoch": 0.27838427947598254,
+      "grad_norm": 0.16103951632976532,
+      "learning_rate": 4.2359986566020906e-05,
+      "loss": 0.17779340744018554,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2792940320232897,
+      "grad_norm": 0.1770307570695877,
+      "learning_rate": 4.230694378004014e-05,
+      "loss": 0.16786882877349854,
+      "step": 1535
+    },
+    {
+      "epoch": 0.2802037845705968,
+      "grad_norm": 0.16225053369998932,
+      "learning_rate": 4.2253750959614504e-05,
+      "loss": 0.16558897495269775,
+      "step": 1540
+    },
+    {
+      "epoch": 0.28111353711790393,
+      "grad_norm": 0.27213969826698303,
+      "learning_rate": 4.220040856587425e-05,
+      "loss": 0.1641119599342346,
+      "step": 1545
+    },
+    {
+      "epoch": 0.28202328966521106,
+      "grad_norm": 0.1773071587085724,
+      "learning_rate": 4.2146917061246284e-05,
+      "loss": 0.16919140815734862,
+      "step": 1550
+    },
+    {
+      "epoch": 0.2829330422125182,
+      "grad_norm": 0.15519705414772034,
+      "learning_rate": 4.209327690945014e-05,
+      "loss": 0.15501506328582765,
+      "step": 1555
+    },
+    {
+      "epoch": 0.2838427947598253,
+      "grad_norm": 0.19921597838401794,
+      "learning_rate": 4.203948857549402e-05,
+      "loss": 0.1690821886062622,
+      "step": 1560
+    },
+    {
+      "epoch": 0.28475254730713245,
+      "grad_norm": 0.15417630970478058,
+      "learning_rate": 4.1985552525670696e-05,
+      "loss": 0.1675640344619751,
+      "step": 1565
+    },
+    {
+      "epoch": 0.2856622998544396,
+      "grad_norm": 0.1739572137594223,
+      "learning_rate": 4.193146922755348e-05,
+      "loss": 0.16738017797470092,
+      "step": 1570
+    },
+    {
+      "epoch": 0.2865720524017467,
+      "grad_norm": 0.1384361982345581,
+      "learning_rate": 4.187723914999221e-05,
+      "loss": 0.16802358627319336,
+      "step": 1575
+    },
+    {
+      "epoch": 0.28748180494905384,
+      "grad_norm": 0.1491454839706421,
+      "learning_rate": 4.182286276310915e-05,
+      "loss": 0.1619583249092102,
+      "step": 1580
+    },
+    {
+      "epoch": 0.288391557496361,
+      "grad_norm": 0.15831919014453888,
+      "learning_rate": 4.176834053829492e-05,
+      "loss": 0.1625199794769287,
+      "step": 1585
+    },
+    {
+      "epoch": 0.2893013100436681,
+      "grad_norm": 0.16265396773815155,
+      "learning_rate": 4.1713672948204416e-05,
+      "loss": 0.16718552112579346,
+      "step": 1590
+    },
+    {
+      "epoch": 0.29021106259097523,
+      "grad_norm": 0.15153461694717407,
+      "learning_rate": 4.1658860466752714e-05,
+      "loss": 0.15979087352752686,
+      "step": 1595
+    },
+    {
+      "epoch": 0.29112081513828236,
+      "grad_norm": 0.1620412915945053,
+      "learning_rate": 4.160390356911096e-05,
+      "loss": 0.16103557348251343,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2920305676855895,
+      "grad_norm": 0.16673807799816132,
+      "learning_rate": 4.154880273170223e-05,
+      "loss": 0.16394708156585694,
+      "step": 1605
+    },
+    {
+      "epoch": 0.2929403202328967,
+      "grad_norm": 0.14834867417812347,
+      "learning_rate": 4.149355843219744e-05,
+      "loss": 0.15916435718536376,
+      "step": 1610
+    },
+    {
+      "epoch": 0.2938500727802038,
+      "grad_norm": 0.16977964341640472,
+      "learning_rate": 4.143817114951119e-05,
+      "loss": 0.16538127660751342,
+      "step": 1615
+    },
+    {
+      "epoch": 0.29475982532751094,
+      "grad_norm": 0.17986875772476196,
+      "learning_rate": 4.138264136379756e-05,
+      "loss": 0.15514618158340454,
+      "step": 1620
+    },
+    {
+      "epoch": 0.29566957787481807,
+      "grad_norm": 0.15794920921325684,
+      "learning_rate": 4.132696955644605e-05,
+      "loss": 0.15992183685302735,
+      "step": 1625
+    },
+    {
+      "epoch": 0.2965793304221252,
+      "grad_norm": 0.19955399632453918,
+      "learning_rate": 4.127115621007731e-05,
+      "loss": 0.16362056732177735,
+      "step": 1630
+    },
+    {
+      "epoch": 0.29748908296943233,
+      "grad_norm": 0.1352023035287857,
+      "learning_rate": 4.121520180853903e-05,
+      "loss": 0.15631601810455323,
+      "step": 1635
+    },
+    {
+      "epoch": 0.29839883551673946,
+      "grad_norm": 0.15340781211853027,
+      "learning_rate": 4.1159106836901674e-05,
+      "loss": 0.1571858048439026,
+      "step": 1640
+    },
+    {
+      "epoch": 0.2993085880640466,
+      "grad_norm": 0.15311770141124725,
+      "learning_rate": 4.110287178145433e-05,
+      "loss": 0.16082344055175782,
+      "step": 1645
+    },
+    {
+      "epoch": 0.3002183406113537,
+      "grad_norm": 0.17811627686023712,
+      "learning_rate": 4.10464971297005e-05,
+      "loss": 0.16117215156555176,
+      "step": 1650
+    },
+    {
+      "epoch": 0.30112809315866085,
+      "grad_norm": 0.21060039103031158,
+      "learning_rate": 4.0989983370353805e-05,
+      "loss": 0.15838587284088135,
+      "step": 1655
+    },
+    {
+      "epoch": 0.302037845705968,
+      "grad_norm": 0.155836820602417,
+      "learning_rate": 4.093333099333383e-05,
+      "loss": 0.16648870706558228,
+      "step": 1660
+    },
+    {
+      "epoch": 0.3029475982532751,
+      "grad_norm": 0.13711698353290558,
+      "learning_rate": 4.0876540489761826e-05,
+      "loss": 0.16899349689483642,
+      "step": 1665
+    },
+    {
+      "epoch": 0.30385735080058224,
+      "grad_norm": 0.15162716805934906,
+      "learning_rate": 4.0819612351956485e-05,
+      "loss": 0.16574090719223022,
+      "step": 1670
+    },
+    {
+      "epoch": 0.30476710334788937,
+      "grad_norm": 0.15016348659992218,
+      "learning_rate": 4.0762547073429615e-05,
+      "loss": 0.1689780354499817,
+      "step": 1675
+    },
+    {
+      "epoch": 0.3056768558951965,
+      "grad_norm": 0.15182986855506897,
+      "learning_rate": 4.070534514888194e-05,
+      "loss": 0.1593686819076538,
+      "step": 1680
+    },
+    {
+      "epoch": 0.3065866084425036,
+      "grad_norm": 0.15648750960826874,
+      "learning_rate": 4.0648007074198765e-05,
+      "loss": 0.16436235904693602,
+      "step": 1685
+    },
+    {
+      "epoch": 0.30749636098981076,
+      "grad_norm": 0.18339484930038452,
+      "learning_rate": 4.0590533346445665e-05,
+      "loss": 0.1678077220916748,
+      "step": 1690
+    },
+    {
+      "epoch": 0.3084061135371179,
+      "grad_norm": 0.16426527500152588,
+      "learning_rate": 4.053292446386422e-05,
+      "loss": 0.1689227342605591,
+      "step": 1695
+    },
+    {
+      "epoch": 0.309315866084425,
+      "grad_norm": 0.16129335761070251,
+      "learning_rate": 4.047518092586766e-05,
+      "loss": 0.16592445373535156,
+      "step": 1700
+    },
+    {
+      "epoch": 0.31022561863173215,
+      "grad_norm": 0.15512363612651825,
+      "learning_rate": 4.041730323303654e-05,
+      "loss": 0.16142364740371704,
+      "step": 1705
+    },
+    {
+      "epoch": 0.3111353711790393,
+      "grad_norm": 0.159842386841774,
+      "learning_rate": 4.0359291887114425e-05,
+      "loss": 0.1702875852584839,
+      "step": 1710
+    },
+    {
+      "epoch": 0.3120451237263464,
+      "grad_norm": 0.19558854401111603,
+      "learning_rate": 4.030114739100352e-05,
+      "loss": 0.15966148376464845,
+      "step": 1715
+    },
+    {
+      "epoch": 0.3129548762736536,
+      "grad_norm": 0.1577496975660324,
+      "learning_rate": 4.024287024876029e-05,
+      "loss": 0.1620358943939209,
+      "step": 1720
+    },
+    {
+      "epoch": 0.3138646288209607,
+      "grad_norm": 0.1629355251789093,
+      "learning_rate": 4.0184460965591144e-05,
+      "loss": 0.16511552333831786,
+      "step": 1725
+    },
+    {
+      "epoch": 0.31477438136826785,
+      "grad_norm": 0.17060767114162445,
+      "learning_rate": 4.0125920047848e-05,
+      "loss": 0.15672838687896729,
+      "step": 1730
+    },
+    {
+      "epoch": 0.315684133915575,
+      "grad_norm": 0.22447620332241058,
+      "learning_rate": 4.006724800302394e-05,
+      "loss": 0.15339784622192382,
+      "step": 1735
+    },
+    {
+      "epoch": 0.3165938864628821,
+      "grad_norm": 0.14572037756443024,
+      "learning_rate": 4.000844533974878e-05,
+      "loss": 0.16566959619522095,
+      "step": 1740
+    },
+    {
+      "epoch": 0.31750363901018924,
+      "grad_norm": 0.15915483236312866,
+      "learning_rate": 3.9949512567784684e-05,
+      "loss": 0.16153957843780517,
+      "step": 1745
+    },
+    {
+      "epoch": 0.3184133915574964,
+      "grad_norm": 0.1668540984392166,
+      "learning_rate": 3.9890450198021704e-05,
+      "loss": 0.1659809947013855,
+      "step": 1750
+    },
+    {
+      "epoch": 0.3193231441048035,
+      "grad_norm": 0.16612035036087036,
+      "learning_rate": 3.983125874247341e-05,
+      "loss": 0.16941241025924683,
+      "step": 1755
+    },
+    {
+      "epoch": 0.32023289665211063,
+      "grad_norm": 0.15163679420948029,
+      "learning_rate": 3.9771938714272407e-05,
+      "loss": 0.16053590774536133,
+      "step": 1760
+    },
+    {
+      "epoch": 0.32114264919941776,
+      "grad_norm": 0.1797824203968048,
+      "learning_rate": 3.97124906276659e-05,
+      "loss": 0.1667110800743103,
+      "step": 1765
+    },
+    {
+      "epoch": 0.3220524017467249,
+      "grad_norm": 0.15076608955860138,
+      "learning_rate": 3.9652914998011237e-05,
+      "loss": 0.1607860803604126,
+      "step": 1770
+    },
+    {
+      "epoch": 0.322962154294032,
+      "grad_norm": 0.16523587703704834,
+      "learning_rate": 3.959321234177144e-05,
+      "loss": 0.16515827178955078,
+      "step": 1775
+    },
+    {
+      "epoch": 0.32387190684133915,
+      "grad_norm": 0.22065149247646332,
+      "learning_rate": 3.9533383176510746e-05,
+      "loss": 0.1618957757949829,
+      "step": 1780
+    },
+    {
+      "epoch": 0.3247816593886463,
+      "grad_norm": 0.16426463425159454,
+      "learning_rate": 3.9473428020890066e-05,
+      "loss": 0.15763382911682128,
+      "step": 1785
+    },
+    {
+      "epoch": 0.3256914119359534,
+      "grad_norm": 0.16474904119968414,
+      "learning_rate": 3.941334739466257e-05,
+      "loss": 0.15135571956634522,
+      "step": 1790
+    },
+    {
+      "epoch": 0.32660116448326054,
+      "grad_norm": 0.16746412217617035,
+      "learning_rate": 3.935314181866909e-05,
+      "loss": 0.15925389528274536,
+      "step": 1795
+    },
+    {
+      "epoch": 0.32751091703056767,
+      "grad_norm": 0.17819371819496155,
+      "learning_rate": 3.929281181483369e-05,
+      "loss": 0.1598669171333313,
+      "step": 1800
+    },
+    {
+      "epoch": 0.3284206695778748,
+      "grad_norm": 0.1816040277481079,
+      "learning_rate": 3.923235790615907e-05,
+      "loss": 0.1652522087097168,
+      "step": 1805
+    },
+    {
+      "epoch": 0.32933042212518193,
+      "grad_norm": 0.14846695959568024,
+      "learning_rate": 3.917178061672211e-05,
+      "loss": 0.16665585041046144,
+      "step": 1810
+    },
+    {
+      "epoch": 0.33024017467248906,
+      "grad_norm": 0.1734926551580429,
+      "learning_rate": 3.911108047166924e-05,
+      "loss": 0.16069791316986085,
+      "step": 1815
+    },
+    {
+      "epoch": 0.3311499272197962,
+      "grad_norm": 0.16154922544956207,
+      "learning_rate": 3.905025799721194e-05,
+      "loss": 0.16114097833633423,
+      "step": 1820
+    },
+    {
+      "epoch": 0.3320596797671033,
+      "grad_norm": 0.1538771390914917,
+      "learning_rate": 3.898931372062217e-05,
+      "loss": 0.1602831244468689,
+      "step": 1825
+    },
+    {
+      "epoch": 0.3329694323144105,
+      "grad_norm": 0.14036566019058228,
+      "learning_rate": 3.892824817022781e-05,
+      "loss": 0.1502395749092102,
+      "step": 1830
+    },
+    {
+      "epoch": 0.33387918486171764,
+      "grad_norm": 0.19212059676647186,
+      "learning_rate": 3.886706187540804e-05,
+      "loss": 0.16265250444412233,
+      "step": 1835
+    },
+    {
+      "epoch": 0.33478893740902477,
+      "grad_norm": 0.17410333454608917,
+      "learning_rate": 3.880575536658881e-05,
+      "loss": 0.15689224004745483,
+      "step": 1840
+    },
+    {
+      "epoch": 0.3356986899563319,
+      "grad_norm": 0.15165294706821442,
+      "learning_rate": 3.874432917523817e-05,
+      "loss": 0.15033140182495117,
+      "step": 1845
+    },
+    {
+      "epoch": 0.336608442503639,
+      "grad_norm": 0.16166730225086212,
+      "learning_rate": 3.8682783833861736e-05,
+      "loss": 0.16896235942840576,
+      "step": 1850
+    },
+    {
+      "epoch": 0.33751819505094616,
+      "grad_norm": 0.16497021913528442,
+      "learning_rate": 3.8621119875998026e-05,
+      "loss": 0.1600774645805359,
+      "step": 1855
+    },
+    {
+      "epoch": 0.3384279475982533,
+      "grad_norm": 0.17264948785305023,
+      "learning_rate": 3.855933783621384e-05,
+      "loss": 0.16947593688964843,
+      "step": 1860
+    },
+    {
+      "epoch": 0.3393377001455604,
+      "grad_norm": 0.16870704293251038,
+      "learning_rate": 3.8497438250099636e-05,
+      "loss": 0.16062095165252685,
+      "step": 1865
+    },
+    {
+      "epoch": 0.34024745269286755,
+      "grad_norm": 0.16644036769866943,
+      "learning_rate": 3.843542165426492e-05,
+      "loss": 0.16015599966049193,
+      "step": 1870
+    },
+    {
+      "epoch": 0.3411572052401747,
+      "grad_norm": 0.1626352220773697,
+      "learning_rate": 3.837328858633349e-05,
+      "loss": 0.17444703578948975,
+      "step": 1875
+    },
+    {
+      "epoch": 0.3420669577874818,
+      "grad_norm": 0.1427375227212906,
+      "learning_rate": 3.83110395849389e-05,
+      "loss": 0.1589805006980896,
+      "step": 1880
+    },
+    {
+      "epoch": 0.34297671033478894,
+      "grad_norm": 0.17840255796909332,
+      "learning_rate": 3.824867518971973e-05,
+      "loss": 0.15953952074050903,
+      "step": 1885
+    },
+    {
+      "epoch": 0.34388646288209607,
+      "grad_norm": 0.16998249292373657,
+      "learning_rate": 3.818619594131489e-05,
+      "loss": 0.16027032136917113,
+      "step": 1890
+    },
+    {
+      "epoch": 0.3447962154294032,
+      "grad_norm": 0.14950257539749146,
+      "learning_rate": 3.812360238135897e-05,
+      "loss": 0.15335670709609986,
+      "step": 1895
+    },
+    {
+      "epoch": 0.3457059679767103,
+      "grad_norm": 0.1678011417388916,
+      "learning_rate": 3.806089505247752e-05,
+      "loss": 0.1560648798942566,
+      "step": 1900
+    },
+    {
+      "epoch": 0.34661572052401746,
+      "grad_norm": 0.17944541573524475,
+      "learning_rate": 3.799807449828238e-05,
+      "loss": 0.16072254180908202,
+      "step": 1905
+    },
+    {
+      "epoch": 0.3475254730713246,
+      "grad_norm": 0.166817307472229,
+      "learning_rate": 3.793514126336691e-05,
+      "loss": 0.1542820692062378,
+      "step": 1910
+    },
+    {
+      "epoch": 0.3484352256186317,
+      "grad_norm": 0.16047626733779907,
+      "learning_rate": 3.787209589330134e-05,
+      "loss": 0.16092092990875245,
+      "step": 1915
+    },
+    {
+      "epoch": 0.34934497816593885,
+      "grad_norm": 0.16478900611400604,
+      "learning_rate": 3.7808938934627965e-05,
+      "loss": 0.16765867471694945,
+      "step": 1920
+    },
+    {
+      "epoch": 0.350254730713246,
+      "grad_norm": 0.15349514782428741,
+      "learning_rate": 3.774567093485648e-05,
+      "loss": 0.15890377759933472,
+      "step": 1925
+    },
+    {
+      "epoch": 0.3511644832605531,
+      "grad_norm": 0.1515921950340271,
+      "learning_rate": 3.768229244245917e-05,
+      "loss": 0.16668319702148438,
+      "step": 1930
+    },
+    {
+      "epoch": 0.35207423580786024,
+      "grad_norm": 0.16310466825962067,
+      "learning_rate": 3.7618804006866195e-05,
+      "loss": 0.15182652473449706,
+      "step": 1935
+    },
+    {
+      "epoch": 0.3529839883551674,
+      "grad_norm": 0.17294517159461975,
+      "learning_rate": 3.755520617846084e-05,
+      "loss": 0.16287628412246705,
+      "step": 1940
+    },
+    {
+      "epoch": 0.35389374090247455,
+      "grad_norm": 0.1482895463705063,
+      "learning_rate": 3.749149950857467e-05,
+      "loss": 0.15321952104568481,
+      "step": 1945
+    },
+    {
+      "epoch": 0.3548034934497817,
+      "grad_norm": 0.2236029952764511,
+      "learning_rate": 3.7427684549482847e-05,
+      "loss": 0.15403482913970948,
+      "step": 1950
+    },
+    {
+      "epoch": 0.3557132459970888,
+      "grad_norm": 0.20185327529907227,
+      "learning_rate": 3.736376185439927e-05,
+      "loss": 0.1633884072303772,
+      "step": 1955
+    },
+    {
+      "epoch": 0.35662299854439594,
+      "grad_norm": 0.13906247913837433,
+      "learning_rate": 3.7299731977471816e-05,
+      "loss": 0.15925350189208984,
+      "step": 1960
+    },
+    {
+      "epoch": 0.35753275109170307,
+      "grad_norm": 0.18665002286434174,
+      "learning_rate": 3.723559547377751e-05,
+      "loss": 0.1612026572227478,
+      "step": 1965
+    },
+    {
+      "epoch": 0.3584425036390102,
+      "grad_norm": 0.16913433372974396,
+      "learning_rate": 3.717135289931774e-05,
+      "loss": 0.15479494333267213,
+      "step": 1970
+    },
+    {
+      "epoch": 0.35935225618631733,
+      "grad_norm": 0.1620066910982132,
+      "learning_rate": 3.7107004811013434e-05,
+      "loss": 0.1604058027267456,
+      "step": 1975
+    },
+    {
+      "epoch": 0.36026200873362446,
+      "grad_norm": 0.16838301718235016,
+      "learning_rate": 3.704255176670021e-05,
+      "loss": 0.15335073471069335,
+      "step": 1980
+    },
+    {
+      "epoch": 0.3611717612809316,
+      "grad_norm": 0.3054695427417755,
+      "learning_rate": 3.6977994325123535e-05,
+      "loss": 0.16558053493499755,
+      "step": 1985
+    },
+    {
+      "epoch": 0.3620815138282387,
+      "grad_norm": 0.1526716649532318,
+      "learning_rate": 3.6913333045933934e-05,
+      "loss": 0.16148923635482787,
+      "step": 1990
+    },
+    {
+      "epoch": 0.36299126637554585,
+      "grad_norm": 0.15328513085842133,
+      "learning_rate": 3.684856848968209e-05,
+      "loss": 0.1553613781929016,
+      "step": 1995
+    },
+    {
+      "epoch": 0.363901018922853,
+      "grad_norm": 0.16129714250564575,
+      "learning_rate": 3.6783701217813995e-05,
+      "loss": 0.16724612712860107,
+      "step": 2000
+    },
+    {
+      "epoch": 0.3648107714701601,
+      "grad_norm": 0.15715539455413818,
+      "learning_rate": 3.6718731792666086e-05,
+      "loss": 0.15867922306060792,
+      "step": 2005
+    },
+    {
+      "epoch": 0.36572052401746724,
+      "grad_norm": 0.15569166839122772,
+      "learning_rate": 3.6653660777460366e-05,
+      "loss": 0.1552058696746826,
+      "step": 2010
+    },
+    {
+      "epoch": 0.36663027656477437,
+      "grad_norm": 0.16223010420799255,
+      "learning_rate": 3.6588488736299535e-05,
+      "loss": 0.1583200454711914,
+      "step": 2015
+    },
+    {
+      "epoch": 0.3675400291120815,
+      "grad_norm": 0.18441995978355408,
+      "learning_rate": 3.652321623416209e-05,
+      "loss": 0.15050662755966188,
+      "step": 2020
+    },
+    {
+      "epoch": 0.36844978165938863,
+      "grad_norm": 0.13792674243450165,
+      "learning_rate": 3.645784383689742e-05,
+      "loss": 0.15458759069442748,
+      "step": 2025
+    },
+    {
+      "epoch": 0.36935953420669576,
+      "grad_norm": 0.14993111789226532,
+      "learning_rate": 3.639237211122091e-05,
+      "loss": 0.15926222801208495,
+      "step": 2030
+    },
+    {
+      "epoch": 0.3702692867540029,
+      "grad_norm": 0.16815930604934692,
+      "learning_rate": 3.632680162470904e-05,
+      "loss": 0.15524441003799438,
+      "step": 2035
+    },
+    {
+      "epoch": 0.37117903930131,
+      "grad_norm": 0.13312821090221405,
+      "learning_rate": 3.626113294579441e-05,
+      "loss": 0.15883516073226928,
+      "step": 2040
+    },
+    {
+      "epoch": 0.37208879184861715,
+      "grad_norm": 0.16838273406028748,
+      "learning_rate": 3.619536664376091e-05,
+      "loss": 0.15829603672027587,
+      "step": 2045
+    },
+    {
+      "epoch": 0.37299854439592434,
+      "grad_norm": 0.14706873893737793,
+      "learning_rate": 3.612950328873869e-05,
+      "loss": 0.15644397735595703,
+      "step": 2050
+    },
+    {
+      "epoch": 0.37390829694323147,
+      "grad_norm": 0.1644199639558792,
+      "learning_rate": 3.606354345169926e-05,
+      "loss": 0.15858219861984252,
+      "step": 2055
+    },
+    {
+      "epoch": 0.3748180494905386,
+      "grad_norm": 0.18077051639556885,
+      "learning_rate": 3.599748770445055e-05,
+      "loss": 0.1641286849975586,
+      "step": 2060
+    },
+    {
+      "epoch": 0.3757278020378457,
+      "grad_norm": 0.16329127550125122,
+      "learning_rate": 3.5931336619631914e-05,
+      "loss": 0.15027186870574952,
+      "step": 2065
+    },
+    {
+      "epoch": 0.37663755458515286,
+      "grad_norm": 0.16346783936023712,
+      "learning_rate": 3.586509077070922e-05,
+      "loss": 0.1558641314506531,
+      "step": 2070
+    },
+    {
+      "epoch": 0.37754730713246,
+      "grad_norm": 0.1727602630853653,
+      "learning_rate": 3.5798750731969834e-05,
+      "loss": 0.15390506982803345,
+      "step": 2075
+    },
+    {
+      "epoch": 0.3784570596797671,
+      "grad_norm": 0.7598192691802979,
+      "learning_rate": 3.5732317078517654e-05,
+      "loss": 0.1533232808113098,
+      "step": 2080
+    },
+    {
+      "epoch": 0.37936681222707425,
+      "grad_norm": 0.1433355212211609,
+      "learning_rate": 3.5665790386268124e-05,
+      "loss": 0.15560413599014283,
+      "step": 2085
+    },
+    {
+      "epoch": 0.3802765647743814,
+      "grad_norm": 0.18439625203609467,
+      "learning_rate": 3.559917123194325e-05,
+      "loss": 0.16695556640625,
+      "step": 2090
+    },
+    {
+      "epoch": 0.3811863173216885,
+      "grad_norm": 0.1693502813577652,
+      "learning_rate": 3.55324601930666e-05,
+      "loss": 0.15957870483398437,
+      "step": 2095
+    },
+    {
+      "epoch": 0.38209606986899564,
+      "grad_norm": 0.17776088416576385,
+      "learning_rate": 3.54656578479583e-05,
+      "loss": 0.1527492880821228,
+      "step": 2100
+    },
+    {
+      "epoch": 0.38300582241630277,
+      "grad_norm": 0.15993724763393402,
+      "learning_rate": 3.539876477572998e-05,
+      "loss": 0.1567505717277527,
+      "step": 2105
+    },
+    {
+      "epoch": 0.3839155749636099,
+      "grad_norm": 0.17067375779151917,
+      "learning_rate": 3.533178155627981e-05,
+      "loss": 0.14660797119140626,
+      "step": 2110
+    },
+    {
+      "epoch": 0.384825327510917,
+      "grad_norm": 0.20239882171154022,
+      "learning_rate": 3.526470877028745e-05,
+      "loss": 0.1596767544746399,
+      "step": 2115
+    },
+    {
+      "epoch": 0.38573508005822416,
+      "grad_norm": 0.1863643079996109,
+      "learning_rate": 3.5197546999209005e-05,
+      "loss": 0.15738571882247926,
+      "step": 2120
+    },
+    {
+      "epoch": 0.3866448326055313,
+      "grad_norm": 0.16994133591651917,
+      "learning_rate": 3.5130296825272014e-05,
+      "loss": 0.16255316734313965,
+      "step": 2125
+    },
+    {
+      "epoch": 0.3875545851528384,
+      "grad_norm": 0.18703415989875793,
+      "learning_rate": 3.5062958831470355e-05,
+      "loss": 0.15206334590911866,
+      "step": 2130
+    },
+    {
+      "epoch": 0.38846433770014555,
+      "grad_norm": 0.15433982014656067,
+      "learning_rate": 3.4995533601559226e-05,
+      "loss": 0.1590178370475769,
+      "step": 2135
+    },
+    {
+      "epoch": 0.3893740902474527,
+      "grad_norm": 0.16498146951198578,
+      "learning_rate": 3.4928021720050104e-05,
+      "loss": 0.14759145975112914,
+      "step": 2140
+    },
+    {
+      "epoch": 0.3902838427947598,
+      "grad_norm": 0.17880478501319885,
+      "learning_rate": 3.486042377220562e-05,
+      "loss": 0.1642458915710449,
+      "step": 2145
+    },
+    {
+      "epoch": 0.39119359534206694,
+      "grad_norm": 0.14700061082839966,
+      "learning_rate": 3.479274034403455e-05,
+      "loss": 0.16105138063430785,
+      "step": 2150
+    },
+    {
+      "epoch": 0.39210334788937407,
+      "grad_norm": 0.1620762050151825,
+      "learning_rate": 3.472497202228664e-05,
+      "loss": 0.15104985237121582,
+      "step": 2155
+    },
+    {
+      "epoch": 0.3930131004366812,
+      "grad_norm": 0.1625058799982071,
+      "learning_rate": 3.4657119394447654e-05,
+      "loss": 0.16145485639572144,
+      "step": 2160
+    },
+    {
+      "epoch": 0.3939228529839884,
+      "grad_norm": 0.1631549596786499,
+      "learning_rate": 3.458918304873417e-05,
+      "loss": 0.16712255477905275,
+      "step": 2165
+    },
+    {
+      "epoch": 0.3948326055312955,
+      "grad_norm": 0.16041551530361176,
+      "learning_rate": 3.452116357408853e-05,
+      "loss": 0.15118330717086792,
+      "step": 2170
+    },
+    {
+      "epoch": 0.39574235807860264,
+      "grad_norm": 0.16692611575126648,
+      "learning_rate": 3.44530615601737e-05,
+      "loss": 0.16982550621032716,
+      "step": 2175
+    },
+    {
+      "epoch": 0.39665211062590977,
+      "grad_norm": 0.16082268953323364,
+      "learning_rate": 3.438487759736821e-05,
+      "loss": 0.1513260006904602,
+      "step": 2180
+    },
+    {
+      "epoch": 0.3975618631732169,
+      "grad_norm": 0.1474589854478836,
+      "learning_rate": 3.4316612276761004e-05,
+      "loss": 0.14968743324279785,
+      "step": 2185
+    },
+    {
+      "epoch": 0.39847161572052403,
+      "grad_norm": 0.14531342685222626,
+      "learning_rate": 3.42482661901463e-05,
+      "loss": 0.1563260555267334,
+      "step": 2190
+    },
+    {
+      "epoch": 0.39938136826783116,
+      "grad_norm": 0.16775506734848022,
+      "learning_rate": 3.41798399300185e-05,
+      "loss": 0.14861010313034057,
+      "step": 2195
+    },
+    {
+      "epoch": 0.4002911208151383,
+      "grad_norm": 0.15065217018127441,
+      "learning_rate": 3.411133408956703e-05,
+      "loss": 0.15559519529342652,
+      "step": 2200
+    },
+    {
+      "epoch": 0.4012008733624454,
+      "grad_norm": 0.16655296087265015,
+      "learning_rate": 3.4042749262671184e-05,
+      "loss": 0.16025567054748535,
+      "step": 2205
+    },
+    {
+      "epoch": 0.40211062590975255,
+      "grad_norm": 0.14773905277252197,
+      "learning_rate": 3.397408604389501e-05,
+      "loss": 0.15074082612991332,
+      "step": 2210
+    },
+    {
+      "epoch": 0.4030203784570597,
+      "grad_norm": 0.16233304142951965,
+      "learning_rate": 3.3905345028482125e-05,
+      "loss": 0.15490520000457764,
+      "step": 2215
+    },
+    {
+      "epoch": 0.4039301310043668,
+      "grad_norm": 0.17520153522491455,
+      "learning_rate": 3.383652681235058e-05,
+      "loss": 0.1517520785331726,
+      "step": 2220
+    },
+    {
+      "epoch": 0.40483988355167394,
+      "grad_norm": 0.14749875664710999,
+      "learning_rate": 3.376763199208766e-05,
+      "loss": 0.15410997867584228,
+      "step": 2225
+    },
+    {
+      "epoch": 0.40574963609898107,
+      "grad_norm": 0.16855919361114502,
+      "learning_rate": 3.369866116494477e-05,
+      "loss": 0.1510261058807373,
+      "step": 2230
+    },
+    {
+      "epoch": 0.4066593886462882,
+      "grad_norm": 0.1594122350215912,
+      "learning_rate": 3.362961492883218e-05,
+      "loss": 0.1493813395500183,
+      "step": 2235
+    },
+    {
+      "epoch": 0.40756914119359533,
+      "grad_norm": 0.13645926117897034,
+      "learning_rate": 3.3560493882313915e-05,
+      "loss": 0.14876762628555298,
+      "step": 2240
+    },
+    {
+      "epoch": 0.40847889374090246,
+      "grad_norm": 0.14304400980472565,
+      "learning_rate": 3.349129862460251e-05,
+      "loss": 0.15567013025283813,
+      "step": 2245
+    },
+    {
+      "epoch": 0.4093886462882096,
+      "grad_norm": 0.17040041089057922,
+      "learning_rate": 3.342202975555386e-05,
+      "loss": 0.1563249945640564,
+      "step": 2250
+    },
+    {
+      "epoch": 0.4102983988355167,
+      "grad_norm": 0.15594671666622162,
+      "learning_rate": 3.3352687875661984e-05,
+      "loss": 0.1546410083770752,
+      "step": 2255
+    },
+    {
+      "epoch": 0.41120815138282385,
+      "grad_norm": 0.1677195280790329,
+      "learning_rate": 3.328327358605384e-05,
+      "loss": 0.15710171461105346,
+      "step": 2260
+    },
+    {
+      "epoch": 0.412117903930131,
+      "grad_norm": 0.1731705516576767,
+      "learning_rate": 3.321378748848412e-05,
+      "loss": 0.16444036960601807,
+      "step": 2265
+    },
+    {
+      "epoch": 0.4130276564774381,
+      "grad_norm": 0.18779033422470093,
+      "learning_rate": 3.3144230185329984e-05,
+      "loss": 0.15659687519073487,
+      "step": 2270
+    },
+    {
+      "epoch": 0.4139374090247453,
+      "grad_norm": 0.1543768346309662,
+      "learning_rate": 3.3074602279585913e-05,
+      "loss": 0.15100739002227784,
+      "step": 2275
+    },
+    {
+      "epoch": 0.4148471615720524,
+      "grad_norm": 0.16672168672084808,
+      "learning_rate": 3.300490437485843e-05,
+      "loss": 0.15535364151000977,
+      "step": 2280
+    },
+    {
+      "epoch": 0.41575691411935956,
+      "grad_norm": 0.16741308569908142,
+      "learning_rate": 3.293513707536089e-05,
+      "loss": 0.15523911714553834,
+      "step": 2285
+    },
+    {
+      "epoch": 0.4166666666666667,
+      "grad_norm": 0.1488303542137146,
+      "learning_rate": 3.286530098590822e-05,
+      "loss": 0.1542000651359558,
+      "step": 2290
+    },
+    {
+      "epoch": 0.4175764192139738,
+      "grad_norm": 0.1637732982635498,
+      "learning_rate": 3.2795396711911694e-05,
+      "loss": 0.15354831218719484,
+      "step": 2295
+    },
+    {
+      "epoch": 0.41848617176128095,
+      "grad_norm": 0.1472022533416748,
+      "learning_rate": 3.272542485937369e-05,
+      "loss": 0.16235145330429077,
+      "step": 2300
+    },
+    {
+      "epoch": 0.4193959243085881,
+      "grad_norm": 0.15908290445804596,
+      "learning_rate": 3.265538603488241e-05,
+      "loss": 0.15642645359039306,
+      "step": 2305
+    },
+    {
+      "epoch": 0.4203056768558952,
+      "grad_norm": 0.1584865301847458,
+      "learning_rate": 3.2585280845606645e-05,
+      "loss": 0.15490249395370484,
+      "step": 2310
+    },
+    {
+      "epoch": 0.42121542940320233,
+      "grad_norm": 0.15893949568271637,
+      "learning_rate": 3.251510989929052e-05,
+      "loss": 0.1598116159439087,
+      "step": 2315
+    },
+    {
+      "epoch": 0.42212518195050946,
+      "grad_norm": 0.18930596113204956,
+      "learning_rate": 3.244487380424817e-05,
+      "loss": 0.1482008934020996,
+      "step": 2320
+    },
+    {
+      "epoch": 0.4230349344978166,
+      "grad_norm": 0.132876455783844,
+      "learning_rate": 3.237457316935856e-05,
+      "loss": 0.15304710865020751,
+      "step": 2325
+    },
+    {
+      "epoch": 0.4239446870451237,
+      "grad_norm": 0.16447032988071442,
+      "learning_rate": 3.2304208604060106e-05,
+      "loss": 0.15298750400543212,
+      "step": 2330
+    },
+    {
+      "epoch": 0.42485443959243085,
+      "grad_norm": 0.17748120427131653,
+      "learning_rate": 3.223378071834546e-05,
+      "loss": 0.1556084156036377,
+      "step": 2335
+    },
+    {
+      "epoch": 0.425764192139738,
+      "grad_norm": 0.16366586089134216,
+      "learning_rate": 3.2163290122756206e-05,
+      "loss": 0.14387927055358887,
+      "step": 2340
+    },
+    {
+      "epoch": 0.4266739446870451,
+      "grad_norm": 0.15398970246315002,
+      "learning_rate": 3.209273742837755e-05,
+      "loss": 0.16091293096542358,
+      "step": 2345
+    },
+    {
+      "epoch": 0.42758369723435224,
+      "grad_norm": 0.164212167263031,
+      "learning_rate": 3.202212324683305e-05,
+      "loss": 0.15523531436920165,
+      "step": 2350
+    },
+    {
+      "epoch": 0.4284934497816594,
+      "grad_norm": 0.16749800741672516,
+      "learning_rate": 3.1951448190279255e-05,
+      "loss": 0.15354975461959838,
+      "step": 2355
+    },
+    {
+      "epoch": 0.4294032023289665,
+      "grad_norm": 0.14137034118175507,
+      "learning_rate": 3.18807128714005e-05,
+      "loss": 0.14981694221496583,
+      "step": 2360
+    },
+    {
+      "epoch": 0.43031295487627363,
+      "grad_norm": 0.14848439395427704,
+      "learning_rate": 3.1809917903403507e-05,
+      "loss": 0.15448769330978393,
+      "step": 2365
+    },
+    {
+      "epoch": 0.43122270742358076,
+      "grad_norm": 0.1747605800628662,
+      "learning_rate": 3.1739063900012095e-05,
+      "loss": 0.15882387161254882,
+      "step": 2370
+    },
+    {
+      "epoch": 0.4321324599708879,
+      "grad_norm": 0.16054467856884003,
+      "learning_rate": 3.166815147546186e-05,
+      "loss": 0.15170297622680665,
+      "step": 2375
+    },
+    {
+      "epoch": 0.433042212518195,
+      "grad_norm": 0.15428027510643005,
+      "learning_rate": 3.1597181244494886e-05,
+      "loss": 0.16202548742294312,
+      "step": 2380
+    },
+    {
+      "epoch": 0.4339519650655022,
+      "grad_norm": 0.16747219860553741,
+      "learning_rate": 3.1526153822354325e-05,
+      "loss": 0.15461477041244506,
+      "step": 2385
+    },
+    {
+      "epoch": 0.43486171761280934,
+      "grad_norm": 0.17415772378444672,
+      "learning_rate": 3.145506982477918e-05,
+      "loss": 0.16173542737960817,
+      "step": 2390
+    },
+    {
+      "epoch": 0.43577147016011647,
+      "grad_norm": 0.1293518990278244,
+      "learning_rate": 3.1383929867998865e-05,
+      "loss": 0.15572521686553956,
+      "step": 2395
+    },
+    {
+      "epoch": 0.4366812227074236,
+      "grad_norm": 0.16909323632717133,
+      "learning_rate": 3.1312734568727935e-05,
+      "loss": 0.15898628234863282,
+      "step": 2400
+    },
+    {
+      "epoch": 0.43759097525473073,
+      "grad_norm": 0.16770294308662415,
+      "learning_rate": 3.124148454416069e-05,
+      "loss": 0.1536281704902649,
+      "step": 2405
+    },
+    {
+      "epoch": 0.43850072780203786,
+      "grad_norm": 0.14078612625598907,
+      "learning_rate": 3.117018041196585e-05,
+      "loss": 0.15274266004562378,
+      "step": 2410
+    },
+    {
+      "epoch": 0.439410480349345,
+      "grad_norm": 0.15457536280155182,
+      "learning_rate": 3.1098822790281226e-05,
+      "loss": 0.15391263961791993,
+      "step": 2415
+    },
+    {
+      "epoch": 0.4403202328966521,
+      "grad_norm": 0.1640717089176178,
+      "learning_rate": 3.102741229770827e-05,
+      "loss": 0.15515168905258178,
+      "step": 2420
+    },
+    {
+      "epoch": 0.44122998544395925,
+      "grad_norm": 0.2601533830165863,
+      "learning_rate": 3.095594955330683e-05,
+      "loss": 0.1587247371673584,
+      "step": 2425
+    },
+    {
+      "epoch": 0.4421397379912664,
+      "grad_norm": 0.1352529525756836,
+      "learning_rate": 3.08844351765897e-05,
+      "loss": 0.1483217477798462,
+      "step": 2430
+    },
+    {
+      "epoch": 0.4430494905385735,
+      "grad_norm": 0.18479721248149872,
+      "learning_rate": 3.081286978751728e-05,
+      "loss": 0.15121787786483765,
+      "step": 2435
+    },
+    {
+      "epoch": 0.44395924308588064,
+      "grad_norm": 0.16954511404037476,
+      "learning_rate": 3.074125400649221e-05,
+      "loss": 0.16073100566864013,
+      "step": 2440
+    },
+    {
+      "epoch": 0.44486899563318777,
+      "grad_norm": 0.15154729783535004,
+      "learning_rate": 3.0669588454353944e-05,
+      "loss": 0.15738017559051515,
+      "step": 2445
+    },
+    {
+      "epoch": 0.4457787481804949,
+      "grad_norm": 0.1540488302707672,
+      "learning_rate": 3.059787375237344e-05,
+      "loss": 0.1515384554862976,
+      "step": 2450
+    },
+    {
+      "epoch": 0.44668850072780203,
+      "grad_norm": 0.1814432442188263,
+      "learning_rate": 3.052611052224774e-05,
+      "loss": 0.15731438398361205,
+      "step": 2455
+    },
+    {
+      "epoch": 0.44759825327510916,
+      "grad_norm": 0.16657036542892456,
+      "learning_rate": 3.0454299386094542e-05,
+      "loss": 0.15741543769836425,
+      "step": 2460
+    },
+    {
+      "epoch": 0.4485080058224163,
+      "grad_norm": 0.2177237570285797,
+      "learning_rate": 3.0382440966446875e-05,
+      "loss": 0.14972515106201173,
+      "step": 2465
+    },
+    {
+      "epoch": 0.4494177583697234,
+      "grad_norm": 0.1669909954071045,
+      "learning_rate": 3.031053588624766e-05,
+      "loss": 0.1506432294845581,
+      "step": 2470
+    },
+    {
+      "epoch": 0.45032751091703055,
+      "grad_norm": 0.1752234250307083,
+      "learning_rate": 3.0238584768844313e-05,
+      "loss": 0.14969609975814818,
+      "step": 2475
+    },
+    {
+      "epoch": 0.4512372634643377,
+      "grad_norm": 0.18267901241779327,
+      "learning_rate": 3.0166588237983363e-05,
+      "loss": 0.15112748146057128,
+      "step": 2480
+    },
+    {
+      "epoch": 0.4521470160116448,
+      "grad_norm": 0.16250105202198029,
+      "learning_rate": 3.0094546917805007e-05,
+      "loss": 0.15864100456237792,
+      "step": 2485
+    },
+    {
+      "epoch": 0.45305676855895194,
+      "grad_norm": 0.14825721085071564,
+      "learning_rate": 3.0022461432837752e-05,
+      "loss": 0.1513954520225525,
+      "step": 2490
+    },
+    {
+      "epoch": 0.4539665211062591,
+      "grad_norm": 0.1626640111207962,
+      "learning_rate": 2.9950332407992943e-05,
+      "loss": 0.1505578875541687,
+      "step": 2495
+    },
+    {
+      "epoch": 0.45487627365356625,
+      "grad_norm": 0.1535351574420929,
+      "learning_rate": 2.987816046855939e-05,
+      "loss": 0.15255829095840454,
+      "step": 2500
+    },
+    {
+      "epoch": 0.4557860262008734,
+      "grad_norm": 0.17552775144577026,
+      "learning_rate": 2.9805946240197928e-05,
+      "loss": 0.1516443133354187,
+      "step": 2505
+    },
+    {
+      "epoch": 0.4566957787481805,
+      "grad_norm": 0.16020981967449188,
+      "learning_rate": 2.9733690348935994e-05,
+      "loss": 0.14519743919372557,
+      "step": 2510
+    },
+    {
+      "epoch": 0.45760553129548764,
+      "grad_norm": 0.17800211906433105,
+      "learning_rate": 2.9661393421162204e-05,
+      "loss": 0.15679080486297609,
+      "step": 2515
+    },
+    {
+      "epoch": 0.4585152838427948,
+      "grad_norm": 0.16016991436481476,
+      "learning_rate": 2.9589056083620902e-05,
+      "loss": 0.14768127202987671,
+      "step": 2520
+    },
+    {
+      "epoch": 0.4594250363901019,
+      "grad_norm": 0.16272081434726715,
+      "learning_rate": 2.951667896340679e-05,
+      "loss": 0.1513301968574524,
+      "step": 2525
+    },
+    {
+      "epoch": 0.46033478893740903,
+      "grad_norm": 0.1726413071155548,
+      "learning_rate": 2.9444262687959402e-05,
+      "loss": 0.14819332361221313,
+      "step": 2530
+    },
+    {
+      "epoch": 0.46124454148471616,
+      "grad_norm": 0.1670403778553009,
+      "learning_rate": 2.9371807885057735e-05,
+      "loss": 0.15245940685272216,
+      "step": 2535
+    },
+    {
+      "epoch": 0.4621542940320233,
+      "grad_norm": 0.1650049239397049,
+      "learning_rate": 2.9299315182814772e-05,
+      "loss": 0.15187418460845947,
+      "step": 2540
+    },
+    {
+      "epoch": 0.4630640465793304,
+      "grad_norm": 0.16327734291553497,
+      "learning_rate": 2.9226785209672047e-05,
+      "loss": 0.15579828023910522,
+      "step": 2545
+    },
+    {
+      "epoch": 0.46397379912663755,
+      "grad_norm": 0.3367880582809448,
+      "learning_rate": 2.91542185943942e-05,
+      "loss": 0.15617697238922118,
+      "step": 2550
+    },
+    {
+      "epoch": 0.4648835516739447,
+      "grad_norm": 0.1731594055891037,
+      "learning_rate": 2.908161596606353e-05,
+      "loss": 0.1559603691101074,
+      "step": 2555
+    },
+    {
+      "epoch": 0.4657933042212518,
+      "grad_norm": 0.1477293074131012,
+      "learning_rate": 2.9008977954074517e-05,
+      "loss": 0.15567959547042848,
+      "step": 2560
+    },
+    {
+      "epoch": 0.46670305676855894,
+      "grad_norm": 0.16227173805236816,
+      "learning_rate": 2.8936305188128392e-05,
+      "loss": 0.1522113561630249,
+      "step": 2565
+    },
+    {
+      "epoch": 0.4676128093158661,
+      "grad_norm": 0.2031075656414032,
+      "learning_rate": 2.8863598298227674e-05,
+      "loss": 0.15054640769958497,
+      "step": 2570
+    },
+    {
+      "epoch": 0.4685225618631732,
+      "grad_norm": 0.18351472914218903,
+      "learning_rate": 2.8790857914670698e-05,
+      "loss": 0.15837019681930542,
+      "step": 2575
+    },
+    {
+      "epoch": 0.46943231441048033,
+      "grad_norm": 0.15914765000343323,
+      "learning_rate": 2.871808466804616e-05,
+      "loss": 0.1550259470939636,
+      "step": 2580
+    },
+    {
+      "epoch": 0.47034206695778746,
+      "grad_norm": 0.17366717755794525,
+      "learning_rate": 2.8645279189227636e-05,
+      "loss": 0.15702390670776367,
+      "step": 2585
+    },
+    {
+      "epoch": 0.4712518195050946,
+      "grad_norm": 0.13677838444709778,
+      "learning_rate": 2.8572442109368134e-05,
+      "loss": 0.15485031604766847,
+      "step": 2590
+    },
+    {
+      "epoch": 0.4721615720524017,
+      "grad_norm": 0.1477748304605484,
+      "learning_rate": 2.8499574059894617e-05,
+      "loss": 0.14577245712280273,
+      "step": 2595
+    },
+    {
+      "epoch": 0.47307132459970885,
+      "grad_norm": 0.1582217663526535,
+      "learning_rate": 2.842667567250252e-05,
+      "loss": 0.15586793422698975,
+      "step": 2600
+    },
+    {
+      "epoch": 0.47398107714701604,
+      "grad_norm": 0.19658738374710083,
+      "learning_rate": 2.8353747579150268e-05,
+      "loss": 0.15060495138168334,
+      "step": 2605
+    },
+    {
+      "epoch": 0.47489082969432317,
+      "grad_norm": 0.176767036318779,
+      "learning_rate": 2.828079041205382e-05,
+      "loss": 0.15116705894470214,
+      "step": 2610
+    },
+    {
+      "epoch": 0.4758005822416303,
+      "grad_norm": 0.16972507536411285,
+      "learning_rate": 2.820780480368117e-05,
+      "loss": 0.1541937470436096,
+      "step": 2615
+    },
+    {
+      "epoch": 0.47671033478893743,
+      "grad_norm": 0.1548585742712021,
+      "learning_rate": 2.8134791386746884e-05,
+      "loss": 0.14334756135940552,
+      "step": 2620
+    },
+    {
+      "epoch": 0.47762008733624456,
+      "grad_norm": 0.15411986410617828,
+      "learning_rate": 2.806175079420658e-05,
+      "loss": 0.14642289876937867,
+      "step": 2625
+    },
+    {
+      "epoch": 0.4785298398835517,
+      "grad_norm": 0.16609491407871246,
+      "learning_rate": 2.7988683659251474e-05,
+      "loss": 0.15083469152450563,
+      "step": 2630
+    },
+    {
+      "epoch": 0.4794395924308588,
+      "grad_norm": 0.16592684388160706,
+      "learning_rate": 2.791559061530289e-05,
+      "loss": 0.14218480587005616,
+      "step": 2635
+    },
+    {
+      "epoch": 0.48034934497816595,
+      "grad_norm": 0.1764935404062271,
+      "learning_rate": 2.7842472296006722e-05,
+      "loss": 0.15004343986511232,
+      "step": 2640
+    },
+    {
+      "epoch": 0.4812590975254731,
+      "grad_norm": 0.20094354450702667,
+      "learning_rate": 2.7769329335228022e-05,
+      "loss": 0.14975016117095946,
+      "step": 2645
+    },
+    {
+      "epoch": 0.4821688500727802,
+      "grad_norm": 0.1869269460439682,
+      "learning_rate": 2.769616236704542e-05,
+      "loss": 0.155981707572937,
+      "step": 2650
+    },
+    {
+      "epoch": 0.48307860262008734,
+      "grad_norm": 0.16671574115753174,
+      "learning_rate": 2.762297202574571e-05,
+      "loss": 0.14633859395980836,
+      "step": 2655
+    },
+    {
+      "epoch": 0.48398835516739447,
+      "grad_norm": 0.14999663829803467,
+      "learning_rate": 2.754975894581826e-05,
+      "loss": 0.15692603588104248,
+      "step": 2660
+    },
+    {
+      "epoch": 0.4848981077147016,
+      "grad_norm": 0.16893649101257324,
+      "learning_rate": 2.7476523761949592e-05,
+      "loss": 0.14530394077301026,
+      "step": 2665
+    },
+    {
+      "epoch": 0.48580786026200873,
+      "grad_norm": 0.16039884090423584,
+      "learning_rate": 2.740326710901784e-05,
+      "loss": 0.15013915300369263,
+      "step": 2670
+    },
+    {
+      "epoch": 0.48671761280931586,
+      "grad_norm": 0.16672006249427795,
+      "learning_rate": 2.732998962208725e-05,
+      "loss": 0.15667349100112915,
+      "step": 2675
+    },
+    {
+      "epoch": 0.487627365356623,
+      "grad_norm": 0.2160867303609848,
+      "learning_rate": 2.7256691936402684e-05,
+      "loss": 0.14335414171218872,
+      "step": 2680
+    },
+    {
+      "epoch": 0.4885371179039301,
+      "grad_norm": 0.349030077457428,
+      "learning_rate": 2.71833746873841e-05,
+      "loss": 0.1437530279159546,
+      "step": 2685
+    },
+    {
+      "epoch": 0.48944687045123725,
+      "grad_norm": 0.18380966782569885,
+      "learning_rate": 2.7110038510621073e-05,
+      "loss": 0.1476014256477356,
+      "step": 2690
+    },
+    {
+      "epoch": 0.4903566229985444,
+      "grad_norm": 0.1523742377758026,
+      "learning_rate": 2.703668404186722e-05,
+      "loss": 0.14578526020050048,
+      "step": 2695
+    },
+    {
+      "epoch": 0.4912663755458515,
+      "grad_norm": 0.16092729568481445,
+      "learning_rate": 2.696331191703479e-05,
+      "loss": 0.15335593223571778,
+      "step": 2700
+    },
+    {
+      "epoch": 0.49217612809315864,
+      "grad_norm": 0.17185333371162415,
+      "learning_rate": 2.688992277218904e-05,
+      "loss": 0.1540898084640503,
+      "step": 2705
+    },
+    {
+      "epoch": 0.49308588064046577,
+      "grad_norm": 0.1521969735622406,
+      "learning_rate": 2.6816517243542792e-05,
+      "loss": 0.15171396732330322,
+      "step": 2710
+    },
+    {
+      "epoch": 0.49399563318777295,
+      "grad_norm": 0.16064171493053436,
+      "learning_rate": 2.674309596745092e-05,
+      "loss": 0.1505839228630066,
+      "step": 2715
+    },
+    {
+      "epoch": 0.4949053857350801,
+      "grad_norm": 0.16430898010730743,
+      "learning_rate": 2.6669659580404795e-05,
+      "loss": 0.1551363468170166,
+      "step": 2720
+    },
+    {
+      "epoch": 0.4958151382823872,
+      "grad_norm": 0.16125477850437164,
+      "learning_rate": 2.659620871902677e-05,
+      "loss": 0.15069286823272704,
+      "step": 2725
+    },
+    {
+      "epoch": 0.49672489082969434,
+      "grad_norm": 0.1428450047969818,
+      "learning_rate": 2.652274402006471e-05,
+      "loss": 0.15511081218719483,
+      "step": 2730
+    },
+    {
+      "epoch": 0.4976346433770015,
+      "grad_norm": 0.15452754497528076,
+      "learning_rate": 2.6449266120386406e-05,
+      "loss": 0.14941939115524291,
+      "step": 2735
+    },
+    {
+      "epoch": 0.4985443959243086,
+      "grad_norm": 0.17243537306785583,
+      "learning_rate": 2.6375775656974123e-05,
+      "loss": 0.151741623878479,
+      "step": 2740
+    },
+    {
+      "epoch": 0.49945414847161573,
+      "grad_norm": 0.13736453652381897,
+      "learning_rate": 2.6302273266919008e-05,
+      "loss": 0.147042977809906,
+      "step": 2745
+    },
+    {
+      "epoch": 0.5003639010189228,
+      "grad_norm": 0.16241495311260223,
+      "learning_rate": 2.6228759587415614e-05,
+      "loss": 0.14664684534072875,
+      "step": 2750
+    },
+    {
+      "epoch": 0.50127365356623,
+      "grad_norm": 0.193496435880661,
+      "learning_rate": 2.6155235255756356e-05,
+      "loss": 0.15486966371536254,
+      "step": 2755
+    },
+    {
+      "epoch": 0.5021834061135371,
+      "grad_norm": 0.1542847901582718,
+      "learning_rate": 2.6081700909326e-05,
+      "loss": 0.15148009061813356,
+      "step": 2760
+    },
+    {
+      "epoch": 0.5030931586608443,
+      "grad_norm": 0.1696511209011078,
+      "learning_rate": 2.6008157185596142e-05,
+      "loss": 0.14190055131912233,
+      "step": 2765
+    },
+    {
+      "epoch": 0.5040029112081513,
+      "grad_norm": 0.14690077304840088,
+      "learning_rate": 2.5934604722119655e-05,
+      "loss": 0.1570739269256592,
+      "step": 2770
+    },
+    {
+      "epoch": 0.5049126637554585,
+      "grad_norm": 0.17149671912193298,
+      "learning_rate": 2.5861044156525162e-05,
+      "loss": 0.14940304756164552,
+      "step": 2775
+    },
+    {
+      "epoch": 0.5058224163027657,
+      "grad_norm": 0.16639231145381927,
+      "learning_rate": 2.578747612651155e-05,
+      "loss": 0.15691237449645995,
+      "step": 2780
+    },
+    {
+      "epoch": 0.5067321688500728,
+      "grad_norm": 0.2062763124704361,
+      "learning_rate": 2.5713901269842404e-05,
+      "loss": 0.1564734935760498,
+      "step": 2785
+    },
+    {
+      "epoch": 0.50764192139738,
+      "grad_norm": 0.12636308372020721,
+      "learning_rate": 2.5640320224340502e-05,
+      "loss": 0.14539417028427123,
+      "step": 2790
+    },
+    {
+      "epoch": 0.508551673944687,
+      "grad_norm": 0.16893689334392548,
+      "learning_rate": 2.556673362788225e-05,
+      "loss": 0.15440930128097535,
+      "step": 2795
+    },
+    {
+      "epoch": 0.5094614264919942,
+      "grad_norm": 0.16250015795230865,
+      "learning_rate": 2.54931421183922e-05,
+      "loss": 0.14485647678375244,
+      "step": 2800
+    },
+    {
+      "epoch": 0.5103711790393013,
+      "grad_norm": 0.1700994372367859,
+      "learning_rate": 2.5419546333837462e-05,
+      "loss": 0.15411126613616943,
+      "step": 2805
+    },
+    {
+      "epoch": 0.5112809315866085,
+      "grad_norm": 0.1547706127166748,
+      "learning_rate": 2.5345946912222256e-05,
+      "loss": 0.15516072511672974,
+      "step": 2810
+    },
+    {
+      "epoch": 0.5121906841339156,
+      "grad_norm": 0.17955681681632996,
+      "learning_rate": 2.527234449158228e-05,
+      "loss": 0.15546923875808716,
+      "step": 2815
+    },
+    {
+      "epoch": 0.5131004366812227,
+      "grad_norm": 0.163709819316864,
+      "learning_rate": 2.519873970997927e-05,
+      "loss": 0.15665037631988527,
+      "step": 2820
+    },
+    {
+      "epoch": 0.5140101892285298,
+      "grad_norm": 0.17859576642513275,
+      "learning_rate": 2.5125133205495405e-05,
+      "loss": 0.1539722204208374,
+      "step": 2825
+    },
+    {
+      "epoch": 0.514919941775837,
+      "grad_norm": 0.17443150281906128,
+      "learning_rate": 2.5051525616227806e-05,
+      "loss": 0.148411762714386,
+      "step": 2830
+    },
+    {
+      "epoch": 0.5158296943231441,
+      "grad_norm": 0.17397581040859222,
+      "learning_rate": 2.4977917580283007e-05,
+      "loss": 0.14880497455596925,
+      "step": 2835
+    },
+    {
+      "epoch": 0.5167394468704513,
+      "grad_norm": 0.14565663039684296,
+      "learning_rate": 2.4904309735771405e-05,
+      "loss": 0.14934680461883545,
+      "step": 2840
+    },
+    {
+      "epoch": 0.5176491994177583,
+      "grad_norm": 0.17895659804344177,
+      "learning_rate": 2.4830702720801746e-05,
+      "loss": 0.15287939310073853,
+      "step": 2845
+    },
+    {
+      "epoch": 0.5185589519650655,
+      "grad_norm": 0.15812788903713226,
+      "learning_rate": 2.4757097173475572e-05,
+      "loss": 0.14576947689056396,
+      "step": 2850
+    },
+    {
+      "epoch": 0.5194687045123726,
+      "grad_norm": 0.17123781144618988,
+      "learning_rate": 2.46834937318817e-05,
+      "loss": 0.15224847793579102,
+      "step": 2855
+    },
+    {
+      "epoch": 0.5203784570596798,
+      "grad_norm": 0.14845474064350128,
+      "learning_rate": 2.460989303409072e-05,
+      "loss": 0.14901585578918458,
+      "step": 2860
+    },
+    {
+      "epoch": 0.5212882096069869,
+      "grad_norm": 0.23493704199790955,
+      "learning_rate": 2.4536295718149407e-05,
+      "loss": 0.1517487049102783,
+      "step": 2865
+    },
+    {
+      "epoch": 0.522197962154294,
+      "grad_norm": 0.16209843754768372,
+      "learning_rate": 2.4462702422075217e-05,
+      "loss": 0.14327445030212402,
+      "step": 2870
+    },
+    {
+      "epoch": 0.5231077147016011,
+      "grad_norm": 0.17249803245067596,
+      "learning_rate": 2.4389113783850793e-05,
+      "loss": 0.1517549753189087,
+      "step": 2875
+    },
+    {
+      "epoch": 0.5240174672489083,
+      "grad_norm": 0.14561402797698975,
+      "learning_rate": 2.431553044141836e-05,
+      "loss": 0.14764087200164794,
+      "step": 2880
+    },
+    {
+      "epoch": 0.5249272197962155,
+      "grad_norm": 0.17033302783966064,
+      "learning_rate": 2.4241953032674256e-05,
+      "loss": 0.15181604623794556,
+      "step": 2885
+    },
+    {
+      "epoch": 0.5258369723435226,
+      "grad_norm": 0.1184430941939354,
+      "learning_rate": 2.4168382195463367e-05,
+      "loss": 0.14264242649078368,
+      "step": 2890
+    },
+    {
+      "epoch": 0.5267467248908297,
+      "grad_norm": 0.17521196603775024,
+      "learning_rate": 2.4094818567573618e-05,
+      "loss": 0.1509538173675537,
+      "step": 2895
+    },
+    {
+      "epoch": 0.5276564774381368,
+      "grad_norm": 0.1681576371192932,
+      "learning_rate": 2.4021262786730428e-05,
+      "loss": 0.15344605445861817,
+      "step": 2900
+    },
+    {
+      "epoch": 0.528566229985444,
+      "grad_norm": 0.17134182155132294,
+      "learning_rate": 2.3947715490591206e-05,
+      "loss": 0.15161689519882202,
+      "step": 2905
+    },
+    {
+      "epoch": 0.5294759825327511,
+      "grad_norm": 0.1796472817659378,
+      "learning_rate": 2.3874177316739778e-05,
+      "loss": 0.15086464881896972,
+      "step": 2910
+    },
+    {
+      "epoch": 0.5303857350800583,
+      "grad_norm": 0.23268625140190125,
+      "learning_rate": 2.380064890268093e-05,
+      "loss": 0.15354180335998535,
+      "step": 2915
+    },
+    {
+      "epoch": 0.5312954876273653,
+      "grad_norm": 0.16318941116333008,
+      "learning_rate": 2.372713088583481e-05,
+      "loss": 0.15131797790527343,
+      "step": 2920
+    },
+    {
+      "epoch": 0.5322052401746725,
+      "grad_norm": 0.18171803653240204,
+      "learning_rate": 2.365362390353143e-05,
+      "loss": 0.15784090757369995,
+      "step": 2925
+    },
+    {
+      "epoch": 0.5331149927219796,
+      "grad_norm": 0.17672640085220337,
+      "learning_rate": 2.3580128593005156e-05,
+      "loss": 0.15509436130523682,
+      "step": 2930
+    },
+    {
+      "epoch": 0.5340247452692868,
+      "grad_norm": 0.15985223650932312,
+      "learning_rate": 2.3506645591389174e-05,
+      "loss": 0.14851027727127075,
+      "step": 2935
+    },
+    {
+      "epoch": 0.5349344978165939,
+      "grad_norm": 0.16597607731819153,
+      "learning_rate": 2.343317553570995e-05,
+      "loss": 0.1504931092262268,
+      "step": 2940
+    },
+    {
+      "epoch": 0.535844250363901,
+      "grad_norm": 0.20180748403072357,
+      "learning_rate": 2.3359719062881725e-05,
+      "loss": 0.15023820400238036,
+      "step": 2945
+    },
+    {
+      "epoch": 0.5367540029112081,
+      "grad_norm": 0.1735963076353073,
+      "learning_rate": 2.3286276809701e-05,
+      "loss": 0.15374408960342406,
+      "step": 2950
+    },
+    {
+      "epoch": 0.5376637554585153,
+      "grad_norm": 0.17629501223564148,
+      "learning_rate": 2.3212849412840995e-05,
+      "loss": 0.15007833242416382,
+      "step": 2955
+    },
+    {
+      "epoch": 0.5385735080058224,
+      "grad_norm": 0.1493796557188034,
+      "learning_rate": 2.3139437508846155e-05,
+      "loss": 0.15206656455993653,
+      "step": 2960
+    },
+    {
+      "epoch": 0.5394832605531296,
+      "grad_norm": 0.17426837980747223,
+      "learning_rate": 2.306604173412659e-05,
+      "loss": 0.1441131591796875,
+      "step": 2965
+    },
+    {
+      "epoch": 0.5403930131004366,
+      "grad_norm": 0.16984431445598602,
+      "learning_rate": 2.2992662724952613e-05,
+      "loss": 0.14438753128051757,
+      "step": 2970
+    },
+    {
+      "epoch": 0.5413027656477438,
+      "grad_norm": 0.1814386397600174,
+      "learning_rate": 2.2919301117449167e-05,
+      "loss": 0.14887022972106934,
+      "step": 2975
+    },
+    {
+      "epoch": 0.5422125181950509,
+      "grad_norm": 0.158392995595932,
+      "learning_rate": 2.2845957547590368e-05,
+      "loss": 0.14404361248016356,
+      "step": 2980
+    },
+    {
+      "epoch": 0.5431222707423581,
+      "grad_norm": 0.17496263980865479,
+      "learning_rate": 2.2772632651193953e-05,
+      "loss": 0.1454906702041626,
+      "step": 2985
+    },
+    {
+      "epoch": 0.5440320232896652,
+      "grad_norm": 0.157533198595047,
+      "learning_rate": 2.2699327063915766e-05,
+      "loss": 0.1458217740058899,
+      "step": 2990
+    },
+    {
+      "epoch": 0.5449417758369723,
+      "grad_norm": 0.1767890453338623,
+      "learning_rate": 2.262604142124427e-05,
+      "loss": 0.14384825229644777,
+      "step": 2995
+    },
+    {
+      "epoch": 0.5458515283842795,
+      "grad_norm": 0.1851050704717636,
+      "learning_rate": 2.2552776358495033e-05,
+      "loss": 0.14832457304000854,
+      "step": 3000
+    },
+    {
+      "epoch": 0.5467612809315866,
+      "grad_norm": 0.164175882935524,
+      "learning_rate": 2.247953251080521e-05,
+      "loss": 0.14999878406524658,
+      "step": 3005
+    },
+    {
+      "epoch": 0.5476710334788938,
+      "grad_norm": 0.3403675854206085,
+      "learning_rate": 2.240631051312804e-05,
+      "loss": 0.1443937063217163,
+      "step": 3010
+    },
+    {
+      "epoch": 0.5485807860262009,
+      "grad_norm": 0.16751109063625336,
+      "learning_rate": 2.2333111000227342e-05,
+      "loss": 0.1462402105331421,
+      "step": 3015
+    },
+    {
+      "epoch": 0.549490538573508,
+      "grad_norm": 0.14741151034832,
+      "learning_rate": 2.225993460667201e-05,
+      "loss": 0.149855899810791,
+      "step": 3020
+    },
+    {
+      "epoch": 0.5504002911208151,
+      "grad_norm": 0.20605266094207764,
+      "learning_rate": 2.218678196683054e-05,
+      "loss": 0.15413178205490113,
+      "step": 3025
+    },
+    {
+      "epoch": 0.5513100436681223,
+      "grad_norm": 0.14884796738624573,
+      "learning_rate": 2.2113653714865473e-05,
+      "loss": 0.14592334032058715,
+      "step": 3030
+    },
+    {
+      "epoch": 0.5522197962154294,
+      "grad_norm": 0.17114350199699402,
+      "learning_rate": 2.2040550484727943e-05,
+      "loss": 0.1498338460922241,
+      "step": 3035
+    },
+    {
+      "epoch": 0.5531295487627366,
+      "grad_norm": 0.16496853530406952,
+      "learning_rate": 2.196747291015219e-05,
+      "loss": 0.14650315046310425,
+      "step": 3040
+    },
+    {
+      "epoch": 0.5540393013100436,
+      "grad_norm": 0.15172401070594788,
+      "learning_rate": 2.189442162465001e-05,
+      "loss": 0.14984124898910522,
+      "step": 3045
+    },
+    {
+      "epoch": 0.5549490538573508,
+      "grad_norm": 0.19258467853069305,
+      "learning_rate": 2.182139726150532e-05,
+      "loss": 0.1486764669418335,
+      "step": 3050
+    },
+    {
+      "epoch": 0.5558588064046579,
+      "grad_norm": 0.1749001443386078,
+      "learning_rate": 2.1748400453768652e-05,
+      "loss": 0.14983701705932617,
+      "step": 3055
+    },
+    {
+      "epoch": 0.5567685589519651,
+      "grad_norm": 0.37510567903518677,
+      "learning_rate": 2.1675431834251637e-05,
+      "loss": 0.14483561515808105,
+      "step": 3060
+    },
+    {
+      "epoch": 0.5576783114992722,
+      "grad_norm": 0.16932405531406403,
+      "learning_rate": 2.1602492035521553e-05,
+      "loss": 0.14487643241882325,
+      "step": 3065
+    },
+    {
+      "epoch": 0.5585880640465793,
+      "grad_norm": 0.174176424741745,
+      "learning_rate": 2.152958168989584e-05,
+      "loss": 0.14737497568130492,
+      "step": 3070
+    },
+    {
+      "epoch": 0.5594978165938864,
+      "grad_norm": 0.1601252257823944,
+      "learning_rate": 2.1456701429436577e-05,
+      "loss": 0.15183379650115966,
+      "step": 3075
+    },
+    {
+      "epoch": 0.5604075691411936,
+      "grad_norm": 0.14960910379886627,
+      "learning_rate": 2.1383851885945085e-05,
+      "loss": 0.143074893951416,
+      "step": 3080
+    },
+    {
+      "epoch": 0.5613173216885007,
+      "grad_norm": 0.1678633838891983,
+      "learning_rate": 2.1311033690956346e-05,
+      "loss": 0.14961432218551635,
+      "step": 3085
+    },
+    {
+      "epoch": 0.5622270742358079,
+      "grad_norm": 0.15814319252967834,
+      "learning_rate": 2.1238247475733613e-05,
+      "loss": 0.14308581352233887,
+      "step": 3090
+    },
+    {
+      "epoch": 0.5631368267831149,
+      "grad_norm": 0.21240772306919098,
+      "learning_rate": 2.1165493871262887e-05,
+      "loss": 0.14737485647201537,
+      "step": 3095
+    },
+    {
+      "epoch": 0.5640465793304221,
+      "grad_norm": 0.15161271393299103,
+      "learning_rate": 2.109277350824749e-05,
+      "loss": 0.14534420967102052,
+      "step": 3100
+    },
+    {
+      "epoch": 0.5649563318777293,
+      "grad_norm": 0.16572362184524536,
+      "learning_rate": 2.1020087017102537e-05,
+      "loss": 0.14299670457839966,
+      "step": 3105
+    },
+    {
+      "epoch": 0.5658660844250364,
+      "grad_norm": 0.1548164039850235,
+      "learning_rate": 2.094743502794954e-05,
+      "loss": 0.14371142387390137,
+      "step": 3110
+    },
+    {
+      "epoch": 0.5667758369723436,
+      "grad_norm": 0.2574169933795929,
+      "learning_rate": 2.0874818170610885e-05,
+      "loss": 0.14350423812866211,
+      "step": 3115
+    },
+    {
+      "epoch": 0.5676855895196506,
+      "grad_norm": 0.16359548270702362,
+      "learning_rate": 2.080223707460443e-05,
+      "loss": 0.1520243763923645,
+      "step": 3120
+    },
+    {
+      "epoch": 0.5685953420669578,
+      "grad_norm": 0.1798320859670639,
+      "learning_rate": 2.072969236913799e-05,
+      "loss": 0.14832595586776734,
+      "step": 3125
+    },
+    {
+      "epoch": 0.5695050946142649,
+      "grad_norm": 0.17045916616916656,
+      "learning_rate": 2.0657184683103926e-05,
+      "loss": 0.15308042764663696,
+      "step": 3130
+    },
+    {
+      "epoch": 0.5704148471615721,
+      "grad_norm": 0.16345897316932678,
+      "learning_rate": 2.058471464507366e-05,
+      "loss": 0.14564799070358275,
+      "step": 3135
+    },
+    {
+      "epoch": 0.5713245997088792,
+      "grad_norm": 0.15170110762119293,
+      "learning_rate": 2.0512282883292257e-05,
+      "loss": 0.14161767959594726,
+      "step": 3140
+    },
+    {
+      "epoch": 0.5722343522561864,
+      "grad_norm": 0.8107472658157349,
+      "learning_rate": 2.0439890025672955e-05,
+      "loss": 0.14481087923049926,
+      "step": 3145
+    },
+    {
+      "epoch": 0.5731441048034934,
+      "grad_norm": 0.15346679091453552,
+      "learning_rate": 2.036753669979174e-05,
+      "loss": 0.14860262870788574,
+      "step": 3150
+    },
+    {
+      "epoch": 0.5740538573508006,
+      "grad_norm": 0.1632593423128128,
+      "learning_rate": 2.0295223532881886e-05,
+      "loss": 0.1481687307357788,
+      "step": 3155
+    },
+    {
+      "epoch": 0.5749636098981077,
+      "grad_norm": 0.23399172723293304,
+      "learning_rate": 2.022295115182852e-05,
+      "loss": 0.149153733253479,
+      "step": 3160
+    },
+    {
+      "epoch": 0.5758733624454149,
+      "grad_norm": 0.14977394044399261,
+      "learning_rate": 2.015072018316323e-05,
+      "loss": 0.14921388626098633,
+      "step": 3165
+    },
+    {
+      "epoch": 0.576783114992722,
+      "grad_norm": 0.1550658792257309,
+      "learning_rate": 2.007853125305856e-05,
+      "loss": 0.1482759475708008,
+      "step": 3170
+    },
+    {
+      "epoch": 0.5776928675400291,
+      "grad_norm": 0.16661737859249115,
+      "learning_rate": 2.0006384987322645e-05,
+      "loss": 0.14903552532196046,
+      "step": 3175
+    },
+    {
+      "epoch": 0.5786026200873362,
+      "grad_norm": 0.1746823936700821,
+      "learning_rate": 1.9934282011393753e-05,
+      "loss": 0.1412947654724121,
+      "step": 3180
+    },
+    {
+      "epoch": 0.5795123726346434,
+      "grad_norm": 0.17025792598724365,
+      "learning_rate": 1.9862222950334857e-05,
+      "loss": 0.15289769172668458,
+      "step": 3185
+    },
+    {
+      "epoch": 0.5804221251819505,
+      "grad_norm": 0.16857658326625824,
+      "learning_rate": 1.9790208428828252e-05,
+      "loss": 0.14419941902160643,
+      "step": 3190
+    },
+    {
+      "epoch": 0.5813318777292577,
+      "grad_norm": 0.16099876165390015,
+      "learning_rate": 1.9718239071170118e-05,
+      "loss": 0.14476487636566163,
+      "step": 3195
+    },
+    {
+      "epoch": 0.5822416302765647,
+      "grad_norm": 0.16140873730182648,
+      "learning_rate": 1.964631550126508e-05,
+      "loss": 0.14588416814804078,
+      "step": 3200
+    },
+    {
+      "epoch": 0.5831513828238719,
+      "grad_norm": 0.15719448029994965,
+      "learning_rate": 1.957443834262087e-05,
+      "loss": 0.15144693851470947,
+      "step": 3205
+    },
+    {
+      "epoch": 0.584061135371179,
+      "grad_norm": 0.16512645781040192,
+      "learning_rate": 1.950260821834285e-05,
+      "loss": 0.14787566661834717,
+      "step": 3210
+    },
+    {
+      "epoch": 0.5849708879184862,
+      "grad_norm": 0.18584516644477844,
+      "learning_rate": 1.9430825751128643e-05,
+      "loss": 0.14514710903167724,
+      "step": 3215
+    },
+    {
+      "epoch": 0.5858806404657934,
+      "grad_norm": 0.17640981078147888,
+      "learning_rate": 1.9359091563262742e-05,
+      "loss": 0.1511004686355591,
+      "step": 3220
+    },
+    {
+      "epoch": 0.5867903930131004,
+      "grad_norm": 0.1697624921798706,
+      "learning_rate": 1.9287406276611095e-05,
+      "loss": 0.15392563343048096,
+      "step": 3225
+    },
+    {
+      "epoch": 0.5877001455604076,
+      "grad_norm": 0.1677260845899582,
+      "learning_rate": 1.9215770512615725e-05,
+      "loss": 0.15311745405197144,
+      "step": 3230
+    },
+    {
+      "epoch": 0.5886098981077147,
+      "grad_norm": 0.15357480943202972,
+      "learning_rate": 1.9144184892289337e-05,
+      "loss": 0.14370160102844237,
+      "step": 3235
+    },
+    {
+      "epoch": 0.5895196506550219,
+      "grad_norm": 0.18601207435131073,
+      "learning_rate": 1.9072650036209955e-05,
+      "loss": 0.14095077514648438,
+      "step": 3240
+    },
+    {
+      "epoch": 0.590429403202329,
+      "grad_norm": 0.17313526570796967,
+      "learning_rate": 1.9001166564515513e-05,
+      "loss": 0.148259174823761,
+      "step": 3245
+    },
+    {
+      "epoch": 0.5913391557496361,
+      "grad_norm": 0.1634378433227539,
+      "learning_rate": 1.8929735096898504e-05,
+      "loss": 0.15082294940948487,
+      "step": 3250
+    },
+    {
+      "epoch": 0.5922489082969432,
+      "grad_norm": 0.18542174994945526,
+      "learning_rate": 1.885835625260058e-05,
+      "loss": 0.14461435079574586,
+      "step": 3255
+    },
+    {
+      "epoch": 0.5931586608442504,
+      "grad_norm": 0.1740756630897522,
+      "learning_rate": 1.87870306504072e-05,
+      "loss": 0.14083608388900756,
+      "step": 3260
+    },
+    {
+      "epoch": 0.5940684133915575,
+      "grad_norm": 0.25606217980384827,
+      "learning_rate": 1.8715758908642288e-05,
+      "loss": 0.15125386714935302,
+      "step": 3265
+    },
+    {
+      "epoch": 0.5949781659388647,
+      "grad_norm": 0.20194627344608307,
+      "learning_rate": 1.8644541645162834e-05,
+      "loss": 0.14433003664016725,
+      "step": 3270
+    },
+    {
+      "epoch": 0.5958879184861717,
+      "grad_norm": 0.1902168095111847,
+      "learning_rate": 1.8573379477353542e-05,
+      "loss": 0.14718132019042968,
+      "step": 3275
+    },
+    {
+      "epoch": 0.5967976710334789,
+      "grad_norm": 0.15122972428798676,
+      "learning_rate": 1.850227302212151e-05,
+      "loss": 0.153376567363739,
+      "step": 3280
+    },
+    {
+      "epoch": 0.597707423580786,
+      "grad_norm": 0.14331959187984467,
+      "learning_rate": 1.843122289589085e-05,
+      "loss": 0.146630597114563,
+      "step": 3285
+    },
+    {
+      "epoch": 0.5986171761280932,
+      "grad_norm": 0.15083099901676178,
+      "learning_rate": 1.836022971459737e-05,
+      "loss": 0.1445971965789795,
+      "step": 3290
+    },
+    {
+      "epoch": 0.5995269286754003,
+      "grad_norm": 0.16585418581962585,
+      "learning_rate": 1.828929409368321e-05,
+      "loss": 0.15120241641998292,
+      "step": 3295
+    },
+    {
+      "epoch": 0.6004366812227074,
+      "grad_norm": 0.1653224229812622,
+      "learning_rate": 1.8218416648091524e-05,
+      "loss": 0.14349838495254516,
+      "step": 3300
+    },
+    {
+      "epoch": 0.6013464337700145,
+      "grad_norm": 0.1891375184059143,
+      "learning_rate": 1.8147597992261124e-05,
+      "loss": 0.15171384811401367,
+      "step": 3305
+    },
+    {
+      "epoch": 0.6022561863173217,
+      "grad_norm": 0.13392704725265503,
+      "learning_rate": 1.8076838740121187e-05,
+      "loss": 0.14607118368148803,
+      "step": 3310
+    },
+    {
+      "epoch": 0.6031659388646288,
+      "grad_norm": 0.15421944856643677,
+      "learning_rate": 1.8006139505085926e-05,
+      "loss": 0.1380957007408142,
+      "step": 3315
+    },
+    {
+      "epoch": 0.604075691411936,
+      "grad_norm": 0.16637761890888214,
+      "learning_rate": 1.7935500900049246e-05,
+      "loss": 0.14604611396789552,
+      "step": 3320
+    },
+    {
+      "epoch": 0.6049854439592431,
+      "grad_norm": 0.16638441383838654,
+      "learning_rate": 1.7864923537379445e-05,
+      "loss": 0.1513611912727356,
+      "step": 3325
+    },
+    {
+      "epoch": 0.6058951965065502,
+      "grad_norm": 0.1745707094669342,
+      "learning_rate": 1.779440802891394e-05,
+      "loss": 0.15391240119934083,
+      "step": 3330
+    },
+    {
+      "epoch": 0.6068049490538574,
+      "grad_norm": 0.1620505005121231,
+      "learning_rate": 1.77239549859539e-05,
+      "loss": 0.14986472129821776,
+      "step": 3335
+    },
+    {
+      "epoch": 0.6077147016011645,
+      "grad_norm": 0.1579132080078125,
+      "learning_rate": 1.7653565019259e-05,
+      "loss": 0.1466603994369507,
+      "step": 3340
+    },
+    {
+      "epoch": 0.6086244541484717,
+      "grad_norm": 0.19154994189739227,
+      "learning_rate": 1.7583238739042086e-05,
+      "loss": 0.15228934288024903,
+      "step": 3345
+    },
+    {
+      "epoch": 0.6095342066957787,
+      "grad_norm": 0.15771779417991638,
+      "learning_rate": 1.7512976754963913e-05,
+      "loss": 0.14965078830718995,
+      "step": 3350
+    },
+    {
+      "epoch": 0.6104439592430859,
+      "grad_norm": 0.18406136333942413,
+      "learning_rate": 1.744277967612785e-05,
+      "loss": 0.1473196864128113,
+      "step": 3355
+    },
+    {
+      "epoch": 0.611353711790393,
+      "grad_norm": 0.17603816092014313,
+      "learning_rate": 1.7372648111074607e-05,
+      "loss": 0.1430676221847534,
+      "step": 3360
+    },
+    {
+      "epoch": 0.6122634643377002,
+      "grad_norm": 0.156408429145813,
+      "learning_rate": 1.7302582667776933e-05,
+      "loss": 0.14018454551696777,
+      "step": 3365
+    },
+    {
+      "epoch": 0.6131732168850073,
+      "grad_norm": 0.14504843950271606,
+      "learning_rate": 1.7232583953634407e-05,
+      "loss": 0.14505640268325806,
+      "step": 3370
+    },
+    {
+      "epoch": 0.6140829694323144,
+      "grad_norm": 0.1864968240261078,
+      "learning_rate": 1.716265257546808e-05,
+      "loss": 0.14810394048690795,
+      "step": 3375
+    },
+    {
+      "epoch": 0.6149927219796215,
+      "grad_norm": 0.1621711403131485,
+      "learning_rate": 1.7092789139515295e-05,
+      "loss": 0.14203091859817504,
+      "step": 3380
+    },
+    {
+      "epoch": 0.6159024745269287,
+      "grad_norm": 0.17994914948940277,
+      "learning_rate": 1.70229942514244e-05,
+      "loss": 0.14565644264221192,
+      "step": 3385
+    },
+    {
+      "epoch": 0.6168122270742358,
+      "grad_norm": 0.1707388162612915,
+      "learning_rate": 1.6953268516249486e-05,
+      "loss": 0.14449434280395507,
+      "step": 3390
+    },
+    {
+      "epoch": 0.617721979621543,
+      "grad_norm": 0.16425329446792603,
+      "learning_rate": 1.6883612538445175e-05,
+      "loss": 0.15185940265655518,
+      "step": 3395
+    },
+    {
+      "epoch": 0.61863173216885,
+      "grad_norm": 0.15987788140773773,
+      "learning_rate": 1.6814026921861335e-05,
+      "loss": 0.14994431734085084,
+      "step": 3400
+    },
+    {
+      "epoch": 0.6195414847161572,
+      "grad_norm": 0.2987690269947052,
+      "learning_rate": 1.6744512269737894e-05,
+      "loss": 0.14652738571166993,
+      "step": 3405
+    },
+    {
+      "epoch": 0.6204512372634643,
+      "grad_norm": 0.1681315004825592,
+      "learning_rate": 1.6675069184699574e-05,
+      "loss": 0.14566165208816528,
+      "step": 3410
+    },
+    {
+      "epoch": 0.6213609898107715,
+      "grad_norm": 0.15847846865653992,
+      "learning_rate": 1.660569826875069e-05,
+      "loss": 0.1374401330947876,
+      "step": 3415
+    },
+    {
+      "epoch": 0.6222707423580786,
+      "grad_norm": 0.16370312869548798,
+      "learning_rate": 1.6536400123269907e-05,
+      "loss": 0.14905524253845215,
+      "step": 3420
+    },
+    {
+      "epoch": 0.6231804949053857,
+      "grad_norm": 0.16054444015026093,
+      "learning_rate": 1.6467175349005054e-05,
+      "loss": 0.1496324896812439,
+      "step": 3425
+    },
+    {
+      "epoch": 0.6240902474526928,
+      "grad_norm": 0.1663951277732849,
+      "learning_rate": 1.639802454606788e-05,
+      "loss": 0.1504170298576355,
+      "step": 3430
+    },
+    {
+      "epoch": 0.625,
+      "grad_norm": 0.1591310054063797,
+      "learning_rate": 1.6328948313928906e-05,
+      "loss": 0.1410186171531677,
+      "step": 3435
+    },
+    {
+      "epoch": 0.6259097525473072,
+      "grad_norm": 0.1637524962425232,
+      "learning_rate": 1.6259947251412178e-05,
+      "loss": 0.13963305950164795,
+      "step": 3440
+    },
+    {
+      "epoch": 0.6268195050946143,
+      "grad_norm": 0.1688017100095749,
+      "learning_rate": 1.6191021956690096e-05,
+      "loss": 0.14727941751480103,
+      "step": 3445
+    },
+    {
+      "epoch": 0.6277292576419214,
+      "grad_norm": 0.1691795438528061,
+      "learning_rate": 1.612217302727821e-05,
+      "loss": 0.14856183528900146,
+      "step": 3450
+    },
+    {
+      "epoch": 0.6286390101892285,
+      "grad_norm": 0.18501746654510498,
+      "learning_rate": 1.60534010600301e-05,
+      "loss": 0.1481746554374695,
+      "step": 3455
+    },
+    {
+      "epoch": 0.6295487627365357,
+      "grad_norm": 0.16234716773033142,
+      "learning_rate": 1.5984706651132125e-05,
+      "loss": 0.1427530527114868,
+      "step": 3460
+    },
+    {
+      "epoch": 0.6304585152838428,
+      "grad_norm": 0.16013780236244202,
+      "learning_rate": 1.5916090396098293e-05,
+      "loss": 0.14264426231384278,
+      "step": 3465
+    },
+    {
+      "epoch": 0.63136826783115,
+      "grad_norm": 0.17116396129131317,
+      "learning_rate": 1.5847552889765095e-05,
+      "loss": 0.14109257459640503,
+      "step": 3470
+    },
+    {
+      "epoch": 0.632278020378457,
+      "grad_norm": 0.16949769854545593,
+      "learning_rate": 1.5779094726286344e-05,
+      "loss": 0.1387040376663208,
+      "step": 3475
+    },
+    {
+      "epoch": 0.6331877729257642,
+      "grad_norm": 0.14983431994915009,
+      "learning_rate": 1.5710716499128044e-05,
+      "loss": 0.13645120859146118,
+      "step": 3480
+    },
+    {
+      "epoch": 0.6340975254730713,
+      "grad_norm": 0.1632554531097412,
+      "learning_rate": 1.564241880106321e-05,
+      "loss": 0.14883992671966553,
+      "step": 3485
+    },
+    {
+      "epoch": 0.6350072780203785,
+      "grad_norm": 0.15686506032943726,
+      "learning_rate": 1.5574202224166744e-05,
+      "loss": 0.14244272708892822,
+      "step": 3490
+    },
+    {
+      "epoch": 0.6359170305676856,
+      "grad_norm": 0.18843458592891693,
+      "learning_rate": 1.5506067359810333e-05,
+      "loss": 0.15149861574172974,
+      "step": 3495
+    },
+    {
+      "epoch": 0.6368267831149927,
+      "grad_norm": 0.15874551236629486,
+      "learning_rate": 1.5438014798657275e-05,
+      "loss": 0.15188233852386473,
+      "step": 3500
+    },
+    {
+      "epoch": 0.6377365356622998,
+      "grad_norm": 0.17014239728450775,
+      "learning_rate": 1.5370045130657366e-05,
+      "loss": 0.14694437980651856,
+      "step": 3505
+    },
+    {
+      "epoch": 0.638646288209607,
+      "grad_norm": 0.14744038879871368,
+      "learning_rate": 1.5302158945041838e-05,
+      "loss": 0.14434736967086792,
+      "step": 3510
+    },
+    {
+      "epoch": 0.6395560407569141,
+      "grad_norm": 0.2069770246744156,
+      "learning_rate": 1.523435683031818e-05,
+      "loss": 0.13982917070388795,
+      "step": 3515
+    },
+    {
+      "epoch": 0.6404657933042213,
+      "grad_norm": 0.17811502516269684,
+      "learning_rate": 1.5166639374265063e-05,
+      "loss": 0.1408839702606201,
+      "step": 3520
+    },
+    {
+      "epoch": 0.6413755458515283,
+      "grad_norm": 0.165786474943161,
+      "learning_rate": 1.509900716392728e-05,
+      "loss": 0.15312877893447877,
+      "step": 3525
+    },
+    {
+      "epoch": 0.6422852983988355,
+      "grad_norm": 0.1633884161710739,
+      "learning_rate": 1.5031460785610596e-05,
+      "loss": 0.1488795518875122,
+      "step": 3530
+    },
+    {
+      "epoch": 0.6431950509461426,
+      "grad_norm": 0.16498984396457672,
+      "learning_rate": 1.4964000824876723e-05,
+      "loss": 0.15031465291976928,
+      "step": 3535
+    },
+    {
+      "epoch": 0.6441048034934498,
+      "grad_norm": 0.18043678998947144,
+      "learning_rate": 1.4896627866538191e-05,
+      "loss": 0.147829806804657,
+      "step": 3540
+    },
+    {
+      "epoch": 0.6450145560407569,
+      "grad_norm": 0.16813597083091736,
+      "learning_rate": 1.4829342494653315e-05,
+      "loss": 0.1418998956680298,
+      "step": 3545
+    },
+    {
+      "epoch": 0.645924308588064,
+      "grad_norm": 0.1817242056131363,
+      "learning_rate": 1.4762145292521118e-05,
+      "loss": 0.14508869647979736,
+      "step": 3550
+    },
+    {
+      "epoch": 0.6468340611353712,
+      "grad_norm": 0.14666494727134705,
+      "learning_rate": 1.469503684267628e-05,
+      "loss": 0.14159854650497436,
+      "step": 3555
+    },
+    {
+      "epoch": 0.6477438136826783,
+      "grad_norm": 0.16485381126403809,
+      "learning_rate": 1.4628017726884086e-05,
+      "loss": 0.14419105052947997,
+      "step": 3560
+    },
+    {
+      "epoch": 0.6486535662299855,
+      "grad_norm": 0.16100342571735382,
+      "learning_rate": 1.4561088526135375e-05,
+      "loss": 0.14501721858978273,
+      "step": 3565
+    },
+    {
+      "epoch": 0.6495633187772926,
+      "grad_norm": 0.16996590793132782,
+      "learning_rate": 1.4494249820641493e-05,
+      "loss": 0.1377166509628296,
+      "step": 3570
+    },
+    {
+      "epoch": 0.6504730713245997,
+      "grad_norm": 0.16168837249279022,
+      "learning_rate": 1.4427502189829339e-05,
+      "loss": 0.1414325475692749,
+      "step": 3575
+    },
+    {
+      "epoch": 0.6513828238719068,
+      "grad_norm": 0.16318906843662262,
+      "learning_rate": 1.436084621233621e-05,
+      "loss": 0.14685193300247193,
+      "step": 3580
+    },
+    {
+      "epoch": 0.652292576419214,
+      "grad_norm": 0.1636219322681427,
+      "learning_rate": 1.4294282466004899e-05,
+      "loss": 0.1405899167060852,
+      "step": 3585
+    },
+    {
+      "epoch": 0.6532023289665211,
+      "grad_norm": 0.1838461309671402,
+      "learning_rate": 1.422781152787865e-05,
+      "loss": 0.14386332035064697,
+      "step": 3590
+    },
+    {
+      "epoch": 0.6541120815138283,
+      "grad_norm": 0.1796344667673111,
+      "learning_rate": 1.4161433974196115e-05,
+      "loss": 0.1513024687767029,
+      "step": 3595
+    },
+    {
+      "epoch": 0.6550218340611353,
+      "grad_norm": 0.16424529254436493,
+      "learning_rate": 1.4095150380386427e-05,
+      "loss": 0.14238927364349366,
+      "step": 3600
+    },
+    {
+      "epoch": 0.6559315866084425,
+      "grad_norm": 0.19264160096645355,
+      "learning_rate": 1.402896132106415e-05,
+      "loss": 0.14297477006912232,
+      "step": 3605
+    },
+    {
+      "epoch": 0.6568413391557496,
+      "grad_norm": 0.18319948017597198,
+      "learning_rate": 1.3962867370024347e-05,
+      "loss": 0.1448880434036255,
+      "step": 3610
+    },
+    {
+      "epoch": 0.6577510917030568,
+      "grad_norm": 0.16507290303707123,
+      "learning_rate": 1.389686910023758e-05,
+      "loss": 0.14724698066711425,
+      "step": 3615
+    },
+    {
+      "epoch": 0.6586608442503639,
+      "grad_norm": 0.17871244251728058,
+      "learning_rate": 1.3830967083844942e-05,
+      "loss": 0.14479386806488037,
+      "step": 3620
+    },
+    {
+      "epoch": 0.659570596797671,
+      "grad_norm": 0.1846228390932083,
+      "learning_rate": 1.3765161892153112e-05,
+      "loss": 0.1453616738319397,
+      "step": 3625
+    },
+    {
+      "epoch": 0.6604803493449781,
+      "grad_norm": 0.17185978591442108,
+      "learning_rate": 1.3699454095629372e-05,
+      "loss": 0.14906206130981445,
+      "step": 3630
+    },
+    {
+      "epoch": 0.6613901018922853,
+      "grad_norm": 0.14751191437244415,
+      "learning_rate": 1.3633844263896698e-05,
+      "loss": 0.13991892337799072,
+      "step": 3635
+    },
+    {
+      "epoch": 0.6622998544395924,
+      "grad_norm": 0.22059763967990875,
+      "learning_rate": 1.3568332965728817e-05,
+      "loss": 0.14680869579315187,
+      "step": 3640
+    },
+    {
+      "epoch": 0.6632096069868996,
+      "grad_norm": 0.15295909345149994,
+      "learning_rate": 1.3502920769045232e-05,
+      "loss": 0.1404443383216858,
+      "step": 3645
+    },
+    {
+      "epoch": 0.6641193595342066,
+      "grad_norm": 0.14600558578968048,
+      "learning_rate": 1.3437608240906364e-05,
+      "loss": 0.14663270711898804,
+      "step": 3650
+    },
+    {
+      "epoch": 0.6650291120815138,
+      "grad_norm": 0.15548352897167206,
+      "learning_rate": 1.3372395947508587e-05,
+      "loss": 0.1431443452835083,
+      "step": 3655
+    },
+    {
+      "epoch": 0.665938864628821,
+      "grad_norm": 0.1813388466835022,
+      "learning_rate": 1.3307284454179342e-05,
+      "loss": 0.1458706736564636,
+      "step": 3660
+    },
+    {
+      "epoch": 0.6668486171761281,
+      "grad_norm": 0.16326870024204254,
+      "learning_rate": 1.3242274325372247e-05,
+      "loss": 0.14700595140457154,
+      "step": 3665
+    },
+    {
+      "epoch": 0.6677583697234353,
+      "grad_norm": 0.18779197335243225,
+      "learning_rate": 1.3177366124662149e-05,
+      "loss": 0.1497237801551819,
+      "step": 3670
+    },
+    {
+      "epoch": 0.6686681222707423,
+      "grad_norm": 0.16291002929210663,
+      "learning_rate": 1.3112560414740315e-05,
+      "loss": 0.1387086868286133,
+      "step": 3675
+    },
+    {
+      "epoch": 0.6695778748180495,
+      "grad_norm": 0.1532297134399414,
+      "learning_rate": 1.3047857757409487e-05,
+      "loss": 0.14497545957565308,
+      "step": 3680
+    },
+    {
+      "epoch": 0.6704876273653566,
+      "grad_norm": 0.14697515964508057,
+      "learning_rate": 1.2983258713579066e-05,
+      "loss": 0.1494283437728882,
+      "step": 3685
+    },
+    {
+      "epoch": 0.6713973799126638,
+      "grad_norm": 0.15213452279567719,
+      "learning_rate": 1.2918763843260218e-05,
+      "loss": 0.1468907594680786,
+      "step": 3690
+    },
+    {
+      "epoch": 0.6723071324599709,
+      "grad_norm": 0.1745215803384781,
+      "learning_rate": 1.285437370556099e-05,
+      "loss": 0.14997754096984864,
+      "step": 3695
+    },
+    {
+      "epoch": 0.673216885007278,
+      "grad_norm": 0.19207637012004852,
+      "learning_rate": 1.2790088858681577e-05,
+      "loss": 0.14202862977981567,
+      "step": 3700
+    },
+    {
+      "epoch": 0.6741266375545851,
+      "grad_norm": 0.1521359086036682,
+      "learning_rate": 1.2725909859909313e-05,
+      "loss": 0.14547673463821412,
+      "step": 3705
+    },
+    {
+      "epoch": 0.6750363901018923,
+      "grad_norm": 0.16975535452365875,
+      "learning_rate": 1.2661837265613999e-05,
+      "loss": 0.14006874561309815,
+      "step": 3710
+    },
+    {
+      "epoch": 0.6759461426491994,
+      "grad_norm": 0.22234582901000977,
+      "learning_rate": 1.2597871631242992e-05,
+      "loss": 0.13691173791885375,
+      "step": 3715
+    },
+    {
+      "epoch": 0.6768558951965066,
+      "grad_norm": 0.16082969307899475,
+      "learning_rate": 1.2534013511316383e-05,
+      "loss": 0.14932308197021485,
+      "step": 3720
+    },
+    {
+      "epoch": 0.6777656477438136,
+      "grad_norm": 0.1751091182231903,
+      "learning_rate": 1.247026345942226e-05,
+      "loss": 0.14531974792480468,
+      "step": 3725
+    },
+    {
+      "epoch": 0.6786754002911208,
+      "grad_norm": 0.15838147699832916,
+      "learning_rate": 1.2406622028211844e-05,
+      "loss": 0.14759832620620728,
+      "step": 3730
+    },
+    {
+      "epoch": 0.6795851528384279,
+      "grad_norm": 0.1771744042634964,
+      "learning_rate": 1.2343089769394714e-05,
+      "loss": 0.1382831573486328,
+      "step": 3735
+    },
+    {
+      "epoch": 0.6804949053857351,
+      "grad_norm": 0.16301538050174713,
+      "learning_rate": 1.2279667233734037e-05,
+      "loss": 0.14444775581359864,
+      "step": 3740
+    },
+    {
+      "epoch": 0.6814046579330422,
+      "grad_norm": 0.1584121286869049,
+      "learning_rate": 1.2216354971041796e-05,
+      "loss": 0.14200170040130616,
+      "step": 3745
+    },
+    {
+      "epoch": 0.6823144104803494,
+      "grad_norm": 0.139187291264534,
+      "learning_rate": 1.2153153530174007e-05,
+      "loss": 0.14318310022354125,
+      "step": 3750
+    },
+    {
+      "epoch": 0.6832241630276564,
+      "grad_norm": 0.13665248453617096,
+      "learning_rate": 1.2090063459025955e-05,
+      "loss": 0.1411946654319763,
+      "step": 3755
+    },
+    {
+      "epoch": 0.6841339155749636,
+      "grad_norm": 0.16273781657218933,
+      "learning_rate": 1.2027085304527475e-05,
+      "loss": 0.14873508214950562,
+      "step": 3760
+    },
+    {
+      "epoch": 0.6850436681222707,
+      "grad_norm": 0.16317526996135712,
+      "learning_rate": 1.1964219612638194e-05,
+      "loss": 0.14644203186035157,
+      "step": 3765
+    },
+    {
+      "epoch": 0.6859534206695779,
+      "grad_norm": 0.17253617942333221,
+      "learning_rate": 1.1901466928342777e-05,
+      "loss": 0.14027841091156007,
+      "step": 3770
+    },
+    {
+      "epoch": 0.6868631732168851,
+      "grad_norm": 0.19692830741405487,
+      "learning_rate": 1.183882779564624e-05,
+      "loss": 0.14411110877990724,
+      "step": 3775
+    },
+    {
+      "epoch": 0.6877729257641921,
+      "grad_norm": 0.15444578230381012,
+      "learning_rate": 1.1776302757569214e-05,
+      "loss": 0.14355008602142333,
+      "step": 3780
+    },
+    {
+      "epoch": 0.6886826783114993,
+      "grad_norm": 0.1622200757265091,
+      "learning_rate": 1.1713892356143239e-05,
+      "loss": 0.14794334173202514,
+      "step": 3785
+    },
+    {
+      "epoch": 0.6895924308588064,
+      "grad_norm": 0.1898501068353653,
+      "learning_rate": 1.1651597132406073e-05,
+      "loss": 0.1418622612953186,
+      "step": 3790
+    },
+    {
+      "epoch": 0.6905021834061136,
+      "grad_norm": 0.17803208529949188,
+      "learning_rate": 1.1589417626396973e-05,
+      "loss": 0.14576040506362914,
+      "step": 3795
+    },
+    {
+      "epoch": 0.6914119359534207,
+      "grad_norm": 0.17138013243675232,
+      "learning_rate": 1.1527354377152053e-05,
+      "loss": 0.14494270086288452,
+      "step": 3800
+    },
+    {
+      "epoch": 0.6923216885007278,
+      "grad_norm": 0.15170913934707642,
+      "learning_rate": 1.1465407922699603e-05,
+      "loss": 0.144084370136261,
+      "step": 3805
+    },
+    {
+      "epoch": 0.6932314410480349,
+      "grad_norm": 0.158562570810318,
+      "learning_rate": 1.1403578800055387e-05,
+      "loss": 0.13636608123779298,
+      "step": 3810
+    },
+    {
+      "epoch": 0.6941411935953421,
+      "grad_norm": 0.17687302827835083,
+      "learning_rate": 1.1341867545218044e-05,
+      "loss": 0.14214688539505005,
+      "step": 3815
+    },
+    {
+      "epoch": 0.6950509461426492,
+      "grad_norm": 0.15394899249076843,
+      "learning_rate": 1.1280274693164378e-05,
+      "loss": 0.14914129972457885,
+      "step": 3820
+    },
+    {
+      "epoch": 0.6959606986899564,
+      "grad_norm": 0.15709355473518372,
+      "learning_rate": 1.12188007778448e-05,
+      "loss": 0.14798580408096312,
+      "step": 3825
+    },
+    {
+      "epoch": 0.6968704512372634,
+      "grad_norm": 0.16631539165973663,
+      "learning_rate": 1.115744633217864e-05,
+      "loss": 0.14756966829299928,
+      "step": 3830
+    },
+    {
+      "epoch": 0.6977802037845706,
+      "grad_norm": 0.15893076360225677,
+      "learning_rate": 1.109621188804951e-05,
+      "loss": 0.14061959981918334,
+      "step": 3835
+    },
+    {
+      "epoch": 0.6986899563318777,
+      "grad_norm": 0.183414489030838,
+      "learning_rate": 1.103509797630077e-05,
+      "loss": 0.1448473334312439,
+      "step": 3840
+    },
+    {
+      "epoch": 0.6995997088791849,
+      "grad_norm": 0.14087305963039398,
+      "learning_rate": 1.0974105126730841e-05,
+      "loss": 0.14369285106658936,
+      "step": 3845
+    },
+    {
+      "epoch": 0.700509461426492,
+      "grad_norm": 0.16919967532157898,
+      "learning_rate": 1.0913233868088685e-05,
+      "loss": 0.1478085398674011,
+      "step": 3850
+    },
+    {
+      "epoch": 0.7014192139737991,
+      "grad_norm": 0.1439533829689026,
+      "learning_rate": 1.0852484728069178e-05,
+      "loss": 0.14376721382141114,
+      "step": 3855
+    },
+    {
+      "epoch": 0.7023289665211062,
+      "grad_norm": 0.17719274759292603,
+      "learning_rate": 1.0791858233308521e-05,
+      "loss": 0.14089040756225585,
+      "step": 3860
+    },
+    {
+      "epoch": 0.7032387190684134,
+      "grad_norm": 0.19753769040107727,
+      "learning_rate": 1.0731354909379754e-05,
+      "loss": 0.15021742582321168,
+      "step": 3865
+    },
+    {
+      "epoch": 0.7041484716157205,
+      "grad_norm": 0.19186992943286896,
+      "learning_rate": 1.0670975280788086e-05,
+      "loss": 0.14113202095031738,
+      "step": 3870
+    },
+    {
+      "epoch": 0.7050582241630277,
+      "grad_norm": 0.1709229201078415,
+      "learning_rate": 1.0610719870966443e-05,
+      "loss": 0.1500566840171814,
+      "step": 3875
+    },
+    {
+      "epoch": 0.7059679767103348,
+      "grad_norm": 0.17846204340457916,
+      "learning_rate": 1.0550589202270892e-05,
+      "loss": 0.15014195442199707,
+      "step": 3880
+    },
+    {
+      "epoch": 0.7068777292576419,
+      "grad_norm": 0.1827082335948944,
+      "learning_rate": 1.0490583795976091e-05,
+      "loss": 0.1423472762107849,
+      "step": 3885
+    },
+    {
+      "epoch": 0.7077874818049491,
+      "grad_norm": 0.17418377101421356,
+      "learning_rate": 1.043070417227083e-05,
+      "loss": 0.14668900966644288,
+      "step": 3890
+    },
+    {
+      "epoch": 0.7086972343522562,
+      "grad_norm": 0.17385616898536682,
+      "learning_rate": 1.0370950850253449e-05,
+      "loss": 0.14627279043197633,
+      "step": 3895
+    },
+    {
+      "epoch": 0.7096069868995634,
+      "grad_norm": 0.16486723721027374,
+      "learning_rate": 1.0311324347927404e-05,
+      "loss": 0.14603652954101562,
+      "step": 3900
+    },
+    {
+      "epoch": 0.7105167394468704,
+      "grad_norm": 0.21806862950325012,
+      "learning_rate": 1.0251825182196732e-05,
+      "loss": 0.1488169550895691,
+      "step": 3905
+    },
+    {
+      "epoch": 0.7114264919941776,
+      "grad_norm": 0.19884569942951202,
+      "learning_rate": 1.019245386886159e-05,
+      "loss": 0.14387656450271608,
+      "step": 3910
+    },
+    {
+      "epoch": 0.7123362445414847,
+      "grad_norm": 0.16139011085033417,
+      "learning_rate": 1.0133210922613789e-05,
+      "loss": 0.1483074426651001,
+      "step": 3915
+    },
+    {
+      "epoch": 0.7132459970887919,
+      "grad_norm": 0.17000740766525269,
+      "learning_rate": 1.007409685703229e-05,
+      "loss": 0.14050065279006957,
+      "step": 3920
+    },
+    {
+      "epoch": 0.714155749636099,
+      "grad_norm": 0.17235304415225983,
+      "learning_rate": 1.0015112184578813e-05,
+      "loss": 0.1440442681312561,
+      "step": 3925
+    },
+    {
+      "epoch": 0.7150655021834061,
+      "grad_norm": 0.15737567842006683,
+      "learning_rate": 9.956257416593362e-06,
+      "loss": 0.14960765838623047,
+      "step": 3930
+    },
+    {
+      "epoch": 0.7159752547307132,
+      "grad_norm": 0.15499180555343628,
+      "learning_rate": 9.897533063289773e-06,
+      "loss": 0.14488829374313356,
+      "step": 3935
+    },
+    {
+      "epoch": 0.7168850072780204,
+      "grad_norm": 0.17744216322898865,
+      "learning_rate": 9.838939633751337e-06,
+      "loss": 0.1416949987411499,
+      "step": 3940
+    },
+    {
+      "epoch": 0.7177947598253275,
+      "grad_norm": 0.1597192883491516,
+      "learning_rate": 9.780477635926358e-06,
+      "loss": 0.14275280237197877,
+      "step": 3945
+    },
+    {
+      "epoch": 0.7187045123726347,
+      "grad_norm": 0.17800374329090118,
+      "learning_rate": 9.722147576623743e-06,
+      "loss": 0.14532098770141602,
+      "step": 3950
+    },
+    {
+      "epoch": 0.7196142649199417,
+      "grad_norm": 0.1828162521123886,
+      "learning_rate": 9.66394996150864e-06,
+      "loss": 0.14525585174560546,
+      "step": 3955
+    },
+    {
+      "epoch": 0.7205240174672489,
+      "grad_norm": 0.1800539344549179,
+      "learning_rate": 9.605885295098005e-06,
+      "loss": 0.14235819578170777,
+      "step": 3960
+    },
+    {
+      "epoch": 0.721433770014556,
+      "grad_norm": 0.16556483507156372,
+      "learning_rate": 9.54795408075628e-06,
+      "loss": 0.13965482711791993,
+      "step": 3965
+    },
+    {
+      "epoch": 0.7223435225618632,
+      "grad_norm": 0.1592024862766266,
+      "learning_rate": 9.49015682069101e-06,
+      "loss": 0.14051042795181273,
+      "step": 3970
+    },
+    {
+      "epoch": 0.7232532751091703,
+      "grad_norm": 0.18988847732543945,
+      "learning_rate": 9.43249401594846e-06,
+      "loss": 0.1436900496482849,
+      "step": 3975
+    },
+    {
+      "epoch": 0.7241630276564774,
+      "grad_norm": 0.24433808028697968,
+      "learning_rate": 9.374966166409329e-06,
+      "loss": 0.14883997440338134,
+      "step": 3980
+    },
+    {
+      "epoch": 0.7250727802037845,
+      "grad_norm": 0.15091639757156372,
+      "learning_rate": 9.317573770784352e-06,
+      "loss": 0.14726560115814208,
+      "step": 3985
+    },
+    {
+      "epoch": 0.7259825327510917,
+      "grad_norm": 0.17045573890209198,
+      "learning_rate": 9.260317326610051e-06,
+      "loss": 0.14120506048202514,
+      "step": 3990
+    },
+    {
+      "epoch": 0.7268922852983989,
+      "grad_norm": 0.18847957253456116,
+      "learning_rate": 9.203197330244343e-06,
+      "loss": 0.1377041220664978,
+      "step": 3995
+    },
+    {
+      "epoch": 0.727802037845706,
+      "grad_norm": 0.1516445279121399,
+      "learning_rate": 9.14621427686229e-06,
+      "loss": 0.14043946266174318,
+      "step": 4000
+    },
+    {
+      "epoch": 0.7287117903930131,
+      "grad_norm": 0.18264050781726837,
+      "learning_rate": 9.0893686604518e-06,
+      "loss": 0.14080368280410765,
+      "step": 4005
+    },
+    {
+      "epoch": 0.7296215429403202,
+      "grad_norm": 0.19129371643066406,
+      "learning_rate": 9.032660973809312e-06,
+      "loss": 0.1402561902999878,
+      "step": 4010
+    },
+    {
+      "epoch": 0.7305312954876274,
+      "grad_norm": 0.15762710571289062,
+      "learning_rate": 8.976091708535567e-06,
+      "loss": 0.14421157836914061,
+      "step": 4015
+    },
+    {
+      "epoch": 0.7314410480349345,
+      "grad_norm": 0.17785198986530304,
+      "learning_rate": 8.919661355031331e-06,
+      "loss": 0.14999009370803834,
+      "step": 4020
+    },
+    {
+      "epoch": 0.7323508005822417,
+      "grad_norm": 0.15306031703948975,
+      "learning_rate": 8.8633704024931e-06,
+      "loss": 0.14101698398590087,
+      "step": 4025
+    },
+    {
+      "epoch": 0.7332605531295487,
+      "grad_norm": 0.16481758654117584,
+      "learning_rate": 8.807219338908968e-06,
+      "loss": 0.14170764684677123,
+      "step": 4030
+    },
+    {
+      "epoch": 0.7341703056768559,
+      "grad_norm": 0.14892235398292542,
+      "learning_rate": 8.751208651054257e-06,
+      "loss": 0.15317896604537964,
+      "step": 4035
+    },
+    {
+      "epoch": 0.735080058224163,
+      "grad_norm": 0.1775592565536499,
+      "learning_rate": 8.695338824487409e-06,
+      "loss": 0.1520617723464966,
+      "step": 4040
+    },
+    {
+      "epoch": 0.7359898107714702,
+      "grad_norm": 0.1614258885383606,
+      "learning_rate": 8.639610343545728e-06,
+      "loss": 0.13747400045394897,
+      "step": 4045
+    },
+    {
+      "epoch": 0.7368995633187773,
+      "grad_norm": 0.21415506303310394,
+      "learning_rate": 8.58402369134117e-06,
+      "loss": 0.1432439088821411,
+      "step": 4050
+    },
+    {
+      "epoch": 0.7378093158660844,
+      "grad_norm": 0.1759418249130249,
+      "learning_rate": 8.528579349756205e-06,
+      "loss": 0.141641104221344,
+      "step": 4055
+    },
+    {
+      "epoch": 0.7387190684133915,
+      "grad_norm": 0.16738329827785492,
+      "learning_rate": 8.47327779943957e-06,
+      "loss": 0.14294810295104982,
+      "step": 4060
+    },
+    {
+      "epoch": 0.7396288209606987,
+      "grad_norm": 0.13916844129562378,
+      "learning_rate": 8.41811951980217e-06,
+      "loss": 0.13876968622207642,
+      "step": 4065
+    },
+    {
+      "epoch": 0.7405385735080058,
+      "grad_norm": 0.1828441321849823,
+      "learning_rate": 8.36310498901288e-06,
+      "loss": 0.148428475856781,
+      "step": 4070
+    },
+    {
+      "epoch": 0.741448326055313,
+      "grad_norm": 0.16534076631069183,
+      "learning_rate": 8.308234683994415e-06,
+      "loss": 0.14222711324691772,
+      "step": 4075
+    },
+    {
+      "epoch": 0.74235807860262,
+      "grad_norm": 0.17922644317150116,
+      "learning_rate": 8.253509080419198e-06,
+      "loss": 0.14365782737731933,
+      "step": 4080
+    },
+    {
+      "epoch": 0.7432678311499272,
+      "grad_norm": 0.15061035752296448,
+      "learning_rate": 8.198928652705204e-06,
+      "loss": 0.13571925163269044,
+      "step": 4085
+    },
+    {
+      "epoch": 0.7441775836972343,
+      "grad_norm": 0.18075402081012726,
+      "learning_rate": 8.144493874011908e-06,
+      "loss": 0.14385528564453126,
+      "step": 4090
+    },
+    {
+      "epoch": 0.7450873362445415,
+      "grad_norm": 0.16514739394187927,
+      "learning_rate": 8.090205216236135e-06,
+      "loss": 0.14920626878738402,
+      "step": 4095
+    },
+    {
+      "epoch": 0.7459970887918487,
+      "grad_norm": 0.16453702747821808,
+      "learning_rate": 8.03606315000797e-06,
+      "loss": 0.14704222679138185,
+      "step": 4100
+    },
+    {
+      "epoch": 0.7469068413391557,
+      "grad_norm": 0.16719917953014374,
+      "learning_rate": 7.982068144686707e-06,
+      "loss": 0.14722511768341065,
+      "step": 4105
+    },
+    {
+      "epoch": 0.7478165938864629,
+      "grad_norm": 0.18499110639095306,
+      "learning_rate": 7.92822066835677e-06,
+      "loss": 0.1401848554611206,
+      "step": 4110
+    },
+    {
+      "epoch": 0.74872634643377,
+      "grad_norm": 0.17249563336372375,
+      "learning_rate": 7.87452118782363e-06,
+      "loss": 0.15132423639297485,
+      "step": 4115
+    },
+    {
+      "epoch": 0.7496360989810772,
+      "grad_norm": 0.15049682557582855,
+      "learning_rate": 7.8209701686098e-06,
+      "loss": 0.1341150164604187,
+      "step": 4120
+    },
+    {
+      "epoch": 0.7505458515283843,
+      "grad_norm": 0.16892646253108978,
+      "learning_rate": 7.767568074950751e-06,
+      "loss": 0.1466840147972107,
+      "step": 4125
+    },
+    {
+      "epoch": 0.7514556040756915,
+      "grad_norm": 0.17288286983966827,
+      "learning_rate": 7.714315369790942e-06,
+      "loss": 0.13819680213928223,
+      "step": 4130
+    },
+    {
+      "epoch": 0.7523653566229985,
+      "grad_norm": 0.21893996000289917,
+      "learning_rate": 7.661212514779745e-06,
+      "loss": 0.14369510412216185,
+      "step": 4135
+    },
+    {
+      "epoch": 0.7532751091703057,
+      "grad_norm": 0.1674601435661316,
+      "learning_rate": 7.608259970267509e-06,
+      "loss": 0.14810250997543334,
+      "step": 4140
+    },
+    {
+      "epoch": 0.7541848617176128,
+      "grad_norm": 0.15875539183616638,
+      "learning_rate": 7.555458195301526e-06,
+      "loss": 0.14103198051452637,
+      "step": 4145
+    },
+    {
+      "epoch": 0.75509461426492,
+      "grad_norm": 0.19454079866409302,
+      "learning_rate": 7.502807647622037e-06,
+      "loss": 0.13848764896392823,
+      "step": 4150
+    },
+    {
+      "epoch": 0.756004366812227,
+      "grad_norm": 0.1795455813407898,
+      "learning_rate": 7.450308783658341e-06,
+      "loss": 0.14459335803985596,
+      "step": 4155
+    },
+    {
+      "epoch": 0.7569141193595342,
+      "grad_norm": 0.1643362045288086,
+      "learning_rate": 7.397962058524735e-06,
+      "loss": 0.14335378408432006,
+      "step": 4160
+    },
+    {
+      "epoch": 0.7578238719068413,
+      "grad_norm": 0.16362066566944122,
+      "learning_rate": 7.3457679260166475e-06,
+      "loss": 0.14222005605697632,
+      "step": 4165
+    },
+    {
+      "epoch": 0.7587336244541485,
+      "grad_norm": 0.17313003540039062,
+      "learning_rate": 7.293726838606674e-06,
+      "loss": 0.14272255897521974,
+      "step": 4170
+    },
+    {
+      "epoch": 0.7596433770014556,
+      "grad_norm": 0.1809929460287094,
+      "learning_rate": 7.2418392474406405e-06,
+      "loss": 0.14089123010635377,
+      "step": 4175
+    },
+    {
+      "epoch": 0.7605531295487628,
+      "grad_norm": 0.14306005835533142,
+      "learning_rate": 7.19010560233373e-06,
+      "loss": 0.13531534671783446,
+      "step": 4180
+    },
+    {
+      "epoch": 0.7614628820960698,
+      "grad_norm": 0.15525390207767487,
+      "learning_rate": 7.138526351766559e-06,
+      "loss": 0.14340845346450806,
+      "step": 4185
+    },
+    {
+      "epoch": 0.762372634643377,
+      "grad_norm": 0.24478943645954132,
+      "learning_rate": 7.087101942881263e-06,
+      "loss": 0.14744555950164795,
+      "step": 4190
+    },
+    {
+      "epoch": 0.7632823871906841,
+      "grad_norm": 0.31335577368736267,
+      "learning_rate": 7.035832821477711e-06,
+      "loss": 0.1484094500541687,
+      "step": 4195
+    },
+    {
+      "epoch": 0.7641921397379913,
+      "grad_norm": 0.15140366554260254,
+      "learning_rate": 6.984719432009515e-06,
+      "loss": 0.14991614818572999,
+      "step": 4200
+    },
+    {
+      "epoch": 0.7651018922852983,
+      "grad_norm": 0.16125506162643433,
+      "learning_rate": 6.933762217580289e-06,
+      "loss": 0.1408134937286377,
+      "step": 4205
+    },
+    {
+      "epoch": 0.7660116448326055,
+      "grad_norm": 0.2501450181007385,
+      "learning_rate": 6.882961619939726e-06,
+      "loss": 0.13875640630722047,
+      "step": 4210
+    },
+    {
+      "epoch": 0.7669213973799127,
+      "grad_norm": 0.16227811574935913,
+      "learning_rate": 6.8323180794798245e-06,
+      "loss": 0.14138660430908204,
+      "step": 4215
+    },
+    {
+      "epoch": 0.7678311499272198,
+      "grad_norm": 0.16676810383796692,
+      "learning_rate": 6.781832035231053e-06,
+      "loss": 0.14696706533432008,
+      "step": 4220
+    },
+    {
+      "epoch": 0.768740902474527,
+      "grad_norm": 0.14638574421405792,
+      "learning_rate": 6.731503924858518e-06,
+      "loss": 0.14263020753860473,
+      "step": 4225
+    },
+    {
+      "epoch": 0.769650655021834,
+      "grad_norm": 0.17093190550804138,
+      "learning_rate": 6.681334184658211e-06,
+      "loss": 0.14694111347198485,
+      "step": 4230
+    },
+    {
+      "epoch": 0.7705604075691412,
+      "grad_norm": 0.17174287140369415,
+      "learning_rate": 6.631323249553201e-06,
+      "loss": 0.13854929208755493,
+      "step": 4235
+    },
+    {
+      "epoch": 0.7714701601164483,
+      "grad_norm": 0.14599016308784485,
+      "learning_rate": 6.5814715530898745e-06,
+      "loss": 0.14058833122253417,
+      "step": 4240
+    },
+    {
+      "epoch": 0.7723799126637555,
+      "grad_norm": 0.16222265362739563,
+      "learning_rate": 6.531779527434176e-06,
+      "loss": 0.1428326725959778,
+      "step": 4245
+    },
+    {
+      "epoch": 0.7732896652110626,
+      "grad_norm": 0.1741994023323059,
+      "learning_rate": 6.482247603367839e-06,
+      "loss": 0.13985042572021483,
+      "step": 4250
+    },
+    {
+      "epoch": 0.7741994177583698,
+      "grad_norm": 0.17427101731300354,
+      "learning_rate": 6.432876210284688e-06,
+      "loss": 0.1442667603492737,
+      "step": 4255
+    },
+    {
+      "epoch": 0.7751091703056768,
+      "grad_norm": 0.1665259599685669,
+      "learning_rate": 6.383665776186912e-06,
+      "loss": 0.1421986222267151,
+      "step": 4260
+    },
+    {
+      "epoch": 0.776018922852984,
+      "grad_norm": 0.1728232353925705,
+      "learning_rate": 6.334616727681303e-06,
+      "loss": 0.1367053508758545,
+      "step": 4265
+    },
+    {
+      "epoch": 0.7769286754002911,
+      "grad_norm": 0.15882381796836853,
+      "learning_rate": 6.285729489975639e-06,
+      "loss": 0.14551182985305786,
+      "step": 4270
+    },
+    {
+      "epoch": 0.7778384279475983,
+      "grad_norm": 0.242042675614357,
+      "learning_rate": 6.2370044868749115e-06,
+      "loss": 0.1455132007598877,
+      "step": 4275
+    },
+    {
+      "epoch": 0.7787481804949054,
+      "grad_norm": 0.1599501073360443,
+      "learning_rate": 6.188442140777742e-06,
+      "loss": 0.1424942970275879,
+      "step": 4280
+    },
+    {
+      "epoch": 0.7796579330422125,
+      "grad_norm": 0.15182635188102722,
+      "learning_rate": 6.140042872672647e-06,
+      "loss": 0.14212887287139891,
+      "step": 4285
+    },
+    {
+      "epoch": 0.7805676855895196,
+      "grad_norm": 0.1720375418663025,
+      "learning_rate": 6.091807102134403e-06,
+      "loss": 0.14243412017822266,
+      "step": 4290
+    },
+    {
+      "epoch": 0.7814774381368268,
+      "grad_norm": 0.16436047852039337,
+      "learning_rate": 6.043735247320454e-06,
+      "loss": 0.15035657882690429,
+      "step": 4295
+    },
+    {
+      "epoch": 0.7823871906841339,
+      "grad_norm": 0.1498408019542694,
+      "learning_rate": 5.995827724967218e-06,
+      "loss": 0.14494839906692505,
+      "step": 4300
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.366369788804259e+18,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-4300/training_args.bin b/checkpoint-4300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-4300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/checkpoint-4400/README.md b/checkpoint-4400/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-4400/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-4400/adapter_config.json b/checkpoint-4400/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-4400/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-4400/adapter_model.safetensors b/checkpoint-4400/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..99eee17bdeaf130d77edeec2af4852d28a487d3e
--- /dev/null
+++ b/checkpoint-4400/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:382f8773e3bb8d8b50a829e51e8adeb1f7677513e05f5adb129b1378b0790129
+size 169741912
diff --git a/checkpoint-4400/chat_template.jinja b/checkpoint-4400/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-4400/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-4400/optimizer.pt b/checkpoint-4400/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a1afc56a4a896f70292ba012ab9e77158a78b525
--- /dev/null
+++ b/checkpoint-4400/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bea3fc6fed190bfcda9845af5cf6f5fe2704ce3e35d310eb436cf67ab8640ddc
+size 72807355
diff --git a/checkpoint-4400/processor_config.json b/checkpoint-4400/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-4400/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-4400/rng_state.pth b/checkpoint-4400/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-4400/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-4400/scheduler.pt b/checkpoint-4400/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..db5f48848af387f93415b758bc879ec59e210237
--- /dev/null
+++ b/checkpoint-4400/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e602cd7d0b8af82ee04a51f3822660b9aefb6c22de1107ff520087edc92e07c8
+size 1465
diff --git a/checkpoint-4400/tokenizer.json b/checkpoint-4400/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-4400/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-4400/tokenizer_config.json b/checkpoint-4400/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-4400/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-4400/trainer_state.json b/checkpoint-4400/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..a0923cb7694dab7be7aae7c10e1adfd92f8e2f46
--- /dev/null
+++ b/checkpoint-4400/trainer_state.json
@@ -0,0 +1,6202 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.8005822416302766,
+  "eval_steps": 100,
+  "global_step": 4400,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    },
+    {
+      "epoch": 0.03729985443959243,
+      "grad_norm": 0.061678580939769745,
+      "learning_rate": 4.999340748636462e-05,
+      "loss": 0.24956226348876953,
+      "step": 205
+    },
+    {
+      "epoch": 0.03820960698689956,
+      "grad_norm": 0.07328873127698898,
+      "learning_rate": 4.999160884067051e-05,
+      "loss": 0.26169676780700685,
+      "step": 210
+    },
+    {
+      "epoch": 0.0391193595342067,
+      "grad_norm": 0.08287990838289261,
+      "learning_rate": 4.9989593541926246e-05,
+      "loss": 0.2574604034423828,
+      "step": 215
+    },
+    {
+      "epoch": 0.04002911208151383,
+      "grad_norm": 0.06787359714508057,
+      "learning_rate": 4.9987361607602525e-05,
+      "loss": 0.25351409912109374,
+      "step": 220
+    },
+    {
+      "epoch": 0.04093886462882096,
+      "grad_norm": 0.06695502996444702,
+      "learning_rate": 4.998491305704805e-05,
+      "loss": 0.24522039890289307,
+      "step": 225
+    },
+    {
+      "epoch": 0.041848617176128096,
+      "grad_norm": 0.08872214704751968,
+      "learning_rate": 4.9982247911489375e-05,
+      "loss": 0.2581867933273315,
+      "step": 230
+    },
+    {
+      "epoch": 0.042758369723435226,
+      "grad_norm": 0.07637131959199905,
+      "learning_rate": 4.9979366194030743e-05,
+      "loss": 0.25569658279418944,
+      "step": 235
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 0.08158119022846222,
+      "learning_rate": 4.997626792965385e-05,
+      "loss": 0.2529409646987915,
+      "step": 240
+    },
+    {
+      "epoch": 0.04457787481804949,
+      "grad_norm": 0.07529161125421524,
+      "learning_rate": 4.997295314521766e-05,
+      "loss": 0.24049024581909179,
+      "step": 245
+    },
+    {
+      "epoch": 0.04548762736535662,
+      "grad_norm": 0.08860139548778534,
+      "learning_rate": 4.996942186945813e-05,
+      "loss": 0.2490522861480713,
+      "step": 250
+    },
+    {
+      "epoch": 0.04639737991266375,
+      "grad_norm": 0.0850321501493454,
+      "learning_rate": 4.9965674132988005e-05,
+      "loss": 0.24180831909179687,
+      "step": 255
+    },
+    {
+      "epoch": 0.04730713245997089,
+      "grad_norm": 0.07556115090847015,
+      "learning_rate": 4.996170996829653e-05,
+      "loss": 0.2509631872177124,
+      "step": 260
+    },
+    {
+      "epoch": 0.04821688500727802,
+      "grad_norm": 0.07971206307411194,
+      "learning_rate": 4.995752940974918e-05,
+      "loss": 0.24398891925811766,
+      "step": 265
+    },
+    {
+      "epoch": 0.04912663755458515,
+      "grad_norm": 0.09149336814880371,
+      "learning_rate": 4.9953132493587344e-05,
+      "loss": 0.2300492286682129,
+      "step": 270
+    },
+    {
+      "epoch": 0.050036390101892286,
+      "grad_norm": 0.08265820890665054,
+      "learning_rate": 4.9948519257928034e-05,
+      "loss": 0.24246792793273925,
+      "step": 275
+    },
+    {
+      "epoch": 0.050946142649199416,
+      "grad_norm": 0.10328587144613266,
+      "learning_rate": 4.9943689742763534e-05,
+      "loss": 0.2367171049118042,
+      "step": 280
+    },
+    {
+      "epoch": 0.05185589519650655,
+      "grad_norm": 0.0836917981505394,
+      "learning_rate": 4.993864398996105e-05,
+      "loss": 0.23215813636779786,
+      "step": 285
+    },
+    {
+      "epoch": 0.05276564774381368,
+      "grad_norm": 0.09475161135196686,
+      "learning_rate": 4.99333820432624e-05,
+      "loss": 0.2350748062133789,
+      "step": 290
+    },
+    {
+      "epoch": 0.05367540029112081,
+      "grad_norm": 0.08040128648281097,
+      "learning_rate": 4.992790394828355e-05,
+      "loss": 0.23253886699676513,
+      "step": 295
+    },
+    {
+      "epoch": 0.05458515283842795,
+      "grad_norm": 0.08852150291204453,
+      "learning_rate": 4.992220975251428e-05,
+      "loss": 0.23856515884399415,
+      "step": 300
+    },
+    {
+      "epoch": 0.05549490538573508,
+      "grad_norm": 0.09565229713916779,
+      "learning_rate": 4.991629950531775e-05,
+      "loss": 0.23311660289764405,
+      "step": 305
+    },
+    {
+      "epoch": 0.05640465793304221,
+      "grad_norm": 0.08158160001039505,
+      "learning_rate": 4.991017325793009e-05,
+      "loss": 0.22467944622039795,
+      "step": 310
+    },
+    {
+      "epoch": 0.05731441048034935,
+      "grad_norm": 0.07746429741382599,
+      "learning_rate": 4.990383106345994e-05,
+      "loss": 0.229844069480896,
+      "step": 315
+    },
+    {
+      "epoch": 0.05822416302765648,
+      "grad_norm": 0.08564355969429016,
+      "learning_rate": 4.989727297688797e-05,
+      "loss": 0.22414517402648926,
+      "step": 320
+    },
+    {
+      "epoch": 0.05913391557496361,
+      "grad_norm": 0.07517435401678085,
+      "learning_rate": 4.9890499055066435e-05,
+      "loss": 0.2236532211303711,
+      "step": 325
+    },
+    {
+      "epoch": 0.060043668122270744,
+      "grad_norm": 0.111734539270401,
+      "learning_rate": 4.988350935671869e-05,
+      "loss": 0.21474847793579102,
+      "step": 330
+    },
+    {
+      "epoch": 0.060953420669577874,
+      "grad_norm": 0.09906989336013794,
+      "learning_rate": 4.987630394243866e-05,
+      "loss": 0.23321933746337892,
+      "step": 335
+    },
+    {
+      "epoch": 0.06186317321688501,
+      "grad_norm": 0.10131457448005676,
+      "learning_rate": 4.98688828746903e-05,
+      "loss": 0.2310662031173706,
+      "step": 340
+    },
+    {
+      "epoch": 0.06277292576419213,
+      "grad_norm": 0.09203507006168365,
+      "learning_rate": 4.986124621780708e-05,
+      "loss": 0.22021169662475587,
+      "step": 345
+    },
+    {
+      "epoch": 0.06368267831149928,
+      "grad_norm": 0.09505912661552429,
+      "learning_rate": 4.9853394037991416e-05,
+      "loss": 0.2197155237197876,
+      "step": 350
+    },
+    {
+      "epoch": 0.06459243085880641,
+      "grad_norm": 0.09038657695055008,
+      "learning_rate": 4.984532640331412e-05,
+      "loss": 0.22066287994384765,
+      "step": 355
+    },
+    {
+      "epoch": 0.06550218340611354,
+      "grad_norm": 0.09707064181566238,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 0.22455451488494874,
+      "step": 360
+    },
+    {
+      "epoch": 0.06641193595342067,
+      "grad_norm": 0.10367228090763092,
+      "learning_rate": 4.98285450509961e-05,
+      "loss": 0.21993820667266845,
+      "step": 365
+    },
+    {
+      "epoch": 0.0673216885007278,
+      "grad_norm": 0.12229471653699875,
+      "learning_rate": 4.9819831478833456e-05,
+      "loss": 0.2168867588043213,
+      "step": 370
+    },
+    {
+      "epoch": 0.06823144104803494,
+      "grad_norm": 0.0964592918753624,
+      "learning_rate": 4.981090274276406e-05,
+      "loss": 0.21579203605651856,
+      "step": 375
+    },
+    {
+      "epoch": 0.06914119359534207,
+      "grad_norm": 0.09400496631860733,
+      "learning_rate": 4.980175892019141e-05,
+      "loss": 0.20972180366516113,
+      "step": 380
+    },
+    {
+      "epoch": 0.0700509461426492,
+      "grad_norm": 0.08158645778894424,
+      "learning_rate": 4.9792400090383594e-05,
+      "loss": 0.22148358821868896,
+      "step": 385
+    },
+    {
+      "epoch": 0.07096069868995633,
+      "grad_norm": 0.10916394740343094,
+      "learning_rate": 4.978282633447261e-05,
+      "loss": 0.2214418649673462,
+      "step": 390
+    },
+    {
+      "epoch": 0.07187045123726346,
+      "grad_norm": 0.11138810962438583,
+      "learning_rate": 4.9773037735453636e-05,
+      "loss": 0.21814754009246826,
+      "step": 395
+    },
+    {
+      "epoch": 0.07278020378457059,
+      "grad_norm": 0.10914396494626999,
+      "learning_rate": 4.9763034378184365e-05,
+      "loss": 0.21310818195343018,
+      "step": 400
+    },
+    {
+      "epoch": 0.07368995633187773,
+      "grad_norm": 0.1043366864323616,
+      "learning_rate": 4.975281634938421e-05,
+      "loss": 0.21266789436340333,
+      "step": 405
+    },
+    {
+      "epoch": 0.07459970887918486,
+      "grad_norm": 0.1036868542432785,
+      "learning_rate": 4.9742383737633594e-05,
+      "loss": 0.21606721878051757,
+      "step": 410
+    },
+    {
+      "epoch": 0.075509461426492,
+      "grad_norm": 0.11640442907810211,
+      "learning_rate": 4.9731736633373144e-05,
+      "loss": 0.21532948017120362,
+      "step": 415
+    },
+    {
+      "epoch": 0.07641921397379912,
+      "grad_norm": 0.11219926178455353,
+      "learning_rate": 4.9720875128902956e-05,
+      "loss": 0.2191627025604248,
+      "step": 420
+    },
+    {
+      "epoch": 0.07732896652110625,
+      "grad_norm": 0.12103637307882309,
+      "learning_rate": 4.970979931838176e-05,
+      "loss": 0.20938868522644044,
+      "step": 425
+    },
+    {
+      "epoch": 0.0782387190684134,
+      "grad_norm": 0.13274189829826355,
+      "learning_rate": 4.96985092978261e-05,
+      "loss": 0.21792960166931152,
+      "step": 430
+    },
+    {
+      "epoch": 0.07914847161572053,
+      "grad_norm": 0.11164513230323792,
+      "learning_rate": 4.968700516510954e-05,
+      "loss": 0.2022618055343628,
+      "step": 435
+    },
+    {
+      "epoch": 0.08005822416302766,
+      "grad_norm": 0.09532847255468369,
+      "learning_rate": 4.967528701996174e-05,
+      "loss": 0.21255812644958497,
+      "step": 440
+    },
+    {
+      "epoch": 0.08096797671033479,
+      "grad_norm": 0.10279258340597153,
+      "learning_rate": 4.96633549639677e-05,
+      "loss": 0.20683050155639648,
+      "step": 445
+    },
+    {
+      "epoch": 0.08187772925764192,
+      "grad_norm": 0.1257462352514267,
+      "learning_rate": 4.965120910056677e-05,
+      "loss": 0.21419920921325683,
+      "step": 450
+    },
+    {
+      "epoch": 0.08278748180494905,
+      "grad_norm": 0.11663137376308441,
+      "learning_rate": 4.963884953505186e-05,
+      "loss": 0.2072287082672119,
+      "step": 455
+    },
+    {
+      "epoch": 0.08369723435225619,
+      "grad_norm": 0.10488224029541016,
+      "learning_rate": 4.96262763745684e-05,
+      "loss": 0.1982678532600403,
+      "step": 460
+    },
+    {
+      "epoch": 0.08460698689956332,
+      "grad_norm": 0.11801692098379135,
+      "learning_rate": 4.961348972811354e-05,
+      "loss": 0.20662031173706055,
+      "step": 465
+    },
+    {
+      "epoch": 0.08551673944687045,
+      "grad_norm": 0.11318827420473099,
+      "learning_rate": 4.96004897065351e-05,
+      "loss": 0.20947303771972656,
+      "step": 470
+    },
+    {
+      "epoch": 0.08642649199417758,
+      "grad_norm": 0.13409486413002014,
+      "learning_rate": 4.95872764225307e-05,
+      "loss": 0.19670876264572143,
+      "step": 475
+    },
+    {
+      "epoch": 0.08733624454148471,
+      "grad_norm": 0.14440792798995972,
+      "learning_rate": 4.957384999064672e-05,
+      "loss": 0.19842848777770997,
+      "step": 480
+    },
+    {
+      "epoch": 0.08824599708879186,
+      "grad_norm": 0.12246996909379959,
+      "learning_rate": 4.956021052727731e-05,
+      "loss": 0.20318071842193602,
+      "step": 485
+    },
+    {
+      "epoch": 0.08915574963609899,
+      "grad_norm": 0.13437233865261078,
+      "learning_rate": 4.954635815066342e-05,
+      "loss": 0.21675212383270265,
+      "step": 490
+    },
+    {
+      "epoch": 0.09006550218340612,
+      "grad_norm": 0.11109672486782074,
+      "learning_rate": 4.9532292980891744e-05,
+      "loss": 0.2100757837295532,
+      "step": 495
+    },
+    {
+      "epoch": 0.09097525473071325,
+      "grad_norm": 0.1388893872499466,
+      "learning_rate": 4.9518015139893675e-05,
+      "loss": 0.20303285121917725,
+      "step": 500
+    },
+    {
+      "epoch": 0.09188500727802038,
+      "grad_norm": 0.13239721953868866,
+      "learning_rate": 4.950352475144427e-05,
+      "loss": 0.2152268409729004,
+      "step": 505
+    },
+    {
+      "epoch": 0.0927947598253275,
+      "grad_norm": 0.12834979593753815,
+      "learning_rate": 4.948882194116119e-05,
+      "loss": 0.20799248218536376,
+      "step": 510
+    },
+    {
+      "epoch": 0.09370451237263465,
+      "grad_norm": 0.11886704713106155,
+      "learning_rate": 4.947390683650354e-05,
+      "loss": 0.20394976139068605,
+      "step": 515
+    },
+    {
+      "epoch": 0.09461426491994178,
+      "grad_norm": 0.11398876458406448,
+      "learning_rate": 4.945877956677083e-05,
+      "loss": 0.2091092586517334,
+      "step": 520
+    },
+    {
+      "epoch": 0.09552401746724891,
+      "grad_norm": 0.1422540694475174,
+      "learning_rate": 4.944344026310186e-05,
+      "loss": 0.19564238786697388,
+      "step": 525
+    },
+    {
+      "epoch": 0.09643377001455604,
+      "grad_norm": 0.11359584331512451,
+      "learning_rate": 4.9427889058473535e-05,
+      "loss": 0.20493624210357667,
+      "step": 530
+    },
+    {
+      "epoch": 0.09734352256186317,
+      "grad_norm": 0.11703553050756454,
+      "learning_rate": 4.941212608769974e-05,
+      "loss": 0.2098615884780884,
+      "step": 535
+    },
+    {
+      "epoch": 0.0982532751091703,
+      "grad_norm": 0.14552047848701477,
+      "learning_rate": 4.939615148743017e-05,
+      "loss": 0.20382182598114013,
+      "step": 540
+    },
+    {
+      "epoch": 0.09916302765647744,
+      "grad_norm": 0.13178016245365143,
+      "learning_rate": 4.937996539614914e-05,
+      "loss": 0.19901862144470214,
+      "step": 545
+    },
+    {
+      "epoch": 0.10007278020378457,
+      "grad_norm": 0.635392427444458,
+      "learning_rate": 4.936356795417439e-05,
+      "loss": 0.20694944858551026,
+      "step": 550
+    },
+    {
+      "epoch": 0.1009825327510917,
+      "grad_norm": 0.15019077062606812,
+      "learning_rate": 4.934695930365586e-05,
+      "loss": 0.19313746690750122,
+      "step": 555
+    },
+    {
+      "epoch": 0.10189228529839883,
+      "grad_norm": 0.12941956520080566,
+      "learning_rate": 4.9330139588574474e-05,
+      "loss": 0.19671722650527954,
+      "step": 560
+    },
+    {
+      "epoch": 0.10280203784570596,
+      "grad_norm": 0.13818831741809845,
+      "learning_rate": 4.931310895474088e-05,
+      "loss": 0.20026786327362062,
+      "step": 565
+    },
+    {
+      "epoch": 0.1037117903930131,
+      "grad_norm": 0.12011194974184036,
+      "learning_rate": 4.929586754979417e-05,
+      "loss": 0.1932437539100647,
+      "step": 570
+    },
+    {
+      "epoch": 0.10462154294032024,
+      "grad_norm": 0.1345364898443222,
+      "learning_rate": 4.9278415523200644e-05,
+      "loss": 0.20245940685272218,
+      "step": 575
+    },
+    {
+      "epoch": 0.10553129548762737,
+      "grad_norm": 0.13281017541885376,
+      "learning_rate": 4.926075302625247e-05,
+      "loss": 0.19864981174468993,
+      "step": 580
+    },
+    {
+      "epoch": 0.1064410480349345,
+      "grad_norm": 0.13465586304664612,
+      "learning_rate": 4.924288021206639e-05,
+      "loss": 0.19573183059692384,
+      "step": 585
+    },
+    {
+      "epoch": 0.10735080058224163,
+      "grad_norm": 0.15225961804389954,
+      "learning_rate": 4.9224797235582396e-05,
+      "loss": 0.19946801662445068,
+      "step": 590
+    },
+    {
+      "epoch": 0.10826055312954876,
+      "grad_norm": 0.12816746532917023,
+      "learning_rate": 4.92065042535624e-05,
+      "loss": 0.19851526021957397,
+      "step": 595
+    },
+    {
+      "epoch": 0.1091703056768559,
+      "grad_norm": 0.13802853226661682,
+      "learning_rate": 4.9188001424588824e-05,
+      "loss": 0.19321763515472412,
+      "step": 600
+    },
+    {
+      "epoch": 0.11008005822416303,
+      "grad_norm": 0.17504797875881195,
+      "learning_rate": 4.9169288909063295e-05,
+      "loss": 0.2032616138458252,
+      "step": 605
+    },
+    {
+      "epoch": 0.11098981077147016,
+      "grad_norm": 0.13544194400310516,
+      "learning_rate": 4.91503668692052e-05,
+      "loss": 0.2011256456375122,
+      "step": 610
+    },
+    {
+      "epoch": 0.11189956331877729,
+      "grad_norm": 1.3976134061813354,
+      "learning_rate": 4.91312354690503e-05,
+      "loss": 0.19916868209838867,
+      "step": 615
+    },
+    {
+      "epoch": 0.11280931586608442,
+      "grad_norm": 0.1465059071779251,
+      "learning_rate": 4.91118948744493e-05,
+      "loss": 0.19487457275390624,
+      "step": 620
+    },
+    {
+      "epoch": 0.11371906841339156,
+      "grad_norm": 0.12103168666362762,
+      "learning_rate": 4.909234525306645e-05,
+      "loss": 0.1907251238822937,
+      "step": 625
+    },
+    {
+      "epoch": 0.1146288209606987,
+      "grad_norm": 0.12660574913024902,
+      "learning_rate": 4.907258677437802e-05,
+      "loss": 0.19327253103256226,
+      "step": 630
+    },
+    {
+      "epoch": 0.11553857350800582,
+      "grad_norm": 0.1347813606262207,
+      "learning_rate": 4.90526196096709e-05,
+      "loss": 0.19637736082077026,
+      "step": 635
+    },
+    {
+      "epoch": 0.11644832605531295,
+      "grad_norm": 0.14953652024269104,
+      "learning_rate": 4.903244393204107e-05,
+      "loss": 0.20325069427490233,
+      "step": 640
+    },
+    {
+      "epoch": 0.11735807860262008,
+      "grad_norm": 0.13936272263526917,
+      "learning_rate": 4.901205991639213e-05,
+      "loss": 0.1930275321006775,
+      "step": 645
+    },
+    {
+      "epoch": 0.11826783114992721,
+      "grad_norm": 0.1448420137166977,
+      "learning_rate": 4.899146773943374e-05,
+      "loss": 0.20026936531066894,
+      "step": 650
+    },
+    {
+      "epoch": 0.11917758369723436,
+      "grad_norm": 0.1312534064054489,
+      "learning_rate": 4.897066757968014e-05,
+      "loss": 0.19062033891677857,
+      "step": 655
+    },
+    {
+      "epoch": 0.12008733624454149,
+      "grad_norm": 0.13644742965698242,
+      "learning_rate": 4.894965961744859e-05,
+      "loss": 0.18719595670700073,
+      "step": 660
+    },
+    {
+      "epoch": 0.12099708879184862,
+      "grad_norm": 0.14276087284088135,
+      "learning_rate": 4.892844403485777e-05,
+      "loss": 0.19784307479858398,
+      "step": 665
+    },
+    {
+      "epoch": 0.12190684133915575,
+      "grad_norm": 0.14735399186611176,
+      "learning_rate": 4.890702101582623e-05,
+      "loss": 0.19163782596588136,
+      "step": 670
+    },
+    {
+      "epoch": 0.12281659388646288,
+      "grad_norm": 0.15742065012454987,
+      "learning_rate": 4.888539074607082e-05,
+      "loss": 0.19312986135482788,
+      "step": 675
+    },
+    {
+      "epoch": 0.12372634643377002,
+      "grad_norm": 0.12917031347751617,
+      "learning_rate": 4.8863553413105025e-05,
+      "loss": 0.20066320896148682,
+      "step": 680
+    },
+    {
+      "epoch": 0.12463609898107715,
+      "grad_norm": 0.1484801322221756,
+      "learning_rate": 4.884150920623737e-05,
+      "loss": 0.20096096992492676,
+      "step": 685
+    },
+    {
+      "epoch": 0.12554585152838427,
+      "grad_norm": 0.1455296128988266,
+      "learning_rate": 4.88192583165698e-05,
+      "loss": 0.20518505573272705,
+      "step": 690
+    },
+    {
+      "epoch": 0.12645560407569142,
+      "grad_norm": 0.14517490565776825,
+      "learning_rate": 4.879680093699598e-05,
+      "loss": 0.18859238624572755,
+      "step": 695
+    },
+    {
+      "epoch": 0.12736535662299855,
+      "grad_norm": 0.18778090178966522,
+      "learning_rate": 4.877413726219964e-05,
+      "loss": 0.197074818611145,
+      "step": 700
+    },
+    {
+      "epoch": 0.12827510917030568,
+      "grad_norm": 0.13497677445411682,
+      "learning_rate": 4.87512674886529e-05,
+      "loss": 0.18713107109069824,
+      "step": 705
+    },
+    {
+      "epoch": 0.12918486171761281,
+      "grad_norm": 0.12657155096530914,
+      "learning_rate": 4.872819181461455e-05,
+      "loss": 0.1858484387397766,
+      "step": 710
+    },
+    {
+      "epoch": 0.13009461426491994,
+      "grad_norm": 0.11458148807287216,
+      "learning_rate": 4.870491044012834e-05,
+      "loss": 0.18732179403305055,
+      "step": 715
+    },
+    {
+      "epoch": 0.13100436681222707,
+      "grad_norm": 0.13000249862670898,
+      "learning_rate": 4.8681423567021244e-05,
+      "loss": 0.1872936010360718,
+      "step": 720
+    },
+    {
+      "epoch": 0.1319141193595342,
+      "grad_norm": 0.14580890536308289,
+      "learning_rate": 4.865773139890172e-05,
+      "loss": 0.19280019998550416,
+      "step": 725
+    },
+    {
+      "epoch": 0.13282387190684133,
+      "grad_norm": 0.1507277935743332,
+      "learning_rate": 4.8633834141157913e-05,
+      "loss": 0.1898929238319397,
+      "step": 730
+    },
+    {
+      "epoch": 0.13373362445414846,
+      "grad_norm": 0.1418737769126892,
+      "learning_rate": 4.860973200095592e-05,
+      "loss": 0.17926375865936278,
+      "step": 735
+    },
+    {
+      "epoch": 0.1346433770014556,
+      "grad_norm": 0.17151866853237152,
+      "learning_rate": 4.858542518723794e-05,
+      "loss": 0.18963592052459716,
+      "step": 740
+    },
+    {
+      "epoch": 0.13555312954876272,
+      "grad_norm": 0.11162743717432022,
+      "learning_rate": 4.8560913910720535e-05,
+      "loss": 0.19466646909713745,
+      "step": 745
+    },
+    {
+      "epoch": 0.13646288209606988,
+      "grad_norm": 0.15628376603126526,
+      "learning_rate": 4.8536198383892725e-05,
+      "loss": 0.19494034051895143,
+      "step": 750
+    },
+    {
+      "epoch": 0.137372634643377,
+      "grad_norm": 0.18209289014339447,
+      "learning_rate": 4.851127882101421e-05,
+      "loss": 0.18747550249099731,
+      "step": 755
+    },
+    {
+      "epoch": 0.13828238719068414,
+      "grad_norm": 0.14559614658355713,
+      "learning_rate": 4.8486155438113454e-05,
+      "loss": 0.1897158980369568,
+      "step": 760
+    },
+    {
+      "epoch": 0.13919213973799127,
+      "grad_norm": 0.3198587894439697,
+      "learning_rate": 4.846082845298586e-05,
+      "loss": 0.18571001291275024,
+      "step": 765
+    },
+    {
+      "epoch": 0.1401018922852984,
+      "grad_norm": 0.1486678421497345,
+      "learning_rate": 4.843529808519189e-05,
+      "loss": 0.19561930894851684,
+      "step": 770
+    },
+    {
+      "epoch": 0.14101164483260553,
+      "grad_norm": 0.15318170189857483,
+      "learning_rate": 4.840956455605509e-05,
+      "loss": 0.187040114402771,
+      "step": 775
+    },
+    {
+      "epoch": 0.14192139737991266,
+      "grad_norm": 0.13754244148731232,
+      "learning_rate": 4.838362808866025e-05,
+      "loss": 0.18345539569854735,
+      "step": 780
+    },
+    {
+      "epoch": 0.1428311499272198,
+      "grad_norm": 0.12943248450756073,
+      "learning_rate": 4.835748890785143e-05,
+      "loss": 0.1921079397201538,
+      "step": 785
+    },
+    {
+      "epoch": 0.14374090247452692,
+      "grad_norm": 0.110458143055439,
+      "learning_rate": 4.833114724023001e-05,
+      "loss": 0.17927205562591553,
+      "step": 790
+    },
+    {
+      "epoch": 0.14465065502183405,
+      "grad_norm": 0.2421770840883255,
+      "learning_rate": 4.830460331415275e-05,
+      "loss": 0.18317567110061644,
+      "step": 795
+    },
+    {
+      "epoch": 0.14556040756914118,
+      "grad_norm": 0.14752762019634247,
+      "learning_rate": 4.8277857359729787e-05,
+      "loss": 0.1843916058540344,
+      "step": 800
+    },
+    {
+      "epoch": 0.14647016011644834,
+      "grad_norm": 0.15043556690216064,
+      "learning_rate": 4.8250909608822644e-05,
+      "loss": 0.18354393243789674,
+      "step": 805
+    },
+    {
+      "epoch": 0.14737991266375547,
+      "grad_norm": 0.1381794661283493,
+      "learning_rate": 4.822376029504223e-05,
+      "loss": 0.1789781332015991,
+      "step": 810
+    },
+    {
+      "epoch": 0.1482896652110626,
+      "grad_norm": 0.18386174738407135,
+      "learning_rate": 4.819640965374681e-05,
+      "loss": 0.19494292736053467,
+      "step": 815
+    },
+    {
+      "epoch": 0.14919941775836973,
+      "grad_norm": 0.13829593360424042,
+      "learning_rate": 4.816885792203996e-05,
+      "loss": 0.18486063480377196,
+      "step": 820
+    },
+    {
+      "epoch": 0.15010917030567686,
+      "grad_norm": 0.15033291280269623,
+      "learning_rate": 4.814110533876852e-05,
+      "loss": 0.18061509132385253,
+      "step": 825
+    },
+    {
+      "epoch": 0.151018922852984,
+      "grad_norm": 0.17150473594665527,
+      "learning_rate": 4.811315214452051e-05,
+      "loss": 0.18464866876602173,
+      "step": 830
+    },
+    {
+      "epoch": 0.15192867540029112,
+      "grad_norm": 0.15317125618457794,
+      "learning_rate": 4.808499858162307e-05,
+      "loss": 0.1837708592414856,
+      "step": 835
+    },
+    {
+      "epoch": 0.15283842794759825,
+      "grad_norm": 0.2671392560005188,
+      "learning_rate": 4.805664489414031e-05,
+      "loss": 0.19338636398315429,
+      "step": 840
+    },
+    {
+      "epoch": 0.15374818049490538,
+      "grad_norm": 0.14047028124332428,
+      "learning_rate": 4.802809132787125e-05,
+      "loss": 0.17069108486175538,
+      "step": 845
+    },
+    {
+      "epoch": 0.1546579330422125,
+      "grad_norm": 0.1520431935787201,
+      "learning_rate": 4.799933813034768e-05,
+      "loss": 0.18607735633850098,
+      "step": 850
+    },
+    {
+      "epoch": 0.15556768558951964,
+      "grad_norm": 0.17239463329315186,
+      "learning_rate": 4.797038555083197e-05,
+      "loss": 0.18069062232971192,
+      "step": 855
+    },
+    {
+      "epoch": 0.1564774381368268,
+      "grad_norm": 0.1377955675125122,
+      "learning_rate": 4.794123384031495e-05,
+      "loss": 0.18870222568511963,
+      "step": 860
+    },
+    {
+      "epoch": 0.15738719068413393,
+      "grad_norm": 0.15901461243629456,
+      "learning_rate": 4.791188325151373e-05,
+      "loss": 0.18128334283828734,
+      "step": 865
+    },
+    {
+      "epoch": 0.15829694323144106,
+      "grad_norm": 0.14634132385253906,
+      "learning_rate": 4.7882334038869495e-05,
+      "loss": 0.1866163969039917,
+      "step": 870
+    },
+    {
+      "epoch": 0.1592066957787482,
+      "grad_norm": 0.15361061692237854,
+      "learning_rate": 4.785258645854529e-05,
+      "loss": 0.17850807905197144,
+      "step": 875
+    },
+    {
+      "epoch": 0.16011644832605532,
+      "grad_norm": 0.13751649856567383,
+      "learning_rate": 4.782264076842385e-05,
+      "loss": 0.17731113433837892,
+      "step": 880
+    },
+    {
+      "epoch": 0.16102620087336245,
+      "grad_norm": 0.17909638583660126,
+      "learning_rate": 4.7792497228105314e-05,
+      "loss": 0.18344542980194092,
+      "step": 885
+    },
+    {
+      "epoch": 0.16193595342066958,
+      "grad_norm": 0.16038304567337036,
+      "learning_rate": 4.776215609890498e-05,
+      "loss": 0.18868647813796996,
+      "step": 890
+    },
+    {
+      "epoch": 0.1628457059679767,
+      "grad_norm": 0.1653951108455658,
+      "learning_rate": 4.773161764385107e-05,
+      "loss": 0.18614152669906617,
+      "step": 895
+    },
+    {
+      "epoch": 0.16375545851528384,
+      "grad_norm": 0.16193026304244995,
+      "learning_rate": 4.770088212768241e-05,
+      "loss": 0.18564575910568237,
+      "step": 900
+    },
+    {
+      "epoch": 0.16466521106259097,
+      "grad_norm": 0.16048531234264374,
+      "learning_rate": 4.7669949816846173e-05,
+      "loss": 0.18330031633377075,
+      "step": 905
+    },
+    {
+      "epoch": 0.1655749636098981,
+      "grad_norm": 0.1440177708864212,
+      "learning_rate": 4.7638820979495534e-05,
+      "loss": 0.17712442874908446,
+      "step": 910
+    },
+    {
+      "epoch": 0.16648471615720525,
+      "grad_norm": 0.19635969400405884,
+      "learning_rate": 4.760749588548738e-05,
+      "loss": 0.18679027557373046,
+      "step": 915
+    },
+    {
+      "epoch": 0.16739446870451238,
+      "grad_norm": 0.15576541423797607,
+      "learning_rate": 4.757597480637995e-05,
+      "loss": 0.19283764362335204,
+      "step": 920
+    },
+    {
+      "epoch": 0.1683042212518195,
+      "grad_norm": 0.1550331562757492,
+      "learning_rate": 4.7544258015430463e-05,
+      "loss": 0.18269542455673218,
+      "step": 925
+    },
+    {
+      "epoch": 0.16921397379912664,
+      "grad_norm": 0.18369626998901367,
+      "learning_rate": 4.75123457875928e-05,
+      "loss": 0.1697891116142273,
+      "step": 930
+    },
+    {
+      "epoch": 0.17012372634643377,
+      "grad_norm": 0.15266314148902893,
+      "learning_rate": 4.7480238399515074e-05,
+      "loss": 0.18523451089859008,
+      "step": 935
+    },
+    {
+      "epoch": 0.1710334788937409,
+      "grad_norm": 0.16709664463996887,
+      "learning_rate": 4.744793612953724e-05,
+      "loss": 0.1803238034248352,
+      "step": 940
+    },
+    {
+      "epoch": 0.17194323144104803,
+      "grad_norm": 0.14929179847240448,
+      "learning_rate": 4.741543925768872e-05,
+      "loss": 0.1861217737197876,
+      "step": 945
+    },
+    {
+      "epoch": 0.17285298398835516,
+      "grad_norm": 0.1362280696630478,
+      "learning_rate": 4.7382748065685915e-05,
+      "loss": 0.17896100282669067,
+      "step": 950
+    },
+    {
+      "epoch": 0.1737627365356623,
+      "grad_norm": 0.15290239453315735,
+      "learning_rate": 4.734986283692982e-05,
+      "loss": 0.18432788848876952,
+      "step": 955
+    },
+    {
+      "epoch": 0.17467248908296942,
+      "grad_norm": 0.1287035197019577,
+      "learning_rate": 4.73167838565035e-05,
+      "loss": 0.18485682010650634,
+      "step": 960
+    },
+    {
+      "epoch": 0.17558224163027655,
+      "grad_norm": 0.17969627678394318,
+      "learning_rate": 4.728351141116971e-05,
+      "loss": 0.17361557483673096,
+      "step": 965
+    },
+    {
+      "epoch": 0.1764919941775837,
+      "grad_norm": 0.13751201331615448,
+      "learning_rate": 4.7250045789368326e-05,
+      "loss": 0.1731679320335388,
+      "step": 970
+    },
+    {
+      "epoch": 0.17740174672489084,
+      "grad_norm": 0.1603265255689621,
+      "learning_rate": 4.721638728121388e-05,
+      "loss": 0.17308170795440675,
+      "step": 975
+    },
+    {
+      "epoch": 0.17831149927219797,
+      "grad_norm": 0.1592789888381958,
+      "learning_rate": 4.718253617849306e-05,
+      "loss": 0.17534757852554322,
+      "step": 980
+    },
+    {
+      "epoch": 0.1792212518195051,
+      "grad_norm": 0.12727224826812744,
+      "learning_rate": 4.714849277466214e-05,
+      "loss": 0.17817609310150145,
+      "step": 985
+    },
+    {
+      "epoch": 0.18013100436681223,
+      "grad_norm": 0.15401554107666016,
+      "learning_rate": 4.711425736484447e-05,
+      "loss": 0.1733405351638794,
+      "step": 990
+    },
+    {
+      "epoch": 0.18104075691411936,
+      "grad_norm": 0.13253968954086304,
+      "learning_rate": 4.7079830245827906e-05,
+      "loss": 0.17846795320510864,
+      "step": 995
+    },
+    {
+      "epoch": 0.1819505094614265,
+      "grad_norm": 0.21846213936805725,
+      "learning_rate": 4.7045211716062245e-05,
+      "loss": 0.18021599054336548,
+      "step": 1000
+    },
+    {
+      "epoch": 0.18286026200873362,
+      "grad_norm": 0.16867990791797638,
+      "learning_rate": 4.7010402075656595e-05,
+      "loss": 0.18232386112213134,
+      "step": 1005
+    },
+    {
+      "epoch": 0.18377001455604075,
+      "grad_norm": 0.17180582880973816,
+      "learning_rate": 4.697540162637686e-05,
+      "loss": 0.1816317319869995,
+      "step": 1010
+    },
+    {
+      "epoch": 0.18467976710334788,
+      "grad_norm": 0.16480213403701782,
+      "learning_rate": 4.694021067164303e-05,
+      "loss": 0.17718446254730225,
+      "step": 1015
+    },
+    {
+      "epoch": 0.185589519650655,
+      "grad_norm": 0.15015918016433716,
+      "learning_rate": 4.6904829516526605e-05,
+      "loss": 0.17412011623382567,
+      "step": 1020
+    },
+    {
+      "epoch": 0.18649927219796217,
+      "grad_norm": 0.14445139467716217,
+      "learning_rate": 4.686925846774795e-05,
+      "loss": 0.1778018832206726,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1874090247452693,
+      "grad_norm": 0.1701960265636444,
+      "learning_rate": 4.683349783367362e-05,
+      "loss": 0.16901081800460815,
+      "step": 1030
+    },
+    {
+      "epoch": 0.18831877729257643,
+      "grad_norm": 0.15894867479801178,
+      "learning_rate": 4.679754792431368e-05,
+      "loss": 0.17055928707122803,
+      "step": 1035
+    },
+    {
+      "epoch": 0.18922852983988356,
+      "grad_norm": 0.1511942446231842,
+      "learning_rate": 4.676140905131903e-05,
+      "loss": 0.17339680194854737,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1901382823871907,
+      "grad_norm": 0.14735209941864014,
+      "learning_rate": 4.672508152797872e-05,
+      "loss": 0.17802717685699462,
+      "step": 1045
+    },
+    {
+      "epoch": 0.19104803493449782,
+      "grad_norm": 0.17367291450500488,
+      "learning_rate": 4.66885656692172e-05,
+      "loss": 0.1732744097709656,
+      "step": 1050
+    },
+    {
+      "epoch": 0.19195778748180495,
+      "grad_norm": 0.147227481007576,
+      "learning_rate": 4.665186179159159e-05,
+      "loss": 0.17040517330169677,
+      "step": 1055
+    },
+    {
+      "epoch": 0.19286754002911208,
+      "grad_norm": 0.1709655076265335,
+      "learning_rate": 4.6614970213289e-05,
+      "loss": 0.17794088125228882,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1937772925764192,
+      "grad_norm": 0.1588088721036911,
+      "learning_rate": 4.657789125412366e-05,
+      "loss": 0.17180380821228028,
+      "step": 1065
+    },
+    {
+      "epoch": 0.19468704512372634,
+      "grad_norm": 0.14827021956443787,
+      "learning_rate": 4.654062523553428e-05,
+      "loss": 0.182997989654541,
+      "step": 1070
+    },
+    {
+      "epoch": 0.19559679767103347,
+      "grad_norm": 0.16230466961860657,
+      "learning_rate": 4.6503172480581126e-05,
+      "loss": 0.17346880435943604,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1965065502183406,
+      "grad_norm": 0.1637624353170395,
+      "learning_rate": 4.646553331394333e-05,
+      "loss": 0.17263576984405518,
+      "step": 1080
+    },
+    {
+      "epoch": 0.19741630276564776,
+      "grad_norm": 0.15977843105793,
+      "learning_rate": 4.642770806191603e-05,
+      "loss": 0.17284308671951293,
+      "step": 1085
+    },
+    {
+      "epoch": 0.19832605531295489,
+      "grad_norm": 0.15394869446754456,
+      "learning_rate": 4.6389697052407534e-05,
+      "loss": 0.17797101736068727,
+      "step": 1090
+    },
+    {
+      "epoch": 0.19923580786026202,
+      "grad_norm": 0.15995225310325623,
+      "learning_rate": 4.6351500614936485e-05,
+      "loss": 0.18137198686599731,
+      "step": 1095
+    },
+    {
+      "epoch": 0.20014556040756915,
+      "grad_norm": 0.1779479682445526,
+      "learning_rate": 4.6313119080629006e-05,
+      "loss": 0.17998344898223878,
+      "step": 1100
+    },
+    {
+      "epoch": 0.20105531295487628,
+      "grad_norm": 0.14362832903862,
+      "learning_rate": 4.627455278221584e-05,
+      "loss": 0.18196423053741456,
+      "step": 1105
+    },
+    {
+      "epoch": 0.2019650655021834,
+      "grad_norm": 0.15951639413833618,
+      "learning_rate": 4.623580205402947e-05,
+      "loss": 0.17423888444900512,
+      "step": 1110
+    },
+    {
+      "epoch": 0.20287481804949054,
+      "grad_norm": 0.17273563146591187,
+      "learning_rate": 4.619686723200115e-05,
+      "loss": 0.17392473220825194,
+      "step": 1115
+    },
+    {
+      "epoch": 0.20378457059679767,
+      "grad_norm": 0.1655360758304596,
+      "learning_rate": 4.615774865365813e-05,
+      "loss": 0.17528389692306517,
+      "step": 1120
+    },
+    {
+      "epoch": 0.2046943231441048,
+      "grad_norm": 0.15920691192150116,
+      "learning_rate": 4.611844665812058e-05,
+      "loss": 0.1806849241256714,
+      "step": 1125
+    },
+    {
+      "epoch": 0.20560407569141192,
+      "grad_norm": 0.16114577651023865,
+      "learning_rate": 4.607896158609875e-05,
+      "loss": 0.17217352390289306,
+      "step": 1130
+    },
+    {
+      "epoch": 0.20651382823871905,
+      "grad_norm": 0.1499422937631607,
+      "learning_rate": 4.603929377988999e-05,
+      "loss": 0.17806737422943114,
+      "step": 1135
+    },
+    {
+      "epoch": 0.2074235807860262,
+      "grad_norm": 0.17605191469192505,
+      "learning_rate": 4.5999443583375765e-05,
+      "loss": 0.17842113971710205,
+      "step": 1140
+    },
+    {
+      "epoch": 0.20833333333333334,
+      "grad_norm": 0.16117210686206818,
+      "learning_rate": 4.595941134201871e-05,
+      "loss": 0.18379683494567872,
+      "step": 1145
+    },
+    {
+      "epoch": 0.20924308588064047,
+      "grad_norm": 0.21199050545692444,
+      "learning_rate": 4.591919740285957e-05,
+      "loss": 0.16286123991012574,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2101528384279476,
+      "grad_norm": 0.15100529789924622,
+      "learning_rate": 4.587880211451427e-05,
+      "loss": 0.17995200157165528,
+      "step": 1155
+    },
+    {
+      "epoch": 0.21106259097525473,
+      "grad_norm": 0.16618172824382782,
+      "learning_rate": 4.583822582717085e-05,
+      "loss": 0.16960303783416747,
+      "step": 1160
+    },
+    {
+      "epoch": 0.21197234352256186,
+      "grad_norm": 0.14743569493293762,
+      "learning_rate": 4.579746889258643e-05,
+      "loss": 0.17762668132781984,
+      "step": 1165
+    },
+    {
+      "epoch": 0.212882096069869,
+      "grad_norm": 0.1697179079055786,
+      "learning_rate": 4.575653166408417e-05,
+      "loss": 0.16656005382537842,
+      "step": 1170
+    },
+    {
+      "epoch": 0.21379184861717612,
+      "grad_norm": 0.14886513352394104,
+      "learning_rate": 4.57154144965502e-05,
+      "loss": 0.17091882228851318,
+      "step": 1175
+    },
+    {
+      "epoch": 0.21470160116448325,
+      "grad_norm": 0.18197473883628845,
+      "learning_rate": 4.5674117746430556e-05,
+      "loss": 0.1770920753479004,
+      "step": 1180
+    },
+    {
+      "epoch": 0.21561135371179038,
+      "grad_norm": 0.17323088645935059,
+      "learning_rate": 4.563264177172807e-05,
+      "loss": 0.1734643578529358,
+      "step": 1185
+    },
+    {
+      "epoch": 0.2165211062590975,
+      "grad_norm": 0.1521984338760376,
+      "learning_rate": 4.559098693199929e-05,
+      "loss": 0.17515116930007935,
+      "step": 1190
+    },
+    {
+      "epoch": 0.21743085880640467,
+      "grad_norm": 0.1842304915189743,
+      "learning_rate": 4.554915358835134e-05,
+      "loss": 0.16798022985458375,
+      "step": 1195
+    },
+    {
+      "epoch": 0.2183406113537118,
+      "grad_norm": 0.14753451943397522,
+      "learning_rate": 4.5507142103438794e-05,
+      "loss": 0.1755476713180542,
+      "step": 1200
+    },
+    {
+      "epoch": 0.21925036390101893,
+      "grad_norm": 0.17096194624900818,
+      "learning_rate": 4.546495284146057e-05,
+      "loss": 0.1792473554611206,
+      "step": 1205
+    },
+    {
+      "epoch": 0.22016011644832606,
+      "grad_norm": 0.1579233556985855,
+      "learning_rate": 4.542258616815669e-05,
+      "loss": 0.17230144739151002,
+      "step": 1210
+    },
+    {
+      "epoch": 0.2210698689956332,
+      "grad_norm": 0.177297905087471,
+      "learning_rate": 4.5380042450805216e-05,
+      "loss": 0.1807127833366394,
+      "step": 1215
+    },
+    {
+      "epoch": 0.22197962154294032,
+      "grad_norm": 0.14331696927547455,
+      "learning_rate": 4.533732205821897e-05,
+      "loss": 0.17201389074325563,
+      "step": 1220
+    },
+    {
+      "epoch": 0.22288937409024745,
+      "grad_norm": 0.14473360776901245,
+      "learning_rate": 4.529442536074239e-05,
+      "loss": 0.17036900520324708,
+      "step": 1225
+    },
+    {
+      "epoch": 0.22379912663755458,
+      "grad_norm": 0.1820901483297348,
+      "learning_rate": 4.5251352730248314e-05,
+      "loss": 0.17704882621765136,
+      "step": 1230
+    },
+    {
+      "epoch": 0.2247088791848617,
+      "grad_norm": 0.1948976367712021,
+      "learning_rate": 4.5208104540134746e-05,
+      "loss": 0.1706973433494568,
+      "step": 1235
+    },
+    {
+      "epoch": 0.22561863173216884,
+      "grad_norm": 0.16660070419311523,
+      "learning_rate": 4.51646811653216e-05,
+      "loss": 0.17636821269989014,
+      "step": 1240
+    },
+    {
+      "epoch": 0.22652838427947597,
+      "grad_norm": 0.1699984073638916,
+      "learning_rate": 4.512108298224751e-05,
+      "loss": 0.16986632347106934,
+      "step": 1245
+    },
+    {
+      "epoch": 0.22743813682678313,
+      "grad_norm": 0.17601042985916138,
+      "learning_rate": 4.50773103688665e-05,
+      "loss": 0.17507898807525635,
+      "step": 1250
+    },
+    {
+      "epoch": 0.22834788937409026,
+      "grad_norm": 0.17557238042354584,
+      "learning_rate": 4.503336370464476e-05,
+      "loss": 0.17702863216400147,
+      "step": 1255
+    },
+    {
+      "epoch": 0.2292576419213974,
+      "grad_norm": 0.1800651252269745,
+      "learning_rate": 4.498924337055729e-05,
+      "loss": 0.16419180631637573,
+      "step": 1260
+    },
+    {
+      "epoch": 0.23016739446870452,
+      "grad_norm": 0.2022479772567749,
+      "learning_rate": 4.494494974908468e-05,
+      "loss": 0.17482060194015503,
+      "step": 1265
+    },
+    {
+      "epoch": 0.23107714701601165,
+      "grad_norm": 0.14180205762386322,
+      "learning_rate": 4.490048322420973e-05,
+      "loss": 0.1723136067390442,
+      "step": 1270
+    },
+    {
+      "epoch": 0.23198689956331878,
+      "grad_norm": 0.18607310950756073,
+      "learning_rate": 4.485584418141419e-05,
+      "loss": 0.17096419334411622,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2328966521106259,
+      "grad_norm": 0.15958310663700104,
+      "learning_rate": 4.481103300767529e-05,
+      "loss": 0.1656244158744812,
+      "step": 1280
+    },
+    {
+      "epoch": 0.23380640465793304,
+      "grad_norm": 0.17552383244037628,
+      "learning_rate": 4.476605009146255e-05,
+      "loss": 0.17677626609802247,
+      "step": 1285
+    },
+    {
+      "epoch": 0.23471615720524017,
+      "grad_norm": 0.15299823880195618,
+      "learning_rate": 4.472089582273429e-05,
+      "loss": 0.1778991103172302,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2356259097525473,
+      "grad_norm": 0.14613987505435944,
+      "learning_rate": 4.46755705929343e-05,
+      "loss": 0.17071452140808105,
+      "step": 1295
+    },
+    {
+      "epoch": 0.23653566229985443,
+      "grad_norm": 0.17781122028827667,
+      "learning_rate": 4.463007479498843e-05,
+      "loss": 0.16955430507659913,
+      "step": 1300
+    },
+    {
+      "epoch": 0.23744541484716158,
+      "grad_norm": 0.16326487064361572,
+      "learning_rate": 4.458440882330119e-05,
+      "loss": 0.1777693510055542,
+      "step": 1305
+    },
+    {
+      "epoch": 0.23835516739446871,
+      "grad_norm": 0.17701926827430725,
+      "learning_rate": 4.4538573073752365e-05,
+      "loss": 0.16323351860046387,
+      "step": 1310
+    },
+    {
+      "epoch": 0.23926491994177584,
+      "grad_norm": 0.13104717433452606,
+      "learning_rate": 4.449256794369349e-05,
+      "loss": 0.17653456926345826,
+      "step": 1315
+    },
+    {
+      "epoch": 0.24017467248908297,
+      "grad_norm": 0.1796836256980896,
+      "learning_rate": 4.444639383194452e-05,
+      "loss": 0.17189600467681884,
+      "step": 1320
+    },
+    {
+      "epoch": 0.2410844250363901,
+      "grad_norm": 0.14919696748256683,
+      "learning_rate": 4.440005113879029e-05,
+      "loss": 0.17003334760665895,
+      "step": 1325
+    },
+    {
+      "epoch": 0.24199417758369723,
+      "grad_norm": 0.1728784441947937,
+      "learning_rate": 4.4353540265977064e-05,
+      "loss": 0.17397408485412597,
+      "step": 1330
+    },
+    {
+      "epoch": 0.24290393013100436,
+      "grad_norm": 0.14591015875339508,
+      "learning_rate": 4.43068616167091e-05,
+      "loss": 0.16498478651046752,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2438136826783115,
+      "grad_norm": 0.18417201936244965,
+      "learning_rate": 4.4260015595645055e-05,
+      "loss": 0.16841750144958495,
+      "step": 1340
+    },
+    {
+      "epoch": 0.24472343522561862,
+      "grad_norm": 0.16264279186725616,
+      "learning_rate": 4.4213002608894605e-05,
+      "loss": 0.16907373666763306,
+      "step": 1345
+    },
+    {
+      "epoch": 0.24563318777292575,
+      "grad_norm": 0.15248481929302216,
+      "learning_rate": 4.416582306401481e-05,
+      "loss": 0.15931472778320313,
+      "step": 1350
+    },
+    {
+      "epoch": 0.24654294032023288,
+      "grad_norm": 0.1488373875617981,
+      "learning_rate": 4.4118477370006636e-05,
+      "loss": 0.1701716423034668,
+      "step": 1355
+    },
+    {
+      "epoch": 0.24745269286754004,
+      "grad_norm": 0.14679782092571259,
+      "learning_rate": 4.407096593731142e-05,
+      "loss": 0.157412326335907,
+      "step": 1360
+    },
+    {
+      "epoch": 0.24836244541484717,
+      "grad_norm": 0.17139530181884766,
+      "learning_rate": 4.402328917780728e-05,
+      "loss": 0.17303754091262818,
+      "step": 1365
+    },
+    {
+      "epoch": 0.2492721979621543,
+      "grad_norm": 0.1534871757030487,
+      "learning_rate": 4.397544750480554e-05,
+      "loss": 0.1786255121231079,
+      "step": 1370
+    },
+    {
+      "epoch": 0.2501819505094614,
+      "grad_norm": 0.1876252293586731,
+      "learning_rate": 4.39274413330472e-05,
+      "loss": 0.16442898511886597,
+      "step": 1375
+    },
+    {
+      "epoch": 0.25109170305676853,
+      "grad_norm": 0.16165752708911896,
+      "learning_rate": 4.387927107869928e-05,
+      "loss": 0.1780426025390625,
+      "step": 1380
+    },
+    {
+      "epoch": 0.25200145560407566,
+      "grad_norm": 0.17242255806922913,
+      "learning_rate": 4.383093715935124e-05,
+      "loss": 0.15959256887435913,
+      "step": 1385
+    },
+    {
+      "epoch": 0.25291120815138285,
+      "grad_norm": 0.1627114862203598,
+      "learning_rate": 4.378243999401137e-05,
+      "loss": 0.17606115341186523,
+      "step": 1390
+    },
+    {
+      "epoch": 0.25382096069869,
+      "grad_norm": 0.15911224484443665,
+      "learning_rate": 4.373378000310312e-05,
+      "loss": 0.16798585653305054,
+      "step": 1395
+    },
+    {
+      "epoch": 0.2547307132459971,
+      "grad_norm": 0.15542249381542206,
+      "learning_rate": 4.3684957608461505e-05,
+      "loss": 0.1695417881011963,
+      "step": 1400
+    },
+    {
+      "epoch": 0.25564046579330424,
+      "grad_norm": 0.1475304812192917,
+      "learning_rate": 4.363597323332941e-05,
+      "loss": 0.16340878009796142,
+      "step": 1405
+    },
+    {
+      "epoch": 0.25655021834061137,
+      "grad_norm": 0.16943927109241486,
+      "learning_rate": 4.358682730235395e-05,
+      "loss": 0.17240238189697266,
+      "step": 1410
+    },
+    {
+      "epoch": 0.2574599708879185,
+      "grad_norm": 0.1816391944885254,
+      "learning_rate": 4.3537520241582744e-05,
+      "loss": 0.16558437347412108,
+      "step": 1415
+    },
+    {
+      "epoch": 0.25836972343522563,
+      "grad_norm": 0.23851341009140015,
+      "learning_rate": 4.348805247846027e-05,
+      "loss": 0.16796000003814698,
+      "step": 1420
+    },
+    {
+      "epoch": 0.25927947598253276,
+      "grad_norm": 0.15415243804454803,
+      "learning_rate": 4.343842444182414e-05,
+      "loss": 0.1746017098426819,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2601892285298399,
+      "grad_norm": 0.15651032328605652,
+      "learning_rate": 4.338863656190139e-05,
+      "loss": 0.1649057984352112,
+      "step": 1430
+    },
+    {
+      "epoch": 0.261098981077147,
+      "grad_norm": 0.16601966321468353,
+      "learning_rate": 4.333868927030471e-05,
+      "loss": 0.15888988971710205,
+      "step": 1435
+    },
+    {
+      "epoch": 0.26200873362445415,
+      "grad_norm": 0.1549467295408249,
+      "learning_rate": 4.328858300002876e-05,
+      "loss": 0.16357985734939576,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2629184861717613,
+      "grad_norm": 0.16332370042800903,
+      "learning_rate": 4.32383181854464e-05,
+      "loss": 0.16749982833862304,
+      "step": 1445
+    },
+    {
+      "epoch": 0.2638282387190684,
+      "grad_norm": 0.14827077090740204,
+      "learning_rate": 4.3187895262304894e-05,
+      "loss": 0.16886214017868043,
+      "step": 1450
+    },
+    {
+      "epoch": 0.26473799126637554,
+      "grad_norm": 0.1557198166847229,
+      "learning_rate": 4.313731466772216e-05,
+      "loss": 0.17512214183807373,
+      "step": 1455
+    },
+    {
+      "epoch": 0.26564774381368267,
+      "grad_norm": 0.17263570427894592,
+      "learning_rate": 4.308657684018299e-05,
+      "loss": 0.16248074769973755,
+      "step": 1460
+    },
+    {
+      "epoch": 0.2665574963609898,
+      "grad_norm": 0.17135761678218842,
+      "learning_rate": 4.303568221953521e-05,
+      "loss": 0.16605921983718872,
+      "step": 1465
+    },
+    {
+      "epoch": 0.26746724890829693,
+      "grad_norm": 0.14322632551193237,
+      "learning_rate": 4.2984631246985897e-05,
+      "loss": 0.1610772728919983,
+      "step": 1470
+    },
+    {
+      "epoch": 0.26837700145560406,
+      "grad_norm": 0.18852312862873077,
+      "learning_rate": 4.2933424365097564e-05,
+      "loss": 0.1686462163925171,
+      "step": 1475
+    },
+    {
+      "epoch": 0.2692867540029112,
+      "grad_norm": 0.1780245155096054,
+      "learning_rate": 4.2882062017784294e-05,
+      "loss": 0.16953932046890258,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2701965065502183,
+      "grad_norm": 0.180568665266037,
+      "learning_rate": 4.2830544650307895e-05,
+      "loss": 0.16442664861679077,
+      "step": 1485
+    },
+    {
+      "epoch": 0.27110625909752545,
+      "grad_norm": 0.16876435279846191,
+      "learning_rate": 4.277887270927407e-05,
+      "loss": 0.17128173112869263,
+      "step": 1490
+    },
+    {
+      "epoch": 0.2720160116448326,
+      "grad_norm": 0.164053276181221,
+      "learning_rate": 4.2727046642628513e-05,
+      "loss": 0.16331382989883422,
+      "step": 1495
+    },
+    {
+      "epoch": 0.27292576419213976,
+      "grad_norm": 0.14577528834342957,
+      "learning_rate": 4.267506689965305e-05,
+      "loss": 0.1638316035270691,
+      "step": 1500
+    },
+    {
+      "epoch": 0.2738355167394469,
+      "grad_norm": 0.1648740917444229,
+      "learning_rate": 4.262293393096171e-05,
+      "loss": 0.15332664251327516,
+      "step": 1505
+    },
+    {
+      "epoch": 0.274745269286754,
+      "grad_norm": 0.16445094347000122,
+      "learning_rate": 4.257064818849685e-05,
+      "loss": 0.1706634521484375,
+      "step": 1510
+    },
+    {
+      "epoch": 0.27565502183406115,
+      "grad_norm": 0.1584935486316681,
+      "learning_rate": 4.251821012552524e-05,
+      "loss": 0.1684114694595337,
+      "step": 1515
+    },
+    {
+      "epoch": 0.2765647743813683,
+      "grad_norm": 0.17215611040592194,
+      "learning_rate": 4.24656201966341e-05,
+      "loss": 0.15594131946563722,
+      "step": 1520
+    },
+    {
+      "epoch": 0.2774745269286754,
+      "grad_norm": 0.15945589542388916,
+      "learning_rate": 4.2412878857727214e-05,
+      "loss": 0.1686659574508667,
+      "step": 1525
+    },
+    {
+      "epoch": 0.27838427947598254,
+      "grad_norm": 0.16103951632976532,
+      "learning_rate": 4.2359986566020906e-05,
+      "loss": 0.17779340744018554,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2792940320232897,
+      "grad_norm": 0.1770307570695877,
+      "learning_rate": 4.230694378004014e-05,
+      "loss": 0.16786882877349854,
+      "step": 1535
+    },
+    {
+      "epoch": 0.2802037845705968,
+      "grad_norm": 0.16225053369998932,
+      "learning_rate": 4.2253750959614504e-05,
+      "loss": 0.16558897495269775,
+      "step": 1540
+    },
+    {
+      "epoch": 0.28111353711790393,
+      "grad_norm": 0.27213969826698303,
+      "learning_rate": 4.220040856587425e-05,
+      "loss": 0.1641119599342346,
+      "step": 1545
+    },
+    {
+      "epoch": 0.28202328966521106,
+      "grad_norm": 0.1773071587085724,
+      "learning_rate": 4.2146917061246284e-05,
+      "loss": 0.16919140815734862,
+      "step": 1550
+    },
+    {
+      "epoch": 0.2829330422125182,
+      "grad_norm": 0.15519705414772034,
+      "learning_rate": 4.209327690945014e-05,
+      "loss": 0.15501506328582765,
+      "step": 1555
+    },
+    {
+      "epoch": 0.2838427947598253,
+      "grad_norm": 0.19921597838401794,
+      "learning_rate": 4.203948857549402e-05,
+      "loss": 0.1690821886062622,
+      "step": 1560
+    },
+    {
+      "epoch": 0.28475254730713245,
+      "grad_norm": 0.15417630970478058,
+      "learning_rate": 4.1985552525670696e-05,
+      "loss": 0.1675640344619751,
+      "step": 1565
+    },
+    {
+      "epoch": 0.2856622998544396,
+      "grad_norm": 0.1739572137594223,
+      "learning_rate": 4.193146922755348e-05,
+      "loss": 0.16738017797470092,
+      "step": 1570
+    },
+    {
+      "epoch": 0.2865720524017467,
+      "grad_norm": 0.1384361982345581,
+      "learning_rate": 4.187723914999221e-05,
+      "loss": 0.16802358627319336,
+      "step": 1575
+    },
+    {
+      "epoch": 0.28748180494905384,
+      "grad_norm": 0.1491454839706421,
+      "learning_rate": 4.182286276310915e-05,
+      "loss": 0.1619583249092102,
+      "step": 1580
+    },
+    {
+      "epoch": 0.288391557496361,
+      "grad_norm": 0.15831919014453888,
+      "learning_rate": 4.176834053829492e-05,
+      "loss": 0.1625199794769287,
+      "step": 1585
+    },
+    {
+      "epoch": 0.2893013100436681,
+      "grad_norm": 0.16265396773815155,
+      "learning_rate": 4.1713672948204416e-05,
+      "loss": 0.16718552112579346,
+      "step": 1590
+    },
+    {
+      "epoch": 0.29021106259097523,
+      "grad_norm": 0.15153461694717407,
+      "learning_rate": 4.1658860466752714e-05,
+      "loss": 0.15979087352752686,
+      "step": 1595
+    },
+    {
+      "epoch": 0.29112081513828236,
+      "grad_norm": 0.1620412915945053,
+      "learning_rate": 4.160390356911096e-05,
+      "loss": 0.16103557348251343,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2920305676855895,
+      "grad_norm": 0.16673807799816132,
+      "learning_rate": 4.154880273170223e-05,
+      "loss": 0.16394708156585694,
+      "step": 1605
+    },
+    {
+      "epoch": 0.2929403202328967,
+      "grad_norm": 0.14834867417812347,
+      "learning_rate": 4.149355843219744e-05,
+      "loss": 0.15916435718536376,
+      "step": 1610
+    },
+    {
+      "epoch": 0.2938500727802038,
+      "grad_norm": 0.16977964341640472,
+      "learning_rate": 4.143817114951119e-05,
+      "loss": 0.16538127660751342,
+      "step": 1615
+    },
+    {
+      "epoch": 0.29475982532751094,
+      "grad_norm": 0.17986875772476196,
+      "learning_rate": 4.138264136379756e-05,
+      "loss": 0.15514618158340454,
+      "step": 1620
+    },
+    {
+      "epoch": 0.29566957787481807,
+      "grad_norm": 0.15794920921325684,
+      "learning_rate": 4.132696955644605e-05,
+      "loss": 0.15992183685302735,
+      "step": 1625
+    },
+    {
+      "epoch": 0.2965793304221252,
+      "grad_norm": 0.19955399632453918,
+      "learning_rate": 4.127115621007731e-05,
+      "loss": 0.16362056732177735,
+      "step": 1630
+    },
+    {
+      "epoch": 0.29748908296943233,
+      "grad_norm": 0.1352023035287857,
+      "learning_rate": 4.121520180853903e-05,
+      "loss": 0.15631601810455323,
+      "step": 1635
+    },
+    {
+      "epoch": 0.29839883551673946,
+      "grad_norm": 0.15340781211853027,
+      "learning_rate": 4.1159106836901674e-05,
+      "loss": 0.1571858048439026,
+      "step": 1640
+    },
+    {
+      "epoch": 0.2993085880640466,
+      "grad_norm": 0.15311770141124725,
+      "learning_rate": 4.110287178145433e-05,
+      "loss": 0.16082344055175782,
+      "step": 1645
+    },
+    {
+      "epoch": 0.3002183406113537,
+      "grad_norm": 0.17811627686023712,
+      "learning_rate": 4.10464971297005e-05,
+      "loss": 0.16117215156555176,
+      "step": 1650
+    },
+    {
+      "epoch": 0.30112809315866085,
+      "grad_norm": 0.21060039103031158,
+      "learning_rate": 4.0989983370353805e-05,
+      "loss": 0.15838587284088135,
+      "step": 1655
+    },
+    {
+      "epoch": 0.302037845705968,
+      "grad_norm": 0.155836820602417,
+      "learning_rate": 4.093333099333383e-05,
+      "loss": 0.16648870706558228,
+      "step": 1660
+    },
+    {
+      "epoch": 0.3029475982532751,
+      "grad_norm": 0.13711698353290558,
+      "learning_rate": 4.0876540489761826e-05,
+      "loss": 0.16899349689483642,
+      "step": 1665
+    },
+    {
+      "epoch": 0.30385735080058224,
+      "grad_norm": 0.15162716805934906,
+      "learning_rate": 4.0819612351956485e-05,
+      "loss": 0.16574090719223022,
+      "step": 1670
+    },
+    {
+      "epoch": 0.30476710334788937,
+      "grad_norm": 0.15016348659992218,
+      "learning_rate": 4.0762547073429615e-05,
+      "loss": 0.1689780354499817,
+      "step": 1675
+    },
+    {
+      "epoch": 0.3056768558951965,
+      "grad_norm": 0.15182986855506897,
+      "learning_rate": 4.070534514888194e-05,
+      "loss": 0.1593686819076538,
+      "step": 1680
+    },
+    {
+      "epoch": 0.3065866084425036,
+      "grad_norm": 0.15648750960826874,
+      "learning_rate": 4.0648007074198765e-05,
+      "loss": 0.16436235904693602,
+      "step": 1685
+    },
+    {
+      "epoch": 0.30749636098981076,
+      "grad_norm": 0.18339484930038452,
+      "learning_rate": 4.0590533346445665e-05,
+      "loss": 0.1678077220916748,
+      "step": 1690
+    },
+    {
+      "epoch": 0.3084061135371179,
+      "grad_norm": 0.16426527500152588,
+      "learning_rate": 4.053292446386422e-05,
+      "loss": 0.1689227342605591,
+      "step": 1695
+    },
+    {
+      "epoch": 0.309315866084425,
+      "grad_norm": 0.16129335761070251,
+      "learning_rate": 4.047518092586766e-05,
+      "loss": 0.16592445373535156,
+      "step": 1700
+    },
+    {
+      "epoch": 0.31022561863173215,
+      "grad_norm": 0.15512363612651825,
+      "learning_rate": 4.041730323303654e-05,
+      "loss": 0.16142364740371704,
+      "step": 1705
+    },
+    {
+      "epoch": 0.3111353711790393,
+      "grad_norm": 0.159842386841774,
+      "learning_rate": 4.0359291887114425e-05,
+      "loss": 0.1702875852584839,
+      "step": 1710
+    },
+    {
+      "epoch": 0.3120451237263464,
+      "grad_norm": 0.19558854401111603,
+      "learning_rate": 4.030114739100352e-05,
+      "loss": 0.15966148376464845,
+      "step": 1715
+    },
+    {
+      "epoch": 0.3129548762736536,
+      "grad_norm": 0.1577496975660324,
+      "learning_rate": 4.024287024876029e-05,
+      "loss": 0.1620358943939209,
+      "step": 1720
+    },
+    {
+      "epoch": 0.3138646288209607,
+      "grad_norm": 0.1629355251789093,
+      "learning_rate": 4.0184460965591144e-05,
+      "loss": 0.16511552333831786,
+      "step": 1725
+    },
+    {
+      "epoch": 0.31477438136826785,
+      "grad_norm": 0.17060767114162445,
+      "learning_rate": 4.0125920047848e-05,
+      "loss": 0.15672838687896729,
+      "step": 1730
+    },
+    {
+      "epoch": 0.315684133915575,
+      "grad_norm": 0.22447620332241058,
+      "learning_rate": 4.006724800302394e-05,
+      "loss": 0.15339784622192382,
+      "step": 1735
+    },
+    {
+      "epoch": 0.3165938864628821,
+      "grad_norm": 0.14572037756443024,
+      "learning_rate": 4.000844533974878e-05,
+      "loss": 0.16566959619522095,
+      "step": 1740
+    },
+    {
+      "epoch": 0.31750363901018924,
+      "grad_norm": 0.15915483236312866,
+      "learning_rate": 3.9949512567784684e-05,
+      "loss": 0.16153957843780517,
+      "step": 1745
+    },
+    {
+      "epoch": 0.3184133915574964,
+      "grad_norm": 0.1668540984392166,
+      "learning_rate": 3.9890450198021704e-05,
+      "loss": 0.1659809947013855,
+      "step": 1750
+    },
+    {
+      "epoch": 0.3193231441048035,
+      "grad_norm": 0.16612035036087036,
+      "learning_rate": 3.983125874247341e-05,
+      "loss": 0.16941241025924683,
+      "step": 1755
+    },
+    {
+      "epoch": 0.32023289665211063,
+      "grad_norm": 0.15163679420948029,
+      "learning_rate": 3.9771938714272407e-05,
+      "loss": 0.16053590774536133,
+      "step": 1760
+    },
+    {
+      "epoch": 0.32114264919941776,
+      "grad_norm": 0.1797824203968048,
+      "learning_rate": 3.97124906276659e-05,
+      "loss": 0.1667110800743103,
+      "step": 1765
+    },
+    {
+      "epoch": 0.3220524017467249,
+      "grad_norm": 0.15076608955860138,
+      "learning_rate": 3.9652914998011237e-05,
+      "loss": 0.1607860803604126,
+      "step": 1770
+    },
+    {
+      "epoch": 0.322962154294032,
+      "grad_norm": 0.16523587703704834,
+      "learning_rate": 3.959321234177144e-05,
+      "loss": 0.16515827178955078,
+      "step": 1775
+    },
+    {
+      "epoch": 0.32387190684133915,
+      "grad_norm": 0.22065149247646332,
+      "learning_rate": 3.9533383176510746e-05,
+      "loss": 0.1618957757949829,
+      "step": 1780
+    },
+    {
+      "epoch": 0.3247816593886463,
+      "grad_norm": 0.16426463425159454,
+      "learning_rate": 3.9473428020890066e-05,
+      "loss": 0.15763382911682128,
+      "step": 1785
+    },
+    {
+      "epoch": 0.3256914119359534,
+      "grad_norm": 0.16474904119968414,
+      "learning_rate": 3.941334739466257e-05,
+      "loss": 0.15135571956634522,
+      "step": 1790
+    },
+    {
+      "epoch": 0.32660116448326054,
+      "grad_norm": 0.16746412217617035,
+      "learning_rate": 3.935314181866909e-05,
+      "loss": 0.15925389528274536,
+      "step": 1795
+    },
+    {
+      "epoch": 0.32751091703056767,
+      "grad_norm": 0.17819371819496155,
+      "learning_rate": 3.929281181483369e-05,
+      "loss": 0.1598669171333313,
+      "step": 1800
+    },
+    {
+      "epoch": 0.3284206695778748,
+      "grad_norm": 0.1816040277481079,
+      "learning_rate": 3.923235790615907e-05,
+      "loss": 0.1652522087097168,
+      "step": 1805
+    },
+    {
+      "epoch": 0.32933042212518193,
+      "grad_norm": 0.14846695959568024,
+      "learning_rate": 3.917178061672211e-05,
+      "loss": 0.16665585041046144,
+      "step": 1810
+    },
+    {
+      "epoch": 0.33024017467248906,
+      "grad_norm": 0.1734926551580429,
+      "learning_rate": 3.911108047166924e-05,
+      "loss": 0.16069791316986085,
+      "step": 1815
+    },
+    {
+      "epoch": 0.3311499272197962,
+      "grad_norm": 0.16154922544956207,
+      "learning_rate": 3.905025799721194e-05,
+      "loss": 0.16114097833633423,
+      "step": 1820
+    },
+    {
+      "epoch": 0.3320596797671033,
+      "grad_norm": 0.1538771390914917,
+      "learning_rate": 3.898931372062217e-05,
+      "loss": 0.1602831244468689,
+      "step": 1825
+    },
+    {
+      "epoch": 0.3329694323144105,
+      "grad_norm": 0.14036566019058228,
+      "learning_rate": 3.892824817022781e-05,
+      "loss": 0.1502395749092102,
+      "step": 1830
+    },
+    {
+      "epoch": 0.33387918486171764,
+      "grad_norm": 0.19212059676647186,
+      "learning_rate": 3.886706187540804e-05,
+      "loss": 0.16265250444412233,
+      "step": 1835
+    },
+    {
+      "epoch": 0.33478893740902477,
+      "grad_norm": 0.17410333454608917,
+      "learning_rate": 3.880575536658881e-05,
+      "loss": 0.15689224004745483,
+      "step": 1840
+    },
+    {
+      "epoch": 0.3356986899563319,
+      "grad_norm": 0.15165294706821442,
+      "learning_rate": 3.874432917523817e-05,
+      "loss": 0.15033140182495117,
+      "step": 1845
+    },
+    {
+      "epoch": 0.336608442503639,
+      "grad_norm": 0.16166730225086212,
+      "learning_rate": 3.8682783833861736e-05,
+      "loss": 0.16896235942840576,
+      "step": 1850
+    },
+    {
+      "epoch": 0.33751819505094616,
+      "grad_norm": 0.16497021913528442,
+      "learning_rate": 3.8621119875998026e-05,
+      "loss": 0.1600774645805359,
+      "step": 1855
+    },
+    {
+      "epoch": 0.3384279475982533,
+      "grad_norm": 0.17264948785305023,
+      "learning_rate": 3.855933783621384e-05,
+      "loss": 0.16947593688964843,
+      "step": 1860
+    },
+    {
+      "epoch": 0.3393377001455604,
+      "grad_norm": 0.16870704293251038,
+      "learning_rate": 3.8497438250099636e-05,
+      "loss": 0.16062095165252685,
+      "step": 1865
+    },
+    {
+      "epoch": 0.34024745269286755,
+      "grad_norm": 0.16644036769866943,
+      "learning_rate": 3.843542165426492e-05,
+      "loss": 0.16015599966049193,
+      "step": 1870
+    },
+    {
+      "epoch": 0.3411572052401747,
+      "grad_norm": 0.1626352220773697,
+      "learning_rate": 3.837328858633349e-05,
+      "loss": 0.17444703578948975,
+      "step": 1875
+    },
+    {
+      "epoch": 0.3420669577874818,
+      "grad_norm": 0.1427375227212906,
+      "learning_rate": 3.83110395849389e-05,
+      "loss": 0.1589805006980896,
+      "step": 1880
+    },
+    {
+      "epoch": 0.34297671033478894,
+      "grad_norm": 0.17840255796909332,
+      "learning_rate": 3.824867518971973e-05,
+      "loss": 0.15953952074050903,
+      "step": 1885
+    },
+    {
+      "epoch": 0.34388646288209607,
+      "grad_norm": 0.16998249292373657,
+      "learning_rate": 3.818619594131489e-05,
+      "loss": 0.16027032136917113,
+      "step": 1890
+    },
+    {
+      "epoch": 0.3447962154294032,
+      "grad_norm": 0.14950257539749146,
+      "learning_rate": 3.812360238135897e-05,
+      "loss": 0.15335670709609986,
+      "step": 1895
+    },
+    {
+      "epoch": 0.3457059679767103,
+      "grad_norm": 0.1678011417388916,
+      "learning_rate": 3.806089505247752e-05,
+      "loss": 0.1560648798942566,
+      "step": 1900
+    },
+    {
+      "epoch": 0.34661572052401746,
+      "grad_norm": 0.17944541573524475,
+      "learning_rate": 3.799807449828238e-05,
+      "loss": 0.16072254180908202,
+      "step": 1905
+    },
+    {
+      "epoch": 0.3475254730713246,
+      "grad_norm": 0.166817307472229,
+      "learning_rate": 3.793514126336691e-05,
+      "loss": 0.1542820692062378,
+      "step": 1910
+    },
+    {
+      "epoch": 0.3484352256186317,
+      "grad_norm": 0.16047626733779907,
+      "learning_rate": 3.787209589330134e-05,
+      "loss": 0.16092092990875245,
+      "step": 1915
+    },
+    {
+      "epoch": 0.34934497816593885,
+      "grad_norm": 0.16478900611400604,
+      "learning_rate": 3.7808938934627965e-05,
+      "loss": 0.16765867471694945,
+      "step": 1920
+    },
+    {
+      "epoch": 0.350254730713246,
+      "grad_norm": 0.15349514782428741,
+      "learning_rate": 3.774567093485648e-05,
+      "loss": 0.15890377759933472,
+      "step": 1925
+    },
+    {
+      "epoch": 0.3511644832605531,
+      "grad_norm": 0.1515921950340271,
+      "learning_rate": 3.768229244245917e-05,
+      "loss": 0.16668319702148438,
+      "step": 1930
+    },
+    {
+      "epoch": 0.35207423580786024,
+      "grad_norm": 0.16310466825962067,
+      "learning_rate": 3.7618804006866195e-05,
+      "loss": 0.15182652473449706,
+      "step": 1935
+    },
+    {
+      "epoch": 0.3529839883551674,
+      "grad_norm": 0.17294517159461975,
+      "learning_rate": 3.755520617846084e-05,
+      "loss": 0.16287628412246705,
+      "step": 1940
+    },
+    {
+      "epoch": 0.35389374090247455,
+      "grad_norm": 0.1482895463705063,
+      "learning_rate": 3.749149950857467e-05,
+      "loss": 0.15321952104568481,
+      "step": 1945
+    },
+    {
+      "epoch": 0.3548034934497817,
+      "grad_norm": 0.2236029952764511,
+      "learning_rate": 3.7427684549482847e-05,
+      "loss": 0.15403482913970948,
+      "step": 1950
+    },
+    {
+      "epoch": 0.3557132459970888,
+      "grad_norm": 0.20185327529907227,
+      "learning_rate": 3.736376185439927e-05,
+      "loss": 0.1633884072303772,
+      "step": 1955
+    },
+    {
+      "epoch": 0.35662299854439594,
+      "grad_norm": 0.13906247913837433,
+      "learning_rate": 3.7299731977471816e-05,
+      "loss": 0.15925350189208984,
+      "step": 1960
+    },
+    {
+      "epoch": 0.35753275109170307,
+      "grad_norm": 0.18665002286434174,
+      "learning_rate": 3.723559547377751e-05,
+      "loss": 0.1612026572227478,
+      "step": 1965
+    },
+    {
+      "epoch": 0.3584425036390102,
+      "grad_norm": 0.16913433372974396,
+      "learning_rate": 3.717135289931774e-05,
+      "loss": 0.15479494333267213,
+      "step": 1970
+    },
+    {
+      "epoch": 0.35935225618631733,
+      "grad_norm": 0.1620066910982132,
+      "learning_rate": 3.7107004811013434e-05,
+      "loss": 0.1604058027267456,
+      "step": 1975
+    },
+    {
+      "epoch": 0.36026200873362446,
+      "grad_norm": 0.16838301718235016,
+      "learning_rate": 3.704255176670021e-05,
+      "loss": 0.15335073471069335,
+      "step": 1980
+    },
+    {
+      "epoch": 0.3611717612809316,
+      "grad_norm": 0.3054695427417755,
+      "learning_rate": 3.6977994325123535e-05,
+      "loss": 0.16558053493499755,
+      "step": 1985
+    },
+    {
+      "epoch": 0.3620815138282387,
+      "grad_norm": 0.1526716649532318,
+      "learning_rate": 3.6913333045933934e-05,
+      "loss": 0.16148923635482787,
+      "step": 1990
+    },
+    {
+      "epoch": 0.36299126637554585,
+      "grad_norm": 0.15328513085842133,
+      "learning_rate": 3.684856848968209e-05,
+      "loss": 0.1553613781929016,
+      "step": 1995
+    },
+    {
+      "epoch": 0.363901018922853,
+      "grad_norm": 0.16129714250564575,
+      "learning_rate": 3.6783701217813995e-05,
+      "loss": 0.16724612712860107,
+      "step": 2000
+    },
+    {
+      "epoch": 0.3648107714701601,
+      "grad_norm": 0.15715539455413818,
+      "learning_rate": 3.6718731792666086e-05,
+      "loss": 0.15867922306060792,
+      "step": 2005
+    },
+    {
+      "epoch": 0.36572052401746724,
+      "grad_norm": 0.15569166839122772,
+      "learning_rate": 3.6653660777460366e-05,
+      "loss": 0.1552058696746826,
+      "step": 2010
+    },
+    {
+      "epoch": 0.36663027656477437,
+      "grad_norm": 0.16223010420799255,
+      "learning_rate": 3.6588488736299535e-05,
+      "loss": 0.1583200454711914,
+      "step": 2015
+    },
+    {
+      "epoch": 0.3675400291120815,
+      "grad_norm": 0.18441995978355408,
+      "learning_rate": 3.652321623416209e-05,
+      "loss": 0.15050662755966188,
+      "step": 2020
+    },
+    {
+      "epoch": 0.36844978165938863,
+      "grad_norm": 0.13792674243450165,
+      "learning_rate": 3.645784383689742e-05,
+      "loss": 0.15458759069442748,
+      "step": 2025
+    },
+    {
+      "epoch": 0.36935953420669576,
+      "grad_norm": 0.14993111789226532,
+      "learning_rate": 3.639237211122091e-05,
+      "loss": 0.15926222801208495,
+      "step": 2030
+    },
+    {
+      "epoch": 0.3702692867540029,
+      "grad_norm": 0.16815930604934692,
+      "learning_rate": 3.632680162470904e-05,
+      "loss": 0.15524441003799438,
+      "step": 2035
+    },
+    {
+      "epoch": 0.37117903930131,
+      "grad_norm": 0.13312821090221405,
+      "learning_rate": 3.626113294579441e-05,
+      "loss": 0.15883516073226928,
+      "step": 2040
+    },
+    {
+      "epoch": 0.37208879184861715,
+      "grad_norm": 0.16838273406028748,
+      "learning_rate": 3.619536664376091e-05,
+      "loss": 0.15829603672027587,
+      "step": 2045
+    },
+    {
+      "epoch": 0.37299854439592434,
+      "grad_norm": 0.14706873893737793,
+      "learning_rate": 3.612950328873869e-05,
+      "loss": 0.15644397735595703,
+      "step": 2050
+    },
+    {
+      "epoch": 0.37390829694323147,
+      "grad_norm": 0.1644199639558792,
+      "learning_rate": 3.606354345169926e-05,
+      "loss": 0.15858219861984252,
+      "step": 2055
+    },
+    {
+      "epoch": 0.3748180494905386,
+      "grad_norm": 0.18077051639556885,
+      "learning_rate": 3.599748770445055e-05,
+      "loss": 0.1641286849975586,
+      "step": 2060
+    },
+    {
+      "epoch": 0.3757278020378457,
+      "grad_norm": 0.16329127550125122,
+      "learning_rate": 3.5931336619631914e-05,
+      "loss": 0.15027186870574952,
+      "step": 2065
+    },
+    {
+      "epoch": 0.37663755458515286,
+      "grad_norm": 0.16346783936023712,
+      "learning_rate": 3.586509077070922e-05,
+      "loss": 0.1558641314506531,
+      "step": 2070
+    },
+    {
+      "epoch": 0.37754730713246,
+      "grad_norm": 0.1727602630853653,
+      "learning_rate": 3.5798750731969834e-05,
+      "loss": 0.15390506982803345,
+      "step": 2075
+    },
+    {
+      "epoch": 0.3784570596797671,
+      "grad_norm": 0.7598192691802979,
+      "learning_rate": 3.5732317078517654e-05,
+      "loss": 0.1533232808113098,
+      "step": 2080
+    },
+    {
+      "epoch": 0.37936681222707425,
+      "grad_norm": 0.1433355212211609,
+      "learning_rate": 3.5665790386268124e-05,
+      "loss": 0.15560413599014283,
+      "step": 2085
+    },
+    {
+      "epoch": 0.3802765647743814,
+      "grad_norm": 0.18439625203609467,
+      "learning_rate": 3.559917123194325e-05,
+      "loss": 0.16695556640625,
+      "step": 2090
+    },
+    {
+      "epoch": 0.3811863173216885,
+      "grad_norm": 0.1693502813577652,
+      "learning_rate": 3.55324601930666e-05,
+      "loss": 0.15957870483398437,
+      "step": 2095
+    },
+    {
+      "epoch": 0.38209606986899564,
+      "grad_norm": 0.17776088416576385,
+      "learning_rate": 3.54656578479583e-05,
+      "loss": 0.1527492880821228,
+      "step": 2100
+    },
+    {
+      "epoch": 0.38300582241630277,
+      "grad_norm": 0.15993724763393402,
+      "learning_rate": 3.539876477572998e-05,
+      "loss": 0.1567505717277527,
+      "step": 2105
+    },
+    {
+      "epoch": 0.3839155749636099,
+      "grad_norm": 0.17067375779151917,
+      "learning_rate": 3.533178155627981e-05,
+      "loss": 0.14660797119140626,
+      "step": 2110
+    },
+    {
+      "epoch": 0.384825327510917,
+      "grad_norm": 0.20239882171154022,
+      "learning_rate": 3.526470877028745e-05,
+      "loss": 0.1596767544746399,
+      "step": 2115
+    },
+    {
+      "epoch": 0.38573508005822416,
+      "grad_norm": 0.1863643079996109,
+      "learning_rate": 3.5197546999209005e-05,
+      "loss": 0.15738571882247926,
+      "step": 2120
+    },
+    {
+      "epoch": 0.3866448326055313,
+      "grad_norm": 0.16994133591651917,
+      "learning_rate": 3.5130296825272014e-05,
+      "loss": 0.16255316734313965,
+      "step": 2125
+    },
+    {
+      "epoch": 0.3875545851528384,
+      "grad_norm": 0.18703415989875793,
+      "learning_rate": 3.5062958831470355e-05,
+      "loss": 0.15206334590911866,
+      "step": 2130
+    },
+    {
+      "epoch": 0.38846433770014555,
+      "grad_norm": 0.15433982014656067,
+      "learning_rate": 3.4995533601559226e-05,
+      "loss": 0.1590178370475769,
+      "step": 2135
+    },
+    {
+      "epoch": 0.3893740902474527,
+      "grad_norm": 0.16498146951198578,
+      "learning_rate": 3.4928021720050104e-05,
+      "loss": 0.14759145975112914,
+      "step": 2140
+    },
+    {
+      "epoch": 0.3902838427947598,
+      "grad_norm": 0.17880478501319885,
+      "learning_rate": 3.486042377220562e-05,
+      "loss": 0.1642458915710449,
+      "step": 2145
+    },
+    {
+      "epoch": 0.39119359534206694,
+      "grad_norm": 0.14700061082839966,
+      "learning_rate": 3.479274034403455e-05,
+      "loss": 0.16105138063430785,
+      "step": 2150
+    },
+    {
+      "epoch": 0.39210334788937407,
+      "grad_norm": 0.1620762050151825,
+      "learning_rate": 3.472497202228664e-05,
+      "loss": 0.15104985237121582,
+      "step": 2155
+    },
+    {
+      "epoch": 0.3930131004366812,
+      "grad_norm": 0.1625058799982071,
+      "learning_rate": 3.4657119394447654e-05,
+      "loss": 0.16145485639572144,
+      "step": 2160
+    },
+    {
+      "epoch": 0.3939228529839884,
+      "grad_norm": 0.1631549596786499,
+      "learning_rate": 3.458918304873417e-05,
+      "loss": 0.16712255477905275,
+      "step": 2165
+    },
+    {
+      "epoch": 0.3948326055312955,
+      "grad_norm": 0.16041551530361176,
+      "learning_rate": 3.452116357408853e-05,
+      "loss": 0.15118330717086792,
+      "step": 2170
+    },
+    {
+      "epoch": 0.39574235807860264,
+      "grad_norm": 0.16692611575126648,
+      "learning_rate": 3.44530615601737e-05,
+      "loss": 0.16982550621032716,
+      "step": 2175
+    },
+    {
+      "epoch": 0.39665211062590977,
+      "grad_norm": 0.16082268953323364,
+      "learning_rate": 3.438487759736821e-05,
+      "loss": 0.1513260006904602,
+      "step": 2180
+    },
+    {
+      "epoch": 0.3975618631732169,
+      "grad_norm": 0.1474589854478836,
+      "learning_rate": 3.4316612276761004e-05,
+      "loss": 0.14968743324279785,
+      "step": 2185
+    },
+    {
+      "epoch": 0.39847161572052403,
+      "grad_norm": 0.14531342685222626,
+      "learning_rate": 3.42482661901463e-05,
+      "loss": 0.1563260555267334,
+      "step": 2190
+    },
+    {
+      "epoch": 0.39938136826783116,
+      "grad_norm": 0.16775506734848022,
+      "learning_rate": 3.41798399300185e-05,
+      "loss": 0.14861010313034057,
+      "step": 2195
+    },
+    {
+      "epoch": 0.4002911208151383,
+      "grad_norm": 0.15065217018127441,
+      "learning_rate": 3.411133408956703e-05,
+      "loss": 0.15559519529342652,
+      "step": 2200
+    },
+    {
+      "epoch": 0.4012008733624454,
+      "grad_norm": 0.16655296087265015,
+      "learning_rate": 3.4042749262671184e-05,
+      "loss": 0.16025567054748535,
+      "step": 2205
+    },
+    {
+      "epoch": 0.40211062590975255,
+      "grad_norm": 0.14773905277252197,
+      "learning_rate": 3.397408604389501e-05,
+      "loss": 0.15074082612991332,
+      "step": 2210
+    },
+    {
+      "epoch": 0.4030203784570597,
+      "grad_norm": 0.16233304142951965,
+      "learning_rate": 3.3905345028482125e-05,
+      "loss": 0.15490520000457764,
+      "step": 2215
+    },
+    {
+      "epoch": 0.4039301310043668,
+      "grad_norm": 0.17520153522491455,
+      "learning_rate": 3.383652681235058e-05,
+      "loss": 0.1517520785331726,
+      "step": 2220
+    },
+    {
+      "epoch": 0.40483988355167394,
+      "grad_norm": 0.14749875664710999,
+      "learning_rate": 3.376763199208766e-05,
+      "loss": 0.15410997867584228,
+      "step": 2225
+    },
+    {
+      "epoch": 0.40574963609898107,
+      "grad_norm": 0.16855919361114502,
+      "learning_rate": 3.369866116494477e-05,
+      "loss": 0.1510261058807373,
+      "step": 2230
+    },
+    {
+      "epoch": 0.4066593886462882,
+      "grad_norm": 0.1594122350215912,
+      "learning_rate": 3.362961492883218e-05,
+      "loss": 0.1493813395500183,
+      "step": 2235
+    },
+    {
+      "epoch": 0.40756914119359533,
+      "grad_norm": 0.13645926117897034,
+      "learning_rate": 3.3560493882313915e-05,
+      "loss": 0.14876762628555298,
+      "step": 2240
+    },
+    {
+      "epoch": 0.40847889374090246,
+      "grad_norm": 0.14304400980472565,
+      "learning_rate": 3.349129862460251e-05,
+      "loss": 0.15567013025283813,
+      "step": 2245
+    },
+    {
+      "epoch": 0.4093886462882096,
+      "grad_norm": 0.17040041089057922,
+      "learning_rate": 3.342202975555386e-05,
+      "loss": 0.1563249945640564,
+      "step": 2250
+    },
+    {
+      "epoch": 0.4102983988355167,
+      "grad_norm": 0.15594671666622162,
+      "learning_rate": 3.3352687875661984e-05,
+      "loss": 0.1546410083770752,
+      "step": 2255
+    },
+    {
+      "epoch": 0.41120815138282385,
+      "grad_norm": 0.1677195280790329,
+      "learning_rate": 3.328327358605384e-05,
+      "loss": 0.15710171461105346,
+      "step": 2260
+    },
+    {
+      "epoch": 0.412117903930131,
+      "grad_norm": 0.1731705516576767,
+      "learning_rate": 3.321378748848412e-05,
+      "loss": 0.16444036960601807,
+      "step": 2265
+    },
+    {
+      "epoch": 0.4130276564774381,
+      "grad_norm": 0.18779033422470093,
+      "learning_rate": 3.3144230185329984e-05,
+      "loss": 0.15659687519073487,
+      "step": 2270
+    },
+    {
+      "epoch": 0.4139374090247453,
+      "grad_norm": 0.1543768346309662,
+      "learning_rate": 3.3074602279585913e-05,
+      "loss": 0.15100739002227784,
+      "step": 2275
+    },
+    {
+      "epoch": 0.4148471615720524,
+      "grad_norm": 0.16672168672084808,
+      "learning_rate": 3.300490437485843e-05,
+      "loss": 0.15535364151000977,
+      "step": 2280
+    },
+    {
+      "epoch": 0.41575691411935956,
+      "grad_norm": 0.16741308569908142,
+      "learning_rate": 3.293513707536089e-05,
+      "loss": 0.15523911714553834,
+      "step": 2285
+    },
+    {
+      "epoch": 0.4166666666666667,
+      "grad_norm": 0.1488303542137146,
+      "learning_rate": 3.286530098590822e-05,
+      "loss": 0.1542000651359558,
+      "step": 2290
+    },
+    {
+      "epoch": 0.4175764192139738,
+      "grad_norm": 0.1637732982635498,
+      "learning_rate": 3.2795396711911694e-05,
+      "loss": 0.15354831218719484,
+      "step": 2295
+    },
+    {
+      "epoch": 0.41848617176128095,
+      "grad_norm": 0.1472022533416748,
+      "learning_rate": 3.272542485937369e-05,
+      "loss": 0.16235145330429077,
+      "step": 2300
+    },
+    {
+      "epoch": 0.4193959243085881,
+      "grad_norm": 0.15908290445804596,
+      "learning_rate": 3.265538603488241e-05,
+      "loss": 0.15642645359039306,
+      "step": 2305
+    },
+    {
+      "epoch": 0.4203056768558952,
+      "grad_norm": 0.1584865301847458,
+      "learning_rate": 3.2585280845606645e-05,
+      "loss": 0.15490249395370484,
+      "step": 2310
+    },
+    {
+      "epoch": 0.42121542940320233,
+      "grad_norm": 0.15893949568271637,
+      "learning_rate": 3.251510989929052e-05,
+      "loss": 0.1598116159439087,
+      "step": 2315
+    },
+    {
+      "epoch": 0.42212518195050946,
+      "grad_norm": 0.18930596113204956,
+      "learning_rate": 3.244487380424817e-05,
+      "loss": 0.1482008934020996,
+      "step": 2320
+    },
+    {
+      "epoch": 0.4230349344978166,
+      "grad_norm": 0.132876455783844,
+      "learning_rate": 3.237457316935856e-05,
+      "loss": 0.15304710865020751,
+      "step": 2325
+    },
+    {
+      "epoch": 0.4239446870451237,
+      "grad_norm": 0.16447032988071442,
+      "learning_rate": 3.2304208604060106e-05,
+      "loss": 0.15298750400543212,
+      "step": 2330
+    },
+    {
+      "epoch": 0.42485443959243085,
+      "grad_norm": 0.17748120427131653,
+      "learning_rate": 3.223378071834546e-05,
+      "loss": 0.1556084156036377,
+      "step": 2335
+    },
+    {
+      "epoch": 0.425764192139738,
+      "grad_norm": 0.16366586089134216,
+      "learning_rate": 3.2163290122756206e-05,
+      "loss": 0.14387927055358887,
+      "step": 2340
+    },
+    {
+      "epoch": 0.4266739446870451,
+      "grad_norm": 0.15398970246315002,
+      "learning_rate": 3.209273742837755e-05,
+      "loss": 0.16091293096542358,
+      "step": 2345
+    },
+    {
+      "epoch": 0.42758369723435224,
+      "grad_norm": 0.164212167263031,
+      "learning_rate": 3.202212324683305e-05,
+      "loss": 0.15523531436920165,
+      "step": 2350
+    },
+    {
+      "epoch": 0.4284934497816594,
+      "grad_norm": 0.16749800741672516,
+      "learning_rate": 3.1951448190279255e-05,
+      "loss": 0.15354975461959838,
+      "step": 2355
+    },
+    {
+      "epoch": 0.4294032023289665,
+      "grad_norm": 0.14137034118175507,
+      "learning_rate": 3.18807128714005e-05,
+      "loss": 0.14981694221496583,
+      "step": 2360
+    },
+    {
+      "epoch": 0.43031295487627363,
+      "grad_norm": 0.14848439395427704,
+      "learning_rate": 3.1809917903403507e-05,
+      "loss": 0.15448769330978393,
+      "step": 2365
+    },
+    {
+      "epoch": 0.43122270742358076,
+      "grad_norm": 0.1747605800628662,
+      "learning_rate": 3.1739063900012095e-05,
+      "loss": 0.15882387161254882,
+      "step": 2370
+    },
+    {
+      "epoch": 0.4321324599708879,
+      "grad_norm": 0.16054467856884003,
+      "learning_rate": 3.166815147546186e-05,
+      "loss": 0.15170297622680665,
+      "step": 2375
+    },
+    {
+      "epoch": 0.433042212518195,
+      "grad_norm": 0.15428027510643005,
+      "learning_rate": 3.1597181244494886e-05,
+      "loss": 0.16202548742294312,
+      "step": 2380
+    },
+    {
+      "epoch": 0.4339519650655022,
+      "grad_norm": 0.16747219860553741,
+      "learning_rate": 3.1526153822354325e-05,
+      "loss": 0.15461477041244506,
+      "step": 2385
+    },
+    {
+      "epoch": 0.43486171761280934,
+      "grad_norm": 0.17415772378444672,
+      "learning_rate": 3.145506982477918e-05,
+      "loss": 0.16173542737960817,
+      "step": 2390
+    },
+    {
+      "epoch": 0.43577147016011647,
+      "grad_norm": 0.1293518990278244,
+      "learning_rate": 3.1383929867998865e-05,
+      "loss": 0.15572521686553956,
+      "step": 2395
+    },
+    {
+      "epoch": 0.4366812227074236,
+      "grad_norm": 0.16909323632717133,
+      "learning_rate": 3.1312734568727935e-05,
+      "loss": 0.15898628234863282,
+      "step": 2400
+    },
+    {
+      "epoch": 0.43759097525473073,
+      "grad_norm": 0.16770294308662415,
+      "learning_rate": 3.124148454416069e-05,
+      "loss": 0.1536281704902649,
+      "step": 2405
+    },
+    {
+      "epoch": 0.43850072780203786,
+      "grad_norm": 0.14078612625598907,
+      "learning_rate": 3.117018041196585e-05,
+      "loss": 0.15274266004562378,
+      "step": 2410
+    },
+    {
+      "epoch": 0.439410480349345,
+      "grad_norm": 0.15457536280155182,
+      "learning_rate": 3.1098822790281226e-05,
+      "loss": 0.15391263961791993,
+      "step": 2415
+    },
+    {
+      "epoch": 0.4403202328966521,
+      "grad_norm": 0.1640717089176178,
+      "learning_rate": 3.102741229770827e-05,
+      "loss": 0.15515168905258178,
+      "step": 2420
+    },
+    {
+      "epoch": 0.44122998544395925,
+      "grad_norm": 0.2601533830165863,
+      "learning_rate": 3.095594955330683e-05,
+      "loss": 0.1587247371673584,
+      "step": 2425
+    },
+    {
+      "epoch": 0.4421397379912664,
+      "grad_norm": 0.1352529525756836,
+      "learning_rate": 3.08844351765897e-05,
+      "loss": 0.1483217477798462,
+      "step": 2430
+    },
+    {
+      "epoch": 0.4430494905385735,
+      "grad_norm": 0.18479721248149872,
+      "learning_rate": 3.081286978751728e-05,
+      "loss": 0.15121787786483765,
+      "step": 2435
+    },
+    {
+      "epoch": 0.44395924308588064,
+      "grad_norm": 0.16954511404037476,
+      "learning_rate": 3.074125400649221e-05,
+      "loss": 0.16073100566864013,
+      "step": 2440
+    },
+    {
+      "epoch": 0.44486899563318777,
+      "grad_norm": 0.15154729783535004,
+      "learning_rate": 3.0669588454353944e-05,
+      "loss": 0.15738017559051515,
+      "step": 2445
+    },
+    {
+      "epoch": 0.4457787481804949,
+      "grad_norm": 0.1540488302707672,
+      "learning_rate": 3.059787375237344e-05,
+      "loss": 0.1515384554862976,
+      "step": 2450
+    },
+    {
+      "epoch": 0.44668850072780203,
+      "grad_norm": 0.1814432442188263,
+      "learning_rate": 3.052611052224774e-05,
+      "loss": 0.15731438398361205,
+      "step": 2455
+    },
+    {
+      "epoch": 0.44759825327510916,
+      "grad_norm": 0.16657036542892456,
+      "learning_rate": 3.0454299386094542e-05,
+      "loss": 0.15741543769836425,
+      "step": 2460
+    },
+    {
+      "epoch": 0.4485080058224163,
+      "grad_norm": 0.2177237570285797,
+      "learning_rate": 3.0382440966446875e-05,
+      "loss": 0.14972515106201173,
+      "step": 2465
+    },
+    {
+      "epoch": 0.4494177583697234,
+      "grad_norm": 0.1669909954071045,
+      "learning_rate": 3.031053588624766e-05,
+      "loss": 0.1506432294845581,
+      "step": 2470
+    },
+    {
+      "epoch": 0.45032751091703055,
+      "grad_norm": 0.1752234250307083,
+      "learning_rate": 3.0238584768844313e-05,
+      "loss": 0.14969609975814818,
+      "step": 2475
+    },
+    {
+      "epoch": 0.4512372634643377,
+      "grad_norm": 0.18267901241779327,
+      "learning_rate": 3.0166588237983363e-05,
+      "loss": 0.15112748146057128,
+      "step": 2480
+    },
+    {
+      "epoch": 0.4521470160116448,
+      "grad_norm": 0.16250105202198029,
+      "learning_rate": 3.0094546917805007e-05,
+      "loss": 0.15864100456237792,
+      "step": 2485
+    },
+    {
+      "epoch": 0.45305676855895194,
+      "grad_norm": 0.14825721085071564,
+      "learning_rate": 3.0022461432837752e-05,
+      "loss": 0.1513954520225525,
+      "step": 2490
+    },
+    {
+      "epoch": 0.4539665211062591,
+      "grad_norm": 0.1626640111207962,
+      "learning_rate": 2.9950332407992943e-05,
+      "loss": 0.1505578875541687,
+      "step": 2495
+    },
+    {
+      "epoch": 0.45487627365356625,
+      "grad_norm": 0.1535351574420929,
+      "learning_rate": 2.987816046855939e-05,
+      "loss": 0.15255829095840454,
+      "step": 2500
+    },
+    {
+      "epoch": 0.4557860262008734,
+      "grad_norm": 0.17552775144577026,
+      "learning_rate": 2.9805946240197928e-05,
+      "loss": 0.1516443133354187,
+      "step": 2505
+    },
+    {
+      "epoch": 0.4566957787481805,
+      "grad_norm": 0.16020981967449188,
+      "learning_rate": 2.9733690348935994e-05,
+      "loss": 0.14519743919372557,
+      "step": 2510
+    },
+    {
+      "epoch": 0.45760553129548764,
+      "grad_norm": 0.17800211906433105,
+      "learning_rate": 2.9661393421162204e-05,
+      "loss": 0.15679080486297609,
+      "step": 2515
+    },
+    {
+      "epoch": 0.4585152838427948,
+      "grad_norm": 0.16016991436481476,
+      "learning_rate": 2.9589056083620902e-05,
+      "loss": 0.14768127202987671,
+      "step": 2520
+    },
+    {
+      "epoch": 0.4594250363901019,
+      "grad_norm": 0.16272081434726715,
+      "learning_rate": 2.951667896340679e-05,
+      "loss": 0.1513301968574524,
+      "step": 2525
+    },
+    {
+      "epoch": 0.46033478893740903,
+      "grad_norm": 0.1726413071155548,
+      "learning_rate": 2.9444262687959402e-05,
+      "loss": 0.14819332361221313,
+      "step": 2530
+    },
+    {
+      "epoch": 0.46124454148471616,
+      "grad_norm": 0.1670403778553009,
+      "learning_rate": 2.9371807885057735e-05,
+      "loss": 0.15245940685272216,
+      "step": 2535
+    },
+    {
+      "epoch": 0.4621542940320233,
+      "grad_norm": 0.1650049239397049,
+      "learning_rate": 2.9299315182814772e-05,
+      "loss": 0.15187418460845947,
+      "step": 2540
+    },
+    {
+      "epoch": 0.4630640465793304,
+      "grad_norm": 0.16327734291553497,
+      "learning_rate": 2.9226785209672047e-05,
+      "loss": 0.15579828023910522,
+      "step": 2545
+    },
+    {
+      "epoch": 0.46397379912663755,
+      "grad_norm": 0.3367880582809448,
+      "learning_rate": 2.91542185943942e-05,
+      "loss": 0.15617697238922118,
+      "step": 2550
+    },
+    {
+      "epoch": 0.4648835516739447,
+      "grad_norm": 0.1731594055891037,
+      "learning_rate": 2.908161596606353e-05,
+      "loss": 0.1559603691101074,
+      "step": 2555
+    },
+    {
+      "epoch": 0.4657933042212518,
+      "grad_norm": 0.1477293074131012,
+      "learning_rate": 2.9008977954074517e-05,
+      "loss": 0.15567959547042848,
+      "step": 2560
+    },
+    {
+      "epoch": 0.46670305676855894,
+      "grad_norm": 0.16227173805236816,
+      "learning_rate": 2.8936305188128392e-05,
+      "loss": 0.1522113561630249,
+      "step": 2565
+    },
+    {
+      "epoch": 0.4676128093158661,
+      "grad_norm": 0.2031075656414032,
+      "learning_rate": 2.8863598298227674e-05,
+      "loss": 0.15054640769958497,
+      "step": 2570
+    },
+    {
+      "epoch": 0.4685225618631732,
+      "grad_norm": 0.18351472914218903,
+      "learning_rate": 2.8790857914670698e-05,
+      "loss": 0.15837019681930542,
+      "step": 2575
+    },
+    {
+      "epoch": 0.46943231441048033,
+      "grad_norm": 0.15914765000343323,
+      "learning_rate": 2.871808466804616e-05,
+      "loss": 0.1550259470939636,
+      "step": 2580
+    },
+    {
+      "epoch": 0.47034206695778746,
+      "grad_norm": 0.17366717755794525,
+      "learning_rate": 2.8645279189227636e-05,
+      "loss": 0.15702390670776367,
+      "step": 2585
+    },
+    {
+      "epoch": 0.4712518195050946,
+      "grad_norm": 0.13677838444709778,
+      "learning_rate": 2.8572442109368134e-05,
+      "loss": 0.15485031604766847,
+      "step": 2590
+    },
+    {
+      "epoch": 0.4721615720524017,
+      "grad_norm": 0.1477748304605484,
+      "learning_rate": 2.8499574059894617e-05,
+      "loss": 0.14577245712280273,
+      "step": 2595
+    },
+    {
+      "epoch": 0.47307132459970885,
+      "grad_norm": 0.1582217663526535,
+      "learning_rate": 2.842667567250252e-05,
+      "loss": 0.15586793422698975,
+      "step": 2600
+    },
+    {
+      "epoch": 0.47398107714701604,
+      "grad_norm": 0.19658738374710083,
+      "learning_rate": 2.8353747579150268e-05,
+      "loss": 0.15060495138168334,
+      "step": 2605
+    },
+    {
+      "epoch": 0.47489082969432317,
+      "grad_norm": 0.176767036318779,
+      "learning_rate": 2.828079041205382e-05,
+      "loss": 0.15116705894470214,
+      "step": 2610
+    },
+    {
+      "epoch": 0.4758005822416303,
+      "grad_norm": 0.16972507536411285,
+      "learning_rate": 2.820780480368117e-05,
+      "loss": 0.1541937470436096,
+      "step": 2615
+    },
+    {
+      "epoch": 0.47671033478893743,
+      "grad_norm": 0.1548585742712021,
+      "learning_rate": 2.8134791386746884e-05,
+      "loss": 0.14334756135940552,
+      "step": 2620
+    },
+    {
+      "epoch": 0.47762008733624456,
+      "grad_norm": 0.15411986410617828,
+      "learning_rate": 2.806175079420658e-05,
+      "loss": 0.14642289876937867,
+      "step": 2625
+    },
+    {
+      "epoch": 0.4785298398835517,
+      "grad_norm": 0.16609491407871246,
+      "learning_rate": 2.7988683659251474e-05,
+      "loss": 0.15083469152450563,
+      "step": 2630
+    },
+    {
+      "epoch": 0.4794395924308588,
+      "grad_norm": 0.16592684388160706,
+      "learning_rate": 2.791559061530289e-05,
+      "loss": 0.14218480587005616,
+      "step": 2635
+    },
+    {
+      "epoch": 0.48034934497816595,
+      "grad_norm": 0.1764935404062271,
+      "learning_rate": 2.7842472296006722e-05,
+      "loss": 0.15004343986511232,
+      "step": 2640
+    },
+    {
+      "epoch": 0.4812590975254731,
+      "grad_norm": 0.20094354450702667,
+      "learning_rate": 2.7769329335228022e-05,
+      "loss": 0.14975016117095946,
+      "step": 2645
+    },
+    {
+      "epoch": 0.4821688500727802,
+      "grad_norm": 0.1869269460439682,
+      "learning_rate": 2.769616236704542e-05,
+      "loss": 0.155981707572937,
+      "step": 2650
+    },
+    {
+      "epoch": 0.48307860262008734,
+      "grad_norm": 0.16671574115753174,
+      "learning_rate": 2.762297202574571e-05,
+      "loss": 0.14633859395980836,
+      "step": 2655
+    },
+    {
+      "epoch": 0.48398835516739447,
+      "grad_norm": 0.14999663829803467,
+      "learning_rate": 2.754975894581826e-05,
+      "loss": 0.15692603588104248,
+      "step": 2660
+    },
+    {
+      "epoch": 0.4848981077147016,
+      "grad_norm": 0.16893649101257324,
+      "learning_rate": 2.7476523761949592e-05,
+      "loss": 0.14530394077301026,
+      "step": 2665
+    },
+    {
+      "epoch": 0.48580786026200873,
+      "grad_norm": 0.16039884090423584,
+      "learning_rate": 2.740326710901784e-05,
+      "loss": 0.15013915300369263,
+      "step": 2670
+    },
+    {
+      "epoch": 0.48671761280931586,
+      "grad_norm": 0.16672006249427795,
+      "learning_rate": 2.732998962208725e-05,
+      "loss": 0.15667349100112915,
+      "step": 2675
+    },
+    {
+      "epoch": 0.487627365356623,
+      "grad_norm": 0.2160867303609848,
+      "learning_rate": 2.7256691936402684e-05,
+      "loss": 0.14335414171218872,
+      "step": 2680
+    },
+    {
+      "epoch": 0.4885371179039301,
+      "grad_norm": 0.349030077457428,
+      "learning_rate": 2.71833746873841e-05,
+      "loss": 0.1437530279159546,
+      "step": 2685
+    },
+    {
+      "epoch": 0.48944687045123725,
+      "grad_norm": 0.18380966782569885,
+      "learning_rate": 2.7110038510621073e-05,
+      "loss": 0.1476014256477356,
+      "step": 2690
+    },
+    {
+      "epoch": 0.4903566229985444,
+      "grad_norm": 0.1523742377758026,
+      "learning_rate": 2.703668404186722e-05,
+      "loss": 0.14578526020050048,
+      "step": 2695
+    },
+    {
+      "epoch": 0.4912663755458515,
+      "grad_norm": 0.16092729568481445,
+      "learning_rate": 2.696331191703479e-05,
+      "loss": 0.15335593223571778,
+      "step": 2700
+    },
+    {
+      "epoch": 0.49217612809315864,
+      "grad_norm": 0.17185333371162415,
+      "learning_rate": 2.688992277218904e-05,
+      "loss": 0.1540898084640503,
+      "step": 2705
+    },
+    {
+      "epoch": 0.49308588064046577,
+      "grad_norm": 0.1521969735622406,
+      "learning_rate": 2.6816517243542792e-05,
+      "loss": 0.15171396732330322,
+      "step": 2710
+    },
+    {
+      "epoch": 0.49399563318777295,
+      "grad_norm": 0.16064171493053436,
+      "learning_rate": 2.674309596745092e-05,
+      "loss": 0.1505839228630066,
+      "step": 2715
+    },
+    {
+      "epoch": 0.4949053857350801,
+      "grad_norm": 0.16430898010730743,
+      "learning_rate": 2.6669659580404795e-05,
+      "loss": 0.1551363468170166,
+      "step": 2720
+    },
+    {
+      "epoch": 0.4958151382823872,
+      "grad_norm": 0.16125477850437164,
+      "learning_rate": 2.659620871902677e-05,
+      "loss": 0.15069286823272704,
+      "step": 2725
+    },
+    {
+      "epoch": 0.49672489082969434,
+      "grad_norm": 0.1428450047969818,
+      "learning_rate": 2.652274402006471e-05,
+      "loss": 0.15511081218719483,
+      "step": 2730
+    },
+    {
+      "epoch": 0.4976346433770015,
+      "grad_norm": 0.15452754497528076,
+      "learning_rate": 2.6449266120386406e-05,
+      "loss": 0.14941939115524291,
+      "step": 2735
+    },
+    {
+      "epoch": 0.4985443959243086,
+      "grad_norm": 0.17243537306785583,
+      "learning_rate": 2.6375775656974123e-05,
+      "loss": 0.151741623878479,
+      "step": 2740
+    },
+    {
+      "epoch": 0.49945414847161573,
+      "grad_norm": 0.13736453652381897,
+      "learning_rate": 2.6302273266919008e-05,
+      "loss": 0.147042977809906,
+      "step": 2745
+    },
+    {
+      "epoch": 0.5003639010189228,
+      "grad_norm": 0.16241495311260223,
+      "learning_rate": 2.6228759587415614e-05,
+      "loss": 0.14664684534072875,
+      "step": 2750
+    },
+    {
+      "epoch": 0.50127365356623,
+      "grad_norm": 0.193496435880661,
+      "learning_rate": 2.6155235255756356e-05,
+      "loss": 0.15486966371536254,
+      "step": 2755
+    },
+    {
+      "epoch": 0.5021834061135371,
+      "grad_norm": 0.1542847901582718,
+      "learning_rate": 2.6081700909326e-05,
+      "loss": 0.15148009061813356,
+      "step": 2760
+    },
+    {
+      "epoch": 0.5030931586608443,
+      "grad_norm": 0.1696511209011078,
+      "learning_rate": 2.6008157185596142e-05,
+      "loss": 0.14190055131912233,
+      "step": 2765
+    },
+    {
+      "epoch": 0.5040029112081513,
+      "grad_norm": 0.14690077304840088,
+      "learning_rate": 2.5934604722119655e-05,
+      "loss": 0.1570739269256592,
+      "step": 2770
+    },
+    {
+      "epoch": 0.5049126637554585,
+      "grad_norm": 0.17149671912193298,
+      "learning_rate": 2.5861044156525162e-05,
+      "loss": 0.14940304756164552,
+      "step": 2775
+    },
+    {
+      "epoch": 0.5058224163027657,
+      "grad_norm": 0.16639231145381927,
+      "learning_rate": 2.578747612651155e-05,
+      "loss": 0.15691237449645995,
+      "step": 2780
+    },
+    {
+      "epoch": 0.5067321688500728,
+      "grad_norm": 0.2062763124704361,
+      "learning_rate": 2.5713901269842404e-05,
+      "loss": 0.1564734935760498,
+      "step": 2785
+    },
+    {
+      "epoch": 0.50764192139738,
+      "grad_norm": 0.12636308372020721,
+      "learning_rate": 2.5640320224340502e-05,
+      "loss": 0.14539417028427123,
+      "step": 2790
+    },
+    {
+      "epoch": 0.508551673944687,
+      "grad_norm": 0.16893689334392548,
+      "learning_rate": 2.556673362788225e-05,
+      "loss": 0.15440930128097535,
+      "step": 2795
+    },
+    {
+      "epoch": 0.5094614264919942,
+      "grad_norm": 0.16250015795230865,
+      "learning_rate": 2.54931421183922e-05,
+      "loss": 0.14485647678375244,
+      "step": 2800
+    },
+    {
+      "epoch": 0.5103711790393013,
+      "grad_norm": 0.1700994372367859,
+      "learning_rate": 2.5419546333837462e-05,
+      "loss": 0.15411126613616943,
+      "step": 2805
+    },
+    {
+      "epoch": 0.5112809315866085,
+      "grad_norm": 0.1547706127166748,
+      "learning_rate": 2.5345946912222256e-05,
+      "loss": 0.15516072511672974,
+      "step": 2810
+    },
+    {
+      "epoch": 0.5121906841339156,
+      "grad_norm": 0.17955681681632996,
+      "learning_rate": 2.527234449158228e-05,
+      "loss": 0.15546923875808716,
+      "step": 2815
+    },
+    {
+      "epoch": 0.5131004366812227,
+      "grad_norm": 0.163709819316864,
+      "learning_rate": 2.519873970997927e-05,
+      "loss": 0.15665037631988527,
+      "step": 2820
+    },
+    {
+      "epoch": 0.5140101892285298,
+      "grad_norm": 0.17859576642513275,
+      "learning_rate": 2.5125133205495405e-05,
+      "loss": 0.1539722204208374,
+      "step": 2825
+    },
+    {
+      "epoch": 0.514919941775837,
+      "grad_norm": 0.17443150281906128,
+      "learning_rate": 2.5051525616227806e-05,
+      "loss": 0.148411762714386,
+      "step": 2830
+    },
+    {
+      "epoch": 0.5158296943231441,
+      "grad_norm": 0.17397581040859222,
+      "learning_rate": 2.4977917580283007e-05,
+      "loss": 0.14880497455596925,
+      "step": 2835
+    },
+    {
+      "epoch": 0.5167394468704513,
+      "grad_norm": 0.14565663039684296,
+      "learning_rate": 2.4904309735771405e-05,
+      "loss": 0.14934680461883545,
+      "step": 2840
+    },
+    {
+      "epoch": 0.5176491994177583,
+      "grad_norm": 0.17895659804344177,
+      "learning_rate": 2.4830702720801746e-05,
+      "loss": 0.15287939310073853,
+      "step": 2845
+    },
+    {
+      "epoch": 0.5185589519650655,
+      "grad_norm": 0.15812788903713226,
+      "learning_rate": 2.4757097173475572e-05,
+      "loss": 0.14576947689056396,
+      "step": 2850
+    },
+    {
+      "epoch": 0.5194687045123726,
+      "grad_norm": 0.17123781144618988,
+      "learning_rate": 2.46834937318817e-05,
+      "loss": 0.15224847793579102,
+      "step": 2855
+    },
+    {
+      "epoch": 0.5203784570596798,
+      "grad_norm": 0.14845474064350128,
+      "learning_rate": 2.460989303409072e-05,
+      "loss": 0.14901585578918458,
+      "step": 2860
+    },
+    {
+      "epoch": 0.5212882096069869,
+      "grad_norm": 0.23493704199790955,
+      "learning_rate": 2.4536295718149407e-05,
+      "loss": 0.1517487049102783,
+      "step": 2865
+    },
+    {
+      "epoch": 0.522197962154294,
+      "grad_norm": 0.16209843754768372,
+      "learning_rate": 2.4462702422075217e-05,
+      "loss": 0.14327445030212402,
+      "step": 2870
+    },
+    {
+      "epoch": 0.5231077147016011,
+      "grad_norm": 0.17249803245067596,
+      "learning_rate": 2.4389113783850793e-05,
+      "loss": 0.1517549753189087,
+      "step": 2875
+    },
+    {
+      "epoch": 0.5240174672489083,
+      "grad_norm": 0.14561402797698975,
+      "learning_rate": 2.431553044141836e-05,
+      "loss": 0.14764087200164794,
+      "step": 2880
+    },
+    {
+      "epoch": 0.5249272197962155,
+      "grad_norm": 0.17033302783966064,
+      "learning_rate": 2.4241953032674256e-05,
+      "loss": 0.15181604623794556,
+      "step": 2885
+    },
+    {
+      "epoch": 0.5258369723435226,
+      "grad_norm": 0.1184430941939354,
+      "learning_rate": 2.4168382195463367e-05,
+      "loss": 0.14264242649078368,
+      "step": 2890
+    },
+    {
+      "epoch": 0.5267467248908297,
+      "grad_norm": 0.17521196603775024,
+      "learning_rate": 2.4094818567573618e-05,
+      "loss": 0.1509538173675537,
+      "step": 2895
+    },
+    {
+      "epoch": 0.5276564774381368,
+      "grad_norm": 0.1681576371192932,
+      "learning_rate": 2.4021262786730428e-05,
+      "loss": 0.15344605445861817,
+      "step": 2900
+    },
+    {
+      "epoch": 0.528566229985444,
+      "grad_norm": 0.17134182155132294,
+      "learning_rate": 2.3947715490591206e-05,
+      "loss": 0.15161689519882202,
+      "step": 2905
+    },
+    {
+      "epoch": 0.5294759825327511,
+      "grad_norm": 0.1796472817659378,
+      "learning_rate": 2.3874177316739778e-05,
+      "loss": 0.15086464881896972,
+      "step": 2910
+    },
+    {
+      "epoch": 0.5303857350800583,
+      "grad_norm": 0.23268625140190125,
+      "learning_rate": 2.380064890268093e-05,
+      "loss": 0.15354180335998535,
+      "step": 2915
+    },
+    {
+      "epoch": 0.5312954876273653,
+      "grad_norm": 0.16318941116333008,
+      "learning_rate": 2.372713088583481e-05,
+      "loss": 0.15131797790527343,
+      "step": 2920
+    },
+    {
+      "epoch": 0.5322052401746725,
+      "grad_norm": 0.18171803653240204,
+      "learning_rate": 2.365362390353143e-05,
+      "loss": 0.15784090757369995,
+      "step": 2925
+    },
+    {
+      "epoch": 0.5331149927219796,
+      "grad_norm": 0.17672640085220337,
+      "learning_rate": 2.3580128593005156e-05,
+      "loss": 0.15509436130523682,
+      "step": 2930
+    },
+    {
+      "epoch": 0.5340247452692868,
+      "grad_norm": 0.15985223650932312,
+      "learning_rate": 2.3506645591389174e-05,
+      "loss": 0.14851027727127075,
+      "step": 2935
+    },
+    {
+      "epoch": 0.5349344978165939,
+      "grad_norm": 0.16597607731819153,
+      "learning_rate": 2.343317553570995e-05,
+      "loss": 0.1504931092262268,
+      "step": 2940
+    },
+    {
+      "epoch": 0.535844250363901,
+      "grad_norm": 0.20180748403072357,
+      "learning_rate": 2.3359719062881725e-05,
+      "loss": 0.15023820400238036,
+      "step": 2945
+    },
+    {
+      "epoch": 0.5367540029112081,
+      "grad_norm": 0.1735963076353073,
+      "learning_rate": 2.3286276809701e-05,
+      "loss": 0.15374408960342406,
+      "step": 2950
+    },
+    {
+      "epoch": 0.5376637554585153,
+      "grad_norm": 0.17629501223564148,
+      "learning_rate": 2.3212849412840995e-05,
+      "loss": 0.15007833242416382,
+      "step": 2955
+    },
+    {
+      "epoch": 0.5385735080058224,
+      "grad_norm": 0.1493796557188034,
+      "learning_rate": 2.3139437508846155e-05,
+      "loss": 0.15206656455993653,
+      "step": 2960
+    },
+    {
+      "epoch": 0.5394832605531296,
+      "grad_norm": 0.17426837980747223,
+      "learning_rate": 2.306604173412659e-05,
+      "loss": 0.1441131591796875,
+      "step": 2965
+    },
+    {
+      "epoch": 0.5403930131004366,
+      "grad_norm": 0.16984431445598602,
+      "learning_rate": 2.2992662724952613e-05,
+      "loss": 0.14438753128051757,
+      "step": 2970
+    },
+    {
+      "epoch": 0.5413027656477438,
+      "grad_norm": 0.1814386397600174,
+      "learning_rate": 2.2919301117449167e-05,
+      "loss": 0.14887022972106934,
+      "step": 2975
+    },
+    {
+      "epoch": 0.5422125181950509,
+      "grad_norm": 0.158392995595932,
+      "learning_rate": 2.2845957547590368e-05,
+      "loss": 0.14404361248016356,
+      "step": 2980
+    },
+    {
+      "epoch": 0.5431222707423581,
+      "grad_norm": 0.17496263980865479,
+      "learning_rate": 2.2772632651193953e-05,
+      "loss": 0.1454906702041626,
+      "step": 2985
+    },
+    {
+      "epoch": 0.5440320232896652,
+      "grad_norm": 0.157533198595047,
+      "learning_rate": 2.2699327063915766e-05,
+      "loss": 0.1458217740058899,
+      "step": 2990
+    },
+    {
+      "epoch": 0.5449417758369723,
+      "grad_norm": 0.1767890453338623,
+      "learning_rate": 2.262604142124427e-05,
+      "loss": 0.14384825229644777,
+      "step": 2995
+    },
+    {
+      "epoch": 0.5458515283842795,
+      "grad_norm": 0.1851050704717636,
+      "learning_rate": 2.2552776358495033e-05,
+      "loss": 0.14832457304000854,
+      "step": 3000
+    },
+    {
+      "epoch": 0.5467612809315866,
+      "grad_norm": 0.164175882935524,
+      "learning_rate": 2.247953251080521e-05,
+      "loss": 0.14999878406524658,
+      "step": 3005
+    },
+    {
+      "epoch": 0.5476710334788938,
+      "grad_norm": 0.3403675854206085,
+      "learning_rate": 2.240631051312804e-05,
+      "loss": 0.1443937063217163,
+      "step": 3010
+    },
+    {
+      "epoch": 0.5485807860262009,
+      "grad_norm": 0.16751109063625336,
+      "learning_rate": 2.2333111000227342e-05,
+      "loss": 0.1462402105331421,
+      "step": 3015
+    },
+    {
+      "epoch": 0.549490538573508,
+      "grad_norm": 0.14741151034832,
+      "learning_rate": 2.225993460667201e-05,
+      "loss": 0.149855899810791,
+      "step": 3020
+    },
+    {
+      "epoch": 0.5504002911208151,
+      "grad_norm": 0.20605266094207764,
+      "learning_rate": 2.218678196683054e-05,
+      "loss": 0.15413178205490113,
+      "step": 3025
+    },
+    {
+      "epoch": 0.5513100436681223,
+      "grad_norm": 0.14884796738624573,
+      "learning_rate": 2.2113653714865473e-05,
+      "loss": 0.14592334032058715,
+      "step": 3030
+    },
+    {
+      "epoch": 0.5522197962154294,
+      "grad_norm": 0.17114350199699402,
+      "learning_rate": 2.2040550484727943e-05,
+      "loss": 0.1498338460922241,
+      "step": 3035
+    },
+    {
+      "epoch": 0.5531295487627366,
+      "grad_norm": 0.16496853530406952,
+      "learning_rate": 2.196747291015219e-05,
+      "loss": 0.14650315046310425,
+      "step": 3040
+    },
+    {
+      "epoch": 0.5540393013100436,
+      "grad_norm": 0.15172401070594788,
+      "learning_rate": 2.189442162465001e-05,
+      "loss": 0.14984124898910522,
+      "step": 3045
+    },
+    {
+      "epoch": 0.5549490538573508,
+      "grad_norm": 0.19258467853069305,
+      "learning_rate": 2.182139726150532e-05,
+      "loss": 0.1486764669418335,
+      "step": 3050
+    },
+    {
+      "epoch": 0.5558588064046579,
+      "grad_norm": 0.1749001443386078,
+      "learning_rate": 2.1748400453768652e-05,
+      "loss": 0.14983701705932617,
+      "step": 3055
+    },
+    {
+      "epoch": 0.5567685589519651,
+      "grad_norm": 0.37510567903518677,
+      "learning_rate": 2.1675431834251637e-05,
+      "loss": 0.14483561515808105,
+      "step": 3060
+    },
+    {
+      "epoch": 0.5576783114992722,
+      "grad_norm": 0.16932405531406403,
+      "learning_rate": 2.1602492035521553e-05,
+      "loss": 0.14487643241882325,
+      "step": 3065
+    },
+    {
+      "epoch": 0.5585880640465793,
+      "grad_norm": 0.174176424741745,
+      "learning_rate": 2.152958168989584e-05,
+      "loss": 0.14737497568130492,
+      "step": 3070
+    },
+    {
+      "epoch": 0.5594978165938864,
+      "grad_norm": 0.1601252257823944,
+      "learning_rate": 2.1456701429436577e-05,
+      "loss": 0.15183379650115966,
+      "step": 3075
+    },
+    {
+      "epoch": 0.5604075691411936,
+      "grad_norm": 0.14960910379886627,
+      "learning_rate": 2.1383851885945085e-05,
+      "loss": 0.143074893951416,
+      "step": 3080
+    },
+    {
+      "epoch": 0.5613173216885007,
+      "grad_norm": 0.1678633838891983,
+      "learning_rate": 2.1311033690956346e-05,
+      "loss": 0.14961432218551635,
+      "step": 3085
+    },
+    {
+      "epoch": 0.5622270742358079,
+      "grad_norm": 0.15814319252967834,
+      "learning_rate": 2.1238247475733613e-05,
+      "loss": 0.14308581352233887,
+      "step": 3090
+    },
+    {
+      "epoch": 0.5631368267831149,
+      "grad_norm": 0.21240772306919098,
+      "learning_rate": 2.1165493871262887e-05,
+      "loss": 0.14737485647201537,
+      "step": 3095
+    },
+    {
+      "epoch": 0.5640465793304221,
+      "grad_norm": 0.15161271393299103,
+      "learning_rate": 2.109277350824749e-05,
+      "loss": 0.14534420967102052,
+      "step": 3100
+    },
+    {
+      "epoch": 0.5649563318777293,
+      "grad_norm": 0.16572362184524536,
+      "learning_rate": 2.1020087017102537e-05,
+      "loss": 0.14299670457839966,
+      "step": 3105
+    },
+    {
+      "epoch": 0.5658660844250364,
+      "grad_norm": 0.1548164039850235,
+      "learning_rate": 2.094743502794954e-05,
+      "loss": 0.14371142387390137,
+      "step": 3110
+    },
+    {
+      "epoch": 0.5667758369723436,
+      "grad_norm": 0.2574169933795929,
+      "learning_rate": 2.0874818170610885e-05,
+      "loss": 0.14350423812866211,
+      "step": 3115
+    },
+    {
+      "epoch": 0.5676855895196506,
+      "grad_norm": 0.16359548270702362,
+      "learning_rate": 2.080223707460443e-05,
+      "loss": 0.1520243763923645,
+      "step": 3120
+    },
+    {
+      "epoch": 0.5685953420669578,
+      "grad_norm": 0.1798320859670639,
+      "learning_rate": 2.072969236913799e-05,
+      "loss": 0.14832595586776734,
+      "step": 3125
+    },
+    {
+      "epoch": 0.5695050946142649,
+      "grad_norm": 0.17045916616916656,
+      "learning_rate": 2.0657184683103926e-05,
+      "loss": 0.15308042764663696,
+      "step": 3130
+    },
+    {
+      "epoch": 0.5704148471615721,
+      "grad_norm": 0.16345897316932678,
+      "learning_rate": 2.058471464507366e-05,
+      "loss": 0.14564799070358275,
+      "step": 3135
+    },
+    {
+      "epoch": 0.5713245997088792,
+      "grad_norm": 0.15170110762119293,
+      "learning_rate": 2.0512282883292257e-05,
+      "loss": 0.14161767959594726,
+      "step": 3140
+    },
+    {
+      "epoch": 0.5722343522561864,
+      "grad_norm": 0.8107472658157349,
+      "learning_rate": 2.0439890025672955e-05,
+      "loss": 0.14481087923049926,
+      "step": 3145
+    },
+    {
+      "epoch": 0.5731441048034934,
+      "grad_norm": 0.15346679091453552,
+      "learning_rate": 2.036753669979174e-05,
+      "loss": 0.14860262870788574,
+      "step": 3150
+    },
+    {
+      "epoch": 0.5740538573508006,
+      "grad_norm": 0.1632593423128128,
+      "learning_rate": 2.0295223532881886e-05,
+      "loss": 0.1481687307357788,
+      "step": 3155
+    },
+    {
+      "epoch": 0.5749636098981077,
+      "grad_norm": 0.23399172723293304,
+      "learning_rate": 2.022295115182852e-05,
+      "loss": 0.149153733253479,
+      "step": 3160
+    },
+    {
+      "epoch": 0.5758733624454149,
+      "grad_norm": 0.14977394044399261,
+      "learning_rate": 2.015072018316323e-05,
+      "loss": 0.14921388626098633,
+      "step": 3165
+    },
+    {
+      "epoch": 0.576783114992722,
+      "grad_norm": 0.1550658792257309,
+      "learning_rate": 2.007853125305856e-05,
+      "loss": 0.1482759475708008,
+      "step": 3170
+    },
+    {
+      "epoch": 0.5776928675400291,
+      "grad_norm": 0.16661737859249115,
+      "learning_rate": 2.0006384987322645e-05,
+      "loss": 0.14903552532196046,
+      "step": 3175
+    },
+    {
+      "epoch": 0.5786026200873362,
+      "grad_norm": 0.1746823936700821,
+      "learning_rate": 1.9934282011393753e-05,
+      "loss": 0.1412947654724121,
+      "step": 3180
+    },
+    {
+      "epoch": 0.5795123726346434,
+      "grad_norm": 0.17025792598724365,
+      "learning_rate": 1.9862222950334857e-05,
+      "loss": 0.15289769172668458,
+      "step": 3185
+    },
+    {
+      "epoch": 0.5804221251819505,
+      "grad_norm": 0.16857658326625824,
+      "learning_rate": 1.9790208428828252e-05,
+      "loss": 0.14419941902160643,
+      "step": 3190
+    },
+    {
+      "epoch": 0.5813318777292577,
+      "grad_norm": 0.16099876165390015,
+      "learning_rate": 1.9718239071170118e-05,
+      "loss": 0.14476487636566163,
+      "step": 3195
+    },
+    {
+      "epoch": 0.5822416302765647,
+      "grad_norm": 0.16140873730182648,
+      "learning_rate": 1.964631550126508e-05,
+      "loss": 0.14588416814804078,
+      "step": 3200
+    },
+    {
+      "epoch": 0.5831513828238719,
+      "grad_norm": 0.15719448029994965,
+      "learning_rate": 1.957443834262087e-05,
+      "loss": 0.15144693851470947,
+      "step": 3205
+    },
+    {
+      "epoch": 0.584061135371179,
+      "grad_norm": 0.16512645781040192,
+      "learning_rate": 1.950260821834285e-05,
+      "loss": 0.14787566661834717,
+      "step": 3210
+    },
+    {
+      "epoch": 0.5849708879184862,
+      "grad_norm": 0.18584516644477844,
+      "learning_rate": 1.9430825751128643e-05,
+      "loss": 0.14514710903167724,
+      "step": 3215
+    },
+    {
+      "epoch": 0.5858806404657934,
+      "grad_norm": 0.17640981078147888,
+      "learning_rate": 1.9359091563262742e-05,
+      "loss": 0.1511004686355591,
+      "step": 3220
+    },
+    {
+      "epoch": 0.5867903930131004,
+      "grad_norm": 0.1697624921798706,
+      "learning_rate": 1.9287406276611095e-05,
+      "loss": 0.15392563343048096,
+      "step": 3225
+    },
+    {
+      "epoch": 0.5877001455604076,
+      "grad_norm": 0.1677260845899582,
+      "learning_rate": 1.9215770512615725e-05,
+      "loss": 0.15311745405197144,
+      "step": 3230
+    },
+    {
+      "epoch": 0.5886098981077147,
+      "grad_norm": 0.15357480943202972,
+      "learning_rate": 1.9144184892289337e-05,
+      "loss": 0.14370160102844237,
+      "step": 3235
+    },
+    {
+      "epoch": 0.5895196506550219,
+      "grad_norm": 0.18601207435131073,
+      "learning_rate": 1.9072650036209955e-05,
+      "loss": 0.14095077514648438,
+      "step": 3240
+    },
+    {
+      "epoch": 0.590429403202329,
+      "grad_norm": 0.17313526570796967,
+      "learning_rate": 1.9001166564515513e-05,
+      "loss": 0.148259174823761,
+      "step": 3245
+    },
+    {
+      "epoch": 0.5913391557496361,
+      "grad_norm": 0.1634378433227539,
+      "learning_rate": 1.8929735096898504e-05,
+      "loss": 0.15082294940948487,
+      "step": 3250
+    },
+    {
+      "epoch": 0.5922489082969432,
+      "grad_norm": 0.18542174994945526,
+      "learning_rate": 1.885835625260058e-05,
+      "loss": 0.14461435079574586,
+      "step": 3255
+    },
+    {
+      "epoch": 0.5931586608442504,
+      "grad_norm": 0.1740756630897522,
+      "learning_rate": 1.87870306504072e-05,
+      "loss": 0.14083608388900756,
+      "step": 3260
+    },
+    {
+      "epoch": 0.5940684133915575,
+      "grad_norm": 0.25606217980384827,
+      "learning_rate": 1.8715758908642288e-05,
+      "loss": 0.15125386714935302,
+      "step": 3265
+    },
+    {
+      "epoch": 0.5949781659388647,
+      "grad_norm": 0.20194627344608307,
+      "learning_rate": 1.8644541645162834e-05,
+      "loss": 0.14433003664016725,
+      "step": 3270
+    },
+    {
+      "epoch": 0.5958879184861717,
+      "grad_norm": 0.1902168095111847,
+      "learning_rate": 1.8573379477353542e-05,
+      "loss": 0.14718132019042968,
+      "step": 3275
+    },
+    {
+      "epoch": 0.5967976710334789,
+      "grad_norm": 0.15122972428798676,
+      "learning_rate": 1.850227302212151e-05,
+      "loss": 0.153376567363739,
+      "step": 3280
+    },
+    {
+      "epoch": 0.597707423580786,
+      "grad_norm": 0.14331959187984467,
+      "learning_rate": 1.843122289589085e-05,
+      "loss": 0.146630597114563,
+      "step": 3285
+    },
+    {
+      "epoch": 0.5986171761280932,
+      "grad_norm": 0.15083099901676178,
+      "learning_rate": 1.836022971459737e-05,
+      "loss": 0.1445971965789795,
+      "step": 3290
+    },
+    {
+      "epoch": 0.5995269286754003,
+      "grad_norm": 0.16585418581962585,
+      "learning_rate": 1.828929409368321e-05,
+      "loss": 0.15120241641998292,
+      "step": 3295
+    },
+    {
+      "epoch": 0.6004366812227074,
+      "grad_norm": 0.1653224229812622,
+      "learning_rate": 1.8218416648091524e-05,
+      "loss": 0.14349838495254516,
+      "step": 3300
+    },
+    {
+      "epoch": 0.6013464337700145,
+      "grad_norm": 0.1891375184059143,
+      "learning_rate": 1.8147597992261124e-05,
+      "loss": 0.15171384811401367,
+      "step": 3305
+    },
+    {
+      "epoch": 0.6022561863173217,
+      "grad_norm": 0.13392704725265503,
+      "learning_rate": 1.8076838740121187e-05,
+      "loss": 0.14607118368148803,
+      "step": 3310
+    },
+    {
+      "epoch": 0.6031659388646288,
+      "grad_norm": 0.15421944856643677,
+      "learning_rate": 1.8006139505085926e-05,
+      "loss": 0.1380957007408142,
+      "step": 3315
+    },
+    {
+      "epoch": 0.604075691411936,
+      "grad_norm": 0.16637761890888214,
+      "learning_rate": 1.7935500900049246e-05,
+      "loss": 0.14604611396789552,
+      "step": 3320
+    },
+    {
+      "epoch": 0.6049854439592431,
+      "grad_norm": 0.16638441383838654,
+      "learning_rate": 1.7864923537379445e-05,
+      "loss": 0.1513611912727356,
+      "step": 3325
+    },
+    {
+      "epoch": 0.6058951965065502,
+      "grad_norm": 0.1745707094669342,
+      "learning_rate": 1.779440802891394e-05,
+      "loss": 0.15391240119934083,
+      "step": 3330
+    },
+    {
+      "epoch": 0.6068049490538574,
+      "grad_norm": 0.1620505005121231,
+      "learning_rate": 1.77239549859539e-05,
+      "loss": 0.14986472129821776,
+      "step": 3335
+    },
+    {
+      "epoch": 0.6077147016011645,
+      "grad_norm": 0.1579132080078125,
+      "learning_rate": 1.7653565019259e-05,
+      "loss": 0.1466603994369507,
+      "step": 3340
+    },
+    {
+      "epoch": 0.6086244541484717,
+      "grad_norm": 0.19154994189739227,
+      "learning_rate": 1.7583238739042086e-05,
+      "loss": 0.15228934288024903,
+      "step": 3345
+    },
+    {
+      "epoch": 0.6095342066957787,
+      "grad_norm": 0.15771779417991638,
+      "learning_rate": 1.7512976754963913e-05,
+      "loss": 0.14965078830718995,
+      "step": 3350
+    },
+    {
+      "epoch": 0.6104439592430859,
+      "grad_norm": 0.18406136333942413,
+      "learning_rate": 1.744277967612785e-05,
+      "loss": 0.1473196864128113,
+      "step": 3355
+    },
+    {
+      "epoch": 0.611353711790393,
+      "grad_norm": 0.17603816092014313,
+      "learning_rate": 1.7372648111074607e-05,
+      "loss": 0.1430676221847534,
+      "step": 3360
+    },
+    {
+      "epoch": 0.6122634643377002,
+      "grad_norm": 0.156408429145813,
+      "learning_rate": 1.7302582667776933e-05,
+      "loss": 0.14018454551696777,
+      "step": 3365
+    },
+    {
+      "epoch": 0.6131732168850073,
+      "grad_norm": 0.14504843950271606,
+      "learning_rate": 1.7232583953634407e-05,
+      "loss": 0.14505640268325806,
+      "step": 3370
+    },
+    {
+      "epoch": 0.6140829694323144,
+      "grad_norm": 0.1864968240261078,
+      "learning_rate": 1.716265257546808e-05,
+      "loss": 0.14810394048690795,
+      "step": 3375
+    },
+    {
+      "epoch": 0.6149927219796215,
+      "grad_norm": 0.1621711403131485,
+      "learning_rate": 1.7092789139515295e-05,
+      "loss": 0.14203091859817504,
+      "step": 3380
+    },
+    {
+      "epoch": 0.6159024745269287,
+      "grad_norm": 0.17994914948940277,
+      "learning_rate": 1.70229942514244e-05,
+      "loss": 0.14565644264221192,
+      "step": 3385
+    },
+    {
+      "epoch": 0.6168122270742358,
+      "grad_norm": 0.1707388162612915,
+      "learning_rate": 1.6953268516249486e-05,
+      "loss": 0.14449434280395507,
+      "step": 3390
+    },
+    {
+      "epoch": 0.617721979621543,
+      "grad_norm": 0.16425329446792603,
+      "learning_rate": 1.6883612538445175e-05,
+      "loss": 0.15185940265655518,
+      "step": 3395
+    },
+    {
+      "epoch": 0.61863173216885,
+      "grad_norm": 0.15987788140773773,
+      "learning_rate": 1.6814026921861335e-05,
+      "loss": 0.14994431734085084,
+      "step": 3400
+    },
+    {
+      "epoch": 0.6195414847161572,
+      "grad_norm": 0.2987690269947052,
+      "learning_rate": 1.6744512269737894e-05,
+      "loss": 0.14652738571166993,
+      "step": 3405
+    },
+    {
+      "epoch": 0.6204512372634643,
+      "grad_norm": 0.1681315004825592,
+      "learning_rate": 1.6675069184699574e-05,
+      "loss": 0.14566165208816528,
+      "step": 3410
+    },
+    {
+      "epoch": 0.6213609898107715,
+      "grad_norm": 0.15847846865653992,
+      "learning_rate": 1.660569826875069e-05,
+      "loss": 0.1374401330947876,
+      "step": 3415
+    },
+    {
+      "epoch": 0.6222707423580786,
+      "grad_norm": 0.16370312869548798,
+      "learning_rate": 1.6536400123269907e-05,
+      "loss": 0.14905524253845215,
+      "step": 3420
+    },
+    {
+      "epoch": 0.6231804949053857,
+      "grad_norm": 0.16054444015026093,
+      "learning_rate": 1.6467175349005054e-05,
+      "loss": 0.1496324896812439,
+      "step": 3425
+    },
+    {
+      "epoch": 0.6240902474526928,
+      "grad_norm": 0.1663951277732849,
+      "learning_rate": 1.639802454606788e-05,
+      "loss": 0.1504170298576355,
+      "step": 3430
+    },
+    {
+      "epoch": 0.625,
+      "grad_norm": 0.1591310054063797,
+      "learning_rate": 1.6328948313928906e-05,
+      "loss": 0.1410186171531677,
+      "step": 3435
+    },
+    {
+      "epoch": 0.6259097525473072,
+      "grad_norm": 0.1637524962425232,
+      "learning_rate": 1.6259947251412178e-05,
+      "loss": 0.13963305950164795,
+      "step": 3440
+    },
+    {
+      "epoch": 0.6268195050946143,
+      "grad_norm": 0.1688017100095749,
+      "learning_rate": 1.6191021956690096e-05,
+      "loss": 0.14727941751480103,
+      "step": 3445
+    },
+    {
+      "epoch": 0.6277292576419214,
+      "grad_norm": 0.1691795438528061,
+      "learning_rate": 1.612217302727821e-05,
+      "loss": 0.14856183528900146,
+      "step": 3450
+    },
+    {
+      "epoch": 0.6286390101892285,
+      "grad_norm": 0.18501746654510498,
+      "learning_rate": 1.60534010600301e-05,
+      "loss": 0.1481746554374695,
+      "step": 3455
+    },
+    {
+      "epoch": 0.6295487627365357,
+      "grad_norm": 0.16234716773033142,
+      "learning_rate": 1.5984706651132125e-05,
+      "loss": 0.1427530527114868,
+      "step": 3460
+    },
+    {
+      "epoch": 0.6304585152838428,
+      "grad_norm": 0.16013780236244202,
+      "learning_rate": 1.5916090396098293e-05,
+      "loss": 0.14264426231384278,
+      "step": 3465
+    },
+    {
+      "epoch": 0.63136826783115,
+      "grad_norm": 0.17116396129131317,
+      "learning_rate": 1.5847552889765095e-05,
+      "loss": 0.14109257459640503,
+      "step": 3470
+    },
+    {
+      "epoch": 0.632278020378457,
+      "grad_norm": 0.16949769854545593,
+      "learning_rate": 1.5779094726286344e-05,
+      "loss": 0.1387040376663208,
+      "step": 3475
+    },
+    {
+      "epoch": 0.6331877729257642,
+      "grad_norm": 0.14983431994915009,
+      "learning_rate": 1.5710716499128044e-05,
+      "loss": 0.13645120859146118,
+      "step": 3480
+    },
+    {
+      "epoch": 0.6340975254730713,
+      "grad_norm": 0.1632554531097412,
+      "learning_rate": 1.564241880106321e-05,
+      "loss": 0.14883992671966553,
+      "step": 3485
+    },
+    {
+      "epoch": 0.6350072780203785,
+      "grad_norm": 0.15686506032943726,
+      "learning_rate": 1.5574202224166744e-05,
+      "loss": 0.14244272708892822,
+      "step": 3490
+    },
+    {
+      "epoch": 0.6359170305676856,
+      "grad_norm": 0.18843458592891693,
+      "learning_rate": 1.5506067359810333e-05,
+      "loss": 0.15149861574172974,
+      "step": 3495
+    },
+    {
+      "epoch": 0.6368267831149927,
+      "grad_norm": 0.15874551236629486,
+      "learning_rate": 1.5438014798657275e-05,
+      "loss": 0.15188233852386473,
+      "step": 3500
+    },
+    {
+      "epoch": 0.6377365356622998,
+      "grad_norm": 0.17014239728450775,
+      "learning_rate": 1.5370045130657366e-05,
+      "loss": 0.14694437980651856,
+      "step": 3505
+    },
+    {
+      "epoch": 0.638646288209607,
+      "grad_norm": 0.14744038879871368,
+      "learning_rate": 1.5302158945041838e-05,
+      "loss": 0.14434736967086792,
+      "step": 3510
+    },
+    {
+      "epoch": 0.6395560407569141,
+      "grad_norm": 0.2069770246744156,
+      "learning_rate": 1.523435683031818e-05,
+      "loss": 0.13982917070388795,
+      "step": 3515
+    },
+    {
+      "epoch": 0.6404657933042213,
+      "grad_norm": 0.17811502516269684,
+      "learning_rate": 1.5166639374265063e-05,
+      "loss": 0.1408839702606201,
+      "step": 3520
+    },
+    {
+      "epoch": 0.6413755458515283,
+      "grad_norm": 0.165786474943161,
+      "learning_rate": 1.509900716392728e-05,
+      "loss": 0.15312877893447877,
+      "step": 3525
+    },
+    {
+      "epoch": 0.6422852983988355,
+      "grad_norm": 0.1633884161710739,
+      "learning_rate": 1.5031460785610596e-05,
+      "loss": 0.1488795518875122,
+      "step": 3530
+    },
+    {
+      "epoch": 0.6431950509461426,
+      "grad_norm": 0.16498984396457672,
+      "learning_rate": 1.4964000824876723e-05,
+      "loss": 0.15031465291976928,
+      "step": 3535
+    },
+    {
+      "epoch": 0.6441048034934498,
+      "grad_norm": 0.18043678998947144,
+      "learning_rate": 1.4896627866538191e-05,
+      "loss": 0.147829806804657,
+      "step": 3540
+    },
+    {
+      "epoch": 0.6450145560407569,
+      "grad_norm": 0.16813597083091736,
+      "learning_rate": 1.4829342494653315e-05,
+      "loss": 0.1418998956680298,
+      "step": 3545
+    },
+    {
+      "epoch": 0.645924308588064,
+      "grad_norm": 0.1817242056131363,
+      "learning_rate": 1.4762145292521118e-05,
+      "loss": 0.14508869647979736,
+      "step": 3550
+    },
+    {
+      "epoch": 0.6468340611353712,
+      "grad_norm": 0.14666494727134705,
+      "learning_rate": 1.469503684267628e-05,
+      "loss": 0.14159854650497436,
+      "step": 3555
+    },
+    {
+      "epoch": 0.6477438136826783,
+      "grad_norm": 0.16485381126403809,
+      "learning_rate": 1.4628017726884086e-05,
+      "loss": 0.14419105052947997,
+      "step": 3560
+    },
+    {
+      "epoch": 0.6486535662299855,
+      "grad_norm": 0.16100342571735382,
+      "learning_rate": 1.4561088526135375e-05,
+      "loss": 0.14501721858978273,
+      "step": 3565
+    },
+    {
+      "epoch": 0.6495633187772926,
+      "grad_norm": 0.16996590793132782,
+      "learning_rate": 1.4494249820641493e-05,
+      "loss": 0.1377166509628296,
+      "step": 3570
+    },
+    {
+      "epoch": 0.6504730713245997,
+      "grad_norm": 0.16168837249279022,
+      "learning_rate": 1.4427502189829339e-05,
+      "loss": 0.1414325475692749,
+      "step": 3575
+    },
+    {
+      "epoch": 0.6513828238719068,
+      "grad_norm": 0.16318906843662262,
+      "learning_rate": 1.436084621233621e-05,
+      "loss": 0.14685193300247193,
+      "step": 3580
+    },
+    {
+      "epoch": 0.652292576419214,
+      "grad_norm": 0.1636219322681427,
+      "learning_rate": 1.4294282466004899e-05,
+      "loss": 0.1405899167060852,
+      "step": 3585
+    },
+    {
+      "epoch": 0.6532023289665211,
+      "grad_norm": 0.1838461309671402,
+      "learning_rate": 1.422781152787865e-05,
+      "loss": 0.14386332035064697,
+      "step": 3590
+    },
+    {
+      "epoch": 0.6541120815138283,
+      "grad_norm": 0.1796344667673111,
+      "learning_rate": 1.4161433974196115e-05,
+      "loss": 0.1513024687767029,
+      "step": 3595
+    },
+    {
+      "epoch": 0.6550218340611353,
+      "grad_norm": 0.16424529254436493,
+      "learning_rate": 1.4095150380386427e-05,
+      "loss": 0.14238927364349366,
+      "step": 3600
+    },
+    {
+      "epoch": 0.6559315866084425,
+      "grad_norm": 0.19264160096645355,
+      "learning_rate": 1.402896132106415e-05,
+      "loss": 0.14297477006912232,
+      "step": 3605
+    },
+    {
+      "epoch": 0.6568413391557496,
+      "grad_norm": 0.18319948017597198,
+      "learning_rate": 1.3962867370024347e-05,
+      "loss": 0.1448880434036255,
+      "step": 3610
+    },
+    {
+      "epoch": 0.6577510917030568,
+      "grad_norm": 0.16507290303707123,
+      "learning_rate": 1.389686910023758e-05,
+      "loss": 0.14724698066711425,
+      "step": 3615
+    },
+    {
+      "epoch": 0.6586608442503639,
+      "grad_norm": 0.17871244251728058,
+      "learning_rate": 1.3830967083844942e-05,
+      "loss": 0.14479386806488037,
+      "step": 3620
+    },
+    {
+      "epoch": 0.659570596797671,
+      "grad_norm": 0.1846228390932083,
+      "learning_rate": 1.3765161892153112e-05,
+      "loss": 0.1453616738319397,
+      "step": 3625
+    },
+    {
+      "epoch": 0.6604803493449781,
+      "grad_norm": 0.17185978591442108,
+      "learning_rate": 1.3699454095629372e-05,
+      "loss": 0.14906206130981445,
+      "step": 3630
+    },
+    {
+      "epoch": 0.6613901018922853,
+      "grad_norm": 0.14751191437244415,
+      "learning_rate": 1.3633844263896698e-05,
+      "loss": 0.13991892337799072,
+      "step": 3635
+    },
+    {
+      "epoch": 0.6622998544395924,
+      "grad_norm": 0.22059763967990875,
+      "learning_rate": 1.3568332965728817e-05,
+      "loss": 0.14680869579315187,
+      "step": 3640
+    },
+    {
+      "epoch": 0.6632096069868996,
+      "grad_norm": 0.15295909345149994,
+      "learning_rate": 1.3502920769045232e-05,
+      "loss": 0.1404443383216858,
+      "step": 3645
+    },
+    {
+      "epoch": 0.6641193595342066,
+      "grad_norm": 0.14600558578968048,
+      "learning_rate": 1.3437608240906364e-05,
+      "loss": 0.14663270711898804,
+      "step": 3650
+    },
+    {
+      "epoch": 0.6650291120815138,
+      "grad_norm": 0.15548352897167206,
+      "learning_rate": 1.3372395947508587e-05,
+      "loss": 0.1431443452835083,
+      "step": 3655
+    },
+    {
+      "epoch": 0.665938864628821,
+      "grad_norm": 0.1813388466835022,
+      "learning_rate": 1.3307284454179342e-05,
+      "loss": 0.1458706736564636,
+      "step": 3660
+    },
+    {
+      "epoch": 0.6668486171761281,
+      "grad_norm": 0.16326870024204254,
+      "learning_rate": 1.3242274325372247e-05,
+      "loss": 0.14700595140457154,
+      "step": 3665
+    },
+    {
+      "epoch": 0.6677583697234353,
+      "grad_norm": 0.18779197335243225,
+      "learning_rate": 1.3177366124662149e-05,
+      "loss": 0.1497237801551819,
+      "step": 3670
+    },
+    {
+      "epoch": 0.6686681222707423,
+      "grad_norm": 0.16291002929210663,
+      "learning_rate": 1.3112560414740315e-05,
+      "loss": 0.1387086868286133,
+      "step": 3675
+    },
+    {
+      "epoch": 0.6695778748180495,
+      "grad_norm": 0.1532297134399414,
+      "learning_rate": 1.3047857757409487e-05,
+      "loss": 0.14497545957565308,
+      "step": 3680
+    },
+    {
+      "epoch": 0.6704876273653566,
+      "grad_norm": 0.14697515964508057,
+      "learning_rate": 1.2983258713579066e-05,
+      "loss": 0.1494283437728882,
+      "step": 3685
+    },
+    {
+      "epoch": 0.6713973799126638,
+      "grad_norm": 0.15213452279567719,
+      "learning_rate": 1.2918763843260218e-05,
+      "loss": 0.1468907594680786,
+      "step": 3690
+    },
+    {
+      "epoch": 0.6723071324599709,
+      "grad_norm": 0.1745215803384781,
+      "learning_rate": 1.285437370556099e-05,
+      "loss": 0.14997754096984864,
+      "step": 3695
+    },
+    {
+      "epoch": 0.673216885007278,
+      "grad_norm": 0.19207637012004852,
+      "learning_rate": 1.2790088858681577e-05,
+      "loss": 0.14202862977981567,
+      "step": 3700
+    },
+    {
+      "epoch": 0.6741266375545851,
+      "grad_norm": 0.1521359086036682,
+      "learning_rate": 1.2725909859909313e-05,
+      "loss": 0.14547673463821412,
+      "step": 3705
+    },
+    {
+      "epoch": 0.6750363901018923,
+      "grad_norm": 0.16975535452365875,
+      "learning_rate": 1.2661837265613999e-05,
+      "loss": 0.14006874561309815,
+      "step": 3710
+    },
+    {
+      "epoch": 0.6759461426491994,
+      "grad_norm": 0.22234582901000977,
+      "learning_rate": 1.2597871631242992e-05,
+      "loss": 0.13691173791885375,
+      "step": 3715
+    },
+    {
+      "epoch": 0.6768558951965066,
+      "grad_norm": 0.16082969307899475,
+      "learning_rate": 1.2534013511316383e-05,
+      "loss": 0.14932308197021485,
+      "step": 3720
+    },
+    {
+      "epoch": 0.6777656477438136,
+      "grad_norm": 0.1751091182231903,
+      "learning_rate": 1.247026345942226e-05,
+      "loss": 0.14531974792480468,
+      "step": 3725
+    },
+    {
+      "epoch": 0.6786754002911208,
+      "grad_norm": 0.15838147699832916,
+      "learning_rate": 1.2406622028211844e-05,
+      "loss": 0.14759832620620728,
+      "step": 3730
+    },
+    {
+      "epoch": 0.6795851528384279,
+      "grad_norm": 0.1771744042634964,
+      "learning_rate": 1.2343089769394714e-05,
+      "loss": 0.1382831573486328,
+      "step": 3735
+    },
+    {
+      "epoch": 0.6804949053857351,
+      "grad_norm": 0.16301538050174713,
+      "learning_rate": 1.2279667233734037e-05,
+      "loss": 0.14444775581359864,
+      "step": 3740
+    },
+    {
+      "epoch": 0.6814046579330422,
+      "grad_norm": 0.1584121286869049,
+      "learning_rate": 1.2216354971041796e-05,
+      "loss": 0.14200170040130616,
+      "step": 3745
+    },
+    {
+      "epoch": 0.6823144104803494,
+      "grad_norm": 0.139187291264534,
+      "learning_rate": 1.2153153530174007e-05,
+      "loss": 0.14318310022354125,
+      "step": 3750
+    },
+    {
+      "epoch": 0.6832241630276564,
+      "grad_norm": 0.13665248453617096,
+      "learning_rate": 1.2090063459025955e-05,
+      "loss": 0.1411946654319763,
+      "step": 3755
+    },
+    {
+      "epoch": 0.6841339155749636,
+      "grad_norm": 0.16273781657218933,
+      "learning_rate": 1.2027085304527475e-05,
+      "loss": 0.14873508214950562,
+      "step": 3760
+    },
+    {
+      "epoch": 0.6850436681222707,
+      "grad_norm": 0.16317526996135712,
+      "learning_rate": 1.1964219612638194e-05,
+      "loss": 0.14644203186035157,
+      "step": 3765
+    },
+    {
+      "epoch": 0.6859534206695779,
+      "grad_norm": 0.17253617942333221,
+      "learning_rate": 1.1901466928342777e-05,
+      "loss": 0.14027841091156007,
+      "step": 3770
+    },
+    {
+      "epoch": 0.6868631732168851,
+      "grad_norm": 0.19692830741405487,
+      "learning_rate": 1.183882779564624e-05,
+      "loss": 0.14411110877990724,
+      "step": 3775
+    },
+    {
+      "epoch": 0.6877729257641921,
+      "grad_norm": 0.15444578230381012,
+      "learning_rate": 1.1776302757569214e-05,
+      "loss": 0.14355008602142333,
+      "step": 3780
+    },
+    {
+      "epoch": 0.6886826783114993,
+      "grad_norm": 0.1622200757265091,
+      "learning_rate": 1.1713892356143239e-05,
+      "loss": 0.14794334173202514,
+      "step": 3785
+    },
+    {
+      "epoch": 0.6895924308588064,
+      "grad_norm": 0.1898501068353653,
+      "learning_rate": 1.1651597132406073e-05,
+      "loss": 0.1418622612953186,
+      "step": 3790
+    },
+    {
+      "epoch": 0.6905021834061136,
+      "grad_norm": 0.17803208529949188,
+      "learning_rate": 1.1589417626396973e-05,
+      "loss": 0.14576040506362914,
+      "step": 3795
+    },
+    {
+      "epoch": 0.6914119359534207,
+      "grad_norm": 0.17138013243675232,
+      "learning_rate": 1.1527354377152053e-05,
+      "loss": 0.14494270086288452,
+      "step": 3800
+    },
+    {
+      "epoch": 0.6923216885007278,
+      "grad_norm": 0.15170913934707642,
+      "learning_rate": 1.1465407922699603e-05,
+      "loss": 0.144084370136261,
+      "step": 3805
+    },
+    {
+      "epoch": 0.6932314410480349,
+      "grad_norm": 0.158562570810318,
+      "learning_rate": 1.1403578800055387e-05,
+      "loss": 0.13636608123779298,
+      "step": 3810
+    },
+    {
+      "epoch": 0.6941411935953421,
+      "grad_norm": 0.17687302827835083,
+      "learning_rate": 1.1341867545218044e-05,
+      "loss": 0.14214688539505005,
+      "step": 3815
+    },
+    {
+      "epoch": 0.6950509461426492,
+      "grad_norm": 0.15394899249076843,
+      "learning_rate": 1.1280274693164378e-05,
+      "loss": 0.14914129972457885,
+      "step": 3820
+    },
+    {
+      "epoch": 0.6959606986899564,
+      "grad_norm": 0.15709355473518372,
+      "learning_rate": 1.12188007778448e-05,
+      "loss": 0.14798580408096312,
+      "step": 3825
+    },
+    {
+      "epoch": 0.6968704512372634,
+      "grad_norm": 0.16631539165973663,
+      "learning_rate": 1.115744633217864e-05,
+      "loss": 0.14756966829299928,
+      "step": 3830
+    },
+    {
+      "epoch": 0.6977802037845706,
+      "grad_norm": 0.15893076360225677,
+      "learning_rate": 1.109621188804951e-05,
+      "loss": 0.14061959981918334,
+      "step": 3835
+    },
+    {
+      "epoch": 0.6986899563318777,
+      "grad_norm": 0.183414489030838,
+      "learning_rate": 1.103509797630077e-05,
+      "loss": 0.1448473334312439,
+      "step": 3840
+    },
+    {
+      "epoch": 0.6995997088791849,
+      "grad_norm": 0.14087305963039398,
+      "learning_rate": 1.0974105126730841e-05,
+      "loss": 0.14369285106658936,
+      "step": 3845
+    },
+    {
+      "epoch": 0.700509461426492,
+      "grad_norm": 0.16919967532157898,
+      "learning_rate": 1.0913233868088685e-05,
+      "loss": 0.1478085398674011,
+      "step": 3850
+    },
+    {
+      "epoch": 0.7014192139737991,
+      "grad_norm": 0.1439533829689026,
+      "learning_rate": 1.0852484728069178e-05,
+      "loss": 0.14376721382141114,
+      "step": 3855
+    },
+    {
+      "epoch": 0.7023289665211062,
+      "grad_norm": 0.17719274759292603,
+      "learning_rate": 1.0791858233308521e-05,
+      "loss": 0.14089040756225585,
+      "step": 3860
+    },
+    {
+      "epoch": 0.7032387190684134,
+      "grad_norm": 0.19753769040107727,
+      "learning_rate": 1.0731354909379754e-05,
+      "loss": 0.15021742582321168,
+      "step": 3865
+    },
+    {
+      "epoch": 0.7041484716157205,
+      "grad_norm": 0.19186992943286896,
+      "learning_rate": 1.0670975280788086e-05,
+      "loss": 0.14113202095031738,
+      "step": 3870
+    },
+    {
+      "epoch": 0.7050582241630277,
+      "grad_norm": 0.1709229201078415,
+      "learning_rate": 1.0610719870966443e-05,
+      "loss": 0.1500566840171814,
+      "step": 3875
+    },
+    {
+      "epoch": 0.7059679767103348,
+      "grad_norm": 0.17846204340457916,
+      "learning_rate": 1.0550589202270892e-05,
+      "loss": 0.15014195442199707,
+      "step": 3880
+    },
+    {
+      "epoch": 0.7068777292576419,
+      "grad_norm": 0.1827082335948944,
+      "learning_rate": 1.0490583795976091e-05,
+      "loss": 0.1423472762107849,
+      "step": 3885
+    },
+    {
+      "epoch": 0.7077874818049491,
+      "grad_norm": 0.17418377101421356,
+      "learning_rate": 1.043070417227083e-05,
+      "loss": 0.14668900966644288,
+      "step": 3890
+    },
+    {
+      "epoch": 0.7086972343522562,
+      "grad_norm": 0.17385616898536682,
+      "learning_rate": 1.0370950850253449e-05,
+      "loss": 0.14627279043197633,
+      "step": 3895
+    },
+    {
+      "epoch": 0.7096069868995634,
+      "grad_norm": 0.16486723721027374,
+      "learning_rate": 1.0311324347927404e-05,
+      "loss": 0.14603652954101562,
+      "step": 3900
+    },
+    {
+      "epoch": 0.7105167394468704,
+      "grad_norm": 0.21806862950325012,
+      "learning_rate": 1.0251825182196732e-05,
+      "loss": 0.1488169550895691,
+      "step": 3905
+    },
+    {
+      "epoch": 0.7114264919941776,
+      "grad_norm": 0.19884569942951202,
+      "learning_rate": 1.019245386886159e-05,
+      "loss": 0.14387656450271608,
+      "step": 3910
+    },
+    {
+      "epoch": 0.7123362445414847,
+      "grad_norm": 0.16139011085033417,
+      "learning_rate": 1.0133210922613789e-05,
+      "loss": 0.1483074426651001,
+      "step": 3915
+    },
+    {
+      "epoch": 0.7132459970887919,
+      "grad_norm": 0.17000740766525269,
+      "learning_rate": 1.007409685703229e-05,
+      "loss": 0.14050065279006957,
+      "step": 3920
+    },
+    {
+      "epoch": 0.714155749636099,
+      "grad_norm": 0.17235304415225983,
+      "learning_rate": 1.0015112184578813e-05,
+      "loss": 0.1440442681312561,
+      "step": 3925
+    },
+    {
+      "epoch": 0.7150655021834061,
+      "grad_norm": 0.15737567842006683,
+      "learning_rate": 9.956257416593362e-06,
+      "loss": 0.14960765838623047,
+      "step": 3930
+    },
+    {
+      "epoch": 0.7159752547307132,
+      "grad_norm": 0.15499180555343628,
+      "learning_rate": 9.897533063289773e-06,
+      "loss": 0.14488829374313356,
+      "step": 3935
+    },
+    {
+      "epoch": 0.7168850072780204,
+      "grad_norm": 0.17744216322898865,
+      "learning_rate": 9.838939633751337e-06,
+      "loss": 0.1416949987411499,
+      "step": 3940
+    },
+    {
+      "epoch": 0.7177947598253275,
+      "grad_norm": 0.1597192883491516,
+      "learning_rate": 9.780477635926358e-06,
+      "loss": 0.14275280237197877,
+      "step": 3945
+    },
+    {
+      "epoch": 0.7187045123726347,
+      "grad_norm": 0.17800374329090118,
+      "learning_rate": 9.722147576623743e-06,
+      "loss": 0.14532098770141602,
+      "step": 3950
+    },
+    {
+      "epoch": 0.7196142649199417,
+      "grad_norm": 0.1828162521123886,
+      "learning_rate": 9.66394996150864e-06,
+      "loss": 0.14525585174560546,
+      "step": 3955
+    },
+    {
+      "epoch": 0.7205240174672489,
+      "grad_norm": 0.1800539344549179,
+      "learning_rate": 9.605885295098005e-06,
+      "loss": 0.14235819578170777,
+      "step": 3960
+    },
+    {
+      "epoch": 0.721433770014556,
+      "grad_norm": 0.16556483507156372,
+      "learning_rate": 9.54795408075628e-06,
+      "loss": 0.13965482711791993,
+      "step": 3965
+    },
+    {
+      "epoch": 0.7223435225618632,
+      "grad_norm": 0.1592024862766266,
+      "learning_rate": 9.49015682069101e-06,
+      "loss": 0.14051042795181273,
+      "step": 3970
+    },
+    {
+      "epoch": 0.7232532751091703,
+      "grad_norm": 0.18988847732543945,
+      "learning_rate": 9.43249401594846e-06,
+      "loss": 0.1436900496482849,
+      "step": 3975
+    },
+    {
+      "epoch": 0.7241630276564774,
+      "grad_norm": 0.24433808028697968,
+      "learning_rate": 9.374966166409329e-06,
+      "loss": 0.14883997440338134,
+      "step": 3980
+    },
+    {
+      "epoch": 0.7250727802037845,
+      "grad_norm": 0.15091639757156372,
+      "learning_rate": 9.317573770784352e-06,
+      "loss": 0.14726560115814208,
+      "step": 3985
+    },
+    {
+      "epoch": 0.7259825327510917,
+      "grad_norm": 0.17045573890209198,
+      "learning_rate": 9.260317326610051e-06,
+      "loss": 0.14120506048202514,
+      "step": 3990
+    },
+    {
+      "epoch": 0.7268922852983989,
+      "grad_norm": 0.18847957253456116,
+      "learning_rate": 9.203197330244343e-06,
+      "loss": 0.1377041220664978,
+      "step": 3995
+    },
+    {
+      "epoch": 0.727802037845706,
+      "grad_norm": 0.1516445279121399,
+      "learning_rate": 9.14621427686229e-06,
+      "loss": 0.14043946266174318,
+      "step": 4000
+    },
+    {
+      "epoch": 0.7287117903930131,
+      "grad_norm": 0.18264050781726837,
+      "learning_rate": 9.0893686604518e-06,
+      "loss": 0.14080368280410765,
+      "step": 4005
+    },
+    {
+      "epoch": 0.7296215429403202,
+      "grad_norm": 0.19129371643066406,
+      "learning_rate": 9.032660973809312e-06,
+      "loss": 0.1402561902999878,
+      "step": 4010
+    },
+    {
+      "epoch": 0.7305312954876274,
+      "grad_norm": 0.15762710571289062,
+      "learning_rate": 8.976091708535567e-06,
+      "loss": 0.14421157836914061,
+      "step": 4015
+    },
+    {
+      "epoch": 0.7314410480349345,
+      "grad_norm": 0.17785198986530304,
+      "learning_rate": 8.919661355031331e-06,
+      "loss": 0.14999009370803834,
+      "step": 4020
+    },
+    {
+      "epoch": 0.7323508005822417,
+      "grad_norm": 0.15306031703948975,
+      "learning_rate": 8.8633704024931e-06,
+      "loss": 0.14101698398590087,
+      "step": 4025
+    },
+    {
+      "epoch": 0.7332605531295487,
+      "grad_norm": 0.16481758654117584,
+      "learning_rate": 8.807219338908968e-06,
+      "loss": 0.14170764684677123,
+      "step": 4030
+    },
+    {
+      "epoch": 0.7341703056768559,
+      "grad_norm": 0.14892235398292542,
+      "learning_rate": 8.751208651054257e-06,
+      "loss": 0.15317896604537964,
+      "step": 4035
+    },
+    {
+      "epoch": 0.735080058224163,
+      "grad_norm": 0.1775592565536499,
+      "learning_rate": 8.695338824487409e-06,
+      "loss": 0.1520617723464966,
+      "step": 4040
+    },
+    {
+      "epoch": 0.7359898107714702,
+      "grad_norm": 0.1614258885383606,
+      "learning_rate": 8.639610343545728e-06,
+      "loss": 0.13747400045394897,
+      "step": 4045
+    },
+    {
+      "epoch": 0.7368995633187773,
+      "grad_norm": 0.21415506303310394,
+      "learning_rate": 8.58402369134117e-06,
+      "loss": 0.1432439088821411,
+      "step": 4050
+    },
+    {
+      "epoch": 0.7378093158660844,
+      "grad_norm": 0.1759418249130249,
+      "learning_rate": 8.528579349756205e-06,
+      "loss": 0.141641104221344,
+      "step": 4055
+    },
+    {
+      "epoch": 0.7387190684133915,
+      "grad_norm": 0.16738329827785492,
+      "learning_rate": 8.47327779943957e-06,
+      "loss": 0.14294810295104982,
+      "step": 4060
+    },
+    {
+      "epoch": 0.7396288209606987,
+      "grad_norm": 0.13916844129562378,
+      "learning_rate": 8.41811951980217e-06,
+      "loss": 0.13876968622207642,
+      "step": 4065
+    },
+    {
+      "epoch": 0.7405385735080058,
+      "grad_norm": 0.1828441321849823,
+      "learning_rate": 8.36310498901288e-06,
+      "loss": 0.148428475856781,
+      "step": 4070
+    },
+    {
+      "epoch": 0.741448326055313,
+      "grad_norm": 0.16534076631069183,
+      "learning_rate": 8.308234683994415e-06,
+      "loss": 0.14222711324691772,
+      "step": 4075
+    },
+    {
+      "epoch": 0.74235807860262,
+      "grad_norm": 0.17922644317150116,
+      "learning_rate": 8.253509080419198e-06,
+      "loss": 0.14365782737731933,
+      "step": 4080
+    },
+    {
+      "epoch": 0.7432678311499272,
+      "grad_norm": 0.15061035752296448,
+      "learning_rate": 8.198928652705204e-06,
+      "loss": 0.13571925163269044,
+      "step": 4085
+    },
+    {
+      "epoch": 0.7441775836972343,
+      "grad_norm": 0.18075402081012726,
+      "learning_rate": 8.144493874011908e-06,
+      "loss": 0.14385528564453126,
+      "step": 4090
+    },
+    {
+      "epoch": 0.7450873362445415,
+      "grad_norm": 0.16514739394187927,
+      "learning_rate": 8.090205216236135e-06,
+      "loss": 0.14920626878738402,
+      "step": 4095
+    },
+    {
+      "epoch": 0.7459970887918487,
+      "grad_norm": 0.16453702747821808,
+      "learning_rate": 8.03606315000797e-06,
+      "loss": 0.14704222679138185,
+      "step": 4100
+    },
+    {
+      "epoch": 0.7469068413391557,
+      "grad_norm": 0.16719917953014374,
+      "learning_rate": 7.982068144686707e-06,
+      "loss": 0.14722511768341065,
+      "step": 4105
+    },
+    {
+      "epoch": 0.7478165938864629,
+      "grad_norm": 0.18499110639095306,
+      "learning_rate": 7.92822066835677e-06,
+      "loss": 0.1401848554611206,
+      "step": 4110
+    },
+    {
+      "epoch": 0.74872634643377,
+      "grad_norm": 0.17249563336372375,
+      "learning_rate": 7.87452118782363e-06,
+      "loss": 0.15132423639297485,
+      "step": 4115
+    },
+    {
+      "epoch": 0.7496360989810772,
+      "grad_norm": 0.15049682557582855,
+      "learning_rate": 7.8209701686098e-06,
+      "loss": 0.1341150164604187,
+      "step": 4120
+    },
+    {
+      "epoch": 0.7505458515283843,
+      "grad_norm": 0.16892646253108978,
+      "learning_rate": 7.767568074950751e-06,
+      "loss": 0.1466840147972107,
+      "step": 4125
+    },
+    {
+      "epoch": 0.7514556040756915,
+      "grad_norm": 0.17288286983966827,
+      "learning_rate": 7.714315369790942e-06,
+      "loss": 0.13819680213928223,
+      "step": 4130
+    },
+    {
+      "epoch": 0.7523653566229985,
+      "grad_norm": 0.21893996000289917,
+      "learning_rate": 7.661212514779745e-06,
+      "loss": 0.14369510412216185,
+      "step": 4135
+    },
+    {
+      "epoch": 0.7532751091703057,
+      "grad_norm": 0.1674601435661316,
+      "learning_rate": 7.608259970267509e-06,
+      "loss": 0.14810250997543334,
+      "step": 4140
+    },
+    {
+      "epoch": 0.7541848617176128,
+      "grad_norm": 0.15875539183616638,
+      "learning_rate": 7.555458195301526e-06,
+      "loss": 0.14103198051452637,
+      "step": 4145
+    },
+    {
+      "epoch": 0.75509461426492,
+      "grad_norm": 0.19454079866409302,
+      "learning_rate": 7.502807647622037e-06,
+      "loss": 0.13848764896392823,
+      "step": 4150
+    },
+    {
+      "epoch": 0.756004366812227,
+      "grad_norm": 0.1795455813407898,
+      "learning_rate": 7.450308783658341e-06,
+      "loss": 0.14459335803985596,
+      "step": 4155
+    },
+    {
+      "epoch": 0.7569141193595342,
+      "grad_norm": 0.1643362045288086,
+      "learning_rate": 7.397962058524735e-06,
+      "loss": 0.14335378408432006,
+      "step": 4160
+    },
+    {
+      "epoch": 0.7578238719068413,
+      "grad_norm": 0.16362066566944122,
+      "learning_rate": 7.3457679260166475e-06,
+      "loss": 0.14222005605697632,
+      "step": 4165
+    },
+    {
+      "epoch": 0.7587336244541485,
+      "grad_norm": 0.17313003540039062,
+      "learning_rate": 7.293726838606674e-06,
+      "loss": 0.14272255897521974,
+      "step": 4170
+    },
+    {
+      "epoch": 0.7596433770014556,
+      "grad_norm": 0.1809929460287094,
+      "learning_rate": 7.2418392474406405e-06,
+      "loss": 0.14089123010635377,
+      "step": 4175
+    },
+    {
+      "epoch": 0.7605531295487628,
+      "grad_norm": 0.14306005835533142,
+      "learning_rate": 7.19010560233373e-06,
+      "loss": 0.13531534671783446,
+      "step": 4180
+    },
+    {
+      "epoch": 0.7614628820960698,
+      "grad_norm": 0.15525390207767487,
+      "learning_rate": 7.138526351766559e-06,
+      "loss": 0.14340845346450806,
+      "step": 4185
+    },
+    {
+      "epoch": 0.762372634643377,
+      "grad_norm": 0.24478943645954132,
+      "learning_rate": 7.087101942881263e-06,
+      "loss": 0.14744555950164795,
+      "step": 4190
+    },
+    {
+      "epoch": 0.7632823871906841,
+      "grad_norm": 0.31335577368736267,
+      "learning_rate": 7.035832821477711e-06,
+      "loss": 0.1484094500541687,
+      "step": 4195
+    },
+    {
+      "epoch": 0.7641921397379913,
+      "grad_norm": 0.15140366554260254,
+      "learning_rate": 6.984719432009515e-06,
+      "loss": 0.14991614818572999,
+      "step": 4200
+    },
+    {
+      "epoch": 0.7651018922852983,
+      "grad_norm": 0.16125506162643433,
+      "learning_rate": 6.933762217580289e-06,
+      "loss": 0.1408134937286377,
+      "step": 4205
+    },
+    {
+      "epoch": 0.7660116448326055,
+      "grad_norm": 0.2501450181007385,
+      "learning_rate": 6.882961619939726e-06,
+      "loss": 0.13875640630722047,
+      "step": 4210
+    },
+    {
+      "epoch": 0.7669213973799127,
+      "grad_norm": 0.16227811574935913,
+      "learning_rate": 6.8323180794798245e-06,
+      "loss": 0.14138660430908204,
+      "step": 4215
+    },
+    {
+      "epoch": 0.7678311499272198,
+      "grad_norm": 0.16676810383796692,
+      "learning_rate": 6.781832035231053e-06,
+      "loss": 0.14696706533432008,
+      "step": 4220
+    },
+    {
+      "epoch": 0.768740902474527,
+      "grad_norm": 0.14638574421405792,
+      "learning_rate": 6.731503924858518e-06,
+      "loss": 0.14263020753860473,
+      "step": 4225
+    },
+    {
+      "epoch": 0.769650655021834,
+      "grad_norm": 0.17093190550804138,
+      "learning_rate": 6.681334184658211e-06,
+      "loss": 0.14694111347198485,
+      "step": 4230
+    },
+    {
+      "epoch": 0.7705604075691412,
+      "grad_norm": 0.17174287140369415,
+      "learning_rate": 6.631323249553201e-06,
+      "loss": 0.13854929208755493,
+      "step": 4235
+    },
+    {
+      "epoch": 0.7714701601164483,
+      "grad_norm": 0.14599016308784485,
+      "learning_rate": 6.5814715530898745e-06,
+      "loss": 0.14058833122253417,
+      "step": 4240
+    },
+    {
+      "epoch": 0.7723799126637555,
+      "grad_norm": 0.16222265362739563,
+      "learning_rate": 6.531779527434176e-06,
+      "loss": 0.1428326725959778,
+      "step": 4245
+    },
+    {
+      "epoch": 0.7732896652110626,
+      "grad_norm": 0.1741994023323059,
+      "learning_rate": 6.482247603367839e-06,
+      "loss": 0.13985042572021483,
+      "step": 4250
+    },
+    {
+      "epoch": 0.7741994177583698,
+      "grad_norm": 0.17427101731300354,
+      "learning_rate": 6.432876210284688e-06,
+      "loss": 0.1442667603492737,
+      "step": 4255
+    },
+    {
+      "epoch": 0.7751091703056768,
+      "grad_norm": 0.1665259599685669,
+      "learning_rate": 6.383665776186912e-06,
+      "loss": 0.1421986222267151,
+      "step": 4260
+    },
+    {
+      "epoch": 0.776018922852984,
+      "grad_norm": 0.1728232353925705,
+      "learning_rate": 6.334616727681303e-06,
+      "loss": 0.1367053508758545,
+      "step": 4265
+    },
+    {
+      "epoch": 0.7769286754002911,
+      "grad_norm": 0.15882381796836853,
+      "learning_rate": 6.285729489975639e-06,
+      "loss": 0.14551182985305786,
+      "step": 4270
+    },
+    {
+      "epoch": 0.7778384279475983,
+      "grad_norm": 0.242042675614357,
+      "learning_rate": 6.2370044868749115e-06,
+      "loss": 0.1455132007598877,
+      "step": 4275
+    },
+    {
+      "epoch": 0.7787481804949054,
+      "grad_norm": 0.1599501073360443,
+      "learning_rate": 6.188442140777742e-06,
+      "loss": 0.1424942970275879,
+      "step": 4280
+    },
+    {
+      "epoch": 0.7796579330422125,
+      "grad_norm": 0.15182635188102722,
+      "learning_rate": 6.140042872672647e-06,
+      "loss": 0.14212887287139891,
+      "step": 4285
+    },
+    {
+      "epoch": 0.7805676855895196,
+      "grad_norm": 0.1720375418663025,
+      "learning_rate": 6.091807102134403e-06,
+      "loss": 0.14243412017822266,
+      "step": 4290
+    },
+    {
+      "epoch": 0.7814774381368268,
+      "grad_norm": 0.16436047852039337,
+      "learning_rate": 6.043735247320454e-06,
+      "loss": 0.15035657882690429,
+      "step": 4295
+    },
+    {
+      "epoch": 0.7823871906841339,
+      "grad_norm": 0.1498408019542694,
+      "learning_rate": 5.995827724967218e-06,
+      "loss": 0.14494839906692505,
+      "step": 4300
+    },
+    {
+      "epoch": 0.7832969432314411,
+      "grad_norm": 0.16924560070037842,
+      "learning_rate": 5.948084950386535e-06,
+      "loss": 0.13581212759017944,
+      "step": 4305
+    },
+    {
+      "epoch": 0.7842066957787481,
+      "grad_norm": 0.15889139473438263,
+      "learning_rate": 5.900507337462036e-06,
+      "loss": 0.15071530342102052,
+      "step": 4310
+    },
+    {
+      "epoch": 0.7851164483260553,
+      "grad_norm": 0.17201054096221924,
+      "learning_rate": 5.853095298645542e-06,
+      "loss": 0.1398628830909729,
+      "step": 4315
+    },
+    {
+      "epoch": 0.7860262008733624,
+      "grad_norm": 0.17965619266033173,
+      "learning_rate": 5.805849244953548e-06,
+      "loss": 0.14666696786880493,
+      "step": 4320
+    },
+    {
+      "epoch": 0.7869359534206696,
+      "grad_norm": 0.17514032125473022,
+      "learning_rate": 5.758769585963569e-06,
+      "loss": 0.1383386731147766,
+      "step": 4325
+    },
+    {
+      "epoch": 0.7878457059679768,
+      "grad_norm": 0.17497631907463074,
+      "learning_rate": 5.7118567298106744e-06,
+      "loss": 0.14362354278564454,
+      "step": 4330
+    },
+    {
+      "epoch": 0.7887554585152838,
+      "grad_norm": 0.16770458221435547,
+      "learning_rate": 5.665111083183905e-06,
+      "loss": 0.14136618375778198,
+      "step": 4335
+    },
+    {
+      "epoch": 0.789665211062591,
+      "grad_norm": 0.17134106159210205,
+      "learning_rate": 5.618533051322747e-06,
+      "loss": 0.1401529550552368,
+      "step": 4340
+    },
+    {
+      "epoch": 0.7905749636098981,
+      "grad_norm": 0.19458788633346558,
+      "learning_rate": 5.5721230380136435e-06,
+      "loss": 0.1393273115158081,
+      "step": 4345
+    },
+    {
+      "epoch": 0.7914847161572053,
+      "grad_norm": 0.19483692944049835,
+      "learning_rate": 5.525881445586467e-06,
+      "loss": 0.1369825482368469,
+      "step": 4350
+    },
+    {
+      "epoch": 0.7923944687045124,
+      "grad_norm": 0.3052191734313965,
+      "learning_rate": 5.4798086749110495e-06,
+      "loss": 0.14762181043624878,
+      "step": 4355
+    },
+    {
+      "epoch": 0.7933042212518195,
+      "grad_norm": 0.164458766579628,
+      "learning_rate": 5.4339051253937065e-06,
+      "loss": 0.14501686096191407,
+      "step": 4360
+    },
+    {
+      "epoch": 0.7942139737991266,
+      "grad_norm": 0.1719193458557129,
+      "learning_rate": 5.3881711949737625e-06,
+      "loss": 0.13321092128753662,
+      "step": 4365
+    },
+    {
+      "epoch": 0.7951237263464338,
+      "grad_norm": 0.17219696938991547,
+      "learning_rate": 5.342607280120121e-06,
+      "loss": 0.1413906455039978,
+      "step": 4370
+    },
+    {
+      "epoch": 0.7960334788937409,
+      "grad_norm": 0.15083056688308716,
+      "learning_rate": 5.297213775827789e-06,
+      "loss": 0.14772192239761353,
+      "step": 4375
+    },
+    {
+      "epoch": 0.7969432314410481,
+      "grad_norm": 0.1699071079492569,
+      "learning_rate": 5.251991075614507e-06,
+      "loss": 0.1392375946044922,
+      "step": 4380
+    },
+    {
+      "epoch": 0.7978529839883551,
+      "grad_norm": 0.1680395007133484,
+      "learning_rate": 5.206939571517302e-06,
+      "loss": 0.14185575246810914,
+      "step": 4385
+    },
+    {
+      "epoch": 0.7987627365356623,
+      "grad_norm": 0.16526710987091064,
+      "learning_rate": 5.162059654089083e-06,
+      "loss": 0.15001428127288818,
+      "step": 4390
+    },
+    {
+      "epoch": 0.7996724890829694,
+      "grad_norm": 0.16281752288341522,
+      "learning_rate": 5.1173517123952794e-06,
+      "loss": 0.13747023344039916,
+      "step": 4395
+    },
+    {
+      "epoch": 0.8005822416302766,
+      "grad_norm": 0.1454378366470337,
+      "learning_rate": 5.072816134010458e-06,
+      "loss": 0.14710829257965088,
+      "step": 4400
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.419746258733838e+18,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-4400/training_args.bin b/checkpoint-4400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-4400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/checkpoint-4500/README.md b/checkpoint-4500/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-4500/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-4500/adapter_config.json b/checkpoint-4500/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-4500/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-4500/adapter_model.safetensors b/checkpoint-4500/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e8ebcdcd711a3becbd00e8e12a4df02740e45b85
--- /dev/null
+++ b/checkpoint-4500/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:60a9669a0c97afe6dc5a4a3a2ca2d576054b2401ba19e0ab40447a9c9dacf454
+size 169741912
diff --git a/checkpoint-4500/chat_template.jinja b/checkpoint-4500/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-4500/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-4500/optimizer.pt b/checkpoint-4500/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0fcc6547deb9ea802444bfa07c707a2dbf6f56b7
--- /dev/null
+++ b/checkpoint-4500/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:725a5212f8154f26c18f3bf6f21c99c3610460be8e36cd5b4258408fd9ce07b0
+size 72807355
diff --git a/checkpoint-4500/processor_config.json b/checkpoint-4500/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-4500/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-4500/rng_state.pth b/checkpoint-4500/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-4500/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-4500/scheduler.pt b/checkpoint-4500/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..92670746a2524edac6085d32fcfd8c64889ac055
--- /dev/null
+++ b/checkpoint-4500/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:28135fc83e9e89f0b4f8eddf7929fecdeff05a9e15ba8b9f299bb0b694eae52b
+size 1465
diff --git a/checkpoint-4500/tokenizer.json b/checkpoint-4500/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-4500/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-4500/tokenizer_config.json b/checkpoint-4500/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-4500/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-4500/trainer_state.json b/checkpoint-4500/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..113f5c01f3ef91e41a4e2ca304aa2b0cf6a2357d
--- /dev/null
+++ b/checkpoint-4500/trainer_state.json
@@ -0,0 +1,6342 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.8187772925764192,
+  "eval_steps": 100,
+  "global_step": 4500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    },
+    {
+      "epoch": 0.03729985443959243,
+      "grad_norm": 0.061678580939769745,
+      "learning_rate": 4.999340748636462e-05,
+      "loss": 0.24956226348876953,
+      "step": 205
+    },
+    {
+      "epoch": 0.03820960698689956,
+      "grad_norm": 0.07328873127698898,
+      "learning_rate": 4.999160884067051e-05,
+      "loss": 0.26169676780700685,
+      "step": 210
+    },
+    {
+      "epoch": 0.0391193595342067,
+      "grad_norm": 0.08287990838289261,
+      "learning_rate": 4.9989593541926246e-05,
+      "loss": 0.2574604034423828,
+      "step": 215
+    },
+    {
+      "epoch": 0.04002911208151383,
+      "grad_norm": 0.06787359714508057,
+      "learning_rate": 4.9987361607602525e-05,
+      "loss": 0.25351409912109374,
+      "step": 220
+    },
+    {
+      "epoch": 0.04093886462882096,
+      "grad_norm": 0.06695502996444702,
+      "learning_rate": 4.998491305704805e-05,
+      "loss": 0.24522039890289307,
+      "step": 225
+    },
+    {
+      "epoch": 0.041848617176128096,
+      "grad_norm": 0.08872214704751968,
+      "learning_rate": 4.9982247911489375e-05,
+      "loss": 0.2581867933273315,
+      "step": 230
+    },
+    {
+      "epoch": 0.042758369723435226,
+      "grad_norm": 0.07637131959199905,
+      "learning_rate": 4.9979366194030743e-05,
+      "loss": 0.25569658279418944,
+      "step": 235
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 0.08158119022846222,
+      "learning_rate": 4.997626792965385e-05,
+      "loss": 0.2529409646987915,
+      "step": 240
+    },
+    {
+      "epoch": 0.04457787481804949,
+      "grad_norm": 0.07529161125421524,
+      "learning_rate": 4.997295314521766e-05,
+      "loss": 0.24049024581909179,
+      "step": 245
+    },
+    {
+      "epoch": 0.04548762736535662,
+      "grad_norm": 0.08860139548778534,
+      "learning_rate": 4.996942186945813e-05,
+      "loss": 0.2490522861480713,
+      "step": 250
+    },
+    {
+      "epoch": 0.04639737991266375,
+      "grad_norm": 0.0850321501493454,
+      "learning_rate": 4.9965674132988005e-05,
+      "loss": 0.24180831909179687,
+      "step": 255
+    },
+    {
+      "epoch": 0.04730713245997089,
+      "grad_norm": 0.07556115090847015,
+      "learning_rate": 4.996170996829653e-05,
+      "loss": 0.2509631872177124,
+      "step": 260
+    },
+    {
+      "epoch": 0.04821688500727802,
+      "grad_norm": 0.07971206307411194,
+      "learning_rate": 4.995752940974918e-05,
+      "loss": 0.24398891925811766,
+      "step": 265
+    },
+    {
+      "epoch": 0.04912663755458515,
+      "grad_norm": 0.09149336814880371,
+      "learning_rate": 4.9953132493587344e-05,
+      "loss": 0.2300492286682129,
+      "step": 270
+    },
+    {
+      "epoch": 0.050036390101892286,
+      "grad_norm": 0.08265820890665054,
+      "learning_rate": 4.9948519257928034e-05,
+      "loss": 0.24246792793273925,
+      "step": 275
+    },
+    {
+      "epoch": 0.050946142649199416,
+      "grad_norm": 0.10328587144613266,
+      "learning_rate": 4.9943689742763534e-05,
+      "loss": 0.2367171049118042,
+      "step": 280
+    },
+    {
+      "epoch": 0.05185589519650655,
+      "grad_norm": 0.0836917981505394,
+      "learning_rate": 4.993864398996105e-05,
+      "loss": 0.23215813636779786,
+      "step": 285
+    },
+    {
+      "epoch": 0.05276564774381368,
+      "grad_norm": 0.09475161135196686,
+      "learning_rate": 4.99333820432624e-05,
+      "loss": 0.2350748062133789,
+      "step": 290
+    },
+    {
+      "epoch": 0.05367540029112081,
+      "grad_norm": 0.08040128648281097,
+      "learning_rate": 4.992790394828355e-05,
+      "loss": 0.23253886699676513,
+      "step": 295
+    },
+    {
+      "epoch": 0.05458515283842795,
+      "grad_norm": 0.08852150291204453,
+      "learning_rate": 4.992220975251428e-05,
+      "loss": 0.23856515884399415,
+      "step": 300
+    },
+    {
+      "epoch": 0.05549490538573508,
+      "grad_norm": 0.09565229713916779,
+      "learning_rate": 4.991629950531775e-05,
+      "loss": 0.23311660289764405,
+      "step": 305
+    },
+    {
+      "epoch": 0.05640465793304221,
+      "grad_norm": 0.08158160001039505,
+      "learning_rate": 4.991017325793009e-05,
+      "loss": 0.22467944622039795,
+      "step": 310
+    },
+    {
+      "epoch": 0.05731441048034935,
+      "grad_norm": 0.07746429741382599,
+      "learning_rate": 4.990383106345994e-05,
+      "loss": 0.229844069480896,
+      "step": 315
+    },
+    {
+      "epoch": 0.05822416302765648,
+      "grad_norm": 0.08564355969429016,
+      "learning_rate": 4.989727297688797e-05,
+      "loss": 0.22414517402648926,
+      "step": 320
+    },
+    {
+      "epoch": 0.05913391557496361,
+      "grad_norm": 0.07517435401678085,
+      "learning_rate": 4.9890499055066435e-05,
+      "loss": 0.2236532211303711,
+      "step": 325
+    },
+    {
+      "epoch": 0.060043668122270744,
+      "grad_norm": 0.111734539270401,
+      "learning_rate": 4.988350935671869e-05,
+      "loss": 0.21474847793579102,
+      "step": 330
+    },
+    {
+      "epoch": 0.060953420669577874,
+      "grad_norm": 0.09906989336013794,
+      "learning_rate": 4.987630394243866e-05,
+      "loss": 0.23321933746337892,
+      "step": 335
+    },
+    {
+      "epoch": 0.06186317321688501,
+      "grad_norm": 0.10131457448005676,
+      "learning_rate": 4.98688828746903e-05,
+      "loss": 0.2310662031173706,
+      "step": 340
+    },
+    {
+      "epoch": 0.06277292576419213,
+      "grad_norm": 0.09203507006168365,
+      "learning_rate": 4.986124621780708e-05,
+      "loss": 0.22021169662475587,
+      "step": 345
+    },
+    {
+      "epoch": 0.06368267831149928,
+      "grad_norm": 0.09505912661552429,
+      "learning_rate": 4.9853394037991416e-05,
+      "loss": 0.2197155237197876,
+      "step": 350
+    },
+    {
+      "epoch": 0.06459243085880641,
+      "grad_norm": 0.09038657695055008,
+      "learning_rate": 4.984532640331412e-05,
+      "loss": 0.22066287994384765,
+      "step": 355
+    },
+    {
+      "epoch": 0.06550218340611354,
+      "grad_norm": 0.09707064181566238,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 0.22455451488494874,
+      "step": 360
+    },
+    {
+      "epoch": 0.06641193595342067,
+      "grad_norm": 0.10367228090763092,
+      "learning_rate": 4.98285450509961e-05,
+      "loss": 0.21993820667266845,
+      "step": 365
+    },
+    {
+      "epoch": 0.0673216885007278,
+      "grad_norm": 0.12229471653699875,
+      "learning_rate": 4.9819831478833456e-05,
+      "loss": 0.2168867588043213,
+      "step": 370
+    },
+    {
+      "epoch": 0.06823144104803494,
+      "grad_norm": 0.0964592918753624,
+      "learning_rate": 4.981090274276406e-05,
+      "loss": 0.21579203605651856,
+      "step": 375
+    },
+    {
+      "epoch": 0.06914119359534207,
+      "grad_norm": 0.09400496631860733,
+      "learning_rate": 4.980175892019141e-05,
+      "loss": 0.20972180366516113,
+      "step": 380
+    },
+    {
+      "epoch": 0.0700509461426492,
+      "grad_norm": 0.08158645778894424,
+      "learning_rate": 4.9792400090383594e-05,
+      "loss": 0.22148358821868896,
+      "step": 385
+    },
+    {
+      "epoch": 0.07096069868995633,
+      "grad_norm": 0.10916394740343094,
+      "learning_rate": 4.978282633447261e-05,
+      "loss": 0.2214418649673462,
+      "step": 390
+    },
+    {
+      "epoch": 0.07187045123726346,
+      "grad_norm": 0.11138810962438583,
+      "learning_rate": 4.9773037735453636e-05,
+      "loss": 0.21814754009246826,
+      "step": 395
+    },
+    {
+      "epoch": 0.07278020378457059,
+      "grad_norm": 0.10914396494626999,
+      "learning_rate": 4.9763034378184365e-05,
+      "loss": 0.21310818195343018,
+      "step": 400
+    },
+    {
+      "epoch": 0.07368995633187773,
+      "grad_norm": 0.1043366864323616,
+      "learning_rate": 4.975281634938421e-05,
+      "loss": 0.21266789436340333,
+      "step": 405
+    },
+    {
+      "epoch": 0.07459970887918486,
+      "grad_norm": 0.1036868542432785,
+      "learning_rate": 4.9742383737633594e-05,
+      "loss": 0.21606721878051757,
+      "step": 410
+    },
+    {
+      "epoch": 0.075509461426492,
+      "grad_norm": 0.11640442907810211,
+      "learning_rate": 4.9731736633373144e-05,
+      "loss": 0.21532948017120362,
+      "step": 415
+    },
+    {
+      "epoch": 0.07641921397379912,
+      "grad_norm": 0.11219926178455353,
+      "learning_rate": 4.9720875128902956e-05,
+      "loss": 0.2191627025604248,
+      "step": 420
+    },
+    {
+      "epoch": 0.07732896652110625,
+      "grad_norm": 0.12103637307882309,
+      "learning_rate": 4.970979931838176e-05,
+      "loss": 0.20938868522644044,
+      "step": 425
+    },
+    {
+      "epoch": 0.0782387190684134,
+      "grad_norm": 0.13274189829826355,
+      "learning_rate": 4.96985092978261e-05,
+      "loss": 0.21792960166931152,
+      "step": 430
+    },
+    {
+      "epoch": 0.07914847161572053,
+      "grad_norm": 0.11164513230323792,
+      "learning_rate": 4.968700516510954e-05,
+      "loss": 0.2022618055343628,
+      "step": 435
+    },
+    {
+      "epoch": 0.08005822416302766,
+      "grad_norm": 0.09532847255468369,
+      "learning_rate": 4.967528701996174e-05,
+      "loss": 0.21255812644958497,
+      "step": 440
+    },
+    {
+      "epoch": 0.08096797671033479,
+      "grad_norm": 0.10279258340597153,
+      "learning_rate": 4.96633549639677e-05,
+      "loss": 0.20683050155639648,
+      "step": 445
+    },
+    {
+      "epoch": 0.08187772925764192,
+      "grad_norm": 0.1257462352514267,
+      "learning_rate": 4.965120910056677e-05,
+      "loss": 0.21419920921325683,
+      "step": 450
+    },
+    {
+      "epoch": 0.08278748180494905,
+      "grad_norm": 0.11663137376308441,
+      "learning_rate": 4.963884953505186e-05,
+      "loss": 0.2072287082672119,
+      "step": 455
+    },
+    {
+      "epoch": 0.08369723435225619,
+      "grad_norm": 0.10488224029541016,
+      "learning_rate": 4.96262763745684e-05,
+      "loss": 0.1982678532600403,
+      "step": 460
+    },
+    {
+      "epoch": 0.08460698689956332,
+      "grad_norm": 0.11801692098379135,
+      "learning_rate": 4.961348972811354e-05,
+      "loss": 0.20662031173706055,
+      "step": 465
+    },
+    {
+      "epoch": 0.08551673944687045,
+      "grad_norm": 0.11318827420473099,
+      "learning_rate": 4.96004897065351e-05,
+      "loss": 0.20947303771972656,
+      "step": 470
+    },
+    {
+      "epoch": 0.08642649199417758,
+      "grad_norm": 0.13409486413002014,
+      "learning_rate": 4.95872764225307e-05,
+      "loss": 0.19670876264572143,
+      "step": 475
+    },
+    {
+      "epoch": 0.08733624454148471,
+      "grad_norm": 0.14440792798995972,
+      "learning_rate": 4.957384999064672e-05,
+      "loss": 0.19842848777770997,
+      "step": 480
+    },
+    {
+      "epoch": 0.08824599708879186,
+      "grad_norm": 0.12246996909379959,
+      "learning_rate": 4.956021052727731e-05,
+      "loss": 0.20318071842193602,
+      "step": 485
+    },
+    {
+      "epoch": 0.08915574963609899,
+      "grad_norm": 0.13437233865261078,
+      "learning_rate": 4.954635815066342e-05,
+      "loss": 0.21675212383270265,
+      "step": 490
+    },
+    {
+      "epoch": 0.09006550218340612,
+      "grad_norm": 0.11109672486782074,
+      "learning_rate": 4.9532292980891744e-05,
+      "loss": 0.2100757837295532,
+      "step": 495
+    },
+    {
+      "epoch": 0.09097525473071325,
+      "grad_norm": 0.1388893872499466,
+      "learning_rate": 4.9518015139893675e-05,
+      "loss": 0.20303285121917725,
+      "step": 500
+    },
+    {
+      "epoch": 0.09188500727802038,
+      "grad_norm": 0.13239721953868866,
+      "learning_rate": 4.950352475144427e-05,
+      "loss": 0.2152268409729004,
+      "step": 505
+    },
+    {
+      "epoch": 0.0927947598253275,
+      "grad_norm": 0.12834979593753815,
+      "learning_rate": 4.948882194116119e-05,
+      "loss": 0.20799248218536376,
+      "step": 510
+    },
+    {
+      "epoch": 0.09370451237263465,
+      "grad_norm": 0.11886704713106155,
+      "learning_rate": 4.947390683650354e-05,
+      "loss": 0.20394976139068605,
+      "step": 515
+    },
+    {
+      "epoch": 0.09461426491994178,
+      "grad_norm": 0.11398876458406448,
+      "learning_rate": 4.945877956677083e-05,
+      "loss": 0.2091092586517334,
+      "step": 520
+    },
+    {
+      "epoch": 0.09552401746724891,
+      "grad_norm": 0.1422540694475174,
+      "learning_rate": 4.944344026310186e-05,
+      "loss": 0.19564238786697388,
+      "step": 525
+    },
+    {
+      "epoch": 0.09643377001455604,
+      "grad_norm": 0.11359584331512451,
+      "learning_rate": 4.9427889058473535e-05,
+      "loss": 0.20493624210357667,
+      "step": 530
+    },
+    {
+      "epoch": 0.09734352256186317,
+      "grad_norm": 0.11703553050756454,
+      "learning_rate": 4.941212608769974e-05,
+      "loss": 0.2098615884780884,
+      "step": 535
+    },
+    {
+      "epoch": 0.0982532751091703,
+      "grad_norm": 0.14552047848701477,
+      "learning_rate": 4.939615148743017e-05,
+      "loss": 0.20382182598114013,
+      "step": 540
+    },
+    {
+      "epoch": 0.09916302765647744,
+      "grad_norm": 0.13178016245365143,
+      "learning_rate": 4.937996539614914e-05,
+      "loss": 0.19901862144470214,
+      "step": 545
+    },
+    {
+      "epoch": 0.10007278020378457,
+      "grad_norm": 0.635392427444458,
+      "learning_rate": 4.936356795417439e-05,
+      "loss": 0.20694944858551026,
+      "step": 550
+    },
+    {
+      "epoch": 0.1009825327510917,
+      "grad_norm": 0.15019077062606812,
+      "learning_rate": 4.934695930365586e-05,
+      "loss": 0.19313746690750122,
+      "step": 555
+    },
+    {
+      "epoch": 0.10189228529839883,
+      "grad_norm": 0.12941956520080566,
+      "learning_rate": 4.9330139588574474e-05,
+      "loss": 0.19671722650527954,
+      "step": 560
+    },
+    {
+      "epoch": 0.10280203784570596,
+      "grad_norm": 0.13818831741809845,
+      "learning_rate": 4.931310895474088e-05,
+      "loss": 0.20026786327362062,
+      "step": 565
+    },
+    {
+      "epoch": 0.1037117903930131,
+      "grad_norm": 0.12011194974184036,
+      "learning_rate": 4.929586754979417e-05,
+      "loss": 0.1932437539100647,
+      "step": 570
+    },
+    {
+      "epoch": 0.10462154294032024,
+      "grad_norm": 0.1345364898443222,
+      "learning_rate": 4.9278415523200644e-05,
+      "loss": 0.20245940685272218,
+      "step": 575
+    },
+    {
+      "epoch": 0.10553129548762737,
+      "grad_norm": 0.13281017541885376,
+      "learning_rate": 4.926075302625247e-05,
+      "loss": 0.19864981174468993,
+      "step": 580
+    },
+    {
+      "epoch": 0.1064410480349345,
+      "grad_norm": 0.13465586304664612,
+      "learning_rate": 4.924288021206639e-05,
+      "loss": 0.19573183059692384,
+      "step": 585
+    },
+    {
+      "epoch": 0.10735080058224163,
+      "grad_norm": 0.15225961804389954,
+      "learning_rate": 4.9224797235582396e-05,
+      "loss": 0.19946801662445068,
+      "step": 590
+    },
+    {
+      "epoch": 0.10826055312954876,
+      "grad_norm": 0.12816746532917023,
+      "learning_rate": 4.92065042535624e-05,
+      "loss": 0.19851526021957397,
+      "step": 595
+    },
+    {
+      "epoch": 0.1091703056768559,
+      "grad_norm": 0.13802853226661682,
+      "learning_rate": 4.9188001424588824e-05,
+      "loss": 0.19321763515472412,
+      "step": 600
+    },
+    {
+      "epoch": 0.11008005822416303,
+      "grad_norm": 0.17504797875881195,
+      "learning_rate": 4.9169288909063295e-05,
+      "loss": 0.2032616138458252,
+      "step": 605
+    },
+    {
+      "epoch": 0.11098981077147016,
+      "grad_norm": 0.13544194400310516,
+      "learning_rate": 4.91503668692052e-05,
+      "loss": 0.2011256456375122,
+      "step": 610
+    },
+    {
+      "epoch": 0.11189956331877729,
+      "grad_norm": 1.3976134061813354,
+      "learning_rate": 4.91312354690503e-05,
+      "loss": 0.19916868209838867,
+      "step": 615
+    },
+    {
+      "epoch": 0.11280931586608442,
+      "grad_norm": 0.1465059071779251,
+      "learning_rate": 4.91118948744493e-05,
+      "loss": 0.19487457275390624,
+      "step": 620
+    },
+    {
+      "epoch": 0.11371906841339156,
+      "grad_norm": 0.12103168666362762,
+      "learning_rate": 4.909234525306645e-05,
+      "loss": 0.1907251238822937,
+      "step": 625
+    },
+    {
+      "epoch": 0.1146288209606987,
+      "grad_norm": 0.12660574913024902,
+      "learning_rate": 4.907258677437802e-05,
+      "loss": 0.19327253103256226,
+      "step": 630
+    },
+    {
+      "epoch": 0.11553857350800582,
+      "grad_norm": 0.1347813606262207,
+      "learning_rate": 4.90526196096709e-05,
+      "loss": 0.19637736082077026,
+      "step": 635
+    },
+    {
+      "epoch": 0.11644832605531295,
+      "grad_norm": 0.14953652024269104,
+      "learning_rate": 4.903244393204107e-05,
+      "loss": 0.20325069427490233,
+      "step": 640
+    },
+    {
+      "epoch": 0.11735807860262008,
+      "grad_norm": 0.13936272263526917,
+      "learning_rate": 4.901205991639213e-05,
+      "loss": 0.1930275321006775,
+      "step": 645
+    },
+    {
+      "epoch": 0.11826783114992721,
+      "grad_norm": 0.1448420137166977,
+      "learning_rate": 4.899146773943374e-05,
+      "loss": 0.20026936531066894,
+      "step": 650
+    },
+    {
+      "epoch": 0.11917758369723436,
+      "grad_norm": 0.1312534064054489,
+      "learning_rate": 4.897066757968014e-05,
+      "loss": 0.19062033891677857,
+      "step": 655
+    },
+    {
+      "epoch": 0.12008733624454149,
+      "grad_norm": 0.13644742965698242,
+      "learning_rate": 4.894965961744859e-05,
+      "loss": 0.18719595670700073,
+      "step": 660
+    },
+    {
+      "epoch": 0.12099708879184862,
+      "grad_norm": 0.14276087284088135,
+      "learning_rate": 4.892844403485777e-05,
+      "loss": 0.19784307479858398,
+      "step": 665
+    },
+    {
+      "epoch": 0.12190684133915575,
+      "grad_norm": 0.14735399186611176,
+      "learning_rate": 4.890702101582623e-05,
+      "loss": 0.19163782596588136,
+      "step": 670
+    },
+    {
+      "epoch": 0.12281659388646288,
+      "grad_norm": 0.15742065012454987,
+      "learning_rate": 4.888539074607082e-05,
+      "loss": 0.19312986135482788,
+      "step": 675
+    },
+    {
+      "epoch": 0.12372634643377002,
+      "grad_norm": 0.12917031347751617,
+      "learning_rate": 4.8863553413105025e-05,
+      "loss": 0.20066320896148682,
+      "step": 680
+    },
+    {
+      "epoch": 0.12463609898107715,
+      "grad_norm": 0.1484801322221756,
+      "learning_rate": 4.884150920623737e-05,
+      "loss": 0.20096096992492676,
+      "step": 685
+    },
+    {
+      "epoch": 0.12554585152838427,
+      "grad_norm": 0.1455296128988266,
+      "learning_rate": 4.88192583165698e-05,
+      "loss": 0.20518505573272705,
+      "step": 690
+    },
+    {
+      "epoch": 0.12645560407569142,
+      "grad_norm": 0.14517490565776825,
+      "learning_rate": 4.879680093699598e-05,
+      "loss": 0.18859238624572755,
+      "step": 695
+    },
+    {
+      "epoch": 0.12736535662299855,
+      "grad_norm": 0.18778090178966522,
+      "learning_rate": 4.877413726219964e-05,
+      "loss": 0.197074818611145,
+      "step": 700
+    },
+    {
+      "epoch": 0.12827510917030568,
+      "grad_norm": 0.13497677445411682,
+      "learning_rate": 4.87512674886529e-05,
+      "loss": 0.18713107109069824,
+      "step": 705
+    },
+    {
+      "epoch": 0.12918486171761281,
+      "grad_norm": 0.12657155096530914,
+      "learning_rate": 4.872819181461455e-05,
+      "loss": 0.1858484387397766,
+      "step": 710
+    },
+    {
+      "epoch": 0.13009461426491994,
+      "grad_norm": 0.11458148807287216,
+      "learning_rate": 4.870491044012834e-05,
+      "loss": 0.18732179403305055,
+      "step": 715
+    },
+    {
+      "epoch": 0.13100436681222707,
+      "grad_norm": 0.13000249862670898,
+      "learning_rate": 4.8681423567021244e-05,
+      "loss": 0.1872936010360718,
+      "step": 720
+    },
+    {
+      "epoch": 0.1319141193595342,
+      "grad_norm": 0.14580890536308289,
+      "learning_rate": 4.865773139890172e-05,
+      "loss": 0.19280019998550416,
+      "step": 725
+    },
+    {
+      "epoch": 0.13282387190684133,
+      "grad_norm": 0.1507277935743332,
+      "learning_rate": 4.8633834141157913e-05,
+      "loss": 0.1898929238319397,
+      "step": 730
+    },
+    {
+      "epoch": 0.13373362445414846,
+      "grad_norm": 0.1418737769126892,
+      "learning_rate": 4.860973200095592e-05,
+      "loss": 0.17926375865936278,
+      "step": 735
+    },
+    {
+      "epoch": 0.1346433770014556,
+      "grad_norm": 0.17151866853237152,
+      "learning_rate": 4.858542518723794e-05,
+      "loss": 0.18963592052459716,
+      "step": 740
+    },
+    {
+      "epoch": 0.13555312954876272,
+      "grad_norm": 0.11162743717432022,
+      "learning_rate": 4.8560913910720535e-05,
+      "loss": 0.19466646909713745,
+      "step": 745
+    },
+    {
+      "epoch": 0.13646288209606988,
+      "grad_norm": 0.15628376603126526,
+      "learning_rate": 4.8536198383892725e-05,
+      "loss": 0.19494034051895143,
+      "step": 750
+    },
+    {
+      "epoch": 0.137372634643377,
+      "grad_norm": 0.18209289014339447,
+      "learning_rate": 4.851127882101421e-05,
+      "loss": 0.18747550249099731,
+      "step": 755
+    },
+    {
+      "epoch": 0.13828238719068414,
+      "grad_norm": 0.14559614658355713,
+      "learning_rate": 4.8486155438113454e-05,
+      "loss": 0.1897158980369568,
+      "step": 760
+    },
+    {
+      "epoch": 0.13919213973799127,
+      "grad_norm": 0.3198587894439697,
+      "learning_rate": 4.846082845298586e-05,
+      "loss": 0.18571001291275024,
+      "step": 765
+    },
+    {
+      "epoch": 0.1401018922852984,
+      "grad_norm": 0.1486678421497345,
+      "learning_rate": 4.843529808519189e-05,
+      "loss": 0.19561930894851684,
+      "step": 770
+    },
+    {
+      "epoch": 0.14101164483260553,
+      "grad_norm": 0.15318170189857483,
+      "learning_rate": 4.840956455605509e-05,
+      "loss": 0.187040114402771,
+      "step": 775
+    },
+    {
+      "epoch": 0.14192139737991266,
+      "grad_norm": 0.13754244148731232,
+      "learning_rate": 4.838362808866025e-05,
+      "loss": 0.18345539569854735,
+      "step": 780
+    },
+    {
+      "epoch": 0.1428311499272198,
+      "grad_norm": 0.12943248450756073,
+      "learning_rate": 4.835748890785143e-05,
+      "loss": 0.1921079397201538,
+      "step": 785
+    },
+    {
+      "epoch": 0.14374090247452692,
+      "grad_norm": 0.110458143055439,
+      "learning_rate": 4.833114724023001e-05,
+      "loss": 0.17927205562591553,
+      "step": 790
+    },
+    {
+      "epoch": 0.14465065502183405,
+      "grad_norm": 0.2421770840883255,
+      "learning_rate": 4.830460331415275e-05,
+      "loss": 0.18317567110061644,
+      "step": 795
+    },
+    {
+      "epoch": 0.14556040756914118,
+      "grad_norm": 0.14752762019634247,
+      "learning_rate": 4.8277857359729787e-05,
+      "loss": 0.1843916058540344,
+      "step": 800
+    },
+    {
+      "epoch": 0.14647016011644834,
+      "grad_norm": 0.15043556690216064,
+      "learning_rate": 4.8250909608822644e-05,
+      "loss": 0.18354393243789674,
+      "step": 805
+    },
+    {
+      "epoch": 0.14737991266375547,
+      "grad_norm": 0.1381794661283493,
+      "learning_rate": 4.822376029504223e-05,
+      "loss": 0.1789781332015991,
+      "step": 810
+    },
+    {
+      "epoch": 0.1482896652110626,
+      "grad_norm": 0.18386174738407135,
+      "learning_rate": 4.819640965374681e-05,
+      "loss": 0.19494292736053467,
+      "step": 815
+    },
+    {
+      "epoch": 0.14919941775836973,
+      "grad_norm": 0.13829593360424042,
+      "learning_rate": 4.816885792203996e-05,
+      "loss": 0.18486063480377196,
+      "step": 820
+    },
+    {
+      "epoch": 0.15010917030567686,
+      "grad_norm": 0.15033291280269623,
+      "learning_rate": 4.814110533876852e-05,
+      "loss": 0.18061509132385253,
+      "step": 825
+    },
+    {
+      "epoch": 0.151018922852984,
+      "grad_norm": 0.17150473594665527,
+      "learning_rate": 4.811315214452051e-05,
+      "loss": 0.18464866876602173,
+      "step": 830
+    },
+    {
+      "epoch": 0.15192867540029112,
+      "grad_norm": 0.15317125618457794,
+      "learning_rate": 4.808499858162307e-05,
+      "loss": 0.1837708592414856,
+      "step": 835
+    },
+    {
+      "epoch": 0.15283842794759825,
+      "grad_norm": 0.2671392560005188,
+      "learning_rate": 4.805664489414031e-05,
+      "loss": 0.19338636398315429,
+      "step": 840
+    },
+    {
+      "epoch": 0.15374818049490538,
+      "grad_norm": 0.14047028124332428,
+      "learning_rate": 4.802809132787125e-05,
+      "loss": 0.17069108486175538,
+      "step": 845
+    },
+    {
+      "epoch": 0.1546579330422125,
+      "grad_norm": 0.1520431935787201,
+      "learning_rate": 4.799933813034768e-05,
+      "loss": 0.18607735633850098,
+      "step": 850
+    },
+    {
+      "epoch": 0.15556768558951964,
+      "grad_norm": 0.17239463329315186,
+      "learning_rate": 4.797038555083197e-05,
+      "loss": 0.18069062232971192,
+      "step": 855
+    },
+    {
+      "epoch": 0.1564774381368268,
+      "grad_norm": 0.1377955675125122,
+      "learning_rate": 4.794123384031495e-05,
+      "loss": 0.18870222568511963,
+      "step": 860
+    },
+    {
+      "epoch": 0.15738719068413393,
+      "grad_norm": 0.15901461243629456,
+      "learning_rate": 4.791188325151373e-05,
+      "loss": 0.18128334283828734,
+      "step": 865
+    },
+    {
+      "epoch": 0.15829694323144106,
+      "grad_norm": 0.14634132385253906,
+      "learning_rate": 4.7882334038869495e-05,
+      "loss": 0.1866163969039917,
+      "step": 870
+    },
+    {
+      "epoch": 0.1592066957787482,
+      "grad_norm": 0.15361061692237854,
+      "learning_rate": 4.785258645854529e-05,
+      "loss": 0.17850807905197144,
+      "step": 875
+    },
+    {
+      "epoch": 0.16011644832605532,
+      "grad_norm": 0.13751649856567383,
+      "learning_rate": 4.782264076842385e-05,
+      "loss": 0.17731113433837892,
+      "step": 880
+    },
+    {
+      "epoch": 0.16102620087336245,
+      "grad_norm": 0.17909638583660126,
+      "learning_rate": 4.7792497228105314e-05,
+      "loss": 0.18344542980194092,
+      "step": 885
+    },
+    {
+      "epoch": 0.16193595342066958,
+      "grad_norm": 0.16038304567337036,
+      "learning_rate": 4.776215609890498e-05,
+      "loss": 0.18868647813796996,
+      "step": 890
+    },
+    {
+      "epoch": 0.1628457059679767,
+      "grad_norm": 0.1653951108455658,
+      "learning_rate": 4.773161764385107e-05,
+      "loss": 0.18614152669906617,
+      "step": 895
+    },
+    {
+      "epoch": 0.16375545851528384,
+      "grad_norm": 0.16193026304244995,
+      "learning_rate": 4.770088212768241e-05,
+      "loss": 0.18564575910568237,
+      "step": 900
+    },
+    {
+      "epoch": 0.16466521106259097,
+      "grad_norm": 0.16048531234264374,
+      "learning_rate": 4.7669949816846173e-05,
+      "loss": 0.18330031633377075,
+      "step": 905
+    },
+    {
+      "epoch": 0.1655749636098981,
+      "grad_norm": 0.1440177708864212,
+      "learning_rate": 4.7638820979495534e-05,
+      "loss": 0.17712442874908446,
+      "step": 910
+    },
+    {
+      "epoch": 0.16648471615720525,
+      "grad_norm": 0.19635969400405884,
+      "learning_rate": 4.760749588548738e-05,
+      "loss": 0.18679027557373046,
+      "step": 915
+    },
+    {
+      "epoch": 0.16739446870451238,
+      "grad_norm": 0.15576541423797607,
+      "learning_rate": 4.757597480637995e-05,
+      "loss": 0.19283764362335204,
+      "step": 920
+    },
+    {
+      "epoch": 0.1683042212518195,
+      "grad_norm": 0.1550331562757492,
+      "learning_rate": 4.7544258015430463e-05,
+      "loss": 0.18269542455673218,
+      "step": 925
+    },
+    {
+      "epoch": 0.16921397379912664,
+      "grad_norm": 0.18369626998901367,
+      "learning_rate": 4.75123457875928e-05,
+      "loss": 0.1697891116142273,
+      "step": 930
+    },
+    {
+      "epoch": 0.17012372634643377,
+      "grad_norm": 0.15266314148902893,
+      "learning_rate": 4.7480238399515074e-05,
+      "loss": 0.18523451089859008,
+      "step": 935
+    },
+    {
+      "epoch": 0.1710334788937409,
+      "grad_norm": 0.16709664463996887,
+      "learning_rate": 4.744793612953724e-05,
+      "loss": 0.1803238034248352,
+      "step": 940
+    },
+    {
+      "epoch": 0.17194323144104803,
+      "grad_norm": 0.14929179847240448,
+      "learning_rate": 4.741543925768872e-05,
+      "loss": 0.1861217737197876,
+      "step": 945
+    },
+    {
+      "epoch": 0.17285298398835516,
+      "grad_norm": 0.1362280696630478,
+      "learning_rate": 4.7382748065685915e-05,
+      "loss": 0.17896100282669067,
+      "step": 950
+    },
+    {
+      "epoch": 0.1737627365356623,
+      "grad_norm": 0.15290239453315735,
+      "learning_rate": 4.734986283692982e-05,
+      "loss": 0.18432788848876952,
+      "step": 955
+    },
+    {
+      "epoch": 0.17467248908296942,
+      "grad_norm": 0.1287035197019577,
+      "learning_rate": 4.73167838565035e-05,
+      "loss": 0.18485682010650634,
+      "step": 960
+    },
+    {
+      "epoch": 0.17558224163027655,
+      "grad_norm": 0.17969627678394318,
+      "learning_rate": 4.728351141116971e-05,
+      "loss": 0.17361557483673096,
+      "step": 965
+    },
+    {
+      "epoch": 0.1764919941775837,
+      "grad_norm": 0.13751201331615448,
+      "learning_rate": 4.7250045789368326e-05,
+      "loss": 0.1731679320335388,
+      "step": 970
+    },
+    {
+      "epoch": 0.17740174672489084,
+      "grad_norm": 0.1603265255689621,
+      "learning_rate": 4.721638728121388e-05,
+      "loss": 0.17308170795440675,
+      "step": 975
+    },
+    {
+      "epoch": 0.17831149927219797,
+      "grad_norm": 0.1592789888381958,
+      "learning_rate": 4.718253617849306e-05,
+      "loss": 0.17534757852554322,
+      "step": 980
+    },
+    {
+      "epoch": 0.1792212518195051,
+      "grad_norm": 0.12727224826812744,
+      "learning_rate": 4.714849277466214e-05,
+      "loss": 0.17817609310150145,
+      "step": 985
+    },
+    {
+      "epoch": 0.18013100436681223,
+      "grad_norm": 0.15401554107666016,
+      "learning_rate": 4.711425736484447e-05,
+      "loss": 0.1733405351638794,
+      "step": 990
+    },
+    {
+      "epoch": 0.18104075691411936,
+      "grad_norm": 0.13253968954086304,
+      "learning_rate": 4.7079830245827906e-05,
+      "loss": 0.17846795320510864,
+      "step": 995
+    },
+    {
+      "epoch": 0.1819505094614265,
+      "grad_norm": 0.21846213936805725,
+      "learning_rate": 4.7045211716062245e-05,
+      "loss": 0.18021599054336548,
+      "step": 1000
+    },
+    {
+      "epoch": 0.18286026200873362,
+      "grad_norm": 0.16867990791797638,
+      "learning_rate": 4.7010402075656595e-05,
+      "loss": 0.18232386112213134,
+      "step": 1005
+    },
+    {
+      "epoch": 0.18377001455604075,
+      "grad_norm": 0.17180582880973816,
+      "learning_rate": 4.697540162637686e-05,
+      "loss": 0.1816317319869995,
+      "step": 1010
+    },
+    {
+      "epoch": 0.18467976710334788,
+      "grad_norm": 0.16480213403701782,
+      "learning_rate": 4.694021067164303e-05,
+      "loss": 0.17718446254730225,
+      "step": 1015
+    },
+    {
+      "epoch": 0.185589519650655,
+      "grad_norm": 0.15015918016433716,
+      "learning_rate": 4.6904829516526605e-05,
+      "loss": 0.17412011623382567,
+      "step": 1020
+    },
+    {
+      "epoch": 0.18649927219796217,
+      "grad_norm": 0.14445139467716217,
+      "learning_rate": 4.686925846774795e-05,
+      "loss": 0.1778018832206726,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1874090247452693,
+      "grad_norm": 0.1701960265636444,
+      "learning_rate": 4.683349783367362e-05,
+      "loss": 0.16901081800460815,
+      "step": 1030
+    },
+    {
+      "epoch": 0.18831877729257643,
+      "grad_norm": 0.15894867479801178,
+      "learning_rate": 4.679754792431368e-05,
+      "loss": 0.17055928707122803,
+      "step": 1035
+    },
+    {
+      "epoch": 0.18922852983988356,
+      "grad_norm": 0.1511942446231842,
+      "learning_rate": 4.676140905131903e-05,
+      "loss": 0.17339680194854737,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1901382823871907,
+      "grad_norm": 0.14735209941864014,
+      "learning_rate": 4.672508152797872e-05,
+      "loss": 0.17802717685699462,
+      "step": 1045
+    },
+    {
+      "epoch": 0.19104803493449782,
+      "grad_norm": 0.17367291450500488,
+      "learning_rate": 4.66885656692172e-05,
+      "loss": 0.1732744097709656,
+      "step": 1050
+    },
+    {
+      "epoch": 0.19195778748180495,
+      "grad_norm": 0.147227481007576,
+      "learning_rate": 4.665186179159159e-05,
+      "loss": 0.17040517330169677,
+      "step": 1055
+    },
+    {
+      "epoch": 0.19286754002911208,
+      "grad_norm": 0.1709655076265335,
+      "learning_rate": 4.6614970213289e-05,
+      "loss": 0.17794088125228882,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1937772925764192,
+      "grad_norm": 0.1588088721036911,
+      "learning_rate": 4.657789125412366e-05,
+      "loss": 0.17180380821228028,
+      "step": 1065
+    },
+    {
+      "epoch": 0.19468704512372634,
+      "grad_norm": 0.14827021956443787,
+      "learning_rate": 4.654062523553428e-05,
+      "loss": 0.182997989654541,
+      "step": 1070
+    },
+    {
+      "epoch": 0.19559679767103347,
+      "grad_norm": 0.16230466961860657,
+      "learning_rate": 4.6503172480581126e-05,
+      "loss": 0.17346880435943604,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1965065502183406,
+      "grad_norm": 0.1637624353170395,
+      "learning_rate": 4.646553331394333e-05,
+      "loss": 0.17263576984405518,
+      "step": 1080
+    },
+    {
+      "epoch": 0.19741630276564776,
+      "grad_norm": 0.15977843105793,
+      "learning_rate": 4.642770806191603e-05,
+      "loss": 0.17284308671951293,
+      "step": 1085
+    },
+    {
+      "epoch": 0.19832605531295489,
+      "grad_norm": 0.15394869446754456,
+      "learning_rate": 4.6389697052407534e-05,
+      "loss": 0.17797101736068727,
+      "step": 1090
+    },
+    {
+      "epoch": 0.19923580786026202,
+      "grad_norm": 0.15995225310325623,
+      "learning_rate": 4.6351500614936485e-05,
+      "loss": 0.18137198686599731,
+      "step": 1095
+    },
+    {
+      "epoch": 0.20014556040756915,
+      "grad_norm": 0.1779479682445526,
+      "learning_rate": 4.6313119080629006e-05,
+      "loss": 0.17998344898223878,
+      "step": 1100
+    },
+    {
+      "epoch": 0.20105531295487628,
+      "grad_norm": 0.14362832903862,
+      "learning_rate": 4.627455278221584e-05,
+      "loss": 0.18196423053741456,
+      "step": 1105
+    },
+    {
+      "epoch": 0.2019650655021834,
+      "grad_norm": 0.15951639413833618,
+      "learning_rate": 4.623580205402947e-05,
+      "loss": 0.17423888444900512,
+      "step": 1110
+    },
+    {
+      "epoch": 0.20287481804949054,
+      "grad_norm": 0.17273563146591187,
+      "learning_rate": 4.619686723200115e-05,
+      "loss": 0.17392473220825194,
+      "step": 1115
+    },
+    {
+      "epoch": 0.20378457059679767,
+      "grad_norm": 0.1655360758304596,
+      "learning_rate": 4.615774865365813e-05,
+      "loss": 0.17528389692306517,
+      "step": 1120
+    },
+    {
+      "epoch": 0.2046943231441048,
+      "grad_norm": 0.15920691192150116,
+      "learning_rate": 4.611844665812058e-05,
+      "loss": 0.1806849241256714,
+      "step": 1125
+    },
+    {
+      "epoch": 0.20560407569141192,
+      "grad_norm": 0.16114577651023865,
+      "learning_rate": 4.607896158609875e-05,
+      "loss": 0.17217352390289306,
+      "step": 1130
+    },
+    {
+      "epoch": 0.20651382823871905,
+      "grad_norm": 0.1499422937631607,
+      "learning_rate": 4.603929377988999e-05,
+      "loss": 0.17806737422943114,
+      "step": 1135
+    },
+    {
+      "epoch": 0.2074235807860262,
+      "grad_norm": 0.17605191469192505,
+      "learning_rate": 4.5999443583375765e-05,
+      "loss": 0.17842113971710205,
+      "step": 1140
+    },
+    {
+      "epoch": 0.20833333333333334,
+      "grad_norm": 0.16117210686206818,
+      "learning_rate": 4.595941134201871e-05,
+      "loss": 0.18379683494567872,
+      "step": 1145
+    },
+    {
+      "epoch": 0.20924308588064047,
+      "grad_norm": 0.21199050545692444,
+      "learning_rate": 4.591919740285957e-05,
+      "loss": 0.16286123991012574,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2101528384279476,
+      "grad_norm": 0.15100529789924622,
+      "learning_rate": 4.587880211451427e-05,
+      "loss": 0.17995200157165528,
+      "step": 1155
+    },
+    {
+      "epoch": 0.21106259097525473,
+      "grad_norm": 0.16618172824382782,
+      "learning_rate": 4.583822582717085e-05,
+      "loss": 0.16960303783416747,
+      "step": 1160
+    },
+    {
+      "epoch": 0.21197234352256186,
+      "grad_norm": 0.14743569493293762,
+      "learning_rate": 4.579746889258643e-05,
+      "loss": 0.17762668132781984,
+      "step": 1165
+    },
+    {
+      "epoch": 0.212882096069869,
+      "grad_norm": 0.1697179079055786,
+      "learning_rate": 4.575653166408417e-05,
+      "loss": 0.16656005382537842,
+      "step": 1170
+    },
+    {
+      "epoch": 0.21379184861717612,
+      "grad_norm": 0.14886513352394104,
+      "learning_rate": 4.57154144965502e-05,
+      "loss": 0.17091882228851318,
+      "step": 1175
+    },
+    {
+      "epoch": 0.21470160116448325,
+      "grad_norm": 0.18197473883628845,
+      "learning_rate": 4.5674117746430556e-05,
+      "loss": 0.1770920753479004,
+      "step": 1180
+    },
+    {
+      "epoch": 0.21561135371179038,
+      "grad_norm": 0.17323088645935059,
+      "learning_rate": 4.563264177172807e-05,
+      "loss": 0.1734643578529358,
+      "step": 1185
+    },
+    {
+      "epoch": 0.2165211062590975,
+      "grad_norm": 0.1521984338760376,
+      "learning_rate": 4.559098693199929e-05,
+      "loss": 0.17515116930007935,
+      "step": 1190
+    },
+    {
+      "epoch": 0.21743085880640467,
+      "grad_norm": 0.1842304915189743,
+      "learning_rate": 4.554915358835134e-05,
+      "loss": 0.16798022985458375,
+      "step": 1195
+    },
+    {
+      "epoch": 0.2183406113537118,
+      "grad_norm": 0.14753451943397522,
+      "learning_rate": 4.5507142103438794e-05,
+      "loss": 0.1755476713180542,
+      "step": 1200
+    },
+    {
+      "epoch": 0.21925036390101893,
+      "grad_norm": 0.17096194624900818,
+      "learning_rate": 4.546495284146057e-05,
+      "loss": 0.1792473554611206,
+      "step": 1205
+    },
+    {
+      "epoch": 0.22016011644832606,
+      "grad_norm": 0.1579233556985855,
+      "learning_rate": 4.542258616815669e-05,
+      "loss": 0.17230144739151002,
+      "step": 1210
+    },
+    {
+      "epoch": 0.2210698689956332,
+      "grad_norm": 0.177297905087471,
+      "learning_rate": 4.5380042450805216e-05,
+      "loss": 0.1807127833366394,
+      "step": 1215
+    },
+    {
+      "epoch": 0.22197962154294032,
+      "grad_norm": 0.14331696927547455,
+      "learning_rate": 4.533732205821897e-05,
+      "loss": 0.17201389074325563,
+      "step": 1220
+    },
+    {
+      "epoch": 0.22288937409024745,
+      "grad_norm": 0.14473360776901245,
+      "learning_rate": 4.529442536074239e-05,
+      "loss": 0.17036900520324708,
+      "step": 1225
+    },
+    {
+      "epoch": 0.22379912663755458,
+      "grad_norm": 0.1820901483297348,
+      "learning_rate": 4.5251352730248314e-05,
+      "loss": 0.17704882621765136,
+      "step": 1230
+    },
+    {
+      "epoch": 0.2247088791848617,
+      "grad_norm": 0.1948976367712021,
+      "learning_rate": 4.5208104540134746e-05,
+      "loss": 0.1706973433494568,
+      "step": 1235
+    },
+    {
+      "epoch": 0.22561863173216884,
+      "grad_norm": 0.16660070419311523,
+      "learning_rate": 4.51646811653216e-05,
+      "loss": 0.17636821269989014,
+      "step": 1240
+    },
+    {
+      "epoch": 0.22652838427947597,
+      "grad_norm": 0.1699984073638916,
+      "learning_rate": 4.512108298224751e-05,
+      "loss": 0.16986632347106934,
+      "step": 1245
+    },
+    {
+      "epoch": 0.22743813682678313,
+      "grad_norm": 0.17601042985916138,
+      "learning_rate": 4.50773103688665e-05,
+      "loss": 0.17507898807525635,
+      "step": 1250
+    },
+    {
+      "epoch": 0.22834788937409026,
+      "grad_norm": 0.17557238042354584,
+      "learning_rate": 4.503336370464476e-05,
+      "loss": 0.17702863216400147,
+      "step": 1255
+    },
+    {
+      "epoch": 0.2292576419213974,
+      "grad_norm": 0.1800651252269745,
+      "learning_rate": 4.498924337055729e-05,
+      "loss": 0.16419180631637573,
+      "step": 1260
+    },
+    {
+      "epoch": 0.23016739446870452,
+      "grad_norm": 0.2022479772567749,
+      "learning_rate": 4.494494974908468e-05,
+      "loss": 0.17482060194015503,
+      "step": 1265
+    },
+    {
+      "epoch": 0.23107714701601165,
+      "grad_norm": 0.14180205762386322,
+      "learning_rate": 4.490048322420973e-05,
+      "loss": 0.1723136067390442,
+      "step": 1270
+    },
+    {
+      "epoch": 0.23198689956331878,
+      "grad_norm": 0.18607310950756073,
+      "learning_rate": 4.485584418141419e-05,
+      "loss": 0.17096419334411622,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2328966521106259,
+      "grad_norm": 0.15958310663700104,
+      "learning_rate": 4.481103300767529e-05,
+      "loss": 0.1656244158744812,
+      "step": 1280
+    },
+    {
+      "epoch": 0.23380640465793304,
+      "grad_norm": 0.17552383244037628,
+      "learning_rate": 4.476605009146255e-05,
+      "loss": 0.17677626609802247,
+      "step": 1285
+    },
+    {
+      "epoch": 0.23471615720524017,
+      "grad_norm": 0.15299823880195618,
+      "learning_rate": 4.472089582273429e-05,
+      "loss": 0.1778991103172302,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2356259097525473,
+      "grad_norm": 0.14613987505435944,
+      "learning_rate": 4.46755705929343e-05,
+      "loss": 0.17071452140808105,
+      "step": 1295
+    },
+    {
+      "epoch": 0.23653566229985443,
+      "grad_norm": 0.17781122028827667,
+      "learning_rate": 4.463007479498843e-05,
+      "loss": 0.16955430507659913,
+      "step": 1300
+    },
+    {
+      "epoch": 0.23744541484716158,
+      "grad_norm": 0.16326487064361572,
+      "learning_rate": 4.458440882330119e-05,
+      "loss": 0.1777693510055542,
+      "step": 1305
+    },
+    {
+      "epoch": 0.23835516739446871,
+      "grad_norm": 0.17701926827430725,
+      "learning_rate": 4.4538573073752365e-05,
+      "loss": 0.16323351860046387,
+      "step": 1310
+    },
+    {
+      "epoch": 0.23926491994177584,
+      "grad_norm": 0.13104717433452606,
+      "learning_rate": 4.449256794369349e-05,
+      "loss": 0.17653456926345826,
+      "step": 1315
+    },
+    {
+      "epoch": 0.24017467248908297,
+      "grad_norm": 0.1796836256980896,
+      "learning_rate": 4.444639383194452e-05,
+      "loss": 0.17189600467681884,
+      "step": 1320
+    },
+    {
+      "epoch": 0.2410844250363901,
+      "grad_norm": 0.14919696748256683,
+      "learning_rate": 4.440005113879029e-05,
+      "loss": 0.17003334760665895,
+      "step": 1325
+    },
+    {
+      "epoch": 0.24199417758369723,
+      "grad_norm": 0.1728784441947937,
+      "learning_rate": 4.4353540265977064e-05,
+      "loss": 0.17397408485412597,
+      "step": 1330
+    },
+    {
+      "epoch": 0.24290393013100436,
+      "grad_norm": 0.14591015875339508,
+      "learning_rate": 4.43068616167091e-05,
+      "loss": 0.16498478651046752,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2438136826783115,
+      "grad_norm": 0.18417201936244965,
+      "learning_rate": 4.4260015595645055e-05,
+      "loss": 0.16841750144958495,
+      "step": 1340
+    },
+    {
+      "epoch": 0.24472343522561862,
+      "grad_norm": 0.16264279186725616,
+      "learning_rate": 4.4213002608894605e-05,
+      "loss": 0.16907373666763306,
+      "step": 1345
+    },
+    {
+      "epoch": 0.24563318777292575,
+      "grad_norm": 0.15248481929302216,
+      "learning_rate": 4.416582306401481e-05,
+      "loss": 0.15931472778320313,
+      "step": 1350
+    },
+    {
+      "epoch": 0.24654294032023288,
+      "grad_norm": 0.1488373875617981,
+      "learning_rate": 4.4118477370006636e-05,
+      "loss": 0.1701716423034668,
+      "step": 1355
+    },
+    {
+      "epoch": 0.24745269286754004,
+      "grad_norm": 0.14679782092571259,
+      "learning_rate": 4.407096593731142e-05,
+      "loss": 0.157412326335907,
+      "step": 1360
+    },
+    {
+      "epoch": 0.24836244541484717,
+      "grad_norm": 0.17139530181884766,
+      "learning_rate": 4.402328917780728e-05,
+      "loss": 0.17303754091262818,
+      "step": 1365
+    },
+    {
+      "epoch": 0.2492721979621543,
+      "grad_norm": 0.1534871757030487,
+      "learning_rate": 4.397544750480554e-05,
+      "loss": 0.1786255121231079,
+      "step": 1370
+    },
+    {
+      "epoch": 0.2501819505094614,
+      "grad_norm": 0.1876252293586731,
+      "learning_rate": 4.39274413330472e-05,
+      "loss": 0.16442898511886597,
+      "step": 1375
+    },
+    {
+      "epoch": 0.25109170305676853,
+      "grad_norm": 0.16165752708911896,
+      "learning_rate": 4.387927107869928e-05,
+      "loss": 0.1780426025390625,
+      "step": 1380
+    },
+    {
+      "epoch": 0.25200145560407566,
+      "grad_norm": 0.17242255806922913,
+      "learning_rate": 4.383093715935124e-05,
+      "loss": 0.15959256887435913,
+      "step": 1385
+    },
+    {
+      "epoch": 0.25291120815138285,
+      "grad_norm": 0.1627114862203598,
+      "learning_rate": 4.378243999401137e-05,
+      "loss": 0.17606115341186523,
+      "step": 1390
+    },
+    {
+      "epoch": 0.25382096069869,
+      "grad_norm": 0.15911224484443665,
+      "learning_rate": 4.373378000310312e-05,
+      "loss": 0.16798585653305054,
+      "step": 1395
+    },
+    {
+      "epoch": 0.2547307132459971,
+      "grad_norm": 0.15542249381542206,
+      "learning_rate": 4.3684957608461505e-05,
+      "loss": 0.1695417881011963,
+      "step": 1400
+    },
+    {
+      "epoch": 0.25564046579330424,
+      "grad_norm": 0.1475304812192917,
+      "learning_rate": 4.363597323332941e-05,
+      "loss": 0.16340878009796142,
+      "step": 1405
+    },
+    {
+      "epoch": 0.25655021834061137,
+      "grad_norm": 0.16943927109241486,
+      "learning_rate": 4.358682730235395e-05,
+      "loss": 0.17240238189697266,
+      "step": 1410
+    },
+    {
+      "epoch": 0.2574599708879185,
+      "grad_norm": 0.1816391944885254,
+      "learning_rate": 4.3537520241582744e-05,
+      "loss": 0.16558437347412108,
+      "step": 1415
+    },
+    {
+      "epoch": 0.25836972343522563,
+      "grad_norm": 0.23851341009140015,
+      "learning_rate": 4.348805247846027e-05,
+      "loss": 0.16796000003814698,
+      "step": 1420
+    },
+    {
+      "epoch": 0.25927947598253276,
+      "grad_norm": 0.15415243804454803,
+      "learning_rate": 4.343842444182414e-05,
+      "loss": 0.1746017098426819,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2601892285298399,
+      "grad_norm": 0.15651032328605652,
+      "learning_rate": 4.338863656190139e-05,
+      "loss": 0.1649057984352112,
+      "step": 1430
+    },
+    {
+      "epoch": 0.261098981077147,
+      "grad_norm": 0.16601966321468353,
+      "learning_rate": 4.333868927030471e-05,
+      "loss": 0.15888988971710205,
+      "step": 1435
+    },
+    {
+      "epoch": 0.26200873362445415,
+      "grad_norm": 0.1549467295408249,
+      "learning_rate": 4.328858300002876e-05,
+      "loss": 0.16357985734939576,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2629184861717613,
+      "grad_norm": 0.16332370042800903,
+      "learning_rate": 4.32383181854464e-05,
+      "loss": 0.16749982833862304,
+      "step": 1445
+    },
+    {
+      "epoch": 0.2638282387190684,
+      "grad_norm": 0.14827077090740204,
+      "learning_rate": 4.3187895262304894e-05,
+      "loss": 0.16886214017868043,
+      "step": 1450
+    },
+    {
+      "epoch": 0.26473799126637554,
+      "grad_norm": 0.1557198166847229,
+      "learning_rate": 4.313731466772216e-05,
+      "loss": 0.17512214183807373,
+      "step": 1455
+    },
+    {
+      "epoch": 0.26564774381368267,
+      "grad_norm": 0.17263570427894592,
+      "learning_rate": 4.308657684018299e-05,
+      "loss": 0.16248074769973755,
+      "step": 1460
+    },
+    {
+      "epoch": 0.2665574963609898,
+      "grad_norm": 0.17135761678218842,
+      "learning_rate": 4.303568221953521e-05,
+      "loss": 0.16605921983718872,
+      "step": 1465
+    },
+    {
+      "epoch": 0.26746724890829693,
+      "grad_norm": 0.14322632551193237,
+      "learning_rate": 4.2984631246985897e-05,
+      "loss": 0.1610772728919983,
+      "step": 1470
+    },
+    {
+      "epoch": 0.26837700145560406,
+      "grad_norm": 0.18852312862873077,
+      "learning_rate": 4.2933424365097564e-05,
+      "loss": 0.1686462163925171,
+      "step": 1475
+    },
+    {
+      "epoch": 0.2692867540029112,
+      "grad_norm": 0.1780245155096054,
+      "learning_rate": 4.2882062017784294e-05,
+      "loss": 0.16953932046890258,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2701965065502183,
+      "grad_norm": 0.180568665266037,
+      "learning_rate": 4.2830544650307895e-05,
+      "loss": 0.16442664861679077,
+      "step": 1485
+    },
+    {
+      "epoch": 0.27110625909752545,
+      "grad_norm": 0.16876435279846191,
+      "learning_rate": 4.277887270927407e-05,
+      "loss": 0.17128173112869263,
+      "step": 1490
+    },
+    {
+      "epoch": 0.2720160116448326,
+      "grad_norm": 0.164053276181221,
+      "learning_rate": 4.2727046642628513e-05,
+      "loss": 0.16331382989883422,
+      "step": 1495
+    },
+    {
+      "epoch": 0.27292576419213976,
+      "grad_norm": 0.14577528834342957,
+      "learning_rate": 4.267506689965305e-05,
+      "loss": 0.1638316035270691,
+      "step": 1500
+    },
+    {
+      "epoch": 0.2738355167394469,
+      "grad_norm": 0.1648740917444229,
+      "learning_rate": 4.262293393096171e-05,
+      "loss": 0.15332664251327516,
+      "step": 1505
+    },
+    {
+      "epoch": 0.274745269286754,
+      "grad_norm": 0.16445094347000122,
+      "learning_rate": 4.257064818849685e-05,
+      "loss": 0.1706634521484375,
+      "step": 1510
+    },
+    {
+      "epoch": 0.27565502183406115,
+      "grad_norm": 0.1584935486316681,
+      "learning_rate": 4.251821012552524e-05,
+      "loss": 0.1684114694595337,
+      "step": 1515
+    },
+    {
+      "epoch": 0.2765647743813683,
+      "grad_norm": 0.17215611040592194,
+      "learning_rate": 4.24656201966341e-05,
+      "loss": 0.15594131946563722,
+      "step": 1520
+    },
+    {
+      "epoch": 0.2774745269286754,
+      "grad_norm": 0.15945589542388916,
+      "learning_rate": 4.2412878857727214e-05,
+      "loss": 0.1686659574508667,
+      "step": 1525
+    },
+    {
+      "epoch": 0.27838427947598254,
+      "grad_norm": 0.16103951632976532,
+      "learning_rate": 4.2359986566020906e-05,
+      "loss": 0.17779340744018554,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2792940320232897,
+      "grad_norm": 0.1770307570695877,
+      "learning_rate": 4.230694378004014e-05,
+      "loss": 0.16786882877349854,
+      "step": 1535
+    },
+    {
+      "epoch": 0.2802037845705968,
+      "grad_norm": 0.16225053369998932,
+      "learning_rate": 4.2253750959614504e-05,
+      "loss": 0.16558897495269775,
+      "step": 1540
+    },
+    {
+      "epoch": 0.28111353711790393,
+      "grad_norm": 0.27213969826698303,
+      "learning_rate": 4.220040856587425e-05,
+      "loss": 0.1641119599342346,
+      "step": 1545
+    },
+    {
+      "epoch": 0.28202328966521106,
+      "grad_norm": 0.1773071587085724,
+      "learning_rate": 4.2146917061246284e-05,
+      "loss": 0.16919140815734862,
+      "step": 1550
+    },
+    {
+      "epoch": 0.2829330422125182,
+      "grad_norm": 0.15519705414772034,
+      "learning_rate": 4.209327690945014e-05,
+      "loss": 0.15501506328582765,
+      "step": 1555
+    },
+    {
+      "epoch": 0.2838427947598253,
+      "grad_norm": 0.19921597838401794,
+      "learning_rate": 4.203948857549402e-05,
+      "loss": 0.1690821886062622,
+      "step": 1560
+    },
+    {
+      "epoch": 0.28475254730713245,
+      "grad_norm": 0.15417630970478058,
+      "learning_rate": 4.1985552525670696e-05,
+      "loss": 0.1675640344619751,
+      "step": 1565
+    },
+    {
+      "epoch": 0.2856622998544396,
+      "grad_norm": 0.1739572137594223,
+      "learning_rate": 4.193146922755348e-05,
+      "loss": 0.16738017797470092,
+      "step": 1570
+    },
+    {
+      "epoch": 0.2865720524017467,
+      "grad_norm": 0.1384361982345581,
+      "learning_rate": 4.187723914999221e-05,
+      "loss": 0.16802358627319336,
+      "step": 1575
+    },
+    {
+      "epoch": 0.28748180494905384,
+      "grad_norm": 0.1491454839706421,
+      "learning_rate": 4.182286276310915e-05,
+      "loss": 0.1619583249092102,
+      "step": 1580
+    },
+    {
+      "epoch": 0.288391557496361,
+      "grad_norm": 0.15831919014453888,
+      "learning_rate": 4.176834053829492e-05,
+      "loss": 0.1625199794769287,
+      "step": 1585
+    },
+    {
+      "epoch": 0.2893013100436681,
+      "grad_norm": 0.16265396773815155,
+      "learning_rate": 4.1713672948204416e-05,
+      "loss": 0.16718552112579346,
+      "step": 1590
+    },
+    {
+      "epoch": 0.29021106259097523,
+      "grad_norm": 0.15153461694717407,
+      "learning_rate": 4.1658860466752714e-05,
+      "loss": 0.15979087352752686,
+      "step": 1595
+    },
+    {
+      "epoch": 0.29112081513828236,
+      "grad_norm": 0.1620412915945053,
+      "learning_rate": 4.160390356911096e-05,
+      "loss": 0.16103557348251343,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2920305676855895,
+      "grad_norm": 0.16673807799816132,
+      "learning_rate": 4.154880273170223e-05,
+      "loss": 0.16394708156585694,
+      "step": 1605
+    },
+    {
+      "epoch": 0.2929403202328967,
+      "grad_norm": 0.14834867417812347,
+      "learning_rate": 4.149355843219744e-05,
+      "loss": 0.15916435718536376,
+      "step": 1610
+    },
+    {
+      "epoch": 0.2938500727802038,
+      "grad_norm": 0.16977964341640472,
+      "learning_rate": 4.143817114951119e-05,
+      "loss": 0.16538127660751342,
+      "step": 1615
+    },
+    {
+      "epoch": 0.29475982532751094,
+      "grad_norm": 0.17986875772476196,
+      "learning_rate": 4.138264136379756e-05,
+      "loss": 0.15514618158340454,
+      "step": 1620
+    },
+    {
+      "epoch": 0.29566957787481807,
+      "grad_norm": 0.15794920921325684,
+      "learning_rate": 4.132696955644605e-05,
+      "loss": 0.15992183685302735,
+      "step": 1625
+    },
+    {
+      "epoch": 0.2965793304221252,
+      "grad_norm": 0.19955399632453918,
+      "learning_rate": 4.127115621007731e-05,
+      "loss": 0.16362056732177735,
+      "step": 1630
+    },
+    {
+      "epoch": 0.29748908296943233,
+      "grad_norm": 0.1352023035287857,
+      "learning_rate": 4.121520180853903e-05,
+      "loss": 0.15631601810455323,
+      "step": 1635
+    },
+    {
+      "epoch": 0.29839883551673946,
+      "grad_norm": 0.15340781211853027,
+      "learning_rate": 4.1159106836901674e-05,
+      "loss": 0.1571858048439026,
+      "step": 1640
+    },
+    {
+      "epoch": 0.2993085880640466,
+      "grad_norm": 0.15311770141124725,
+      "learning_rate": 4.110287178145433e-05,
+      "loss": 0.16082344055175782,
+      "step": 1645
+    },
+    {
+      "epoch": 0.3002183406113537,
+      "grad_norm": 0.17811627686023712,
+      "learning_rate": 4.10464971297005e-05,
+      "loss": 0.16117215156555176,
+      "step": 1650
+    },
+    {
+      "epoch": 0.30112809315866085,
+      "grad_norm": 0.21060039103031158,
+      "learning_rate": 4.0989983370353805e-05,
+      "loss": 0.15838587284088135,
+      "step": 1655
+    },
+    {
+      "epoch": 0.302037845705968,
+      "grad_norm": 0.155836820602417,
+      "learning_rate": 4.093333099333383e-05,
+      "loss": 0.16648870706558228,
+      "step": 1660
+    },
+    {
+      "epoch": 0.3029475982532751,
+      "grad_norm": 0.13711698353290558,
+      "learning_rate": 4.0876540489761826e-05,
+      "loss": 0.16899349689483642,
+      "step": 1665
+    },
+    {
+      "epoch": 0.30385735080058224,
+      "grad_norm": 0.15162716805934906,
+      "learning_rate": 4.0819612351956485e-05,
+      "loss": 0.16574090719223022,
+      "step": 1670
+    },
+    {
+      "epoch": 0.30476710334788937,
+      "grad_norm": 0.15016348659992218,
+      "learning_rate": 4.0762547073429615e-05,
+      "loss": 0.1689780354499817,
+      "step": 1675
+    },
+    {
+      "epoch": 0.3056768558951965,
+      "grad_norm": 0.15182986855506897,
+      "learning_rate": 4.070534514888194e-05,
+      "loss": 0.1593686819076538,
+      "step": 1680
+    },
+    {
+      "epoch": 0.3065866084425036,
+      "grad_norm": 0.15648750960826874,
+      "learning_rate": 4.0648007074198765e-05,
+      "loss": 0.16436235904693602,
+      "step": 1685
+    },
+    {
+      "epoch": 0.30749636098981076,
+      "grad_norm": 0.18339484930038452,
+      "learning_rate": 4.0590533346445665e-05,
+      "loss": 0.1678077220916748,
+      "step": 1690
+    },
+    {
+      "epoch": 0.3084061135371179,
+      "grad_norm": 0.16426527500152588,
+      "learning_rate": 4.053292446386422e-05,
+      "loss": 0.1689227342605591,
+      "step": 1695
+    },
+    {
+      "epoch": 0.309315866084425,
+      "grad_norm": 0.16129335761070251,
+      "learning_rate": 4.047518092586766e-05,
+      "loss": 0.16592445373535156,
+      "step": 1700
+    },
+    {
+      "epoch": 0.31022561863173215,
+      "grad_norm": 0.15512363612651825,
+      "learning_rate": 4.041730323303654e-05,
+      "loss": 0.16142364740371704,
+      "step": 1705
+    },
+    {
+      "epoch": 0.3111353711790393,
+      "grad_norm": 0.159842386841774,
+      "learning_rate": 4.0359291887114425e-05,
+      "loss": 0.1702875852584839,
+      "step": 1710
+    },
+    {
+      "epoch": 0.3120451237263464,
+      "grad_norm": 0.19558854401111603,
+      "learning_rate": 4.030114739100352e-05,
+      "loss": 0.15966148376464845,
+      "step": 1715
+    },
+    {
+      "epoch": 0.3129548762736536,
+      "grad_norm": 0.1577496975660324,
+      "learning_rate": 4.024287024876029e-05,
+      "loss": 0.1620358943939209,
+      "step": 1720
+    },
+    {
+      "epoch": 0.3138646288209607,
+      "grad_norm": 0.1629355251789093,
+      "learning_rate": 4.0184460965591144e-05,
+      "loss": 0.16511552333831786,
+      "step": 1725
+    },
+    {
+      "epoch": 0.31477438136826785,
+      "grad_norm": 0.17060767114162445,
+      "learning_rate": 4.0125920047848e-05,
+      "loss": 0.15672838687896729,
+      "step": 1730
+    },
+    {
+      "epoch": 0.315684133915575,
+      "grad_norm": 0.22447620332241058,
+      "learning_rate": 4.006724800302394e-05,
+      "loss": 0.15339784622192382,
+      "step": 1735
+    },
+    {
+      "epoch": 0.3165938864628821,
+      "grad_norm": 0.14572037756443024,
+      "learning_rate": 4.000844533974878e-05,
+      "loss": 0.16566959619522095,
+      "step": 1740
+    },
+    {
+      "epoch": 0.31750363901018924,
+      "grad_norm": 0.15915483236312866,
+      "learning_rate": 3.9949512567784684e-05,
+      "loss": 0.16153957843780517,
+      "step": 1745
+    },
+    {
+      "epoch": 0.3184133915574964,
+      "grad_norm": 0.1668540984392166,
+      "learning_rate": 3.9890450198021704e-05,
+      "loss": 0.1659809947013855,
+      "step": 1750
+    },
+    {
+      "epoch": 0.3193231441048035,
+      "grad_norm": 0.16612035036087036,
+      "learning_rate": 3.983125874247341e-05,
+      "loss": 0.16941241025924683,
+      "step": 1755
+    },
+    {
+      "epoch": 0.32023289665211063,
+      "grad_norm": 0.15163679420948029,
+      "learning_rate": 3.9771938714272407e-05,
+      "loss": 0.16053590774536133,
+      "step": 1760
+    },
+    {
+      "epoch": 0.32114264919941776,
+      "grad_norm": 0.1797824203968048,
+      "learning_rate": 3.97124906276659e-05,
+      "loss": 0.1667110800743103,
+      "step": 1765
+    },
+    {
+      "epoch": 0.3220524017467249,
+      "grad_norm": 0.15076608955860138,
+      "learning_rate": 3.9652914998011237e-05,
+      "loss": 0.1607860803604126,
+      "step": 1770
+    },
+    {
+      "epoch": 0.322962154294032,
+      "grad_norm": 0.16523587703704834,
+      "learning_rate": 3.959321234177144e-05,
+      "loss": 0.16515827178955078,
+      "step": 1775
+    },
+    {
+      "epoch": 0.32387190684133915,
+      "grad_norm": 0.22065149247646332,
+      "learning_rate": 3.9533383176510746e-05,
+      "loss": 0.1618957757949829,
+      "step": 1780
+    },
+    {
+      "epoch": 0.3247816593886463,
+      "grad_norm": 0.16426463425159454,
+      "learning_rate": 3.9473428020890066e-05,
+      "loss": 0.15763382911682128,
+      "step": 1785
+    },
+    {
+      "epoch": 0.3256914119359534,
+      "grad_norm": 0.16474904119968414,
+      "learning_rate": 3.941334739466257e-05,
+      "loss": 0.15135571956634522,
+      "step": 1790
+    },
+    {
+      "epoch": 0.32660116448326054,
+      "grad_norm": 0.16746412217617035,
+      "learning_rate": 3.935314181866909e-05,
+      "loss": 0.15925389528274536,
+      "step": 1795
+    },
+    {
+      "epoch": 0.32751091703056767,
+      "grad_norm": 0.17819371819496155,
+      "learning_rate": 3.929281181483369e-05,
+      "loss": 0.1598669171333313,
+      "step": 1800
+    },
+    {
+      "epoch": 0.3284206695778748,
+      "grad_norm": 0.1816040277481079,
+      "learning_rate": 3.923235790615907e-05,
+      "loss": 0.1652522087097168,
+      "step": 1805
+    },
+    {
+      "epoch": 0.32933042212518193,
+      "grad_norm": 0.14846695959568024,
+      "learning_rate": 3.917178061672211e-05,
+      "loss": 0.16665585041046144,
+      "step": 1810
+    },
+    {
+      "epoch": 0.33024017467248906,
+      "grad_norm": 0.1734926551580429,
+      "learning_rate": 3.911108047166924e-05,
+      "loss": 0.16069791316986085,
+      "step": 1815
+    },
+    {
+      "epoch": 0.3311499272197962,
+      "grad_norm": 0.16154922544956207,
+      "learning_rate": 3.905025799721194e-05,
+      "loss": 0.16114097833633423,
+      "step": 1820
+    },
+    {
+      "epoch": 0.3320596797671033,
+      "grad_norm": 0.1538771390914917,
+      "learning_rate": 3.898931372062217e-05,
+      "loss": 0.1602831244468689,
+      "step": 1825
+    },
+    {
+      "epoch": 0.3329694323144105,
+      "grad_norm": 0.14036566019058228,
+      "learning_rate": 3.892824817022781e-05,
+      "loss": 0.1502395749092102,
+      "step": 1830
+    },
+    {
+      "epoch": 0.33387918486171764,
+      "grad_norm": 0.19212059676647186,
+      "learning_rate": 3.886706187540804e-05,
+      "loss": 0.16265250444412233,
+      "step": 1835
+    },
+    {
+      "epoch": 0.33478893740902477,
+      "grad_norm": 0.17410333454608917,
+      "learning_rate": 3.880575536658881e-05,
+      "loss": 0.15689224004745483,
+      "step": 1840
+    },
+    {
+      "epoch": 0.3356986899563319,
+      "grad_norm": 0.15165294706821442,
+      "learning_rate": 3.874432917523817e-05,
+      "loss": 0.15033140182495117,
+      "step": 1845
+    },
+    {
+      "epoch": 0.336608442503639,
+      "grad_norm": 0.16166730225086212,
+      "learning_rate": 3.8682783833861736e-05,
+      "loss": 0.16896235942840576,
+      "step": 1850
+    },
+    {
+      "epoch": 0.33751819505094616,
+      "grad_norm": 0.16497021913528442,
+      "learning_rate": 3.8621119875998026e-05,
+      "loss": 0.1600774645805359,
+      "step": 1855
+    },
+    {
+      "epoch": 0.3384279475982533,
+      "grad_norm": 0.17264948785305023,
+      "learning_rate": 3.855933783621384e-05,
+      "loss": 0.16947593688964843,
+      "step": 1860
+    },
+    {
+      "epoch": 0.3393377001455604,
+      "grad_norm": 0.16870704293251038,
+      "learning_rate": 3.8497438250099636e-05,
+      "loss": 0.16062095165252685,
+      "step": 1865
+    },
+    {
+      "epoch": 0.34024745269286755,
+      "grad_norm": 0.16644036769866943,
+      "learning_rate": 3.843542165426492e-05,
+      "loss": 0.16015599966049193,
+      "step": 1870
+    },
+    {
+      "epoch": 0.3411572052401747,
+      "grad_norm": 0.1626352220773697,
+      "learning_rate": 3.837328858633349e-05,
+      "loss": 0.17444703578948975,
+      "step": 1875
+    },
+    {
+      "epoch": 0.3420669577874818,
+      "grad_norm": 0.1427375227212906,
+      "learning_rate": 3.83110395849389e-05,
+      "loss": 0.1589805006980896,
+      "step": 1880
+    },
+    {
+      "epoch": 0.34297671033478894,
+      "grad_norm": 0.17840255796909332,
+      "learning_rate": 3.824867518971973e-05,
+      "loss": 0.15953952074050903,
+      "step": 1885
+    },
+    {
+      "epoch": 0.34388646288209607,
+      "grad_norm": 0.16998249292373657,
+      "learning_rate": 3.818619594131489e-05,
+      "loss": 0.16027032136917113,
+      "step": 1890
+    },
+    {
+      "epoch": 0.3447962154294032,
+      "grad_norm": 0.14950257539749146,
+      "learning_rate": 3.812360238135897e-05,
+      "loss": 0.15335670709609986,
+      "step": 1895
+    },
+    {
+      "epoch": 0.3457059679767103,
+      "grad_norm": 0.1678011417388916,
+      "learning_rate": 3.806089505247752e-05,
+      "loss": 0.1560648798942566,
+      "step": 1900
+    },
+    {
+      "epoch": 0.34661572052401746,
+      "grad_norm": 0.17944541573524475,
+      "learning_rate": 3.799807449828238e-05,
+      "loss": 0.16072254180908202,
+      "step": 1905
+    },
+    {
+      "epoch": 0.3475254730713246,
+      "grad_norm": 0.166817307472229,
+      "learning_rate": 3.793514126336691e-05,
+      "loss": 0.1542820692062378,
+      "step": 1910
+    },
+    {
+      "epoch": 0.3484352256186317,
+      "grad_norm": 0.16047626733779907,
+      "learning_rate": 3.787209589330134e-05,
+      "loss": 0.16092092990875245,
+      "step": 1915
+    },
+    {
+      "epoch": 0.34934497816593885,
+      "grad_norm": 0.16478900611400604,
+      "learning_rate": 3.7808938934627965e-05,
+      "loss": 0.16765867471694945,
+      "step": 1920
+    },
+    {
+      "epoch": 0.350254730713246,
+      "grad_norm": 0.15349514782428741,
+      "learning_rate": 3.774567093485648e-05,
+      "loss": 0.15890377759933472,
+      "step": 1925
+    },
+    {
+      "epoch": 0.3511644832605531,
+      "grad_norm": 0.1515921950340271,
+      "learning_rate": 3.768229244245917e-05,
+      "loss": 0.16668319702148438,
+      "step": 1930
+    },
+    {
+      "epoch": 0.35207423580786024,
+      "grad_norm": 0.16310466825962067,
+      "learning_rate": 3.7618804006866195e-05,
+      "loss": 0.15182652473449706,
+      "step": 1935
+    },
+    {
+      "epoch": 0.3529839883551674,
+      "grad_norm": 0.17294517159461975,
+      "learning_rate": 3.755520617846084e-05,
+      "loss": 0.16287628412246705,
+      "step": 1940
+    },
+    {
+      "epoch": 0.35389374090247455,
+      "grad_norm": 0.1482895463705063,
+      "learning_rate": 3.749149950857467e-05,
+      "loss": 0.15321952104568481,
+      "step": 1945
+    },
+    {
+      "epoch": 0.3548034934497817,
+      "grad_norm": 0.2236029952764511,
+      "learning_rate": 3.7427684549482847e-05,
+      "loss": 0.15403482913970948,
+      "step": 1950
+    },
+    {
+      "epoch": 0.3557132459970888,
+      "grad_norm": 0.20185327529907227,
+      "learning_rate": 3.736376185439927e-05,
+      "loss": 0.1633884072303772,
+      "step": 1955
+    },
+    {
+      "epoch": 0.35662299854439594,
+      "grad_norm": 0.13906247913837433,
+      "learning_rate": 3.7299731977471816e-05,
+      "loss": 0.15925350189208984,
+      "step": 1960
+    },
+    {
+      "epoch": 0.35753275109170307,
+      "grad_norm": 0.18665002286434174,
+      "learning_rate": 3.723559547377751e-05,
+      "loss": 0.1612026572227478,
+      "step": 1965
+    },
+    {
+      "epoch": 0.3584425036390102,
+      "grad_norm": 0.16913433372974396,
+      "learning_rate": 3.717135289931774e-05,
+      "loss": 0.15479494333267213,
+      "step": 1970
+    },
+    {
+      "epoch": 0.35935225618631733,
+      "grad_norm": 0.1620066910982132,
+      "learning_rate": 3.7107004811013434e-05,
+      "loss": 0.1604058027267456,
+      "step": 1975
+    },
+    {
+      "epoch": 0.36026200873362446,
+      "grad_norm": 0.16838301718235016,
+      "learning_rate": 3.704255176670021e-05,
+      "loss": 0.15335073471069335,
+      "step": 1980
+    },
+    {
+      "epoch": 0.3611717612809316,
+      "grad_norm": 0.3054695427417755,
+      "learning_rate": 3.6977994325123535e-05,
+      "loss": 0.16558053493499755,
+      "step": 1985
+    },
+    {
+      "epoch": 0.3620815138282387,
+      "grad_norm": 0.1526716649532318,
+      "learning_rate": 3.6913333045933934e-05,
+      "loss": 0.16148923635482787,
+      "step": 1990
+    },
+    {
+      "epoch": 0.36299126637554585,
+      "grad_norm": 0.15328513085842133,
+      "learning_rate": 3.684856848968209e-05,
+      "loss": 0.1553613781929016,
+      "step": 1995
+    },
+    {
+      "epoch": 0.363901018922853,
+      "grad_norm": 0.16129714250564575,
+      "learning_rate": 3.6783701217813995e-05,
+      "loss": 0.16724612712860107,
+      "step": 2000
+    },
+    {
+      "epoch": 0.3648107714701601,
+      "grad_norm": 0.15715539455413818,
+      "learning_rate": 3.6718731792666086e-05,
+      "loss": 0.15867922306060792,
+      "step": 2005
+    },
+    {
+      "epoch": 0.36572052401746724,
+      "grad_norm": 0.15569166839122772,
+      "learning_rate": 3.6653660777460366e-05,
+      "loss": 0.1552058696746826,
+      "step": 2010
+    },
+    {
+      "epoch": 0.36663027656477437,
+      "grad_norm": 0.16223010420799255,
+      "learning_rate": 3.6588488736299535e-05,
+      "loss": 0.1583200454711914,
+      "step": 2015
+    },
+    {
+      "epoch": 0.3675400291120815,
+      "grad_norm": 0.18441995978355408,
+      "learning_rate": 3.652321623416209e-05,
+      "loss": 0.15050662755966188,
+      "step": 2020
+    },
+    {
+      "epoch": 0.36844978165938863,
+      "grad_norm": 0.13792674243450165,
+      "learning_rate": 3.645784383689742e-05,
+      "loss": 0.15458759069442748,
+      "step": 2025
+    },
+    {
+      "epoch": 0.36935953420669576,
+      "grad_norm": 0.14993111789226532,
+      "learning_rate": 3.639237211122091e-05,
+      "loss": 0.15926222801208495,
+      "step": 2030
+    },
+    {
+      "epoch": 0.3702692867540029,
+      "grad_norm": 0.16815930604934692,
+      "learning_rate": 3.632680162470904e-05,
+      "loss": 0.15524441003799438,
+      "step": 2035
+    },
+    {
+      "epoch": 0.37117903930131,
+      "grad_norm": 0.13312821090221405,
+      "learning_rate": 3.626113294579441e-05,
+      "loss": 0.15883516073226928,
+      "step": 2040
+    },
+    {
+      "epoch": 0.37208879184861715,
+      "grad_norm": 0.16838273406028748,
+      "learning_rate": 3.619536664376091e-05,
+      "loss": 0.15829603672027587,
+      "step": 2045
+    },
+    {
+      "epoch": 0.37299854439592434,
+      "grad_norm": 0.14706873893737793,
+      "learning_rate": 3.612950328873869e-05,
+      "loss": 0.15644397735595703,
+      "step": 2050
+    },
+    {
+      "epoch": 0.37390829694323147,
+      "grad_norm": 0.1644199639558792,
+      "learning_rate": 3.606354345169926e-05,
+      "loss": 0.15858219861984252,
+      "step": 2055
+    },
+    {
+      "epoch": 0.3748180494905386,
+      "grad_norm": 0.18077051639556885,
+      "learning_rate": 3.599748770445055e-05,
+      "loss": 0.1641286849975586,
+      "step": 2060
+    },
+    {
+      "epoch": 0.3757278020378457,
+      "grad_norm": 0.16329127550125122,
+      "learning_rate": 3.5931336619631914e-05,
+      "loss": 0.15027186870574952,
+      "step": 2065
+    },
+    {
+      "epoch": 0.37663755458515286,
+      "grad_norm": 0.16346783936023712,
+      "learning_rate": 3.586509077070922e-05,
+      "loss": 0.1558641314506531,
+      "step": 2070
+    },
+    {
+      "epoch": 0.37754730713246,
+      "grad_norm": 0.1727602630853653,
+      "learning_rate": 3.5798750731969834e-05,
+      "loss": 0.15390506982803345,
+      "step": 2075
+    },
+    {
+      "epoch": 0.3784570596797671,
+      "grad_norm": 0.7598192691802979,
+      "learning_rate": 3.5732317078517654e-05,
+      "loss": 0.1533232808113098,
+      "step": 2080
+    },
+    {
+      "epoch": 0.37936681222707425,
+      "grad_norm": 0.1433355212211609,
+      "learning_rate": 3.5665790386268124e-05,
+      "loss": 0.15560413599014283,
+      "step": 2085
+    },
+    {
+      "epoch": 0.3802765647743814,
+      "grad_norm": 0.18439625203609467,
+      "learning_rate": 3.559917123194325e-05,
+      "loss": 0.16695556640625,
+      "step": 2090
+    },
+    {
+      "epoch": 0.3811863173216885,
+      "grad_norm": 0.1693502813577652,
+      "learning_rate": 3.55324601930666e-05,
+      "loss": 0.15957870483398437,
+      "step": 2095
+    },
+    {
+      "epoch": 0.38209606986899564,
+      "grad_norm": 0.17776088416576385,
+      "learning_rate": 3.54656578479583e-05,
+      "loss": 0.1527492880821228,
+      "step": 2100
+    },
+    {
+      "epoch": 0.38300582241630277,
+      "grad_norm": 0.15993724763393402,
+      "learning_rate": 3.539876477572998e-05,
+      "loss": 0.1567505717277527,
+      "step": 2105
+    },
+    {
+      "epoch": 0.3839155749636099,
+      "grad_norm": 0.17067375779151917,
+      "learning_rate": 3.533178155627981e-05,
+      "loss": 0.14660797119140626,
+      "step": 2110
+    },
+    {
+      "epoch": 0.384825327510917,
+      "grad_norm": 0.20239882171154022,
+      "learning_rate": 3.526470877028745e-05,
+      "loss": 0.1596767544746399,
+      "step": 2115
+    },
+    {
+      "epoch": 0.38573508005822416,
+      "grad_norm": 0.1863643079996109,
+      "learning_rate": 3.5197546999209005e-05,
+      "loss": 0.15738571882247926,
+      "step": 2120
+    },
+    {
+      "epoch": 0.3866448326055313,
+      "grad_norm": 0.16994133591651917,
+      "learning_rate": 3.5130296825272014e-05,
+      "loss": 0.16255316734313965,
+      "step": 2125
+    },
+    {
+      "epoch": 0.3875545851528384,
+      "grad_norm": 0.18703415989875793,
+      "learning_rate": 3.5062958831470355e-05,
+      "loss": 0.15206334590911866,
+      "step": 2130
+    },
+    {
+      "epoch": 0.38846433770014555,
+      "grad_norm": 0.15433982014656067,
+      "learning_rate": 3.4995533601559226e-05,
+      "loss": 0.1590178370475769,
+      "step": 2135
+    },
+    {
+      "epoch": 0.3893740902474527,
+      "grad_norm": 0.16498146951198578,
+      "learning_rate": 3.4928021720050104e-05,
+      "loss": 0.14759145975112914,
+      "step": 2140
+    },
+    {
+      "epoch": 0.3902838427947598,
+      "grad_norm": 0.17880478501319885,
+      "learning_rate": 3.486042377220562e-05,
+      "loss": 0.1642458915710449,
+      "step": 2145
+    },
+    {
+      "epoch": 0.39119359534206694,
+      "grad_norm": 0.14700061082839966,
+      "learning_rate": 3.479274034403455e-05,
+      "loss": 0.16105138063430785,
+      "step": 2150
+    },
+    {
+      "epoch": 0.39210334788937407,
+      "grad_norm": 0.1620762050151825,
+      "learning_rate": 3.472497202228664e-05,
+      "loss": 0.15104985237121582,
+      "step": 2155
+    },
+    {
+      "epoch": 0.3930131004366812,
+      "grad_norm": 0.1625058799982071,
+      "learning_rate": 3.4657119394447654e-05,
+      "loss": 0.16145485639572144,
+      "step": 2160
+    },
+    {
+      "epoch": 0.3939228529839884,
+      "grad_norm": 0.1631549596786499,
+      "learning_rate": 3.458918304873417e-05,
+      "loss": 0.16712255477905275,
+      "step": 2165
+    },
+    {
+      "epoch": 0.3948326055312955,
+      "grad_norm": 0.16041551530361176,
+      "learning_rate": 3.452116357408853e-05,
+      "loss": 0.15118330717086792,
+      "step": 2170
+    },
+    {
+      "epoch": 0.39574235807860264,
+      "grad_norm": 0.16692611575126648,
+      "learning_rate": 3.44530615601737e-05,
+      "loss": 0.16982550621032716,
+      "step": 2175
+    },
+    {
+      "epoch": 0.39665211062590977,
+      "grad_norm": 0.16082268953323364,
+      "learning_rate": 3.438487759736821e-05,
+      "loss": 0.1513260006904602,
+      "step": 2180
+    },
+    {
+      "epoch": 0.3975618631732169,
+      "grad_norm": 0.1474589854478836,
+      "learning_rate": 3.4316612276761004e-05,
+      "loss": 0.14968743324279785,
+      "step": 2185
+    },
+    {
+      "epoch": 0.39847161572052403,
+      "grad_norm": 0.14531342685222626,
+      "learning_rate": 3.42482661901463e-05,
+      "loss": 0.1563260555267334,
+      "step": 2190
+    },
+    {
+      "epoch": 0.39938136826783116,
+      "grad_norm": 0.16775506734848022,
+      "learning_rate": 3.41798399300185e-05,
+      "loss": 0.14861010313034057,
+      "step": 2195
+    },
+    {
+      "epoch": 0.4002911208151383,
+      "grad_norm": 0.15065217018127441,
+      "learning_rate": 3.411133408956703e-05,
+      "loss": 0.15559519529342652,
+      "step": 2200
+    },
+    {
+      "epoch": 0.4012008733624454,
+      "grad_norm": 0.16655296087265015,
+      "learning_rate": 3.4042749262671184e-05,
+      "loss": 0.16025567054748535,
+      "step": 2205
+    },
+    {
+      "epoch": 0.40211062590975255,
+      "grad_norm": 0.14773905277252197,
+      "learning_rate": 3.397408604389501e-05,
+      "loss": 0.15074082612991332,
+      "step": 2210
+    },
+    {
+      "epoch": 0.4030203784570597,
+      "grad_norm": 0.16233304142951965,
+      "learning_rate": 3.3905345028482125e-05,
+      "loss": 0.15490520000457764,
+      "step": 2215
+    },
+    {
+      "epoch": 0.4039301310043668,
+      "grad_norm": 0.17520153522491455,
+      "learning_rate": 3.383652681235058e-05,
+      "loss": 0.1517520785331726,
+      "step": 2220
+    },
+    {
+      "epoch": 0.40483988355167394,
+      "grad_norm": 0.14749875664710999,
+      "learning_rate": 3.376763199208766e-05,
+      "loss": 0.15410997867584228,
+      "step": 2225
+    },
+    {
+      "epoch": 0.40574963609898107,
+      "grad_norm": 0.16855919361114502,
+      "learning_rate": 3.369866116494477e-05,
+      "loss": 0.1510261058807373,
+      "step": 2230
+    },
+    {
+      "epoch": 0.4066593886462882,
+      "grad_norm": 0.1594122350215912,
+      "learning_rate": 3.362961492883218e-05,
+      "loss": 0.1493813395500183,
+      "step": 2235
+    },
+    {
+      "epoch": 0.40756914119359533,
+      "grad_norm": 0.13645926117897034,
+      "learning_rate": 3.3560493882313915e-05,
+      "loss": 0.14876762628555298,
+      "step": 2240
+    },
+    {
+      "epoch": 0.40847889374090246,
+      "grad_norm": 0.14304400980472565,
+      "learning_rate": 3.349129862460251e-05,
+      "loss": 0.15567013025283813,
+      "step": 2245
+    },
+    {
+      "epoch": 0.4093886462882096,
+      "grad_norm": 0.17040041089057922,
+      "learning_rate": 3.342202975555386e-05,
+      "loss": 0.1563249945640564,
+      "step": 2250
+    },
+    {
+      "epoch": 0.4102983988355167,
+      "grad_norm": 0.15594671666622162,
+      "learning_rate": 3.3352687875661984e-05,
+      "loss": 0.1546410083770752,
+      "step": 2255
+    },
+    {
+      "epoch": 0.41120815138282385,
+      "grad_norm": 0.1677195280790329,
+      "learning_rate": 3.328327358605384e-05,
+      "loss": 0.15710171461105346,
+      "step": 2260
+    },
+    {
+      "epoch": 0.412117903930131,
+      "grad_norm": 0.1731705516576767,
+      "learning_rate": 3.321378748848412e-05,
+      "loss": 0.16444036960601807,
+      "step": 2265
+    },
+    {
+      "epoch": 0.4130276564774381,
+      "grad_norm": 0.18779033422470093,
+      "learning_rate": 3.3144230185329984e-05,
+      "loss": 0.15659687519073487,
+      "step": 2270
+    },
+    {
+      "epoch": 0.4139374090247453,
+      "grad_norm": 0.1543768346309662,
+      "learning_rate": 3.3074602279585913e-05,
+      "loss": 0.15100739002227784,
+      "step": 2275
+    },
+    {
+      "epoch": 0.4148471615720524,
+      "grad_norm": 0.16672168672084808,
+      "learning_rate": 3.300490437485843e-05,
+      "loss": 0.15535364151000977,
+      "step": 2280
+    },
+    {
+      "epoch": 0.41575691411935956,
+      "grad_norm": 0.16741308569908142,
+      "learning_rate": 3.293513707536089e-05,
+      "loss": 0.15523911714553834,
+      "step": 2285
+    },
+    {
+      "epoch": 0.4166666666666667,
+      "grad_norm": 0.1488303542137146,
+      "learning_rate": 3.286530098590822e-05,
+      "loss": 0.1542000651359558,
+      "step": 2290
+    },
+    {
+      "epoch": 0.4175764192139738,
+      "grad_norm": 0.1637732982635498,
+      "learning_rate": 3.2795396711911694e-05,
+      "loss": 0.15354831218719484,
+      "step": 2295
+    },
+    {
+      "epoch": 0.41848617176128095,
+      "grad_norm": 0.1472022533416748,
+      "learning_rate": 3.272542485937369e-05,
+      "loss": 0.16235145330429077,
+      "step": 2300
+    },
+    {
+      "epoch": 0.4193959243085881,
+      "grad_norm": 0.15908290445804596,
+      "learning_rate": 3.265538603488241e-05,
+      "loss": 0.15642645359039306,
+      "step": 2305
+    },
+    {
+      "epoch": 0.4203056768558952,
+      "grad_norm": 0.1584865301847458,
+      "learning_rate": 3.2585280845606645e-05,
+      "loss": 0.15490249395370484,
+      "step": 2310
+    },
+    {
+      "epoch": 0.42121542940320233,
+      "grad_norm": 0.15893949568271637,
+      "learning_rate": 3.251510989929052e-05,
+      "loss": 0.1598116159439087,
+      "step": 2315
+    },
+    {
+      "epoch": 0.42212518195050946,
+      "grad_norm": 0.18930596113204956,
+      "learning_rate": 3.244487380424817e-05,
+      "loss": 0.1482008934020996,
+      "step": 2320
+    },
+    {
+      "epoch": 0.4230349344978166,
+      "grad_norm": 0.132876455783844,
+      "learning_rate": 3.237457316935856e-05,
+      "loss": 0.15304710865020751,
+      "step": 2325
+    },
+    {
+      "epoch": 0.4239446870451237,
+      "grad_norm": 0.16447032988071442,
+      "learning_rate": 3.2304208604060106e-05,
+      "loss": 0.15298750400543212,
+      "step": 2330
+    },
+    {
+      "epoch": 0.42485443959243085,
+      "grad_norm": 0.17748120427131653,
+      "learning_rate": 3.223378071834546e-05,
+      "loss": 0.1556084156036377,
+      "step": 2335
+    },
+    {
+      "epoch": 0.425764192139738,
+      "grad_norm": 0.16366586089134216,
+      "learning_rate": 3.2163290122756206e-05,
+      "loss": 0.14387927055358887,
+      "step": 2340
+    },
+    {
+      "epoch": 0.4266739446870451,
+      "grad_norm": 0.15398970246315002,
+      "learning_rate": 3.209273742837755e-05,
+      "loss": 0.16091293096542358,
+      "step": 2345
+    },
+    {
+      "epoch": 0.42758369723435224,
+      "grad_norm": 0.164212167263031,
+      "learning_rate": 3.202212324683305e-05,
+      "loss": 0.15523531436920165,
+      "step": 2350
+    },
+    {
+      "epoch": 0.4284934497816594,
+      "grad_norm": 0.16749800741672516,
+      "learning_rate": 3.1951448190279255e-05,
+      "loss": 0.15354975461959838,
+      "step": 2355
+    },
+    {
+      "epoch": 0.4294032023289665,
+      "grad_norm": 0.14137034118175507,
+      "learning_rate": 3.18807128714005e-05,
+      "loss": 0.14981694221496583,
+      "step": 2360
+    },
+    {
+      "epoch": 0.43031295487627363,
+      "grad_norm": 0.14848439395427704,
+      "learning_rate": 3.1809917903403507e-05,
+      "loss": 0.15448769330978393,
+      "step": 2365
+    },
+    {
+      "epoch": 0.43122270742358076,
+      "grad_norm": 0.1747605800628662,
+      "learning_rate": 3.1739063900012095e-05,
+      "loss": 0.15882387161254882,
+      "step": 2370
+    },
+    {
+      "epoch": 0.4321324599708879,
+      "grad_norm": 0.16054467856884003,
+      "learning_rate": 3.166815147546186e-05,
+      "loss": 0.15170297622680665,
+      "step": 2375
+    },
+    {
+      "epoch": 0.433042212518195,
+      "grad_norm": 0.15428027510643005,
+      "learning_rate": 3.1597181244494886e-05,
+      "loss": 0.16202548742294312,
+      "step": 2380
+    },
+    {
+      "epoch": 0.4339519650655022,
+      "grad_norm": 0.16747219860553741,
+      "learning_rate": 3.1526153822354325e-05,
+      "loss": 0.15461477041244506,
+      "step": 2385
+    },
+    {
+      "epoch": 0.43486171761280934,
+      "grad_norm": 0.17415772378444672,
+      "learning_rate": 3.145506982477918e-05,
+      "loss": 0.16173542737960817,
+      "step": 2390
+    },
+    {
+      "epoch": 0.43577147016011647,
+      "grad_norm": 0.1293518990278244,
+      "learning_rate": 3.1383929867998865e-05,
+      "loss": 0.15572521686553956,
+      "step": 2395
+    },
+    {
+      "epoch": 0.4366812227074236,
+      "grad_norm": 0.16909323632717133,
+      "learning_rate": 3.1312734568727935e-05,
+      "loss": 0.15898628234863282,
+      "step": 2400
+    },
+    {
+      "epoch": 0.43759097525473073,
+      "grad_norm": 0.16770294308662415,
+      "learning_rate": 3.124148454416069e-05,
+      "loss": 0.1536281704902649,
+      "step": 2405
+    },
+    {
+      "epoch": 0.43850072780203786,
+      "grad_norm": 0.14078612625598907,
+      "learning_rate": 3.117018041196585e-05,
+      "loss": 0.15274266004562378,
+      "step": 2410
+    },
+    {
+      "epoch": 0.439410480349345,
+      "grad_norm": 0.15457536280155182,
+      "learning_rate": 3.1098822790281226e-05,
+      "loss": 0.15391263961791993,
+      "step": 2415
+    },
+    {
+      "epoch": 0.4403202328966521,
+      "grad_norm": 0.1640717089176178,
+      "learning_rate": 3.102741229770827e-05,
+      "loss": 0.15515168905258178,
+      "step": 2420
+    },
+    {
+      "epoch": 0.44122998544395925,
+      "grad_norm": 0.2601533830165863,
+      "learning_rate": 3.095594955330683e-05,
+      "loss": 0.1587247371673584,
+      "step": 2425
+    },
+    {
+      "epoch": 0.4421397379912664,
+      "grad_norm": 0.1352529525756836,
+      "learning_rate": 3.08844351765897e-05,
+      "loss": 0.1483217477798462,
+      "step": 2430
+    },
+    {
+      "epoch": 0.4430494905385735,
+      "grad_norm": 0.18479721248149872,
+      "learning_rate": 3.081286978751728e-05,
+      "loss": 0.15121787786483765,
+      "step": 2435
+    },
+    {
+      "epoch": 0.44395924308588064,
+      "grad_norm": 0.16954511404037476,
+      "learning_rate": 3.074125400649221e-05,
+      "loss": 0.16073100566864013,
+      "step": 2440
+    },
+    {
+      "epoch": 0.44486899563318777,
+      "grad_norm": 0.15154729783535004,
+      "learning_rate": 3.0669588454353944e-05,
+      "loss": 0.15738017559051515,
+      "step": 2445
+    },
+    {
+      "epoch": 0.4457787481804949,
+      "grad_norm": 0.1540488302707672,
+      "learning_rate": 3.059787375237344e-05,
+      "loss": 0.1515384554862976,
+      "step": 2450
+    },
+    {
+      "epoch": 0.44668850072780203,
+      "grad_norm": 0.1814432442188263,
+      "learning_rate": 3.052611052224774e-05,
+      "loss": 0.15731438398361205,
+      "step": 2455
+    },
+    {
+      "epoch": 0.44759825327510916,
+      "grad_norm": 0.16657036542892456,
+      "learning_rate": 3.0454299386094542e-05,
+      "loss": 0.15741543769836425,
+      "step": 2460
+    },
+    {
+      "epoch": 0.4485080058224163,
+      "grad_norm": 0.2177237570285797,
+      "learning_rate": 3.0382440966446875e-05,
+      "loss": 0.14972515106201173,
+      "step": 2465
+    },
+    {
+      "epoch": 0.4494177583697234,
+      "grad_norm": 0.1669909954071045,
+      "learning_rate": 3.031053588624766e-05,
+      "loss": 0.1506432294845581,
+      "step": 2470
+    },
+    {
+      "epoch": 0.45032751091703055,
+      "grad_norm": 0.1752234250307083,
+      "learning_rate": 3.0238584768844313e-05,
+      "loss": 0.14969609975814818,
+      "step": 2475
+    },
+    {
+      "epoch": 0.4512372634643377,
+      "grad_norm": 0.18267901241779327,
+      "learning_rate": 3.0166588237983363e-05,
+      "loss": 0.15112748146057128,
+      "step": 2480
+    },
+    {
+      "epoch": 0.4521470160116448,
+      "grad_norm": 0.16250105202198029,
+      "learning_rate": 3.0094546917805007e-05,
+      "loss": 0.15864100456237792,
+      "step": 2485
+    },
+    {
+      "epoch": 0.45305676855895194,
+      "grad_norm": 0.14825721085071564,
+      "learning_rate": 3.0022461432837752e-05,
+      "loss": 0.1513954520225525,
+      "step": 2490
+    },
+    {
+      "epoch": 0.4539665211062591,
+      "grad_norm": 0.1626640111207962,
+      "learning_rate": 2.9950332407992943e-05,
+      "loss": 0.1505578875541687,
+      "step": 2495
+    },
+    {
+      "epoch": 0.45487627365356625,
+      "grad_norm": 0.1535351574420929,
+      "learning_rate": 2.987816046855939e-05,
+      "loss": 0.15255829095840454,
+      "step": 2500
+    },
+    {
+      "epoch": 0.4557860262008734,
+      "grad_norm": 0.17552775144577026,
+      "learning_rate": 2.9805946240197928e-05,
+      "loss": 0.1516443133354187,
+      "step": 2505
+    },
+    {
+      "epoch": 0.4566957787481805,
+      "grad_norm": 0.16020981967449188,
+      "learning_rate": 2.9733690348935994e-05,
+      "loss": 0.14519743919372557,
+      "step": 2510
+    },
+    {
+      "epoch": 0.45760553129548764,
+      "grad_norm": 0.17800211906433105,
+      "learning_rate": 2.9661393421162204e-05,
+      "loss": 0.15679080486297609,
+      "step": 2515
+    },
+    {
+      "epoch": 0.4585152838427948,
+      "grad_norm": 0.16016991436481476,
+      "learning_rate": 2.9589056083620902e-05,
+      "loss": 0.14768127202987671,
+      "step": 2520
+    },
+    {
+      "epoch": 0.4594250363901019,
+      "grad_norm": 0.16272081434726715,
+      "learning_rate": 2.951667896340679e-05,
+      "loss": 0.1513301968574524,
+      "step": 2525
+    },
+    {
+      "epoch": 0.46033478893740903,
+      "grad_norm": 0.1726413071155548,
+      "learning_rate": 2.9444262687959402e-05,
+      "loss": 0.14819332361221313,
+      "step": 2530
+    },
+    {
+      "epoch": 0.46124454148471616,
+      "grad_norm": 0.1670403778553009,
+      "learning_rate": 2.9371807885057735e-05,
+      "loss": 0.15245940685272216,
+      "step": 2535
+    },
+    {
+      "epoch": 0.4621542940320233,
+      "grad_norm": 0.1650049239397049,
+      "learning_rate": 2.9299315182814772e-05,
+      "loss": 0.15187418460845947,
+      "step": 2540
+    },
+    {
+      "epoch": 0.4630640465793304,
+      "grad_norm": 0.16327734291553497,
+      "learning_rate": 2.9226785209672047e-05,
+      "loss": 0.15579828023910522,
+      "step": 2545
+    },
+    {
+      "epoch": 0.46397379912663755,
+      "grad_norm": 0.3367880582809448,
+      "learning_rate": 2.91542185943942e-05,
+      "loss": 0.15617697238922118,
+      "step": 2550
+    },
+    {
+      "epoch": 0.4648835516739447,
+      "grad_norm": 0.1731594055891037,
+      "learning_rate": 2.908161596606353e-05,
+      "loss": 0.1559603691101074,
+      "step": 2555
+    },
+    {
+      "epoch": 0.4657933042212518,
+      "grad_norm": 0.1477293074131012,
+      "learning_rate": 2.9008977954074517e-05,
+      "loss": 0.15567959547042848,
+      "step": 2560
+    },
+    {
+      "epoch": 0.46670305676855894,
+      "grad_norm": 0.16227173805236816,
+      "learning_rate": 2.8936305188128392e-05,
+      "loss": 0.1522113561630249,
+      "step": 2565
+    },
+    {
+      "epoch": 0.4676128093158661,
+      "grad_norm": 0.2031075656414032,
+      "learning_rate": 2.8863598298227674e-05,
+      "loss": 0.15054640769958497,
+      "step": 2570
+    },
+    {
+      "epoch": 0.4685225618631732,
+      "grad_norm": 0.18351472914218903,
+      "learning_rate": 2.8790857914670698e-05,
+      "loss": 0.15837019681930542,
+      "step": 2575
+    },
+    {
+      "epoch": 0.46943231441048033,
+      "grad_norm": 0.15914765000343323,
+      "learning_rate": 2.871808466804616e-05,
+      "loss": 0.1550259470939636,
+      "step": 2580
+    },
+    {
+      "epoch": 0.47034206695778746,
+      "grad_norm": 0.17366717755794525,
+      "learning_rate": 2.8645279189227636e-05,
+      "loss": 0.15702390670776367,
+      "step": 2585
+    },
+    {
+      "epoch": 0.4712518195050946,
+      "grad_norm": 0.13677838444709778,
+      "learning_rate": 2.8572442109368134e-05,
+      "loss": 0.15485031604766847,
+      "step": 2590
+    },
+    {
+      "epoch": 0.4721615720524017,
+      "grad_norm": 0.1477748304605484,
+      "learning_rate": 2.8499574059894617e-05,
+      "loss": 0.14577245712280273,
+      "step": 2595
+    },
+    {
+      "epoch": 0.47307132459970885,
+      "grad_norm": 0.1582217663526535,
+      "learning_rate": 2.842667567250252e-05,
+      "loss": 0.15586793422698975,
+      "step": 2600
+    },
+    {
+      "epoch": 0.47398107714701604,
+      "grad_norm": 0.19658738374710083,
+      "learning_rate": 2.8353747579150268e-05,
+      "loss": 0.15060495138168334,
+      "step": 2605
+    },
+    {
+      "epoch": 0.47489082969432317,
+      "grad_norm": 0.176767036318779,
+      "learning_rate": 2.828079041205382e-05,
+      "loss": 0.15116705894470214,
+      "step": 2610
+    },
+    {
+      "epoch": 0.4758005822416303,
+      "grad_norm": 0.16972507536411285,
+      "learning_rate": 2.820780480368117e-05,
+      "loss": 0.1541937470436096,
+      "step": 2615
+    },
+    {
+      "epoch": 0.47671033478893743,
+      "grad_norm": 0.1548585742712021,
+      "learning_rate": 2.8134791386746884e-05,
+      "loss": 0.14334756135940552,
+      "step": 2620
+    },
+    {
+      "epoch": 0.47762008733624456,
+      "grad_norm": 0.15411986410617828,
+      "learning_rate": 2.806175079420658e-05,
+      "loss": 0.14642289876937867,
+      "step": 2625
+    },
+    {
+      "epoch": 0.4785298398835517,
+      "grad_norm": 0.16609491407871246,
+      "learning_rate": 2.7988683659251474e-05,
+      "loss": 0.15083469152450563,
+      "step": 2630
+    },
+    {
+      "epoch": 0.4794395924308588,
+      "grad_norm": 0.16592684388160706,
+      "learning_rate": 2.791559061530289e-05,
+      "loss": 0.14218480587005616,
+      "step": 2635
+    },
+    {
+      "epoch": 0.48034934497816595,
+      "grad_norm": 0.1764935404062271,
+      "learning_rate": 2.7842472296006722e-05,
+      "loss": 0.15004343986511232,
+      "step": 2640
+    },
+    {
+      "epoch": 0.4812590975254731,
+      "grad_norm": 0.20094354450702667,
+      "learning_rate": 2.7769329335228022e-05,
+      "loss": 0.14975016117095946,
+      "step": 2645
+    },
+    {
+      "epoch": 0.4821688500727802,
+      "grad_norm": 0.1869269460439682,
+      "learning_rate": 2.769616236704542e-05,
+      "loss": 0.155981707572937,
+      "step": 2650
+    },
+    {
+      "epoch": 0.48307860262008734,
+      "grad_norm": 0.16671574115753174,
+      "learning_rate": 2.762297202574571e-05,
+      "loss": 0.14633859395980836,
+      "step": 2655
+    },
+    {
+      "epoch": 0.48398835516739447,
+      "grad_norm": 0.14999663829803467,
+      "learning_rate": 2.754975894581826e-05,
+      "loss": 0.15692603588104248,
+      "step": 2660
+    },
+    {
+      "epoch": 0.4848981077147016,
+      "grad_norm": 0.16893649101257324,
+      "learning_rate": 2.7476523761949592e-05,
+      "loss": 0.14530394077301026,
+      "step": 2665
+    },
+    {
+      "epoch": 0.48580786026200873,
+      "grad_norm": 0.16039884090423584,
+      "learning_rate": 2.740326710901784e-05,
+      "loss": 0.15013915300369263,
+      "step": 2670
+    },
+    {
+      "epoch": 0.48671761280931586,
+      "grad_norm": 0.16672006249427795,
+      "learning_rate": 2.732998962208725e-05,
+      "loss": 0.15667349100112915,
+      "step": 2675
+    },
+    {
+      "epoch": 0.487627365356623,
+      "grad_norm": 0.2160867303609848,
+      "learning_rate": 2.7256691936402684e-05,
+      "loss": 0.14335414171218872,
+      "step": 2680
+    },
+    {
+      "epoch": 0.4885371179039301,
+      "grad_norm": 0.349030077457428,
+      "learning_rate": 2.71833746873841e-05,
+      "loss": 0.1437530279159546,
+      "step": 2685
+    },
+    {
+      "epoch": 0.48944687045123725,
+      "grad_norm": 0.18380966782569885,
+      "learning_rate": 2.7110038510621073e-05,
+      "loss": 0.1476014256477356,
+      "step": 2690
+    },
+    {
+      "epoch": 0.4903566229985444,
+      "grad_norm": 0.1523742377758026,
+      "learning_rate": 2.703668404186722e-05,
+      "loss": 0.14578526020050048,
+      "step": 2695
+    },
+    {
+      "epoch": 0.4912663755458515,
+      "grad_norm": 0.16092729568481445,
+      "learning_rate": 2.696331191703479e-05,
+      "loss": 0.15335593223571778,
+      "step": 2700
+    },
+    {
+      "epoch": 0.49217612809315864,
+      "grad_norm": 0.17185333371162415,
+      "learning_rate": 2.688992277218904e-05,
+      "loss": 0.1540898084640503,
+      "step": 2705
+    },
+    {
+      "epoch": 0.49308588064046577,
+      "grad_norm": 0.1521969735622406,
+      "learning_rate": 2.6816517243542792e-05,
+      "loss": 0.15171396732330322,
+      "step": 2710
+    },
+    {
+      "epoch": 0.49399563318777295,
+      "grad_norm": 0.16064171493053436,
+      "learning_rate": 2.674309596745092e-05,
+      "loss": 0.1505839228630066,
+      "step": 2715
+    },
+    {
+      "epoch": 0.4949053857350801,
+      "grad_norm": 0.16430898010730743,
+      "learning_rate": 2.6669659580404795e-05,
+      "loss": 0.1551363468170166,
+      "step": 2720
+    },
+    {
+      "epoch": 0.4958151382823872,
+      "grad_norm": 0.16125477850437164,
+      "learning_rate": 2.659620871902677e-05,
+      "loss": 0.15069286823272704,
+      "step": 2725
+    },
+    {
+      "epoch": 0.49672489082969434,
+      "grad_norm": 0.1428450047969818,
+      "learning_rate": 2.652274402006471e-05,
+      "loss": 0.15511081218719483,
+      "step": 2730
+    },
+    {
+      "epoch": 0.4976346433770015,
+      "grad_norm": 0.15452754497528076,
+      "learning_rate": 2.6449266120386406e-05,
+      "loss": 0.14941939115524291,
+      "step": 2735
+    },
+    {
+      "epoch": 0.4985443959243086,
+      "grad_norm": 0.17243537306785583,
+      "learning_rate": 2.6375775656974123e-05,
+      "loss": 0.151741623878479,
+      "step": 2740
+    },
+    {
+      "epoch": 0.49945414847161573,
+      "grad_norm": 0.13736453652381897,
+      "learning_rate": 2.6302273266919008e-05,
+      "loss": 0.147042977809906,
+      "step": 2745
+    },
+    {
+      "epoch": 0.5003639010189228,
+      "grad_norm": 0.16241495311260223,
+      "learning_rate": 2.6228759587415614e-05,
+      "loss": 0.14664684534072875,
+      "step": 2750
+    },
+    {
+      "epoch": 0.50127365356623,
+      "grad_norm": 0.193496435880661,
+      "learning_rate": 2.6155235255756356e-05,
+      "loss": 0.15486966371536254,
+      "step": 2755
+    },
+    {
+      "epoch": 0.5021834061135371,
+      "grad_norm": 0.1542847901582718,
+      "learning_rate": 2.6081700909326e-05,
+      "loss": 0.15148009061813356,
+      "step": 2760
+    },
+    {
+      "epoch": 0.5030931586608443,
+      "grad_norm": 0.1696511209011078,
+      "learning_rate": 2.6008157185596142e-05,
+      "loss": 0.14190055131912233,
+      "step": 2765
+    },
+    {
+      "epoch": 0.5040029112081513,
+      "grad_norm": 0.14690077304840088,
+      "learning_rate": 2.5934604722119655e-05,
+      "loss": 0.1570739269256592,
+      "step": 2770
+    },
+    {
+      "epoch": 0.5049126637554585,
+      "grad_norm": 0.17149671912193298,
+      "learning_rate": 2.5861044156525162e-05,
+      "loss": 0.14940304756164552,
+      "step": 2775
+    },
+    {
+      "epoch": 0.5058224163027657,
+      "grad_norm": 0.16639231145381927,
+      "learning_rate": 2.578747612651155e-05,
+      "loss": 0.15691237449645995,
+      "step": 2780
+    },
+    {
+      "epoch": 0.5067321688500728,
+      "grad_norm": 0.2062763124704361,
+      "learning_rate": 2.5713901269842404e-05,
+      "loss": 0.1564734935760498,
+      "step": 2785
+    },
+    {
+      "epoch": 0.50764192139738,
+      "grad_norm": 0.12636308372020721,
+      "learning_rate": 2.5640320224340502e-05,
+      "loss": 0.14539417028427123,
+      "step": 2790
+    },
+    {
+      "epoch": 0.508551673944687,
+      "grad_norm": 0.16893689334392548,
+      "learning_rate": 2.556673362788225e-05,
+      "loss": 0.15440930128097535,
+      "step": 2795
+    },
+    {
+      "epoch": 0.5094614264919942,
+      "grad_norm": 0.16250015795230865,
+      "learning_rate": 2.54931421183922e-05,
+      "loss": 0.14485647678375244,
+      "step": 2800
+    },
+    {
+      "epoch": 0.5103711790393013,
+      "grad_norm": 0.1700994372367859,
+      "learning_rate": 2.5419546333837462e-05,
+      "loss": 0.15411126613616943,
+      "step": 2805
+    },
+    {
+      "epoch": 0.5112809315866085,
+      "grad_norm": 0.1547706127166748,
+      "learning_rate": 2.5345946912222256e-05,
+      "loss": 0.15516072511672974,
+      "step": 2810
+    },
+    {
+      "epoch": 0.5121906841339156,
+      "grad_norm": 0.17955681681632996,
+      "learning_rate": 2.527234449158228e-05,
+      "loss": 0.15546923875808716,
+      "step": 2815
+    },
+    {
+      "epoch": 0.5131004366812227,
+      "grad_norm": 0.163709819316864,
+      "learning_rate": 2.519873970997927e-05,
+      "loss": 0.15665037631988527,
+      "step": 2820
+    },
+    {
+      "epoch": 0.5140101892285298,
+      "grad_norm": 0.17859576642513275,
+      "learning_rate": 2.5125133205495405e-05,
+      "loss": 0.1539722204208374,
+      "step": 2825
+    },
+    {
+      "epoch": 0.514919941775837,
+      "grad_norm": 0.17443150281906128,
+      "learning_rate": 2.5051525616227806e-05,
+      "loss": 0.148411762714386,
+      "step": 2830
+    },
+    {
+      "epoch": 0.5158296943231441,
+      "grad_norm": 0.17397581040859222,
+      "learning_rate": 2.4977917580283007e-05,
+      "loss": 0.14880497455596925,
+      "step": 2835
+    },
+    {
+      "epoch": 0.5167394468704513,
+      "grad_norm": 0.14565663039684296,
+      "learning_rate": 2.4904309735771405e-05,
+      "loss": 0.14934680461883545,
+      "step": 2840
+    },
+    {
+      "epoch": 0.5176491994177583,
+      "grad_norm": 0.17895659804344177,
+      "learning_rate": 2.4830702720801746e-05,
+      "loss": 0.15287939310073853,
+      "step": 2845
+    },
+    {
+      "epoch": 0.5185589519650655,
+      "grad_norm": 0.15812788903713226,
+      "learning_rate": 2.4757097173475572e-05,
+      "loss": 0.14576947689056396,
+      "step": 2850
+    },
+    {
+      "epoch": 0.5194687045123726,
+      "grad_norm": 0.17123781144618988,
+      "learning_rate": 2.46834937318817e-05,
+      "loss": 0.15224847793579102,
+      "step": 2855
+    },
+    {
+      "epoch": 0.5203784570596798,
+      "grad_norm": 0.14845474064350128,
+      "learning_rate": 2.460989303409072e-05,
+      "loss": 0.14901585578918458,
+      "step": 2860
+    },
+    {
+      "epoch": 0.5212882096069869,
+      "grad_norm": 0.23493704199790955,
+      "learning_rate": 2.4536295718149407e-05,
+      "loss": 0.1517487049102783,
+      "step": 2865
+    },
+    {
+      "epoch": 0.522197962154294,
+      "grad_norm": 0.16209843754768372,
+      "learning_rate": 2.4462702422075217e-05,
+      "loss": 0.14327445030212402,
+      "step": 2870
+    },
+    {
+      "epoch": 0.5231077147016011,
+      "grad_norm": 0.17249803245067596,
+      "learning_rate": 2.4389113783850793e-05,
+      "loss": 0.1517549753189087,
+      "step": 2875
+    },
+    {
+      "epoch": 0.5240174672489083,
+      "grad_norm": 0.14561402797698975,
+      "learning_rate": 2.431553044141836e-05,
+      "loss": 0.14764087200164794,
+      "step": 2880
+    },
+    {
+      "epoch": 0.5249272197962155,
+      "grad_norm": 0.17033302783966064,
+      "learning_rate": 2.4241953032674256e-05,
+      "loss": 0.15181604623794556,
+      "step": 2885
+    },
+    {
+      "epoch": 0.5258369723435226,
+      "grad_norm": 0.1184430941939354,
+      "learning_rate": 2.4168382195463367e-05,
+      "loss": 0.14264242649078368,
+      "step": 2890
+    },
+    {
+      "epoch": 0.5267467248908297,
+      "grad_norm": 0.17521196603775024,
+      "learning_rate": 2.4094818567573618e-05,
+      "loss": 0.1509538173675537,
+      "step": 2895
+    },
+    {
+      "epoch": 0.5276564774381368,
+      "grad_norm": 0.1681576371192932,
+      "learning_rate": 2.4021262786730428e-05,
+      "loss": 0.15344605445861817,
+      "step": 2900
+    },
+    {
+      "epoch": 0.528566229985444,
+      "grad_norm": 0.17134182155132294,
+      "learning_rate": 2.3947715490591206e-05,
+      "loss": 0.15161689519882202,
+      "step": 2905
+    },
+    {
+      "epoch": 0.5294759825327511,
+      "grad_norm": 0.1796472817659378,
+      "learning_rate": 2.3874177316739778e-05,
+      "loss": 0.15086464881896972,
+      "step": 2910
+    },
+    {
+      "epoch": 0.5303857350800583,
+      "grad_norm": 0.23268625140190125,
+      "learning_rate": 2.380064890268093e-05,
+      "loss": 0.15354180335998535,
+      "step": 2915
+    },
+    {
+      "epoch": 0.5312954876273653,
+      "grad_norm": 0.16318941116333008,
+      "learning_rate": 2.372713088583481e-05,
+      "loss": 0.15131797790527343,
+      "step": 2920
+    },
+    {
+      "epoch": 0.5322052401746725,
+      "grad_norm": 0.18171803653240204,
+      "learning_rate": 2.365362390353143e-05,
+      "loss": 0.15784090757369995,
+      "step": 2925
+    },
+    {
+      "epoch": 0.5331149927219796,
+      "grad_norm": 0.17672640085220337,
+      "learning_rate": 2.3580128593005156e-05,
+      "loss": 0.15509436130523682,
+      "step": 2930
+    },
+    {
+      "epoch": 0.5340247452692868,
+      "grad_norm": 0.15985223650932312,
+      "learning_rate": 2.3506645591389174e-05,
+      "loss": 0.14851027727127075,
+      "step": 2935
+    },
+    {
+      "epoch": 0.5349344978165939,
+      "grad_norm": 0.16597607731819153,
+      "learning_rate": 2.343317553570995e-05,
+      "loss": 0.1504931092262268,
+      "step": 2940
+    },
+    {
+      "epoch": 0.535844250363901,
+      "grad_norm": 0.20180748403072357,
+      "learning_rate": 2.3359719062881725e-05,
+      "loss": 0.15023820400238036,
+      "step": 2945
+    },
+    {
+      "epoch": 0.5367540029112081,
+      "grad_norm": 0.1735963076353073,
+      "learning_rate": 2.3286276809701e-05,
+      "loss": 0.15374408960342406,
+      "step": 2950
+    },
+    {
+      "epoch": 0.5376637554585153,
+      "grad_norm": 0.17629501223564148,
+      "learning_rate": 2.3212849412840995e-05,
+      "loss": 0.15007833242416382,
+      "step": 2955
+    },
+    {
+      "epoch": 0.5385735080058224,
+      "grad_norm": 0.1493796557188034,
+      "learning_rate": 2.3139437508846155e-05,
+      "loss": 0.15206656455993653,
+      "step": 2960
+    },
+    {
+      "epoch": 0.5394832605531296,
+      "grad_norm": 0.17426837980747223,
+      "learning_rate": 2.306604173412659e-05,
+      "loss": 0.1441131591796875,
+      "step": 2965
+    },
+    {
+      "epoch": 0.5403930131004366,
+      "grad_norm": 0.16984431445598602,
+      "learning_rate": 2.2992662724952613e-05,
+      "loss": 0.14438753128051757,
+      "step": 2970
+    },
+    {
+      "epoch": 0.5413027656477438,
+      "grad_norm": 0.1814386397600174,
+      "learning_rate": 2.2919301117449167e-05,
+      "loss": 0.14887022972106934,
+      "step": 2975
+    },
+    {
+      "epoch": 0.5422125181950509,
+      "grad_norm": 0.158392995595932,
+      "learning_rate": 2.2845957547590368e-05,
+      "loss": 0.14404361248016356,
+      "step": 2980
+    },
+    {
+      "epoch": 0.5431222707423581,
+      "grad_norm": 0.17496263980865479,
+      "learning_rate": 2.2772632651193953e-05,
+      "loss": 0.1454906702041626,
+      "step": 2985
+    },
+    {
+      "epoch": 0.5440320232896652,
+      "grad_norm": 0.157533198595047,
+      "learning_rate": 2.2699327063915766e-05,
+      "loss": 0.1458217740058899,
+      "step": 2990
+    },
+    {
+      "epoch": 0.5449417758369723,
+      "grad_norm": 0.1767890453338623,
+      "learning_rate": 2.262604142124427e-05,
+      "loss": 0.14384825229644777,
+      "step": 2995
+    },
+    {
+      "epoch": 0.5458515283842795,
+      "grad_norm": 0.1851050704717636,
+      "learning_rate": 2.2552776358495033e-05,
+      "loss": 0.14832457304000854,
+      "step": 3000
+    },
+    {
+      "epoch": 0.5467612809315866,
+      "grad_norm": 0.164175882935524,
+      "learning_rate": 2.247953251080521e-05,
+      "loss": 0.14999878406524658,
+      "step": 3005
+    },
+    {
+      "epoch": 0.5476710334788938,
+      "grad_norm": 0.3403675854206085,
+      "learning_rate": 2.240631051312804e-05,
+      "loss": 0.1443937063217163,
+      "step": 3010
+    },
+    {
+      "epoch": 0.5485807860262009,
+      "grad_norm": 0.16751109063625336,
+      "learning_rate": 2.2333111000227342e-05,
+      "loss": 0.1462402105331421,
+      "step": 3015
+    },
+    {
+      "epoch": 0.549490538573508,
+      "grad_norm": 0.14741151034832,
+      "learning_rate": 2.225993460667201e-05,
+      "loss": 0.149855899810791,
+      "step": 3020
+    },
+    {
+      "epoch": 0.5504002911208151,
+      "grad_norm": 0.20605266094207764,
+      "learning_rate": 2.218678196683054e-05,
+      "loss": 0.15413178205490113,
+      "step": 3025
+    },
+    {
+      "epoch": 0.5513100436681223,
+      "grad_norm": 0.14884796738624573,
+      "learning_rate": 2.2113653714865473e-05,
+      "loss": 0.14592334032058715,
+      "step": 3030
+    },
+    {
+      "epoch": 0.5522197962154294,
+      "grad_norm": 0.17114350199699402,
+      "learning_rate": 2.2040550484727943e-05,
+      "loss": 0.1498338460922241,
+      "step": 3035
+    },
+    {
+      "epoch": 0.5531295487627366,
+      "grad_norm": 0.16496853530406952,
+      "learning_rate": 2.196747291015219e-05,
+      "loss": 0.14650315046310425,
+      "step": 3040
+    },
+    {
+      "epoch": 0.5540393013100436,
+      "grad_norm": 0.15172401070594788,
+      "learning_rate": 2.189442162465001e-05,
+      "loss": 0.14984124898910522,
+      "step": 3045
+    },
+    {
+      "epoch": 0.5549490538573508,
+      "grad_norm": 0.19258467853069305,
+      "learning_rate": 2.182139726150532e-05,
+      "loss": 0.1486764669418335,
+      "step": 3050
+    },
+    {
+      "epoch": 0.5558588064046579,
+      "grad_norm": 0.1749001443386078,
+      "learning_rate": 2.1748400453768652e-05,
+      "loss": 0.14983701705932617,
+      "step": 3055
+    },
+    {
+      "epoch": 0.5567685589519651,
+      "grad_norm": 0.37510567903518677,
+      "learning_rate": 2.1675431834251637e-05,
+      "loss": 0.14483561515808105,
+      "step": 3060
+    },
+    {
+      "epoch": 0.5576783114992722,
+      "grad_norm": 0.16932405531406403,
+      "learning_rate": 2.1602492035521553e-05,
+      "loss": 0.14487643241882325,
+      "step": 3065
+    },
+    {
+      "epoch": 0.5585880640465793,
+      "grad_norm": 0.174176424741745,
+      "learning_rate": 2.152958168989584e-05,
+      "loss": 0.14737497568130492,
+      "step": 3070
+    },
+    {
+      "epoch": 0.5594978165938864,
+      "grad_norm": 0.1601252257823944,
+      "learning_rate": 2.1456701429436577e-05,
+      "loss": 0.15183379650115966,
+      "step": 3075
+    },
+    {
+      "epoch": 0.5604075691411936,
+      "grad_norm": 0.14960910379886627,
+      "learning_rate": 2.1383851885945085e-05,
+      "loss": 0.143074893951416,
+      "step": 3080
+    },
+    {
+      "epoch": 0.5613173216885007,
+      "grad_norm": 0.1678633838891983,
+      "learning_rate": 2.1311033690956346e-05,
+      "loss": 0.14961432218551635,
+      "step": 3085
+    },
+    {
+      "epoch": 0.5622270742358079,
+      "grad_norm": 0.15814319252967834,
+      "learning_rate": 2.1238247475733613e-05,
+      "loss": 0.14308581352233887,
+      "step": 3090
+    },
+    {
+      "epoch": 0.5631368267831149,
+      "grad_norm": 0.21240772306919098,
+      "learning_rate": 2.1165493871262887e-05,
+      "loss": 0.14737485647201537,
+      "step": 3095
+    },
+    {
+      "epoch": 0.5640465793304221,
+      "grad_norm": 0.15161271393299103,
+      "learning_rate": 2.109277350824749e-05,
+      "loss": 0.14534420967102052,
+      "step": 3100
+    },
+    {
+      "epoch": 0.5649563318777293,
+      "grad_norm": 0.16572362184524536,
+      "learning_rate": 2.1020087017102537e-05,
+      "loss": 0.14299670457839966,
+      "step": 3105
+    },
+    {
+      "epoch": 0.5658660844250364,
+      "grad_norm": 0.1548164039850235,
+      "learning_rate": 2.094743502794954e-05,
+      "loss": 0.14371142387390137,
+      "step": 3110
+    },
+    {
+      "epoch": 0.5667758369723436,
+      "grad_norm": 0.2574169933795929,
+      "learning_rate": 2.0874818170610885e-05,
+      "loss": 0.14350423812866211,
+      "step": 3115
+    },
+    {
+      "epoch": 0.5676855895196506,
+      "grad_norm": 0.16359548270702362,
+      "learning_rate": 2.080223707460443e-05,
+      "loss": 0.1520243763923645,
+      "step": 3120
+    },
+    {
+      "epoch": 0.5685953420669578,
+      "grad_norm": 0.1798320859670639,
+      "learning_rate": 2.072969236913799e-05,
+      "loss": 0.14832595586776734,
+      "step": 3125
+    },
+    {
+      "epoch": 0.5695050946142649,
+      "grad_norm": 0.17045916616916656,
+      "learning_rate": 2.0657184683103926e-05,
+      "loss": 0.15308042764663696,
+      "step": 3130
+    },
+    {
+      "epoch": 0.5704148471615721,
+      "grad_norm": 0.16345897316932678,
+      "learning_rate": 2.058471464507366e-05,
+      "loss": 0.14564799070358275,
+      "step": 3135
+    },
+    {
+      "epoch": 0.5713245997088792,
+      "grad_norm": 0.15170110762119293,
+      "learning_rate": 2.0512282883292257e-05,
+      "loss": 0.14161767959594726,
+      "step": 3140
+    },
+    {
+      "epoch": 0.5722343522561864,
+      "grad_norm": 0.8107472658157349,
+      "learning_rate": 2.0439890025672955e-05,
+      "loss": 0.14481087923049926,
+      "step": 3145
+    },
+    {
+      "epoch": 0.5731441048034934,
+      "grad_norm": 0.15346679091453552,
+      "learning_rate": 2.036753669979174e-05,
+      "loss": 0.14860262870788574,
+      "step": 3150
+    },
+    {
+      "epoch": 0.5740538573508006,
+      "grad_norm": 0.1632593423128128,
+      "learning_rate": 2.0295223532881886e-05,
+      "loss": 0.1481687307357788,
+      "step": 3155
+    },
+    {
+      "epoch": 0.5749636098981077,
+      "grad_norm": 0.23399172723293304,
+      "learning_rate": 2.022295115182852e-05,
+      "loss": 0.149153733253479,
+      "step": 3160
+    },
+    {
+      "epoch": 0.5758733624454149,
+      "grad_norm": 0.14977394044399261,
+      "learning_rate": 2.015072018316323e-05,
+      "loss": 0.14921388626098633,
+      "step": 3165
+    },
+    {
+      "epoch": 0.576783114992722,
+      "grad_norm": 0.1550658792257309,
+      "learning_rate": 2.007853125305856e-05,
+      "loss": 0.1482759475708008,
+      "step": 3170
+    },
+    {
+      "epoch": 0.5776928675400291,
+      "grad_norm": 0.16661737859249115,
+      "learning_rate": 2.0006384987322645e-05,
+      "loss": 0.14903552532196046,
+      "step": 3175
+    },
+    {
+      "epoch": 0.5786026200873362,
+      "grad_norm": 0.1746823936700821,
+      "learning_rate": 1.9934282011393753e-05,
+      "loss": 0.1412947654724121,
+      "step": 3180
+    },
+    {
+      "epoch": 0.5795123726346434,
+      "grad_norm": 0.17025792598724365,
+      "learning_rate": 1.9862222950334857e-05,
+      "loss": 0.15289769172668458,
+      "step": 3185
+    },
+    {
+      "epoch": 0.5804221251819505,
+      "grad_norm": 0.16857658326625824,
+      "learning_rate": 1.9790208428828252e-05,
+      "loss": 0.14419941902160643,
+      "step": 3190
+    },
+    {
+      "epoch": 0.5813318777292577,
+      "grad_norm": 0.16099876165390015,
+      "learning_rate": 1.9718239071170118e-05,
+      "loss": 0.14476487636566163,
+      "step": 3195
+    },
+    {
+      "epoch": 0.5822416302765647,
+      "grad_norm": 0.16140873730182648,
+      "learning_rate": 1.964631550126508e-05,
+      "loss": 0.14588416814804078,
+      "step": 3200
+    },
+    {
+      "epoch": 0.5831513828238719,
+      "grad_norm": 0.15719448029994965,
+      "learning_rate": 1.957443834262087e-05,
+      "loss": 0.15144693851470947,
+      "step": 3205
+    },
+    {
+      "epoch": 0.584061135371179,
+      "grad_norm": 0.16512645781040192,
+      "learning_rate": 1.950260821834285e-05,
+      "loss": 0.14787566661834717,
+      "step": 3210
+    },
+    {
+      "epoch": 0.5849708879184862,
+      "grad_norm": 0.18584516644477844,
+      "learning_rate": 1.9430825751128643e-05,
+      "loss": 0.14514710903167724,
+      "step": 3215
+    },
+    {
+      "epoch": 0.5858806404657934,
+      "grad_norm": 0.17640981078147888,
+      "learning_rate": 1.9359091563262742e-05,
+      "loss": 0.1511004686355591,
+      "step": 3220
+    },
+    {
+      "epoch": 0.5867903930131004,
+      "grad_norm": 0.1697624921798706,
+      "learning_rate": 1.9287406276611095e-05,
+      "loss": 0.15392563343048096,
+      "step": 3225
+    },
+    {
+      "epoch": 0.5877001455604076,
+      "grad_norm": 0.1677260845899582,
+      "learning_rate": 1.9215770512615725e-05,
+      "loss": 0.15311745405197144,
+      "step": 3230
+    },
+    {
+      "epoch": 0.5886098981077147,
+      "grad_norm": 0.15357480943202972,
+      "learning_rate": 1.9144184892289337e-05,
+      "loss": 0.14370160102844237,
+      "step": 3235
+    },
+    {
+      "epoch": 0.5895196506550219,
+      "grad_norm": 0.18601207435131073,
+      "learning_rate": 1.9072650036209955e-05,
+      "loss": 0.14095077514648438,
+      "step": 3240
+    },
+    {
+      "epoch": 0.590429403202329,
+      "grad_norm": 0.17313526570796967,
+      "learning_rate": 1.9001166564515513e-05,
+      "loss": 0.148259174823761,
+      "step": 3245
+    },
+    {
+      "epoch": 0.5913391557496361,
+      "grad_norm": 0.1634378433227539,
+      "learning_rate": 1.8929735096898504e-05,
+      "loss": 0.15082294940948487,
+      "step": 3250
+    },
+    {
+      "epoch": 0.5922489082969432,
+      "grad_norm": 0.18542174994945526,
+      "learning_rate": 1.885835625260058e-05,
+      "loss": 0.14461435079574586,
+      "step": 3255
+    },
+    {
+      "epoch": 0.5931586608442504,
+      "grad_norm": 0.1740756630897522,
+      "learning_rate": 1.87870306504072e-05,
+      "loss": 0.14083608388900756,
+      "step": 3260
+    },
+    {
+      "epoch": 0.5940684133915575,
+      "grad_norm": 0.25606217980384827,
+      "learning_rate": 1.8715758908642288e-05,
+      "loss": 0.15125386714935302,
+      "step": 3265
+    },
+    {
+      "epoch": 0.5949781659388647,
+      "grad_norm": 0.20194627344608307,
+      "learning_rate": 1.8644541645162834e-05,
+      "loss": 0.14433003664016725,
+      "step": 3270
+    },
+    {
+      "epoch": 0.5958879184861717,
+      "grad_norm": 0.1902168095111847,
+      "learning_rate": 1.8573379477353542e-05,
+      "loss": 0.14718132019042968,
+      "step": 3275
+    },
+    {
+      "epoch": 0.5967976710334789,
+      "grad_norm": 0.15122972428798676,
+      "learning_rate": 1.850227302212151e-05,
+      "loss": 0.153376567363739,
+      "step": 3280
+    },
+    {
+      "epoch": 0.597707423580786,
+      "grad_norm": 0.14331959187984467,
+      "learning_rate": 1.843122289589085e-05,
+      "loss": 0.146630597114563,
+      "step": 3285
+    },
+    {
+      "epoch": 0.5986171761280932,
+      "grad_norm": 0.15083099901676178,
+      "learning_rate": 1.836022971459737e-05,
+      "loss": 0.1445971965789795,
+      "step": 3290
+    },
+    {
+      "epoch": 0.5995269286754003,
+      "grad_norm": 0.16585418581962585,
+      "learning_rate": 1.828929409368321e-05,
+      "loss": 0.15120241641998292,
+      "step": 3295
+    },
+    {
+      "epoch": 0.6004366812227074,
+      "grad_norm": 0.1653224229812622,
+      "learning_rate": 1.8218416648091524e-05,
+      "loss": 0.14349838495254516,
+      "step": 3300
+    },
+    {
+      "epoch": 0.6013464337700145,
+      "grad_norm": 0.1891375184059143,
+      "learning_rate": 1.8147597992261124e-05,
+      "loss": 0.15171384811401367,
+      "step": 3305
+    },
+    {
+      "epoch": 0.6022561863173217,
+      "grad_norm": 0.13392704725265503,
+      "learning_rate": 1.8076838740121187e-05,
+      "loss": 0.14607118368148803,
+      "step": 3310
+    },
+    {
+      "epoch": 0.6031659388646288,
+      "grad_norm": 0.15421944856643677,
+      "learning_rate": 1.8006139505085926e-05,
+      "loss": 0.1380957007408142,
+      "step": 3315
+    },
+    {
+      "epoch": 0.604075691411936,
+      "grad_norm": 0.16637761890888214,
+      "learning_rate": 1.7935500900049246e-05,
+      "loss": 0.14604611396789552,
+      "step": 3320
+    },
+    {
+      "epoch": 0.6049854439592431,
+      "grad_norm": 0.16638441383838654,
+      "learning_rate": 1.7864923537379445e-05,
+      "loss": 0.1513611912727356,
+      "step": 3325
+    },
+    {
+      "epoch": 0.6058951965065502,
+      "grad_norm": 0.1745707094669342,
+      "learning_rate": 1.779440802891394e-05,
+      "loss": 0.15391240119934083,
+      "step": 3330
+    },
+    {
+      "epoch": 0.6068049490538574,
+      "grad_norm": 0.1620505005121231,
+      "learning_rate": 1.77239549859539e-05,
+      "loss": 0.14986472129821776,
+      "step": 3335
+    },
+    {
+      "epoch": 0.6077147016011645,
+      "grad_norm": 0.1579132080078125,
+      "learning_rate": 1.7653565019259e-05,
+      "loss": 0.1466603994369507,
+      "step": 3340
+    },
+    {
+      "epoch": 0.6086244541484717,
+      "grad_norm": 0.19154994189739227,
+      "learning_rate": 1.7583238739042086e-05,
+      "loss": 0.15228934288024903,
+      "step": 3345
+    },
+    {
+      "epoch": 0.6095342066957787,
+      "grad_norm": 0.15771779417991638,
+      "learning_rate": 1.7512976754963913e-05,
+      "loss": 0.14965078830718995,
+      "step": 3350
+    },
+    {
+      "epoch": 0.6104439592430859,
+      "grad_norm": 0.18406136333942413,
+      "learning_rate": 1.744277967612785e-05,
+      "loss": 0.1473196864128113,
+      "step": 3355
+    },
+    {
+      "epoch": 0.611353711790393,
+      "grad_norm": 0.17603816092014313,
+      "learning_rate": 1.7372648111074607e-05,
+      "loss": 0.1430676221847534,
+      "step": 3360
+    },
+    {
+      "epoch": 0.6122634643377002,
+      "grad_norm": 0.156408429145813,
+      "learning_rate": 1.7302582667776933e-05,
+      "loss": 0.14018454551696777,
+      "step": 3365
+    },
+    {
+      "epoch": 0.6131732168850073,
+      "grad_norm": 0.14504843950271606,
+      "learning_rate": 1.7232583953634407e-05,
+      "loss": 0.14505640268325806,
+      "step": 3370
+    },
+    {
+      "epoch": 0.6140829694323144,
+      "grad_norm": 0.1864968240261078,
+      "learning_rate": 1.716265257546808e-05,
+      "loss": 0.14810394048690795,
+      "step": 3375
+    },
+    {
+      "epoch": 0.6149927219796215,
+      "grad_norm": 0.1621711403131485,
+      "learning_rate": 1.7092789139515295e-05,
+      "loss": 0.14203091859817504,
+      "step": 3380
+    },
+    {
+      "epoch": 0.6159024745269287,
+      "grad_norm": 0.17994914948940277,
+      "learning_rate": 1.70229942514244e-05,
+      "loss": 0.14565644264221192,
+      "step": 3385
+    },
+    {
+      "epoch": 0.6168122270742358,
+      "grad_norm": 0.1707388162612915,
+      "learning_rate": 1.6953268516249486e-05,
+      "loss": 0.14449434280395507,
+      "step": 3390
+    },
+    {
+      "epoch": 0.617721979621543,
+      "grad_norm": 0.16425329446792603,
+      "learning_rate": 1.6883612538445175e-05,
+      "loss": 0.15185940265655518,
+      "step": 3395
+    },
+    {
+      "epoch": 0.61863173216885,
+      "grad_norm": 0.15987788140773773,
+      "learning_rate": 1.6814026921861335e-05,
+      "loss": 0.14994431734085084,
+      "step": 3400
+    },
+    {
+      "epoch": 0.6195414847161572,
+      "grad_norm": 0.2987690269947052,
+      "learning_rate": 1.6744512269737894e-05,
+      "loss": 0.14652738571166993,
+      "step": 3405
+    },
+    {
+      "epoch": 0.6204512372634643,
+      "grad_norm": 0.1681315004825592,
+      "learning_rate": 1.6675069184699574e-05,
+      "loss": 0.14566165208816528,
+      "step": 3410
+    },
+    {
+      "epoch": 0.6213609898107715,
+      "grad_norm": 0.15847846865653992,
+      "learning_rate": 1.660569826875069e-05,
+      "loss": 0.1374401330947876,
+      "step": 3415
+    },
+    {
+      "epoch": 0.6222707423580786,
+      "grad_norm": 0.16370312869548798,
+      "learning_rate": 1.6536400123269907e-05,
+      "loss": 0.14905524253845215,
+      "step": 3420
+    },
+    {
+      "epoch": 0.6231804949053857,
+      "grad_norm": 0.16054444015026093,
+      "learning_rate": 1.6467175349005054e-05,
+      "loss": 0.1496324896812439,
+      "step": 3425
+    },
+    {
+      "epoch": 0.6240902474526928,
+      "grad_norm": 0.1663951277732849,
+      "learning_rate": 1.639802454606788e-05,
+      "loss": 0.1504170298576355,
+      "step": 3430
+    },
+    {
+      "epoch": 0.625,
+      "grad_norm": 0.1591310054063797,
+      "learning_rate": 1.6328948313928906e-05,
+      "loss": 0.1410186171531677,
+      "step": 3435
+    },
+    {
+      "epoch": 0.6259097525473072,
+      "grad_norm": 0.1637524962425232,
+      "learning_rate": 1.6259947251412178e-05,
+      "loss": 0.13963305950164795,
+      "step": 3440
+    },
+    {
+      "epoch": 0.6268195050946143,
+      "grad_norm": 0.1688017100095749,
+      "learning_rate": 1.6191021956690096e-05,
+      "loss": 0.14727941751480103,
+      "step": 3445
+    },
+    {
+      "epoch": 0.6277292576419214,
+      "grad_norm": 0.1691795438528061,
+      "learning_rate": 1.612217302727821e-05,
+      "loss": 0.14856183528900146,
+      "step": 3450
+    },
+    {
+      "epoch": 0.6286390101892285,
+      "grad_norm": 0.18501746654510498,
+      "learning_rate": 1.60534010600301e-05,
+      "loss": 0.1481746554374695,
+      "step": 3455
+    },
+    {
+      "epoch": 0.6295487627365357,
+      "grad_norm": 0.16234716773033142,
+      "learning_rate": 1.5984706651132125e-05,
+      "loss": 0.1427530527114868,
+      "step": 3460
+    },
+    {
+      "epoch": 0.6304585152838428,
+      "grad_norm": 0.16013780236244202,
+      "learning_rate": 1.5916090396098293e-05,
+      "loss": 0.14264426231384278,
+      "step": 3465
+    },
+    {
+      "epoch": 0.63136826783115,
+      "grad_norm": 0.17116396129131317,
+      "learning_rate": 1.5847552889765095e-05,
+      "loss": 0.14109257459640503,
+      "step": 3470
+    },
+    {
+      "epoch": 0.632278020378457,
+      "grad_norm": 0.16949769854545593,
+      "learning_rate": 1.5779094726286344e-05,
+      "loss": 0.1387040376663208,
+      "step": 3475
+    },
+    {
+      "epoch": 0.6331877729257642,
+      "grad_norm": 0.14983431994915009,
+      "learning_rate": 1.5710716499128044e-05,
+      "loss": 0.13645120859146118,
+      "step": 3480
+    },
+    {
+      "epoch": 0.6340975254730713,
+      "grad_norm": 0.1632554531097412,
+      "learning_rate": 1.564241880106321e-05,
+      "loss": 0.14883992671966553,
+      "step": 3485
+    },
+    {
+      "epoch": 0.6350072780203785,
+      "grad_norm": 0.15686506032943726,
+      "learning_rate": 1.5574202224166744e-05,
+      "loss": 0.14244272708892822,
+      "step": 3490
+    },
+    {
+      "epoch": 0.6359170305676856,
+      "grad_norm": 0.18843458592891693,
+      "learning_rate": 1.5506067359810333e-05,
+      "loss": 0.15149861574172974,
+      "step": 3495
+    },
+    {
+      "epoch": 0.6368267831149927,
+      "grad_norm": 0.15874551236629486,
+      "learning_rate": 1.5438014798657275e-05,
+      "loss": 0.15188233852386473,
+      "step": 3500
+    },
+    {
+      "epoch": 0.6377365356622998,
+      "grad_norm": 0.17014239728450775,
+      "learning_rate": 1.5370045130657366e-05,
+      "loss": 0.14694437980651856,
+      "step": 3505
+    },
+    {
+      "epoch": 0.638646288209607,
+      "grad_norm": 0.14744038879871368,
+      "learning_rate": 1.5302158945041838e-05,
+      "loss": 0.14434736967086792,
+      "step": 3510
+    },
+    {
+      "epoch": 0.6395560407569141,
+      "grad_norm": 0.2069770246744156,
+      "learning_rate": 1.523435683031818e-05,
+      "loss": 0.13982917070388795,
+      "step": 3515
+    },
+    {
+      "epoch": 0.6404657933042213,
+      "grad_norm": 0.17811502516269684,
+      "learning_rate": 1.5166639374265063e-05,
+      "loss": 0.1408839702606201,
+      "step": 3520
+    },
+    {
+      "epoch": 0.6413755458515283,
+      "grad_norm": 0.165786474943161,
+      "learning_rate": 1.509900716392728e-05,
+      "loss": 0.15312877893447877,
+      "step": 3525
+    },
+    {
+      "epoch": 0.6422852983988355,
+      "grad_norm": 0.1633884161710739,
+      "learning_rate": 1.5031460785610596e-05,
+      "loss": 0.1488795518875122,
+      "step": 3530
+    },
+    {
+      "epoch": 0.6431950509461426,
+      "grad_norm": 0.16498984396457672,
+      "learning_rate": 1.4964000824876723e-05,
+      "loss": 0.15031465291976928,
+      "step": 3535
+    },
+    {
+      "epoch": 0.6441048034934498,
+      "grad_norm": 0.18043678998947144,
+      "learning_rate": 1.4896627866538191e-05,
+      "loss": 0.147829806804657,
+      "step": 3540
+    },
+    {
+      "epoch": 0.6450145560407569,
+      "grad_norm": 0.16813597083091736,
+      "learning_rate": 1.4829342494653315e-05,
+      "loss": 0.1418998956680298,
+      "step": 3545
+    },
+    {
+      "epoch": 0.645924308588064,
+      "grad_norm": 0.1817242056131363,
+      "learning_rate": 1.4762145292521118e-05,
+      "loss": 0.14508869647979736,
+      "step": 3550
+    },
+    {
+      "epoch": 0.6468340611353712,
+      "grad_norm": 0.14666494727134705,
+      "learning_rate": 1.469503684267628e-05,
+      "loss": 0.14159854650497436,
+      "step": 3555
+    },
+    {
+      "epoch": 0.6477438136826783,
+      "grad_norm": 0.16485381126403809,
+      "learning_rate": 1.4628017726884086e-05,
+      "loss": 0.14419105052947997,
+      "step": 3560
+    },
+    {
+      "epoch": 0.6486535662299855,
+      "grad_norm": 0.16100342571735382,
+      "learning_rate": 1.4561088526135375e-05,
+      "loss": 0.14501721858978273,
+      "step": 3565
+    },
+    {
+      "epoch": 0.6495633187772926,
+      "grad_norm": 0.16996590793132782,
+      "learning_rate": 1.4494249820641493e-05,
+      "loss": 0.1377166509628296,
+      "step": 3570
+    },
+    {
+      "epoch": 0.6504730713245997,
+      "grad_norm": 0.16168837249279022,
+      "learning_rate": 1.4427502189829339e-05,
+      "loss": 0.1414325475692749,
+      "step": 3575
+    },
+    {
+      "epoch": 0.6513828238719068,
+      "grad_norm": 0.16318906843662262,
+      "learning_rate": 1.436084621233621e-05,
+      "loss": 0.14685193300247193,
+      "step": 3580
+    },
+    {
+      "epoch": 0.652292576419214,
+      "grad_norm": 0.1636219322681427,
+      "learning_rate": 1.4294282466004899e-05,
+      "loss": 0.1405899167060852,
+      "step": 3585
+    },
+    {
+      "epoch": 0.6532023289665211,
+      "grad_norm": 0.1838461309671402,
+      "learning_rate": 1.422781152787865e-05,
+      "loss": 0.14386332035064697,
+      "step": 3590
+    },
+    {
+      "epoch": 0.6541120815138283,
+      "grad_norm": 0.1796344667673111,
+      "learning_rate": 1.4161433974196115e-05,
+      "loss": 0.1513024687767029,
+      "step": 3595
+    },
+    {
+      "epoch": 0.6550218340611353,
+      "grad_norm": 0.16424529254436493,
+      "learning_rate": 1.4095150380386427e-05,
+      "loss": 0.14238927364349366,
+      "step": 3600
+    },
+    {
+      "epoch": 0.6559315866084425,
+      "grad_norm": 0.19264160096645355,
+      "learning_rate": 1.402896132106415e-05,
+      "loss": 0.14297477006912232,
+      "step": 3605
+    },
+    {
+      "epoch": 0.6568413391557496,
+      "grad_norm": 0.18319948017597198,
+      "learning_rate": 1.3962867370024347e-05,
+      "loss": 0.1448880434036255,
+      "step": 3610
+    },
+    {
+      "epoch": 0.6577510917030568,
+      "grad_norm": 0.16507290303707123,
+      "learning_rate": 1.389686910023758e-05,
+      "loss": 0.14724698066711425,
+      "step": 3615
+    },
+    {
+      "epoch": 0.6586608442503639,
+      "grad_norm": 0.17871244251728058,
+      "learning_rate": 1.3830967083844942e-05,
+      "loss": 0.14479386806488037,
+      "step": 3620
+    },
+    {
+      "epoch": 0.659570596797671,
+      "grad_norm": 0.1846228390932083,
+      "learning_rate": 1.3765161892153112e-05,
+      "loss": 0.1453616738319397,
+      "step": 3625
+    },
+    {
+      "epoch": 0.6604803493449781,
+      "grad_norm": 0.17185978591442108,
+      "learning_rate": 1.3699454095629372e-05,
+      "loss": 0.14906206130981445,
+      "step": 3630
+    },
+    {
+      "epoch": 0.6613901018922853,
+      "grad_norm": 0.14751191437244415,
+      "learning_rate": 1.3633844263896698e-05,
+      "loss": 0.13991892337799072,
+      "step": 3635
+    },
+    {
+      "epoch": 0.6622998544395924,
+      "grad_norm": 0.22059763967990875,
+      "learning_rate": 1.3568332965728817e-05,
+      "loss": 0.14680869579315187,
+      "step": 3640
+    },
+    {
+      "epoch": 0.6632096069868996,
+      "grad_norm": 0.15295909345149994,
+      "learning_rate": 1.3502920769045232e-05,
+      "loss": 0.1404443383216858,
+      "step": 3645
+    },
+    {
+      "epoch": 0.6641193595342066,
+      "grad_norm": 0.14600558578968048,
+      "learning_rate": 1.3437608240906364e-05,
+      "loss": 0.14663270711898804,
+      "step": 3650
+    },
+    {
+      "epoch": 0.6650291120815138,
+      "grad_norm": 0.15548352897167206,
+      "learning_rate": 1.3372395947508587e-05,
+      "loss": 0.1431443452835083,
+      "step": 3655
+    },
+    {
+      "epoch": 0.665938864628821,
+      "grad_norm": 0.1813388466835022,
+      "learning_rate": 1.3307284454179342e-05,
+      "loss": 0.1458706736564636,
+      "step": 3660
+    },
+    {
+      "epoch": 0.6668486171761281,
+      "grad_norm": 0.16326870024204254,
+      "learning_rate": 1.3242274325372247e-05,
+      "loss": 0.14700595140457154,
+      "step": 3665
+    },
+    {
+      "epoch": 0.6677583697234353,
+      "grad_norm": 0.18779197335243225,
+      "learning_rate": 1.3177366124662149e-05,
+      "loss": 0.1497237801551819,
+      "step": 3670
+    },
+    {
+      "epoch": 0.6686681222707423,
+      "grad_norm": 0.16291002929210663,
+      "learning_rate": 1.3112560414740315e-05,
+      "loss": 0.1387086868286133,
+      "step": 3675
+    },
+    {
+      "epoch": 0.6695778748180495,
+      "grad_norm": 0.1532297134399414,
+      "learning_rate": 1.3047857757409487e-05,
+      "loss": 0.14497545957565308,
+      "step": 3680
+    },
+    {
+      "epoch": 0.6704876273653566,
+      "grad_norm": 0.14697515964508057,
+      "learning_rate": 1.2983258713579066e-05,
+      "loss": 0.1494283437728882,
+      "step": 3685
+    },
+    {
+      "epoch": 0.6713973799126638,
+      "grad_norm": 0.15213452279567719,
+      "learning_rate": 1.2918763843260218e-05,
+      "loss": 0.1468907594680786,
+      "step": 3690
+    },
+    {
+      "epoch": 0.6723071324599709,
+      "grad_norm": 0.1745215803384781,
+      "learning_rate": 1.285437370556099e-05,
+      "loss": 0.14997754096984864,
+      "step": 3695
+    },
+    {
+      "epoch": 0.673216885007278,
+      "grad_norm": 0.19207637012004852,
+      "learning_rate": 1.2790088858681577e-05,
+      "loss": 0.14202862977981567,
+      "step": 3700
+    },
+    {
+      "epoch": 0.6741266375545851,
+      "grad_norm": 0.1521359086036682,
+      "learning_rate": 1.2725909859909313e-05,
+      "loss": 0.14547673463821412,
+      "step": 3705
+    },
+    {
+      "epoch": 0.6750363901018923,
+      "grad_norm": 0.16975535452365875,
+      "learning_rate": 1.2661837265613999e-05,
+      "loss": 0.14006874561309815,
+      "step": 3710
+    },
+    {
+      "epoch": 0.6759461426491994,
+      "grad_norm": 0.22234582901000977,
+      "learning_rate": 1.2597871631242992e-05,
+      "loss": 0.13691173791885375,
+      "step": 3715
+    },
+    {
+      "epoch": 0.6768558951965066,
+      "grad_norm": 0.16082969307899475,
+      "learning_rate": 1.2534013511316383e-05,
+      "loss": 0.14932308197021485,
+      "step": 3720
+    },
+    {
+      "epoch": 0.6777656477438136,
+      "grad_norm": 0.1751091182231903,
+      "learning_rate": 1.247026345942226e-05,
+      "loss": 0.14531974792480468,
+      "step": 3725
+    },
+    {
+      "epoch": 0.6786754002911208,
+      "grad_norm": 0.15838147699832916,
+      "learning_rate": 1.2406622028211844e-05,
+      "loss": 0.14759832620620728,
+      "step": 3730
+    },
+    {
+      "epoch": 0.6795851528384279,
+      "grad_norm": 0.1771744042634964,
+      "learning_rate": 1.2343089769394714e-05,
+      "loss": 0.1382831573486328,
+      "step": 3735
+    },
+    {
+      "epoch": 0.6804949053857351,
+      "grad_norm": 0.16301538050174713,
+      "learning_rate": 1.2279667233734037e-05,
+      "loss": 0.14444775581359864,
+      "step": 3740
+    },
+    {
+      "epoch": 0.6814046579330422,
+      "grad_norm": 0.1584121286869049,
+      "learning_rate": 1.2216354971041796e-05,
+      "loss": 0.14200170040130616,
+      "step": 3745
+    },
+    {
+      "epoch": 0.6823144104803494,
+      "grad_norm": 0.139187291264534,
+      "learning_rate": 1.2153153530174007e-05,
+      "loss": 0.14318310022354125,
+      "step": 3750
+    },
+    {
+      "epoch": 0.6832241630276564,
+      "grad_norm": 0.13665248453617096,
+      "learning_rate": 1.2090063459025955e-05,
+      "loss": 0.1411946654319763,
+      "step": 3755
+    },
+    {
+      "epoch": 0.6841339155749636,
+      "grad_norm": 0.16273781657218933,
+      "learning_rate": 1.2027085304527475e-05,
+      "loss": 0.14873508214950562,
+      "step": 3760
+    },
+    {
+      "epoch": 0.6850436681222707,
+      "grad_norm": 0.16317526996135712,
+      "learning_rate": 1.1964219612638194e-05,
+      "loss": 0.14644203186035157,
+      "step": 3765
+    },
+    {
+      "epoch": 0.6859534206695779,
+      "grad_norm": 0.17253617942333221,
+      "learning_rate": 1.1901466928342777e-05,
+      "loss": 0.14027841091156007,
+      "step": 3770
+    },
+    {
+      "epoch": 0.6868631732168851,
+      "grad_norm": 0.19692830741405487,
+      "learning_rate": 1.183882779564624e-05,
+      "loss": 0.14411110877990724,
+      "step": 3775
+    },
+    {
+      "epoch": 0.6877729257641921,
+      "grad_norm": 0.15444578230381012,
+      "learning_rate": 1.1776302757569214e-05,
+      "loss": 0.14355008602142333,
+      "step": 3780
+    },
+    {
+      "epoch": 0.6886826783114993,
+      "grad_norm": 0.1622200757265091,
+      "learning_rate": 1.1713892356143239e-05,
+      "loss": 0.14794334173202514,
+      "step": 3785
+    },
+    {
+      "epoch": 0.6895924308588064,
+      "grad_norm": 0.1898501068353653,
+      "learning_rate": 1.1651597132406073e-05,
+      "loss": 0.1418622612953186,
+      "step": 3790
+    },
+    {
+      "epoch": 0.6905021834061136,
+      "grad_norm": 0.17803208529949188,
+      "learning_rate": 1.1589417626396973e-05,
+      "loss": 0.14576040506362914,
+      "step": 3795
+    },
+    {
+      "epoch": 0.6914119359534207,
+      "grad_norm": 0.17138013243675232,
+      "learning_rate": 1.1527354377152053e-05,
+      "loss": 0.14494270086288452,
+      "step": 3800
+    },
+    {
+      "epoch": 0.6923216885007278,
+      "grad_norm": 0.15170913934707642,
+      "learning_rate": 1.1465407922699603e-05,
+      "loss": 0.144084370136261,
+      "step": 3805
+    },
+    {
+      "epoch": 0.6932314410480349,
+      "grad_norm": 0.158562570810318,
+      "learning_rate": 1.1403578800055387e-05,
+      "loss": 0.13636608123779298,
+      "step": 3810
+    },
+    {
+      "epoch": 0.6941411935953421,
+      "grad_norm": 0.17687302827835083,
+      "learning_rate": 1.1341867545218044e-05,
+      "loss": 0.14214688539505005,
+      "step": 3815
+    },
+    {
+      "epoch": 0.6950509461426492,
+      "grad_norm": 0.15394899249076843,
+      "learning_rate": 1.1280274693164378e-05,
+      "loss": 0.14914129972457885,
+      "step": 3820
+    },
+    {
+      "epoch": 0.6959606986899564,
+      "grad_norm": 0.15709355473518372,
+      "learning_rate": 1.12188007778448e-05,
+      "loss": 0.14798580408096312,
+      "step": 3825
+    },
+    {
+      "epoch": 0.6968704512372634,
+      "grad_norm": 0.16631539165973663,
+      "learning_rate": 1.115744633217864e-05,
+      "loss": 0.14756966829299928,
+      "step": 3830
+    },
+    {
+      "epoch": 0.6977802037845706,
+      "grad_norm": 0.15893076360225677,
+      "learning_rate": 1.109621188804951e-05,
+      "loss": 0.14061959981918334,
+      "step": 3835
+    },
+    {
+      "epoch": 0.6986899563318777,
+      "grad_norm": 0.183414489030838,
+      "learning_rate": 1.103509797630077e-05,
+      "loss": 0.1448473334312439,
+      "step": 3840
+    },
+    {
+      "epoch": 0.6995997088791849,
+      "grad_norm": 0.14087305963039398,
+      "learning_rate": 1.0974105126730841e-05,
+      "loss": 0.14369285106658936,
+      "step": 3845
+    },
+    {
+      "epoch": 0.700509461426492,
+      "grad_norm": 0.16919967532157898,
+      "learning_rate": 1.0913233868088685e-05,
+      "loss": 0.1478085398674011,
+      "step": 3850
+    },
+    {
+      "epoch": 0.7014192139737991,
+      "grad_norm": 0.1439533829689026,
+      "learning_rate": 1.0852484728069178e-05,
+      "loss": 0.14376721382141114,
+      "step": 3855
+    },
+    {
+      "epoch": 0.7023289665211062,
+      "grad_norm": 0.17719274759292603,
+      "learning_rate": 1.0791858233308521e-05,
+      "loss": 0.14089040756225585,
+      "step": 3860
+    },
+    {
+      "epoch": 0.7032387190684134,
+      "grad_norm": 0.19753769040107727,
+      "learning_rate": 1.0731354909379754e-05,
+      "loss": 0.15021742582321168,
+      "step": 3865
+    },
+    {
+      "epoch": 0.7041484716157205,
+      "grad_norm": 0.19186992943286896,
+      "learning_rate": 1.0670975280788086e-05,
+      "loss": 0.14113202095031738,
+      "step": 3870
+    },
+    {
+      "epoch": 0.7050582241630277,
+      "grad_norm": 0.1709229201078415,
+      "learning_rate": 1.0610719870966443e-05,
+      "loss": 0.1500566840171814,
+      "step": 3875
+    },
+    {
+      "epoch": 0.7059679767103348,
+      "grad_norm": 0.17846204340457916,
+      "learning_rate": 1.0550589202270892e-05,
+      "loss": 0.15014195442199707,
+      "step": 3880
+    },
+    {
+      "epoch": 0.7068777292576419,
+      "grad_norm": 0.1827082335948944,
+      "learning_rate": 1.0490583795976091e-05,
+      "loss": 0.1423472762107849,
+      "step": 3885
+    },
+    {
+      "epoch": 0.7077874818049491,
+      "grad_norm": 0.17418377101421356,
+      "learning_rate": 1.043070417227083e-05,
+      "loss": 0.14668900966644288,
+      "step": 3890
+    },
+    {
+      "epoch": 0.7086972343522562,
+      "grad_norm": 0.17385616898536682,
+      "learning_rate": 1.0370950850253449e-05,
+      "loss": 0.14627279043197633,
+      "step": 3895
+    },
+    {
+      "epoch": 0.7096069868995634,
+      "grad_norm": 0.16486723721027374,
+      "learning_rate": 1.0311324347927404e-05,
+      "loss": 0.14603652954101562,
+      "step": 3900
+    },
+    {
+      "epoch": 0.7105167394468704,
+      "grad_norm": 0.21806862950325012,
+      "learning_rate": 1.0251825182196732e-05,
+      "loss": 0.1488169550895691,
+      "step": 3905
+    },
+    {
+      "epoch": 0.7114264919941776,
+      "grad_norm": 0.19884569942951202,
+      "learning_rate": 1.019245386886159e-05,
+      "loss": 0.14387656450271608,
+      "step": 3910
+    },
+    {
+      "epoch": 0.7123362445414847,
+      "grad_norm": 0.16139011085033417,
+      "learning_rate": 1.0133210922613789e-05,
+      "loss": 0.1483074426651001,
+      "step": 3915
+    },
+    {
+      "epoch": 0.7132459970887919,
+      "grad_norm": 0.17000740766525269,
+      "learning_rate": 1.007409685703229e-05,
+      "loss": 0.14050065279006957,
+      "step": 3920
+    },
+    {
+      "epoch": 0.714155749636099,
+      "grad_norm": 0.17235304415225983,
+      "learning_rate": 1.0015112184578813e-05,
+      "loss": 0.1440442681312561,
+      "step": 3925
+    },
+    {
+      "epoch": 0.7150655021834061,
+      "grad_norm": 0.15737567842006683,
+      "learning_rate": 9.956257416593362e-06,
+      "loss": 0.14960765838623047,
+      "step": 3930
+    },
+    {
+      "epoch": 0.7159752547307132,
+      "grad_norm": 0.15499180555343628,
+      "learning_rate": 9.897533063289773e-06,
+      "loss": 0.14488829374313356,
+      "step": 3935
+    },
+    {
+      "epoch": 0.7168850072780204,
+      "grad_norm": 0.17744216322898865,
+      "learning_rate": 9.838939633751337e-06,
+      "loss": 0.1416949987411499,
+      "step": 3940
+    },
+    {
+      "epoch": 0.7177947598253275,
+      "grad_norm": 0.1597192883491516,
+      "learning_rate": 9.780477635926358e-06,
+      "loss": 0.14275280237197877,
+      "step": 3945
+    },
+    {
+      "epoch": 0.7187045123726347,
+      "grad_norm": 0.17800374329090118,
+      "learning_rate": 9.722147576623743e-06,
+      "loss": 0.14532098770141602,
+      "step": 3950
+    },
+    {
+      "epoch": 0.7196142649199417,
+      "grad_norm": 0.1828162521123886,
+      "learning_rate": 9.66394996150864e-06,
+      "loss": 0.14525585174560546,
+      "step": 3955
+    },
+    {
+      "epoch": 0.7205240174672489,
+      "grad_norm": 0.1800539344549179,
+      "learning_rate": 9.605885295098005e-06,
+      "loss": 0.14235819578170777,
+      "step": 3960
+    },
+    {
+      "epoch": 0.721433770014556,
+      "grad_norm": 0.16556483507156372,
+      "learning_rate": 9.54795408075628e-06,
+      "loss": 0.13965482711791993,
+      "step": 3965
+    },
+    {
+      "epoch": 0.7223435225618632,
+      "grad_norm": 0.1592024862766266,
+      "learning_rate": 9.49015682069101e-06,
+      "loss": 0.14051042795181273,
+      "step": 3970
+    },
+    {
+      "epoch": 0.7232532751091703,
+      "grad_norm": 0.18988847732543945,
+      "learning_rate": 9.43249401594846e-06,
+      "loss": 0.1436900496482849,
+      "step": 3975
+    },
+    {
+      "epoch": 0.7241630276564774,
+      "grad_norm": 0.24433808028697968,
+      "learning_rate": 9.374966166409329e-06,
+      "loss": 0.14883997440338134,
+      "step": 3980
+    },
+    {
+      "epoch": 0.7250727802037845,
+      "grad_norm": 0.15091639757156372,
+      "learning_rate": 9.317573770784352e-06,
+      "loss": 0.14726560115814208,
+      "step": 3985
+    },
+    {
+      "epoch": 0.7259825327510917,
+      "grad_norm": 0.17045573890209198,
+      "learning_rate": 9.260317326610051e-06,
+      "loss": 0.14120506048202514,
+      "step": 3990
+    },
+    {
+      "epoch": 0.7268922852983989,
+      "grad_norm": 0.18847957253456116,
+      "learning_rate": 9.203197330244343e-06,
+      "loss": 0.1377041220664978,
+      "step": 3995
+    },
+    {
+      "epoch": 0.727802037845706,
+      "grad_norm": 0.1516445279121399,
+      "learning_rate": 9.14621427686229e-06,
+      "loss": 0.14043946266174318,
+      "step": 4000
+    },
+    {
+      "epoch": 0.7287117903930131,
+      "grad_norm": 0.18264050781726837,
+      "learning_rate": 9.0893686604518e-06,
+      "loss": 0.14080368280410765,
+      "step": 4005
+    },
+    {
+      "epoch": 0.7296215429403202,
+      "grad_norm": 0.19129371643066406,
+      "learning_rate": 9.032660973809312e-06,
+      "loss": 0.1402561902999878,
+      "step": 4010
+    },
+    {
+      "epoch": 0.7305312954876274,
+      "grad_norm": 0.15762710571289062,
+      "learning_rate": 8.976091708535567e-06,
+      "loss": 0.14421157836914061,
+      "step": 4015
+    },
+    {
+      "epoch": 0.7314410480349345,
+      "grad_norm": 0.17785198986530304,
+      "learning_rate": 8.919661355031331e-06,
+      "loss": 0.14999009370803834,
+      "step": 4020
+    },
+    {
+      "epoch": 0.7323508005822417,
+      "grad_norm": 0.15306031703948975,
+      "learning_rate": 8.8633704024931e-06,
+      "loss": 0.14101698398590087,
+      "step": 4025
+    },
+    {
+      "epoch": 0.7332605531295487,
+      "grad_norm": 0.16481758654117584,
+      "learning_rate": 8.807219338908968e-06,
+      "loss": 0.14170764684677123,
+      "step": 4030
+    },
+    {
+      "epoch": 0.7341703056768559,
+      "grad_norm": 0.14892235398292542,
+      "learning_rate": 8.751208651054257e-06,
+      "loss": 0.15317896604537964,
+      "step": 4035
+    },
+    {
+      "epoch": 0.735080058224163,
+      "grad_norm": 0.1775592565536499,
+      "learning_rate": 8.695338824487409e-06,
+      "loss": 0.1520617723464966,
+      "step": 4040
+    },
+    {
+      "epoch": 0.7359898107714702,
+      "grad_norm": 0.1614258885383606,
+      "learning_rate": 8.639610343545728e-06,
+      "loss": 0.13747400045394897,
+      "step": 4045
+    },
+    {
+      "epoch": 0.7368995633187773,
+      "grad_norm": 0.21415506303310394,
+      "learning_rate": 8.58402369134117e-06,
+      "loss": 0.1432439088821411,
+      "step": 4050
+    },
+    {
+      "epoch": 0.7378093158660844,
+      "grad_norm": 0.1759418249130249,
+      "learning_rate": 8.528579349756205e-06,
+      "loss": 0.141641104221344,
+      "step": 4055
+    },
+    {
+      "epoch": 0.7387190684133915,
+      "grad_norm": 0.16738329827785492,
+      "learning_rate": 8.47327779943957e-06,
+      "loss": 0.14294810295104982,
+      "step": 4060
+    },
+    {
+      "epoch": 0.7396288209606987,
+      "grad_norm": 0.13916844129562378,
+      "learning_rate": 8.41811951980217e-06,
+      "loss": 0.13876968622207642,
+      "step": 4065
+    },
+    {
+      "epoch": 0.7405385735080058,
+      "grad_norm": 0.1828441321849823,
+      "learning_rate": 8.36310498901288e-06,
+      "loss": 0.148428475856781,
+      "step": 4070
+    },
+    {
+      "epoch": 0.741448326055313,
+      "grad_norm": 0.16534076631069183,
+      "learning_rate": 8.308234683994415e-06,
+      "loss": 0.14222711324691772,
+      "step": 4075
+    },
+    {
+      "epoch": 0.74235807860262,
+      "grad_norm": 0.17922644317150116,
+      "learning_rate": 8.253509080419198e-06,
+      "loss": 0.14365782737731933,
+      "step": 4080
+    },
+    {
+      "epoch": 0.7432678311499272,
+      "grad_norm": 0.15061035752296448,
+      "learning_rate": 8.198928652705204e-06,
+      "loss": 0.13571925163269044,
+      "step": 4085
+    },
+    {
+      "epoch": 0.7441775836972343,
+      "grad_norm": 0.18075402081012726,
+      "learning_rate": 8.144493874011908e-06,
+      "loss": 0.14385528564453126,
+      "step": 4090
+    },
+    {
+      "epoch": 0.7450873362445415,
+      "grad_norm": 0.16514739394187927,
+      "learning_rate": 8.090205216236135e-06,
+      "loss": 0.14920626878738402,
+      "step": 4095
+    },
+    {
+      "epoch": 0.7459970887918487,
+      "grad_norm": 0.16453702747821808,
+      "learning_rate": 8.03606315000797e-06,
+      "loss": 0.14704222679138185,
+      "step": 4100
+    },
+    {
+      "epoch": 0.7469068413391557,
+      "grad_norm": 0.16719917953014374,
+      "learning_rate": 7.982068144686707e-06,
+      "loss": 0.14722511768341065,
+      "step": 4105
+    },
+    {
+      "epoch": 0.7478165938864629,
+      "grad_norm": 0.18499110639095306,
+      "learning_rate": 7.92822066835677e-06,
+      "loss": 0.1401848554611206,
+      "step": 4110
+    },
+    {
+      "epoch": 0.74872634643377,
+      "grad_norm": 0.17249563336372375,
+      "learning_rate": 7.87452118782363e-06,
+      "loss": 0.15132423639297485,
+      "step": 4115
+    },
+    {
+      "epoch": 0.7496360989810772,
+      "grad_norm": 0.15049682557582855,
+      "learning_rate": 7.8209701686098e-06,
+      "loss": 0.1341150164604187,
+      "step": 4120
+    },
+    {
+      "epoch": 0.7505458515283843,
+      "grad_norm": 0.16892646253108978,
+      "learning_rate": 7.767568074950751e-06,
+      "loss": 0.1466840147972107,
+      "step": 4125
+    },
+    {
+      "epoch": 0.7514556040756915,
+      "grad_norm": 0.17288286983966827,
+      "learning_rate": 7.714315369790942e-06,
+      "loss": 0.13819680213928223,
+      "step": 4130
+    },
+    {
+      "epoch": 0.7523653566229985,
+      "grad_norm": 0.21893996000289917,
+      "learning_rate": 7.661212514779745e-06,
+      "loss": 0.14369510412216185,
+      "step": 4135
+    },
+    {
+      "epoch": 0.7532751091703057,
+      "grad_norm": 0.1674601435661316,
+      "learning_rate": 7.608259970267509e-06,
+      "loss": 0.14810250997543334,
+      "step": 4140
+    },
+    {
+      "epoch": 0.7541848617176128,
+      "grad_norm": 0.15875539183616638,
+      "learning_rate": 7.555458195301526e-06,
+      "loss": 0.14103198051452637,
+      "step": 4145
+    },
+    {
+      "epoch": 0.75509461426492,
+      "grad_norm": 0.19454079866409302,
+      "learning_rate": 7.502807647622037e-06,
+      "loss": 0.13848764896392823,
+      "step": 4150
+    },
+    {
+      "epoch": 0.756004366812227,
+      "grad_norm": 0.1795455813407898,
+      "learning_rate": 7.450308783658341e-06,
+      "loss": 0.14459335803985596,
+      "step": 4155
+    },
+    {
+      "epoch": 0.7569141193595342,
+      "grad_norm": 0.1643362045288086,
+      "learning_rate": 7.397962058524735e-06,
+      "loss": 0.14335378408432006,
+      "step": 4160
+    },
+    {
+      "epoch": 0.7578238719068413,
+      "grad_norm": 0.16362066566944122,
+      "learning_rate": 7.3457679260166475e-06,
+      "loss": 0.14222005605697632,
+      "step": 4165
+    },
+    {
+      "epoch": 0.7587336244541485,
+      "grad_norm": 0.17313003540039062,
+      "learning_rate": 7.293726838606674e-06,
+      "loss": 0.14272255897521974,
+      "step": 4170
+    },
+    {
+      "epoch": 0.7596433770014556,
+      "grad_norm": 0.1809929460287094,
+      "learning_rate": 7.2418392474406405e-06,
+      "loss": 0.14089123010635377,
+      "step": 4175
+    },
+    {
+      "epoch": 0.7605531295487628,
+      "grad_norm": 0.14306005835533142,
+      "learning_rate": 7.19010560233373e-06,
+      "loss": 0.13531534671783446,
+      "step": 4180
+    },
+    {
+      "epoch": 0.7614628820960698,
+      "grad_norm": 0.15525390207767487,
+      "learning_rate": 7.138526351766559e-06,
+      "loss": 0.14340845346450806,
+      "step": 4185
+    },
+    {
+      "epoch": 0.762372634643377,
+      "grad_norm": 0.24478943645954132,
+      "learning_rate": 7.087101942881263e-06,
+      "loss": 0.14744555950164795,
+      "step": 4190
+    },
+    {
+      "epoch": 0.7632823871906841,
+      "grad_norm": 0.31335577368736267,
+      "learning_rate": 7.035832821477711e-06,
+      "loss": 0.1484094500541687,
+      "step": 4195
+    },
+    {
+      "epoch": 0.7641921397379913,
+      "grad_norm": 0.15140366554260254,
+      "learning_rate": 6.984719432009515e-06,
+      "loss": 0.14991614818572999,
+      "step": 4200
+    },
+    {
+      "epoch": 0.7651018922852983,
+      "grad_norm": 0.16125506162643433,
+      "learning_rate": 6.933762217580289e-06,
+      "loss": 0.1408134937286377,
+      "step": 4205
+    },
+    {
+      "epoch": 0.7660116448326055,
+      "grad_norm": 0.2501450181007385,
+      "learning_rate": 6.882961619939726e-06,
+      "loss": 0.13875640630722047,
+      "step": 4210
+    },
+    {
+      "epoch": 0.7669213973799127,
+      "grad_norm": 0.16227811574935913,
+      "learning_rate": 6.8323180794798245e-06,
+      "loss": 0.14138660430908204,
+      "step": 4215
+    },
+    {
+      "epoch": 0.7678311499272198,
+      "grad_norm": 0.16676810383796692,
+      "learning_rate": 6.781832035231053e-06,
+      "loss": 0.14696706533432008,
+      "step": 4220
+    },
+    {
+      "epoch": 0.768740902474527,
+      "grad_norm": 0.14638574421405792,
+      "learning_rate": 6.731503924858518e-06,
+      "loss": 0.14263020753860473,
+      "step": 4225
+    },
+    {
+      "epoch": 0.769650655021834,
+      "grad_norm": 0.17093190550804138,
+      "learning_rate": 6.681334184658211e-06,
+      "loss": 0.14694111347198485,
+      "step": 4230
+    },
+    {
+      "epoch": 0.7705604075691412,
+      "grad_norm": 0.17174287140369415,
+      "learning_rate": 6.631323249553201e-06,
+      "loss": 0.13854929208755493,
+      "step": 4235
+    },
+    {
+      "epoch": 0.7714701601164483,
+      "grad_norm": 0.14599016308784485,
+      "learning_rate": 6.5814715530898745e-06,
+      "loss": 0.14058833122253417,
+      "step": 4240
+    },
+    {
+      "epoch": 0.7723799126637555,
+      "grad_norm": 0.16222265362739563,
+      "learning_rate": 6.531779527434176e-06,
+      "loss": 0.1428326725959778,
+      "step": 4245
+    },
+    {
+      "epoch": 0.7732896652110626,
+      "grad_norm": 0.1741994023323059,
+      "learning_rate": 6.482247603367839e-06,
+      "loss": 0.13985042572021483,
+      "step": 4250
+    },
+    {
+      "epoch": 0.7741994177583698,
+      "grad_norm": 0.17427101731300354,
+      "learning_rate": 6.432876210284688e-06,
+      "loss": 0.1442667603492737,
+      "step": 4255
+    },
+    {
+      "epoch": 0.7751091703056768,
+      "grad_norm": 0.1665259599685669,
+      "learning_rate": 6.383665776186912e-06,
+      "loss": 0.1421986222267151,
+      "step": 4260
+    },
+    {
+      "epoch": 0.776018922852984,
+      "grad_norm": 0.1728232353925705,
+      "learning_rate": 6.334616727681303e-06,
+      "loss": 0.1367053508758545,
+      "step": 4265
+    },
+    {
+      "epoch": 0.7769286754002911,
+      "grad_norm": 0.15882381796836853,
+      "learning_rate": 6.285729489975639e-06,
+      "loss": 0.14551182985305786,
+      "step": 4270
+    },
+    {
+      "epoch": 0.7778384279475983,
+      "grad_norm": 0.242042675614357,
+      "learning_rate": 6.2370044868749115e-06,
+      "loss": 0.1455132007598877,
+      "step": 4275
+    },
+    {
+      "epoch": 0.7787481804949054,
+      "grad_norm": 0.1599501073360443,
+      "learning_rate": 6.188442140777742e-06,
+      "loss": 0.1424942970275879,
+      "step": 4280
+    },
+    {
+      "epoch": 0.7796579330422125,
+      "grad_norm": 0.15182635188102722,
+      "learning_rate": 6.140042872672647e-06,
+      "loss": 0.14212887287139891,
+      "step": 4285
+    },
+    {
+      "epoch": 0.7805676855895196,
+      "grad_norm": 0.1720375418663025,
+      "learning_rate": 6.091807102134403e-06,
+      "loss": 0.14243412017822266,
+      "step": 4290
+    },
+    {
+      "epoch": 0.7814774381368268,
+      "grad_norm": 0.16436047852039337,
+      "learning_rate": 6.043735247320454e-06,
+      "loss": 0.15035657882690429,
+      "step": 4295
+    },
+    {
+      "epoch": 0.7823871906841339,
+      "grad_norm": 0.1498408019542694,
+      "learning_rate": 5.995827724967218e-06,
+      "loss": 0.14494839906692505,
+      "step": 4300
+    },
+    {
+      "epoch": 0.7832969432314411,
+      "grad_norm": 0.16924560070037842,
+      "learning_rate": 5.948084950386535e-06,
+      "loss": 0.13581212759017944,
+      "step": 4305
+    },
+    {
+      "epoch": 0.7842066957787481,
+      "grad_norm": 0.15889139473438263,
+      "learning_rate": 5.900507337462036e-06,
+      "loss": 0.15071530342102052,
+      "step": 4310
+    },
+    {
+      "epoch": 0.7851164483260553,
+      "grad_norm": 0.17201054096221924,
+      "learning_rate": 5.853095298645542e-06,
+      "loss": 0.1398628830909729,
+      "step": 4315
+    },
+    {
+      "epoch": 0.7860262008733624,
+      "grad_norm": 0.17965619266033173,
+      "learning_rate": 5.805849244953548e-06,
+      "loss": 0.14666696786880493,
+      "step": 4320
+    },
+    {
+      "epoch": 0.7869359534206696,
+      "grad_norm": 0.17514032125473022,
+      "learning_rate": 5.758769585963569e-06,
+      "loss": 0.1383386731147766,
+      "step": 4325
+    },
+    {
+      "epoch": 0.7878457059679768,
+      "grad_norm": 0.17497631907463074,
+      "learning_rate": 5.7118567298106744e-06,
+      "loss": 0.14362354278564454,
+      "step": 4330
+    },
+    {
+      "epoch": 0.7887554585152838,
+      "grad_norm": 0.16770458221435547,
+      "learning_rate": 5.665111083183905e-06,
+      "loss": 0.14136618375778198,
+      "step": 4335
+    },
+    {
+      "epoch": 0.789665211062591,
+      "grad_norm": 0.17134106159210205,
+      "learning_rate": 5.618533051322747e-06,
+      "loss": 0.1401529550552368,
+      "step": 4340
+    },
+    {
+      "epoch": 0.7905749636098981,
+      "grad_norm": 0.19458788633346558,
+      "learning_rate": 5.5721230380136435e-06,
+      "loss": 0.1393273115158081,
+      "step": 4345
+    },
+    {
+      "epoch": 0.7914847161572053,
+      "grad_norm": 0.19483692944049835,
+      "learning_rate": 5.525881445586467e-06,
+      "loss": 0.1369825482368469,
+      "step": 4350
+    },
+    {
+      "epoch": 0.7923944687045124,
+      "grad_norm": 0.3052191734313965,
+      "learning_rate": 5.4798086749110495e-06,
+      "loss": 0.14762181043624878,
+      "step": 4355
+    },
+    {
+      "epoch": 0.7933042212518195,
+      "grad_norm": 0.164458766579628,
+      "learning_rate": 5.4339051253937065e-06,
+      "loss": 0.14501686096191407,
+      "step": 4360
+    },
+    {
+      "epoch": 0.7942139737991266,
+      "grad_norm": 0.1719193458557129,
+      "learning_rate": 5.3881711949737625e-06,
+      "loss": 0.13321092128753662,
+      "step": 4365
+    },
+    {
+      "epoch": 0.7951237263464338,
+      "grad_norm": 0.17219696938991547,
+      "learning_rate": 5.342607280120121e-06,
+      "loss": 0.1413906455039978,
+      "step": 4370
+    },
+    {
+      "epoch": 0.7960334788937409,
+      "grad_norm": 0.15083056688308716,
+      "learning_rate": 5.297213775827789e-06,
+      "loss": 0.14772192239761353,
+      "step": 4375
+    },
+    {
+      "epoch": 0.7969432314410481,
+      "grad_norm": 0.1699071079492569,
+      "learning_rate": 5.251991075614507e-06,
+      "loss": 0.1392375946044922,
+      "step": 4380
+    },
+    {
+      "epoch": 0.7978529839883551,
+      "grad_norm": 0.1680395007133484,
+      "learning_rate": 5.206939571517302e-06,
+      "loss": 0.14185575246810914,
+      "step": 4385
+    },
+    {
+      "epoch": 0.7987627365356623,
+      "grad_norm": 0.16526710987091064,
+      "learning_rate": 5.162059654089083e-06,
+      "loss": 0.15001428127288818,
+      "step": 4390
+    },
+    {
+      "epoch": 0.7996724890829694,
+      "grad_norm": 0.16281752288341522,
+      "learning_rate": 5.1173517123952794e-06,
+      "loss": 0.13747023344039916,
+      "step": 4395
+    },
+    {
+      "epoch": 0.8005822416302766,
+      "grad_norm": 0.1454378366470337,
+      "learning_rate": 5.072816134010458e-06,
+      "loss": 0.14710829257965088,
+      "step": 4400
+    },
+    {
+      "epoch": 0.8014919941775837,
+      "grad_norm": 0.16565890610218048,
+      "learning_rate": 5.028453305014966e-06,
+      "loss": 0.14138611555099487,
+      "step": 4405
+    },
+    {
+      "epoch": 0.8024017467248908,
+      "grad_norm": 0.1962810605764389,
+      "learning_rate": 4.984263609991577e-06,
+      "loss": 0.13836177587509155,
+      "step": 4410
+    },
+    {
+      "epoch": 0.8033114992721979,
+      "grad_norm": 0.16091369092464447,
+      "learning_rate": 4.940247432022149e-06,
+      "loss": 0.14407440423965454,
+      "step": 4415
+    },
+    {
+      "epoch": 0.8042212518195051,
+      "grad_norm": 0.1930241584777832,
+      "learning_rate": 4.89640515268433e-06,
+      "loss": 0.14346336126327514,
+      "step": 4420
+    },
+    {
+      "epoch": 0.8051310043668122,
+      "grad_norm": 0.19301500916481018,
+      "learning_rate": 4.852737152048242e-06,
+      "loss": 0.14174317121505736,
+      "step": 4425
+    },
+    {
+      "epoch": 0.8060407569141194,
+      "grad_norm": 0.1541353315114975,
+      "learning_rate": 4.80924380867315e-06,
+      "loss": 0.14100592136383056,
+      "step": 4430
+    },
+    {
+      "epoch": 0.8069505094614265,
+      "grad_norm": 0.16285750269889832,
+      "learning_rate": 4.765925499604243e-06,
+      "loss": 0.1441288709640503,
+      "step": 4435
+    },
+    {
+      "epoch": 0.8078602620087336,
+      "grad_norm": 0.17382675409317017,
+      "learning_rate": 4.722782600369299e-06,
+      "loss": 0.13763951063156127,
+      "step": 4440
+    },
+    {
+      "epoch": 0.8087700145560408,
+      "grad_norm": 0.1697344034910202,
+      "learning_rate": 4.679815484975505e-06,
+      "loss": 0.1410105347633362,
+      "step": 4445
+    },
+    {
+      "epoch": 0.8096797671033479,
+      "grad_norm": 0.19964542984962463,
+      "learning_rate": 4.637024525906131e-06,
+      "loss": 0.1439276695251465,
+      "step": 4450
+    },
+    {
+      "epoch": 0.8105895196506551,
+      "grad_norm": 0.165307879447937,
+      "learning_rate": 4.59441009411736e-06,
+      "loss": 0.13897504806518554,
+      "step": 4455
+    },
+    {
+      "epoch": 0.8114992721979621,
+      "grad_norm": 0.16687989234924316,
+      "learning_rate": 4.551972559035067e-06,
+      "loss": 0.1422593355178833,
+      "step": 4460
+    },
+    {
+      "epoch": 0.8124090247452693,
+      "grad_norm": 0.15737789869308472,
+      "learning_rate": 4.509712288551571e-06,
+      "loss": 0.1452128052711487,
+      "step": 4465
+    },
+    {
+      "epoch": 0.8133187772925764,
+      "grad_norm": 0.17116659879684448,
+      "learning_rate": 4.467629649022509e-06,
+      "loss": 0.14385371208190917,
+      "step": 4470
+    },
+    {
+      "epoch": 0.8142285298398836,
+      "grad_norm": 0.17457640171051025,
+      "learning_rate": 4.425725005263623e-06,
+      "loss": 0.14808475971221924,
+      "step": 4475
+    },
+    {
+      "epoch": 0.8151382823871907,
+      "grad_norm": 0.1621970385313034,
+      "learning_rate": 4.383998720547583e-06,
+      "loss": 0.13927959203720092,
+      "step": 4480
+    },
+    {
+      "epoch": 0.8160480349344978,
+      "grad_norm": 0.176296666264534,
+      "learning_rate": 4.342451156600896e-06,
+      "loss": 0.15041060447692872,
+      "step": 4485
+    },
+    {
+      "epoch": 0.8169577874818049,
+      "grad_norm": 0.17157645523548126,
+      "learning_rate": 4.301082673600698e-06,
+      "loss": 0.13932652473449708,
+      "step": 4490
+    },
+    {
+      "epoch": 0.8178675400291121,
+      "grad_norm": 0.15378527343273163,
+      "learning_rate": 4.259893630171682e-06,
+      "loss": 0.1406856894493103,
+      "step": 4495
+    },
+    {
+      "epoch": 0.8187772925764192,
+      "grad_norm": 0.1750226765871048,
+      "learning_rate": 4.218884383382987e-06,
+      "loss": 0.1350164532661438,
+      "step": 4500
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.473992405047116e+18,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-4500/training_args.bin b/checkpoint-4500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-4500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/checkpoint-4600/README.md b/checkpoint-4600/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-4600/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-4600/adapter_config.json b/checkpoint-4600/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-4600/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-4600/adapter_model.safetensors b/checkpoint-4600/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..6d8ccfdd04a942ec9e97ccdac39d60a1fa198415
--- /dev/null
+++ b/checkpoint-4600/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f99b43fa7dd20c233bf68eca4c2431df06f09ab757a998e33fdecd1901d66069
+size 169741912
diff --git a/checkpoint-4600/chat_template.jinja b/checkpoint-4600/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-4600/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-4600/optimizer.pt b/checkpoint-4600/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b1cc838a7dc321de17bd5f345192fb3c38e94809
--- /dev/null
+++ b/checkpoint-4600/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ccd702ea51b20495eba5844a3074ff4eb31559fb94b3b6d71f24ba8dff7299cd
+size 72807355
diff --git a/checkpoint-4600/processor_config.json b/checkpoint-4600/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-4600/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-4600/rng_state.pth b/checkpoint-4600/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-4600/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-4600/scheduler.pt b/checkpoint-4600/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fa3098c81e713df175dbe75e6e8112afb8211b31
--- /dev/null
+++ b/checkpoint-4600/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d632bb86d093612d43b72d8b08b04690461ffe2f2b2bc3dedcba090bfc88d928
+size 1465
diff --git a/checkpoint-4600/tokenizer.json b/checkpoint-4600/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-4600/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-4600/tokenizer_config.json b/checkpoint-4600/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-4600/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-4600/trainer_state.json b/checkpoint-4600/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..65a3b117e759497d1514643514ef43df0ddf9ac0
--- /dev/null
+++ b/checkpoint-4600/trainer_state.json
@@ -0,0 +1,6482 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.8369723435225619,
+  "eval_steps": 100,
+  "global_step": 4600,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    },
+    {
+      "epoch": 0.03729985443959243,
+      "grad_norm": 0.061678580939769745,
+      "learning_rate": 4.999340748636462e-05,
+      "loss": 0.24956226348876953,
+      "step": 205
+    },
+    {
+      "epoch": 0.03820960698689956,
+      "grad_norm": 0.07328873127698898,
+      "learning_rate": 4.999160884067051e-05,
+      "loss": 0.26169676780700685,
+      "step": 210
+    },
+    {
+      "epoch": 0.0391193595342067,
+      "grad_norm": 0.08287990838289261,
+      "learning_rate": 4.9989593541926246e-05,
+      "loss": 0.2574604034423828,
+      "step": 215
+    },
+    {
+      "epoch": 0.04002911208151383,
+      "grad_norm": 0.06787359714508057,
+      "learning_rate": 4.9987361607602525e-05,
+      "loss": 0.25351409912109374,
+      "step": 220
+    },
+    {
+      "epoch": 0.04093886462882096,
+      "grad_norm": 0.06695502996444702,
+      "learning_rate": 4.998491305704805e-05,
+      "loss": 0.24522039890289307,
+      "step": 225
+    },
+    {
+      "epoch": 0.041848617176128096,
+      "grad_norm": 0.08872214704751968,
+      "learning_rate": 4.9982247911489375e-05,
+      "loss": 0.2581867933273315,
+      "step": 230
+    },
+    {
+      "epoch": 0.042758369723435226,
+      "grad_norm": 0.07637131959199905,
+      "learning_rate": 4.9979366194030743e-05,
+      "loss": 0.25569658279418944,
+      "step": 235
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 0.08158119022846222,
+      "learning_rate": 4.997626792965385e-05,
+      "loss": 0.2529409646987915,
+      "step": 240
+    },
+    {
+      "epoch": 0.04457787481804949,
+      "grad_norm": 0.07529161125421524,
+      "learning_rate": 4.997295314521766e-05,
+      "loss": 0.24049024581909179,
+      "step": 245
+    },
+    {
+      "epoch": 0.04548762736535662,
+      "grad_norm": 0.08860139548778534,
+      "learning_rate": 4.996942186945813e-05,
+      "loss": 0.2490522861480713,
+      "step": 250
+    },
+    {
+      "epoch": 0.04639737991266375,
+      "grad_norm": 0.0850321501493454,
+      "learning_rate": 4.9965674132988005e-05,
+      "loss": 0.24180831909179687,
+      "step": 255
+    },
+    {
+      "epoch": 0.04730713245997089,
+      "grad_norm": 0.07556115090847015,
+      "learning_rate": 4.996170996829653e-05,
+      "loss": 0.2509631872177124,
+      "step": 260
+    },
+    {
+      "epoch": 0.04821688500727802,
+      "grad_norm": 0.07971206307411194,
+      "learning_rate": 4.995752940974918e-05,
+      "loss": 0.24398891925811766,
+      "step": 265
+    },
+    {
+      "epoch": 0.04912663755458515,
+      "grad_norm": 0.09149336814880371,
+      "learning_rate": 4.9953132493587344e-05,
+      "loss": 0.2300492286682129,
+      "step": 270
+    },
+    {
+      "epoch": 0.050036390101892286,
+      "grad_norm": 0.08265820890665054,
+      "learning_rate": 4.9948519257928034e-05,
+      "loss": 0.24246792793273925,
+      "step": 275
+    },
+    {
+      "epoch": 0.050946142649199416,
+      "grad_norm": 0.10328587144613266,
+      "learning_rate": 4.9943689742763534e-05,
+      "loss": 0.2367171049118042,
+      "step": 280
+    },
+    {
+      "epoch": 0.05185589519650655,
+      "grad_norm": 0.0836917981505394,
+      "learning_rate": 4.993864398996105e-05,
+      "loss": 0.23215813636779786,
+      "step": 285
+    },
+    {
+      "epoch": 0.05276564774381368,
+      "grad_norm": 0.09475161135196686,
+      "learning_rate": 4.99333820432624e-05,
+      "loss": 0.2350748062133789,
+      "step": 290
+    },
+    {
+      "epoch": 0.05367540029112081,
+      "grad_norm": 0.08040128648281097,
+      "learning_rate": 4.992790394828355e-05,
+      "loss": 0.23253886699676513,
+      "step": 295
+    },
+    {
+      "epoch": 0.05458515283842795,
+      "grad_norm": 0.08852150291204453,
+      "learning_rate": 4.992220975251428e-05,
+      "loss": 0.23856515884399415,
+      "step": 300
+    },
+    {
+      "epoch": 0.05549490538573508,
+      "grad_norm": 0.09565229713916779,
+      "learning_rate": 4.991629950531775e-05,
+      "loss": 0.23311660289764405,
+      "step": 305
+    },
+    {
+      "epoch": 0.05640465793304221,
+      "grad_norm": 0.08158160001039505,
+      "learning_rate": 4.991017325793009e-05,
+      "loss": 0.22467944622039795,
+      "step": 310
+    },
+    {
+      "epoch": 0.05731441048034935,
+      "grad_norm": 0.07746429741382599,
+      "learning_rate": 4.990383106345994e-05,
+      "loss": 0.229844069480896,
+      "step": 315
+    },
+    {
+      "epoch": 0.05822416302765648,
+      "grad_norm": 0.08564355969429016,
+      "learning_rate": 4.989727297688797e-05,
+      "loss": 0.22414517402648926,
+      "step": 320
+    },
+    {
+      "epoch": 0.05913391557496361,
+      "grad_norm": 0.07517435401678085,
+      "learning_rate": 4.9890499055066435e-05,
+      "loss": 0.2236532211303711,
+      "step": 325
+    },
+    {
+      "epoch": 0.060043668122270744,
+      "grad_norm": 0.111734539270401,
+      "learning_rate": 4.988350935671869e-05,
+      "loss": 0.21474847793579102,
+      "step": 330
+    },
+    {
+      "epoch": 0.060953420669577874,
+      "grad_norm": 0.09906989336013794,
+      "learning_rate": 4.987630394243866e-05,
+      "loss": 0.23321933746337892,
+      "step": 335
+    },
+    {
+      "epoch": 0.06186317321688501,
+      "grad_norm": 0.10131457448005676,
+      "learning_rate": 4.98688828746903e-05,
+      "loss": 0.2310662031173706,
+      "step": 340
+    },
+    {
+      "epoch": 0.06277292576419213,
+      "grad_norm": 0.09203507006168365,
+      "learning_rate": 4.986124621780708e-05,
+      "loss": 0.22021169662475587,
+      "step": 345
+    },
+    {
+      "epoch": 0.06368267831149928,
+      "grad_norm": 0.09505912661552429,
+      "learning_rate": 4.9853394037991416e-05,
+      "loss": 0.2197155237197876,
+      "step": 350
+    },
+    {
+      "epoch": 0.06459243085880641,
+      "grad_norm": 0.09038657695055008,
+      "learning_rate": 4.984532640331412e-05,
+      "loss": 0.22066287994384765,
+      "step": 355
+    },
+    {
+      "epoch": 0.06550218340611354,
+      "grad_norm": 0.09707064181566238,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 0.22455451488494874,
+      "step": 360
+    },
+    {
+      "epoch": 0.06641193595342067,
+      "grad_norm": 0.10367228090763092,
+      "learning_rate": 4.98285450509961e-05,
+      "loss": 0.21993820667266845,
+      "step": 365
+    },
+    {
+      "epoch": 0.0673216885007278,
+      "grad_norm": 0.12229471653699875,
+      "learning_rate": 4.9819831478833456e-05,
+      "loss": 0.2168867588043213,
+      "step": 370
+    },
+    {
+      "epoch": 0.06823144104803494,
+      "grad_norm": 0.0964592918753624,
+      "learning_rate": 4.981090274276406e-05,
+      "loss": 0.21579203605651856,
+      "step": 375
+    },
+    {
+      "epoch": 0.06914119359534207,
+      "grad_norm": 0.09400496631860733,
+      "learning_rate": 4.980175892019141e-05,
+      "loss": 0.20972180366516113,
+      "step": 380
+    },
+    {
+      "epoch": 0.0700509461426492,
+      "grad_norm": 0.08158645778894424,
+      "learning_rate": 4.9792400090383594e-05,
+      "loss": 0.22148358821868896,
+      "step": 385
+    },
+    {
+      "epoch": 0.07096069868995633,
+      "grad_norm": 0.10916394740343094,
+      "learning_rate": 4.978282633447261e-05,
+      "loss": 0.2214418649673462,
+      "step": 390
+    },
+    {
+      "epoch": 0.07187045123726346,
+      "grad_norm": 0.11138810962438583,
+      "learning_rate": 4.9773037735453636e-05,
+      "loss": 0.21814754009246826,
+      "step": 395
+    },
+    {
+      "epoch": 0.07278020378457059,
+      "grad_norm": 0.10914396494626999,
+      "learning_rate": 4.9763034378184365e-05,
+      "loss": 0.21310818195343018,
+      "step": 400
+    },
+    {
+      "epoch": 0.07368995633187773,
+      "grad_norm": 0.1043366864323616,
+      "learning_rate": 4.975281634938421e-05,
+      "loss": 0.21266789436340333,
+      "step": 405
+    },
+    {
+      "epoch": 0.07459970887918486,
+      "grad_norm": 0.1036868542432785,
+      "learning_rate": 4.9742383737633594e-05,
+      "loss": 0.21606721878051757,
+      "step": 410
+    },
+    {
+      "epoch": 0.075509461426492,
+      "grad_norm": 0.11640442907810211,
+      "learning_rate": 4.9731736633373144e-05,
+      "loss": 0.21532948017120362,
+      "step": 415
+    },
+    {
+      "epoch": 0.07641921397379912,
+      "grad_norm": 0.11219926178455353,
+      "learning_rate": 4.9720875128902956e-05,
+      "loss": 0.2191627025604248,
+      "step": 420
+    },
+    {
+      "epoch": 0.07732896652110625,
+      "grad_norm": 0.12103637307882309,
+      "learning_rate": 4.970979931838176e-05,
+      "loss": 0.20938868522644044,
+      "step": 425
+    },
+    {
+      "epoch": 0.0782387190684134,
+      "grad_norm": 0.13274189829826355,
+      "learning_rate": 4.96985092978261e-05,
+      "loss": 0.21792960166931152,
+      "step": 430
+    },
+    {
+      "epoch": 0.07914847161572053,
+      "grad_norm": 0.11164513230323792,
+      "learning_rate": 4.968700516510954e-05,
+      "loss": 0.2022618055343628,
+      "step": 435
+    },
+    {
+      "epoch": 0.08005822416302766,
+      "grad_norm": 0.09532847255468369,
+      "learning_rate": 4.967528701996174e-05,
+      "loss": 0.21255812644958497,
+      "step": 440
+    },
+    {
+      "epoch": 0.08096797671033479,
+      "grad_norm": 0.10279258340597153,
+      "learning_rate": 4.96633549639677e-05,
+      "loss": 0.20683050155639648,
+      "step": 445
+    },
+    {
+      "epoch": 0.08187772925764192,
+      "grad_norm": 0.1257462352514267,
+      "learning_rate": 4.965120910056677e-05,
+      "loss": 0.21419920921325683,
+      "step": 450
+    },
+    {
+      "epoch": 0.08278748180494905,
+      "grad_norm": 0.11663137376308441,
+      "learning_rate": 4.963884953505186e-05,
+      "loss": 0.2072287082672119,
+      "step": 455
+    },
+    {
+      "epoch": 0.08369723435225619,
+      "grad_norm": 0.10488224029541016,
+      "learning_rate": 4.96262763745684e-05,
+      "loss": 0.1982678532600403,
+      "step": 460
+    },
+    {
+      "epoch": 0.08460698689956332,
+      "grad_norm": 0.11801692098379135,
+      "learning_rate": 4.961348972811354e-05,
+      "loss": 0.20662031173706055,
+      "step": 465
+    },
+    {
+      "epoch": 0.08551673944687045,
+      "grad_norm": 0.11318827420473099,
+      "learning_rate": 4.96004897065351e-05,
+      "loss": 0.20947303771972656,
+      "step": 470
+    },
+    {
+      "epoch": 0.08642649199417758,
+      "grad_norm": 0.13409486413002014,
+      "learning_rate": 4.95872764225307e-05,
+      "loss": 0.19670876264572143,
+      "step": 475
+    },
+    {
+      "epoch": 0.08733624454148471,
+      "grad_norm": 0.14440792798995972,
+      "learning_rate": 4.957384999064672e-05,
+      "loss": 0.19842848777770997,
+      "step": 480
+    },
+    {
+      "epoch": 0.08824599708879186,
+      "grad_norm": 0.12246996909379959,
+      "learning_rate": 4.956021052727731e-05,
+      "loss": 0.20318071842193602,
+      "step": 485
+    },
+    {
+      "epoch": 0.08915574963609899,
+      "grad_norm": 0.13437233865261078,
+      "learning_rate": 4.954635815066342e-05,
+      "loss": 0.21675212383270265,
+      "step": 490
+    },
+    {
+      "epoch": 0.09006550218340612,
+      "grad_norm": 0.11109672486782074,
+      "learning_rate": 4.9532292980891744e-05,
+      "loss": 0.2100757837295532,
+      "step": 495
+    },
+    {
+      "epoch": 0.09097525473071325,
+      "grad_norm": 0.1388893872499466,
+      "learning_rate": 4.9518015139893675e-05,
+      "loss": 0.20303285121917725,
+      "step": 500
+    },
+    {
+      "epoch": 0.09188500727802038,
+      "grad_norm": 0.13239721953868866,
+      "learning_rate": 4.950352475144427e-05,
+      "loss": 0.2152268409729004,
+      "step": 505
+    },
+    {
+      "epoch": 0.0927947598253275,
+      "grad_norm": 0.12834979593753815,
+      "learning_rate": 4.948882194116119e-05,
+      "loss": 0.20799248218536376,
+      "step": 510
+    },
+    {
+      "epoch": 0.09370451237263465,
+      "grad_norm": 0.11886704713106155,
+      "learning_rate": 4.947390683650354e-05,
+      "loss": 0.20394976139068605,
+      "step": 515
+    },
+    {
+      "epoch": 0.09461426491994178,
+      "grad_norm": 0.11398876458406448,
+      "learning_rate": 4.945877956677083e-05,
+      "loss": 0.2091092586517334,
+      "step": 520
+    },
+    {
+      "epoch": 0.09552401746724891,
+      "grad_norm": 0.1422540694475174,
+      "learning_rate": 4.944344026310186e-05,
+      "loss": 0.19564238786697388,
+      "step": 525
+    },
+    {
+      "epoch": 0.09643377001455604,
+      "grad_norm": 0.11359584331512451,
+      "learning_rate": 4.9427889058473535e-05,
+      "loss": 0.20493624210357667,
+      "step": 530
+    },
+    {
+      "epoch": 0.09734352256186317,
+      "grad_norm": 0.11703553050756454,
+      "learning_rate": 4.941212608769974e-05,
+      "loss": 0.2098615884780884,
+      "step": 535
+    },
+    {
+      "epoch": 0.0982532751091703,
+      "grad_norm": 0.14552047848701477,
+      "learning_rate": 4.939615148743017e-05,
+      "loss": 0.20382182598114013,
+      "step": 540
+    },
+    {
+      "epoch": 0.09916302765647744,
+      "grad_norm": 0.13178016245365143,
+      "learning_rate": 4.937996539614914e-05,
+      "loss": 0.19901862144470214,
+      "step": 545
+    },
+    {
+      "epoch": 0.10007278020378457,
+      "grad_norm": 0.635392427444458,
+      "learning_rate": 4.936356795417439e-05,
+      "loss": 0.20694944858551026,
+      "step": 550
+    },
+    {
+      "epoch": 0.1009825327510917,
+      "grad_norm": 0.15019077062606812,
+      "learning_rate": 4.934695930365586e-05,
+      "loss": 0.19313746690750122,
+      "step": 555
+    },
+    {
+      "epoch": 0.10189228529839883,
+      "grad_norm": 0.12941956520080566,
+      "learning_rate": 4.9330139588574474e-05,
+      "loss": 0.19671722650527954,
+      "step": 560
+    },
+    {
+      "epoch": 0.10280203784570596,
+      "grad_norm": 0.13818831741809845,
+      "learning_rate": 4.931310895474088e-05,
+      "loss": 0.20026786327362062,
+      "step": 565
+    },
+    {
+      "epoch": 0.1037117903930131,
+      "grad_norm": 0.12011194974184036,
+      "learning_rate": 4.929586754979417e-05,
+      "loss": 0.1932437539100647,
+      "step": 570
+    },
+    {
+      "epoch": 0.10462154294032024,
+      "grad_norm": 0.1345364898443222,
+      "learning_rate": 4.9278415523200644e-05,
+      "loss": 0.20245940685272218,
+      "step": 575
+    },
+    {
+      "epoch": 0.10553129548762737,
+      "grad_norm": 0.13281017541885376,
+      "learning_rate": 4.926075302625247e-05,
+      "loss": 0.19864981174468993,
+      "step": 580
+    },
+    {
+      "epoch": 0.1064410480349345,
+      "grad_norm": 0.13465586304664612,
+      "learning_rate": 4.924288021206639e-05,
+      "loss": 0.19573183059692384,
+      "step": 585
+    },
+    {
+      "epoch": 0.10735080058224163,
+      "grad_norm": 0.15225961804389954,
+      "learning_rate": 4.9224797235582396e-05,
+      "loss": 0.19946801662445068,
+      "step": 590
+    },
+    {
+      "epoch": 0.10826055312954876,
+      "grad_norm": 0.12816746532917023,
+      "learning_rate": 4.92065042535624e-05,
+      "loss": 0.19851526021957397,
+      "step": 595
+    },
+    {
+      "epoch": 0.1091703056768559,
+      "grad_norm": 0.13802853226661682,
+      "learning_rate": 4.9188001424588824e-05,
+      "loss": 0.19321763515472412,
+      "step": 600
+    },
+    {
+      "epoch": 0.11008005822416303,
+      "grad_norm": 0.17504797875881195,
+      "learning_rate": 4.9169288909063295e-05,
+      "loss": 0.2032616138458252,
+      "step": 605
+    },
+    {
+      "epoch": 0.11098981077147016,
+      "grad_norm": 0.13544194400310516,
+      "learning_rate": 4.91503668692052e-05,
+      "loss": 0.2011256456375122,
+      "step": 610
+    },
+    {
+      "epoch": 0.11189956331877729,
+      "grad_norm": 1.3976134061813354,
+      "learning_rate": 4.91312354690503e-05,
+      "loss": 0.19916868209838867,
+      "step": 615
+    },
+    {
+      "epoch": 0.11280931586608442,
+      "grad_norm": 0.1465059071779251,
+      "learning_rate": 4.91118948744493e-05,
+      "loss": 0.19487457275390624,
+      "step": 620
+    },
+    {
+      "epoch": 0.11371906841339156,
+      "grad_norm": 0.12103168666362762,
+      "learning_rate": 4.909234525306645e-05,
+      "loss": 0.1907251238822937,
+      "step": 625
+    },
+    {
+      "epoch": 0.1146288209606987,
+      "grad_norm": 0.12660574913024902,
+      "learning_rate": 4.907258677437802e-05,
+      "loss": 0.19327253103256226,
+      "step": 630
+    },
+    {
+      "epoch": 0.11553857350800582,
+      "grad_norm": 0.1347813606262207,
+      "learning_rate": 4.90526196096709e-05,
+      "loss": 0.19637736082077026,
+      "step": 635
+    },
+    {
+      "epoch": 0.11644832605531295,
+      "grad_norm": 0.14953652024269104,
+      "learning_rate": 4.903244393204107e-05,
+      "loss": 0.20325069427490233,
+      "step": 640
+    },
+    {
+      "epoch": 0.11735807860262008,
+      "grad_norm": 0.13936272263526917,
+      "learning_rate": 4.901205991639213e-05,
+      "loss": 0.1930275321006775,
+      "step": 645
+    },
+    {
+      "epoch": 0.11826783114992721,
+      "grad_norm": 0.1448420137166977,
+      "learning_rate": 4.899146773943374e-05,
+      "loss": 0.20026936531066894,
+      "step": 650
+    },
+    {
+      "epoch": 0.11917758369723436,
+      "grad_norm": 0.1312534064054489,
+      "learning_rate": 4.897066757968014e-05,
+      "loss": 0.19062033891677857,
+      "step": 655
+    },
+    {
+      "epoch": 0.12008733624454149,
+      "grad_norm": 0.13644742965698242,
+      "learning_rate": 4.894965961744859e-05,
+      "loss": 0.18719595670700073,
+      "step": 660
+    },
+    {
+      "epoch": 0.12099708879184862,
+      "grad_norm": 0.14276087284088135,
+      "learning_rate": 4.892844403485777e-05,
+      "loss": 0.19784307479858398,
+      "step": 665
+    },
+    {
+      "epoch": 0.12190684133915575,
+      "grad_norm": 0.14735399186611176,
+      "learning_rate": 4.890702101582623e-05,
+      "loss": 0.19163782596588136,
+      "step": 670
+    },
+    {
+      "epoch": 0.12281659388646288,
+      "grad_norm": 0.15742065012454987,
+      "learning_rate": 4.888539074607082e-05,
+      "loss": 0.19312986135482788,
+      "step": 675
+    },
+    {
+      "epoch": 0.12372634643377002,
+      "grad_norm": 0.12917031347751617,
+      "learning_rate": 4.8863553413105025e-05,
+      "loss": 0.20066320896148682,
+      "step": 680
+    },
+    {
+      "epoch": 0.12463609898107715,
+      "grad_norm": 0.1484801322221756,
+      "learning_rate": 4.884150920623737e-05,
+      "loss": 0.20096096992492676,
+      "step": 685
+    },
+    {
+      "epoch": 0.12554585152838427,
+      "grad_norm": 0.1455296128988266,
+      "learning_rate": 4.88192583165698e-05,
+      "loss": 0.20518505573272705,
+      "step": 690
+    },
+    {
+      "epoch": 0.12645560407569142,
+      "grad_norm": 0.14517490565776825,
+      "learning_rate": 4.879680093699598e-05,
+      "loss": 0.18859238624572755,
+      "step": 695
+    },
+    {
+      "epoch": 0.12736535662299855,
+      "grad_norm": 0.18778090178966522,
+      "learning_rate": 4.877413726219964e-05,
+      "loss": 0.197074818611145,
+      "step": 700
+    },
+    {
+      "epoch": 0.12827510917030568,
+      "grad_norm": 0.13497677445411682,
+      "learning_rate": 4.87512674886529e-05,
+      "loss": 0.18713107109069824,
+      "step": 705
+    },
+    {
+      "epoch": 0.12918486171761281,
+      "grad_norm": 0.12657155096530914,
+      "learning_rate": 4.872819181461455e-05,
+      "loss": 0.1858484387397766,
+      "step": 710
+    },
+    {
+      "epoch": 0.13009461426491994,
+      "grad_norm": 0.11458148807287216,
+      "learning_rate": 4.870491044012834e-05,
+      "loss": 0.18732179403305055,
+      "step": 715
+    },
+    {
+      "epoch": 0.13100436681222707,
+      "grad_norm": 0.13000249862670898,
+      "learning_rate": 4.8681423567021244e-05,
+      "loss": 0.1872936010360718,
+      "step": 720
+    },
+    {
+      "epoch": 0.1319141193595342,
+      "grad_norm": 0.14580890536308289,
+      "learning_rate": 4.865773139890172e-05,
+      "loss": 0.19280019998550416,
+      "step": 725
+    },
+    {
+      "epoch": 0.13282387190684133,
+      "grad_norm": 0.1507277935743332,
+      "learning_rate": 4.8633834141157913e-05,
+      "loss": 0.1898929238319397,
+      "step": 730
+    },
+    {
+      "epoch": 0.13373362445414846,
+      "grad_norm": 0.1418737769126892,
+      "learning_rate": 4.860973200095592e-05,
+      "loss": 0.17926375865936278,
+      "step": 735
+    },
+    {
+      "epoch": 0.1346433770014556,
+      "grad_norm": 0.17151866853237152,
+      "learning_rate": 4.858542518723794e-05,
+      "loss": 0.18963592052459716,
+      "step": 740
+    },
+    {
+      "epoch": 0.13555312954876272,
+      "grad_norm": 0.11162743717432022,
+      "learning_rate": 4.8560913910720535e-05,
+      "loss": 0.19466646909713745,
+      "step": 745
+    },
+    {
+      "epoch": 0.13646288209606988,
+      "grad_norm": 0.15628376603126526,
+      "learning_rate": 4.8536198383892725e-05,
+      "loss": 0.19494034051895143,
+      "step": 750
+    },
+    {
+      "epoch": 0.137372634643377,
+      "grad_norm": 0.18209289014339447,
+      "learning_rate": 4.851127882101421e-05,
+      "loss": 0.18747550249099731,
+      "step": 755
+    },
+    {
+      "epoch": 0.13828238719068414,
+      "grad_norm": 0.14559614658355713,
+      "learning_rate": 4.8486155438113454e-05,
+      "loss": 0.1897158980369568,
+      "step": 760
+    },
+    {
+      "epoch": 0.13919213973799127,
+      "grad_norm": 0.3198587894439697,
+      "learning_rate": 4.846082845298586e-05,
+      "loss": 0.18571001291275024,
+      "step": 765
+    },
+    {
+      "epoch": 0.1401018922852984,
+      "grad_norm": 0.1486678421497345,
+      "learning_rate": 4.843529808519189e-05,
+      "loss": 0.19561930894851684,
+      "step": 770
+    },
+    {
+      "epoch": 0.14101164483260553,
+      "grad_norm": 0.15318170189857483,
+      "learning_rate": 4.840956455605509e-05,
+      "loss": 0.187040114402771,
+      "step": 775
+    },
+    {
+      "epoch": 0.14192139737991266,
+      "grad_norm": 0.13754244148731232,
+      "learning_rate": 4.838362808866025e-05,
+      "loss": 0.18345539569854735,
+      "step": 780
+    },
+    {
+      "epoch": 0.1428311499272198,
+      "grad_norm": 0.12943248450756073,
+      "learning_rate": 4.835748890785143e-05,
+      "loss": 0.1921079397201538,
+      "step": 785
+    },
+    {
+      "epoch": 0.14374090247452692,
+      "grad_norm": 0.110458143055439,
+      "learning_rate": 4.833114724023001e-05,
+      "loss": 0.17927205562591553,
+      "step": 790
+    },
+    {
+      "epoch": 0.14465065502183405,
+      "grad_norm": 0.2421770840883255,
+      "learning_rate": 4.830460331415275e-05,
+      "loss": 0.18317567110061644,
+      "step": 795
+    },
+    {
+      "epoch": 0.14556040756914118,
+      "grad_norm": 0.14752762019634247,
+      "learning_rate": 4.8277857359729787e-05,
+      "loss": 0.1843916058540344,
+      "step": 800
+    },
+    {
+      "epoch": 0.14647016011644834,
+      "grad_norm": 0.15043556690216064,
+      "learning_rate": 4.8250909608822644e-05,
+      "loss": 0.18354393243789674,
+      "step": 805
+    },
+    {
+      "epoch": 0.14737991266375547,
+      "grad_norm": 0.1381794661283493,
+      "learning_rate": 4.822376029504223e-05,
+      "loss": 0.1789781332015991,
+      "step": 810
+    },
+    {
+      "epoch": 0.1482896652110626,
+      "grad_norm": 0.18386174738407135,
+      "learning_rate": 4.819640965374681e-05,
+      "loss": 0.19494292736053467,
+      "step": 815
+    },
+    {
+      "epoch": 0.14919941775836973,
+      "grad_norm": 0.13829593360424042,
+      "learning_rate": 4.816885792203996e-05,
+      "loss": 0.18486063480377196,
+      "step": 820
+    },
+    {
+      "epoch": 0.15010917030567686,
+      "grad_norm": 0.15033291280269623,
+      "learning_rate": 4.814110533876852e-05,
+      "loss": 0.18061509132385253,
+      "step": 825
+    },
+    {
+      "epoch": 0.151018922852984,
+      "grad_norm": 0.17150473594665527,
+      "learning_rate": 4.811315214452051e-05,
+      "loss": 0.18464866876602173,
+      "step": 830
+    },
+    {
+      "epoch": 0.15192867540029112,
+      "grad_norm": 0.15317125618457794,
+      "learning_rate": 4.808499858162307e-05,
+      "loss": 0.1837708592414856,
+      "step": 835
+    },
+    {
+      "epoch": 0.15283842794759825,
+      "grad_norm": 0.2671392560005188,
+      "learning_rate": 4.805664489414031e-05,
+      "loss": 0.19338636398315429,
+      "step": 840
+    },
+    {
+      "epoch": 0.15374818049490538,
+      "grad_norm": 0.14047028124332428,
+      "learning_rate": 4.802809132787125e-05,
+      "loss": 0.17069108486175538,
+      "step": 845
+    },
+    {
+      "epoch": 0.1546579330422125,
+      "grad_norm": 0.1520431935787201,
+      "learning_rate": 4.799933813034768e-05,
+      "loss": 0.18607735633850098,
+      "step": 850
+    },
+    {
+      "epoch": 0.15556768558951964,
+      "grad_norm": 0.17239463329315186,
+      "learning_rate": 4.797038555083197e-05,
+      "loss": 0.18069062232971192,
+      "step": 855
+    },
+    {
+      "epoch": 0.1564774381368268,
+      "grad_norm": 0.1377955675125122,
+      "learning_rate": 4.794123384031495e-05,
+      "loss": 0.18870222568511963,
+      "step": 860
+    },
+    {
+      "epoch": 0.15738719068413393,
+      "grad_norm": 0.15901461243629456,
+      "learning_rate": 4.791188325151373e-05,
+      "loss": 0.18128334283828734,
+      "step": 865
+    },
+    {
+      "epoch": 0.15829694323144106,
+      "grad_norm": 0.14634132385253906,
+      "learning_rate": 4.7882334038869495e-05,
+      "loss": 0.1866163969039917,
+      "step": 870
+    },
+    {
+      "epoch": 0.1592066957787482,
+      "grad_norm": 0.15361061692237854,
+      "learning_rate": 4.785258645854529e-05,
+      "loss": 0.17850807905197144,
+      "step": 875
+    },
+    {
+      "epoch": 0.16011644832605532,
+      "grad_norm": 0.13751649856567383,
+      "learning_rate": 4.782264076842385e-05,
+      "loss": 0.17731113433837892,
+      "step": 880
+    },
+    {
+      "epoch": 0.16102620087336245,
+      "grad_norm": 0.17909638583660126,
+      "learning_rate": 4.7792497228105314e-05,
+      "loss": 0.18344542980194092,
+      "step": 885
+    },
+    {
+      "epoch": 0.16193595342066958,
+      "grad_norm": 0.16038304567337036,
+      "learning_rate": 4.776215609890498e-05,
+      "loss": 0.18868647813796996,
+      "step": 890
+    },
+    {
+      "epoch": 0.1628457059679767,
+      "grad_norm": 0.1653951108455658,
+      "learning_rate": 4.773161764385107e-05,
+      "loss": 0.18614152669906617,
+      "step": 895
+    },
+    {
+      "epoch": 0.16375545851528384,
+      "grad_norm": 0.16193026304244995,
+      "learning_rate": 4.770088212768241e-05,
+      "loss": 0.18564575910568237,
+      "step": 900
+    },
+    {
+      "epoch": 0.16466521106259097,
+      "grad_norm": 0.16048531234264374,
+      "learning_rate": 4.7669949816846173e-05,
+      "loss": 0.18330031633377075,
+      "step": 905
+    },
+    {
+      "epoch": 0.1655749636098981,
+      "grad_norm": 0.1440177708864212,
+      "learning_rate": 4.7638820979495534e-05,
+      "loss": 0.17712442874908446,
+      "step": 910
+    },
+    {
+      "epoch": 0.16648471615720525,
+      "grad_norm": 0.19635969400405884,
+      "learning_rate": 4.760749588548738e-05,
+      "loss": 0.18679027557373046,
+      "step": 915
+    },
+    {
+      "epoch": 0.16739446870451238,
+      "grad_norm": 0.15576541423797607,
+      "learning_rate": 4.757597480637995e-05,
+      "loss": 0.19283764362335204,
+      "step": 920
+    },
+    {
+      "epoch": 0.1683042212518195,
+      "grad_norm": 0.1550331562757492,
+      "learning_rate": 4.7544258015430463e-05,
+      "loss": 0.18269542455673218,
+      "step": 925
+    },
+    {
+      "epoch": 0.16921397379912664,
+      "grad_norm": 0.18369626998901367,
+      "learning_rate": 4.75123457875928e-05,
+      "loss": 0.1697891116142273,
+      "step": 930
+    },
+    {
+      "epoch": 0.17012372634643377,
+      "grad_norm": 0.15266314148902893,
+      "learning_rate": 4.7480238399515074e-05,
+      "loss": 0.18523451089859008,
+      "step": 935
+    },
+    {
+      "epoch": 0.1710334788937409,
+      "grad_norm": 0.16709664463996887,
+      "learning_rate": 4.744793612953724e-05,
+      "loss": 0.1803238034248352,
+      "step": 940
+    },
+    {
+      "epoch": 0.17194323144104803,
+      "grad_norm": 0.14929179847240448,
+      "learning_rate": 4.741543925768872e-05,
+      "loss": 0.1861217737197876,
+      "step": 945
+    },
+    {
+      "epoch": 0.17285298398835516,
+      "grad_norm": 0.1362280696630478,
+      "learning_rate": 4.7382748065685915e-05,
+      "loss": 0.17896100282669067,
+      "step": 950
+    },
+    {
+      "epoch": 0.1737627365356623,
+      "grad_norm": 0.15290239453315735,
+      "learning_rate": 4.734986283692982e-05,
+      "loss": 0.18432788848876952,
+      "step": 955
+    },
+    {
+      "epoch": 0.17467248908296942,
+      "grad_norm": 0.1287035197019577,
+      "learning_rate": 4.73167838565035e-05,
+      "loss": 0.18485682010650634,
+      "step": 960
+    },
+    {
+      "epoch": 0.17558224163027655,
+      "grad_norm": 0.17969627678394318,
+      "learning_rate": 4.728351141116971e-05,
+      "loss": 0.17361557483673096,
+      "step": 965
+    },
+    {
+      "epoch": 0.1764919941775837,
+      "grad_norm": 0.13751201331615448,
+      "learning_rate": 4.7250045789368326e-05,
+      "loss": 0.1731679320335388,
+      "step": 970
+    },
+    {
+      "epoch": 0.17740174672489084,
+      "grad_norm": 0.1603265255689621,
+      "learning_rate": 4.721638728121388e-05,
+      "loss": 0.17308170795440675,
+      "step": 975
+    },
+    {
+      "epoch": 0.17831149927219797,
+      "grad_norm": 0.1592789888381958,
+      "learning_rate": 4.718253617849306e-05,
+      "loss": 0.17534757852554322,
+      "step": 980
+    },
+    {
+      "epoch": 0.1792212518195051,
+      "grad_norm": 0.12727224826812744,
+      "learning_rate": 4.714849277466214e-05,
+      "loss": 0.17817609310150145,
+      "step": 985
+    },
+    {
+      "epoch": 0.18013100436681223,
+      "grad_norm": 0.15401554107666016,
+      "learning_rate": 4.711425736484447e-05,
+      "loss": 0.1733405351638794,
+      "step": 990
+    },
+    {
+      "epoch": 0.18104075691411936,
+      "grad_norm": 0.13253968954086304,
+      "learning_rate": 4.7079830245827906e-05,
+      "loss": 0.17846795320510864,
+      "step": 995
+    },
+    {
+      "epoch": 0.1819505094614265,
+      "grad_norm": 0.21846213936805725,
+      "learning_rate": 4.7045211716062245e-05,
+      "loss": 0.18021599054336548,
+      "step": 1000
+    },
+    {
+      "epoch": 0.18286026200873362,
+      "grad_norm": 0.16867990791797638,
+      "learning_rate": 4.7010402075656595e-05,
+      "loss": 0.18232386112213134,
+      "step": 1005
+    },
+    {
+      "epoch": 0.18377001455604075,
+      "grad_norm": 0.17180582880973816,
+      "learning_rate": 4.697540162637686e-05,
+      "loss": 0.1816317319869995,
+      "step": 1010
+    },
+    {
+      "epoch": 0.18467976710334788,
+      "grad_norm": 0.16480213403701782,
+      "learning_rate": 4.694021067164303e-05,
+      "loss": 0.17718446254730225,
+      "step": 1015
+    },
+    {
+      "epoch": 0.185589519650655,
+      "grad_norm": 0.15015918016433716,
+      "learning_rate": 4.6904829516526605e-05,
+      "loss": 0.17412011623382567,
+      "step": 1020
+    },
+    {
+      "epoch": 0.18649927219796217,
+      "grad_norm": 0.14445139467716217,
+      "learning_rate": 4.686925846774795e-05,
+      "loss": 0.1778018832206726,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1874090247452693,
+      "grad_norm": 0.1701960265636444,
+      "learning_rate": 4.683349783367362e-05,
+      "loss": 0.16901081800460815,
+      "step": 1030
+    },
+    {
+      "epoch": 0.18831877729257643,
+      "grad_norm": 0.15894867479801178,
+      "learning_rate": 4.679754792431368e-05,
+      "loss": 0.17055928707122803,
+      "step": 1035
+    },
+    {
+      "epoch": 0.18922852983988356,
+      "grad_norm": 0.1511942446231842,
+      "learning_rate": 4.676140905131903e-05,
+      "loss": 0.17339680194854737,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1901382823871907,
+      "grad_norm": 0.14735209941864014,
+      "learning_rate": 4.672508152797872e-05,
+      "loss": 0.17802717685699462,
+      "step": 1045
+    },
+    {
+      "epoch": 0.19104803493449782,
+      "grad_norm": 0.17367291450500488,
+      "learning_rate": 4.66885656692172e-05,
+      "loss": 0.1732744097709656,
+      "step": 1050
+    },
+    {
+      "epoch": 0.19195778748180495,
+      "grad_norm": 0.147227481007576,
+      "learning_rate": 4.665186179159159e-05,
+      "loss": 0.17040517330169677,
+      "step": 1055
+    },
+    {
+      "epoch": 0.19286754002911208,
+      "grad_norm": 0.1709655076265335,
+      "learning_rate": 4.6614970213289e-05,
+      "loss": 0.17794088125228882,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1937772925764192,
+      "grad_norm": 0.1588088721036911,
+      "learning_rate": 4.657789125412366e-05,
+      "loss": 0.17180380821228028,
+      "step": 1065
+    },
+    {
+      "epoch": 0.19468704512372634,
+      "grad_norm": 0.14827021956443787,
+      "learning_rate": 4.654062523553428e-05,
+      "loss": 0.182997989654541,
+      "step": 1070
+    },
+    {
+      "epoch": 0.19559679767103347,
+      "grad_norm": 0.16230466961860657,
+      "learning_rate": 4.6503172480581126e-05,
+      "loss": 0.17346880435943604,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1965065502183406,
+      "grad_norm": 0.1637624353170395,
+      "learning_rate": 4.646553331394333e-05,
+      "loss": 0.17263576984405518,
+      "step": 1080
+    },
+    {
+      "epoch": 0.19741630276564776,
+      "grad_norm": 0.15977843105793,
+      "learning_rate": 4.642770806191603e-05,
+      "loss": 0.17284308671951293,
+      "step": 1085
+    },
+    {
+      "epoch": 0.19832605531295489,
+      "grad_norm": 0.15394869446754456,
+      "learning_rate": 4.6389697052407534e-05,
+      "loss": 0.17797101736068727,
+      "step": 1090
+    },
+    {
+      "epoch": 0.19923580786026202,
+      "grad_norm": 0.15995225310325623,
+      "learning_rate": 4.6351500614936485e-05,
+      "loss": 0.18137198686599731,
+      "step": 1095
+    },
+    {
+      "epoch": 0.20014556040756915,
+      "grad_norm": 0.1779479682445526,
+      "learning_rate": 4.6313119080629006e-05,
+      "loss": 0.17998344898223878,
+      "step": 1100
+    },
+    {
+      "epoch": 0.20105531295487628,
+      "grad_norm": 0.14362832903862,
+      "learning_rate": 4.627455278221584e-05,
+      "loss": 0.18196423053741456,
+      "step": 1105
+    },
+    {
+      "epoch": 0.2019650655021834,
+      "grad_norm": 0.15951639413833618,
+      "learning_rate": 4.623580205402947e-05,
+      "loss": 0.17423888444900512,
+      "step": 1110
+    },
+    {
+      "epoch": 0.20287481804949054,
+      "grad_norm": 0.17273563146591187,
+      "learning_rate": 4.619686723200115e-05,
+      "loss": 0.17392473220825194,
+      "step": 1115
+    },
+    {
+      "epoch": 0.20378457059679767,
+      "grad_norm": 0.1655360758304596,
+      "learning_rate": 4.615774865365813e-05,
+      "loss": 0.17528389692306517,
+      "step": 1120
+    },
+    {
+      "epoch": 0.2046943231441048,
+      "grad_norm": 0.15920691192150116,
+      "learning_rate": 4.611844665812058e-05,
+      "loss": 0.1806849241256714,
+      "step": 1125
+    },
+    {
+      "epoch": 0.20560407569141192,
+      "grad_norm": 0.16114577651023865,
+      "learning_rate": 4.607896158609875e-05,
+      "loss": 0.17217352390289306,
+      "step": 1130
+    },
+    {
+      "epoch": 0.20651382823871905,
+      "grad_norm": 0.1499422937631607,
+      "learning_rate": 4.603929377988999e-05,
+      "loss": 0.17806737422943114,
+      "step": 1135
+    },
+    {
+      "epoch": 0.2074235807860262,
+      "grad_norm": 0.17605191469192505,
+      "learning_rate": 4.5999443583375765e-05,
+      "loss": 0.17842113971710205,
+      "step": 1140
+    },
+    {
+      "epoch": 0.20833333333333334,
+      "grad_norm": 0.16117210686206818,
+      "learning_rate": 4.595941134201871e-05,
+      "loss": 0.18379683494567872,
+      "step": 1145
+    },
+    {
+      "epoch": 0.20924308588064047,
+      "grad_norm": 0.21199050545692444,
+      "learning_rate": 4.591919740285957e-05,
+      "loss": 0.16286123991012574,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2101528384279476,
+      "grad_norm": 0.15100529789924622,
+      "learning_rate": 4.587880211451427e-05,
+      "loss": 0.17995200157165528,
+      "step": 1155
+    },
+    {
+      "epoch": 0.21106259097525473,
+      "grad_norm": 0.16618172824382782,
+      "learning_rate": 4.583822582717085e-05,
+      "loss": 0.16960303783416747,
+      "step": 1160
+    },
+    {
+      "epoch": 0.21197234352256186,
+      "grad_norm": 0.14743569493293762,
+      "learning_rate": 4.579746889258643e-05,
+      "loss": 0.17762668132781984,
+      "step": 1165
+    },
+    {
+      "epoch": 0.212882096069869,
+      "grad_norm": 0.1697179079055786,
+      "learning_rate": 4.575653166408417e-05,
+      "loss": 0.16656005382537842,
+      "step": 1170
+    },
+    {
+      "epoch": 0.21379184861717612,
+      "grad_norm": 0.14886513352394104,
+      "learning_rate": 4.57154144965502e-05,
+      "loss": 0.17091882228851318,
+      "step": 1175
+    },
+    {
+      "epoch": 0.21470160116448325,
+      "grad_norm": 0.18197473883628845,
+      "learning_rate": 4.5674117746430556e-05,
+      "loss": 0.1770920753479004,
+      "step": 1180
+    },
+    {
+      "epoch": 0.21561135371179038,
+      "grad_norm": 0.17323088645935059,
+      "learning_rate": 4.563264177172807e-05,
+      "loss": 0.1734643578529358,
+      "step": 1185
+    },
+    {
+      "epoch": 0.2165211062590975,
+      "grad_norm": 0.1521984338760376,
+      "learning_rate": 4.559098693199929e-05,
+      "loss": 0.17515116930007935,
+      "step": 1190
+    },
+    {
+      "epoch": 0.21743085880640467,
+      "grad_norm": 0.1842304915189743,
+      "learning_rate": 4.554915358835134e-05,
+      "loss": 0.16798022985458375,
+      "step": 1195
+    },
+    {
+      "epoch": 0.2183406113537118,
+      "grad_norm": 0.14753451943397522,
+      "learning_rate": 4.5507142103438794e-05,
+      "loss": 0.1755476713180542,
+      "step": 1200
+    },
+    {
+      "epoch": 0.21925036390101893,
+      "grad_norm": 0.17096194624900818,
+      "learning_rate": 4.546495284146057e-05,
+      "loss": 0.1792473554611206,
+      "step": 1205
+    },
+    {
+      "epoch": 0.22016011644832606,
+      "grad_norm": 0.1579233556985855,
+      "learning_rate": 4.542258616815669e-05,
+      "loss": 0.17230144739151002,
+      "step": 1210
+    },
+    {
+      "epoch": 0.2210698689956332,
+      "grad_norm": 0.177297905087471,
+      "learning_rate": 4.5380042450805216e-05,
+      "loss": 0.1807127833366394,
+      "step": 1215
+    },
+    {
+      "epoch": 0.22197962154294032,
+      "grad_norm": 0.14331696927547455,
+      "learning_rate": 4.533732205821897e-05,
+      "loss": 0.17201389074325563,
+      "step": 1220
+    },
+    {
+      "epoch": 0.22288937409024745,
+      "grad_norm": 0.14473360776901245,
+      "learning_rate": 4.529442536074239e-05,
+      "loss": 0.17036900520324708,
+      "step": 1225
+    },
+    {
+      "epoch": 0.22379912663755458,
+      "grad_norm": 0.1820901483297348,
+      "learning_rate": 4.5251352730248314e-05,
+      "loss": 0.17704882621765136,
+      "step": 1230
+    },
+    {
+      "epoch": 0.2247088791848617,
+      "grad_norm": 0.1948976367712021,
+      "learning_rate": 4.5208104540134746e-05,
+      "loss": 0.1706973433494568,
+      "step": 1235
+    },
+    {
+      "epoch": 0.22561863173216884,
+      "grad_norm": 0.16660070419311523,
+      "learning_rate": 4.51646811653216e-05,
+      "loss": 0.17636821269989014,
+      "step": 1240
+    },
+    {
+      "epoch": 0.22652838427947597,
+      "grad_norm": 0.1699984073638916,
+      "learning_rate": 4.512108298224751e-05,
+      "loss": 0.16986632347106934,
+      "step": 1245
+    },
+    {
+      "epoch": 0.22743813682678313,
+      "grad_norm": 0.17601042985916138,
+      "learning_rate": 4.50773103688665e-05,
+      "loss": 0.17507898807525635,
+      "step": 1250
+    },
+    {
+      "epoch": 0.22834788937409026,
+      "grad_norm": 0.17557238042354584,
+      "learning_rate": 4.503336370464476e-05,
+      "loss": 0.17702863216400147,
+      "step": 1255
+    },
+    {
+      "epoch": 0.2292576419213974,
+      "grad_norm": 0.1800651252269745,
+      "learning_rate": 4.498924337055729e-05,
+      "loss": 0.16419180631637573,
+      "step": 1260
+    },
+    {
+      "epoch": 0.23016739446870452,
+      "grad_norm": 0.2022479772567749,
+      "learning_rate": 4.494494974908468e-05,
+      "loss": 0.17482060194015503,
+      "step": 1265
+    },
+    {
+      "epoch": 0.23107714701601165,
+      "grad_norm": 0.14180205762386322,
+      "learning_rate": 4.490048322420973e-05,
+      "loss": 0.1723136067390442,
+      "step": 1270
+    },
+    {
+      "epoch": 0.23198689956331878,
+      "grad_norm": 0.18607310950756073,
+      "learning_rate": 4.485584418141419e-05,
+      "loss": 0.17096419334411622,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2328966521106259,
+      "grad_norm": 0.15958310663700104,
+      "learning_rate": 4.481103300767529e-05,
+      "loss": 0.1656244158744812,
+      "step": 1280
+    },
+    {
+      "epoch": 0.23380640465793304,
+      "grad_norm": 0.17552383244037628,
+      "learning_rate": 4.476605009146255e-05,
+      "loss": 0.17677626609802247,
+      "step": 1285
+    },
+    {
+      "epoch": 0.23471615720524017,
+      "grad_norm": 0.15299823880195618,
+      "learning_rate": 4.472089582273429e-05,
+      "loss": 0.1778991103172302,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2356259097525473,
+      "grad_norm": 0.14613987505435944,
+      "learning_rate": 4.46755705929343e-05,
+      "loss": 0.17071452140808105,
+      "step": 1295
+    },
+    {
+      "epoch": 0.23653566229985443,
+      "grad_norm": 0.17781122028827667,
+      "learning_rate": 4.463007479498843e-05,
+      "loss": 0.16955430507659913,
+      "step": 1300
+    },
+    {
+      "epoch": 0.23744541484716158,
+      "grad_norm": 0.16326487064361572,
+      "learning_rate": 4.458440882330119e-05,
+      "loss": 0.1777693510055542,
+      "step": 1305
+    },
+    {
+      "epoch": 0.23835516739446871,
+      "grad_norm": 0.17701926827430725,
+      "learning_rate": 4.4538573073752365e-05,
+      "loss": 0.16323351860046387,
+      "step": 1310
+    },
+    {
+      "epoch": 0.23926491994177584,
+      "grad_norm": 0.13104717433452606,
+      "learning_rate": 4.449256794369349e-05,
+      "loss": 0.17653456926345826,
+      "step": 1315
+    },
+    {
+      "epoch": 0.24017467248908297,
+      "grad_norm": 0.1796836256980896,
+      "learning_rate": 4.444639383194452e-05,
+      "loss": 0.17189600467681884,
+      "step": 1320
+    },
+    {
+      "epoch": 0.2410844250363901,
+      "grad_norm": 0.14919696748256683,
+      "learning_rate": 4.440005113879029e-05,
+      "loss": 0.17003334760665895,
+      "step": 1325
+    },
+    {
+      "epoch": 0.24199417758369723,
+      "grad_norm": 0.1728784441947937,
+      "learning_rate": 4.4353540265977064e-05,
+      "loss": 0.17397408485412597,
+      "step": 1330
+    },
+    {
+      "epoch": 0.24290393013100436,
+      "grad_norm": 0.14591015875339508,
+      "learning_rate": 4.43068616167091e-05,
+      "loss": 0.16498478651046752,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2438136826783115,
+      "grad_norm": 0.18417201936244965,
+      "learning_rate": 4.4260015595645055e-05,
+      "loss": 0.16841750144958495,
+      "step": 1340
+    },
+    {
+      "epoch": 0.24472343522561862,
+      "grad_norm": 0.16264279186725616,
+      "learning_rate": 4.4213002608894605e-05,
+      "loss": 0.16907373666763306,
+      "step": 1345
+    },
+    {
+      "epoch": 0.24563318777292575,
+      "grad_norm": 0.15248481929302216,
+      "learning_rate": 4.416582306401481e-05,
+      "loss": 0.15931472778320313,
+      "step": 1350
+    },
+    {
+      "epoch": 0.24654294032023288,
+      "grad_norm": 0.1488373875617981,
+      "learning_rate": 4.4118477370006636e-05,
+      "loss": 0.1701716423034668,
+      "step": 1355
+    },
+    {
+      "epoch": 0.24745269286754004,
+      "grad_norm": 0.14679782092571259,
+      "learning_rate": 4.407096593731142e-05,
+      "loss": 0.157412326335907,
+      "step": 1360
+    },
+    {
+      "epoch": 0.24836244541484717,
+      "grad_norm": 0.17139530181884766,
+      "learning_rate": 4.402328917780728e-05,
+      "loss": 0.17303754091262818,
+      "step": 1365
+    },
+    {
+      "epoch": 0.2492721979621543,
+      "grad_norm": 0.1534871757030487,
+      "learning_rate": 4.397544750480554e-05,
+      "loss": 0.1786255121231079,
+      "step": 1370
+    },
+    {
+      "epoch": 0.2501819505094614,
+      "grad_norm": 0.1876252293586731,
+      "learning_rate": 4.39274413330472e-05,
+      "loss": 0.16442898511886597,
+      "step": 1375
+    },
+    {
+      "epoch": 0.25109170305676853,
+      "grad_norm": 0.16165752708911896,
+      "learning_rate": 4.387927107869928e-05,
+      "loss": 0.1780426025390625,
+      "step": 1380
+    },
+    {
+      "epoch": 0.25200145560407566,
+      "grad_norm": 0.17242255806922913,
+      "learning_rate": 4.383093715935124e-05,
+      "loss": 0.15959256887435913,
+      "step": 1385
+    },
+    {
+      "epoch": 0.25291120815138285,
+      "grad_norm": 0.1627114862203598,
+      "learning_rate": 4.378243999401137e-05,
+      "loss": 0.17606115341186523,
+      "step": 1390
+    },
+    {
+      "epoch": 0.25382096069869,
+      "grad_norm": 0.15911224484443665,
+      "learning_rate": 4.373378000310312e-05,
+      "loss": 0.16798585653305054,
+      "step": 1395
+    },
+    {
+      "epoch": 0.2547307132459971,
+      "grad_norm": 0.15542249381542206,
+      "learning_rate": 4.3684957608461505e-05,
+      "loss": 0.1695417881011963,
+      "step": 1400
+    },
+    {
+      "epoch": 0.25564046579330424,
+      "grad_norm": 0.1475304812192917,
+      "learning_rate": 4.363597323332941e-05,
+      "loss": 0.16340878009796142,
+      "step": 1405
+    },
+    {
+      "epoch": 0.25655021834061137,
+      "grad_norm": 0.16943927109241486,
+      "learning_rate": 4.358682730235395e-05,
+      "loss": 0.17240238189697266,
+      "step": 1410
+    },
+    {
+      "epoch": 0.2574599708879185,
+      "grad_norm": 0.1816391944885254,
+      "learning_rate": 4.3537520241582744e-05,
+      "loss": 0.16558437347412108,
+      "step": 1415
+    },
+    {
+      "epoch": 0.25836972343522563,
+      "grad_norm": 0.23851341009140015,
+      "learning_rate": 4.348805247846027e-05,
+      "loss": 0.16796000003814698,
+      "step": 1420
+    },
+    {
+      "epoch": 0.25927947598253276,
+      "grad_norm": 0.15415243804454803,
+      "learning_rate": 4.343842444182414e-05,
+      "loss": 0.1746017098426819,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2601892285298399,
+      "grad_norm": 0.15651032328605652,
+      "learning_rate": 4.338863656190139e-05,
+      "loss": 0.1649057984352112,
+      "step": 1430
+    },
+    {
+      "epoch": 0.261098981077147,
+      "grad_norm": 0.16601966321468353,
+      "learning_rate": 4.333868927030471e-05,
+      "loss": 0.15888988971710205,
+      "step": 1435
+    },
+    {
+      "epoch": 0.26200873362445415,
+      "grad_norm": 0.1549467295408249,
+      "learning_rate": 4.328858300002876e-05,
+      "loss": 0.16357985734939576,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2629184861717613,
+      "grad_norm": 0.16332370042800903,
+      "learning_rate": 4.32383181854464e-05,
+      "loss": 0.16749982833862304,
+      "step": 1445
+    },
+    {
+      "epoch": 0.2638282387190684,
+      "grad_norm": 0.14827077090740204,
+      "learning_rate": 4.3187895262304894e-05,
+      "loss": 0.16886214017868043,
+      "step": 1450
+    },
+    {
+      "epoch": 0.26473799126637554,
+      "grad_norm": 0.1557198166847229,
+      "learning_rate": 4.313731466772216e-05,
+      "loss": 0.17512214183807373,
+      "step": 1455
+    },
+    {
+      "epoch": 0.26564774381368267,
+      "grad_norm": 0.17263570427894592,
+      "learning_rate": 4.308657684018299e-05,
+      "loss": 0.16248074769973755,
+      "step": 1460
+    },
+    {
+      "epoch": 0.2665574963609898,
+      "grad_norm": 0.17135761678218842,
+      "learning_rate": 4.303568221953521e-05,
+      "loss": 0.16605921983718872,
+      "step": 1465
+    },
+    {
+      "epoch": 0.26746724890829693,
+      "grad_norm": 0.14322632551193237,
+      "learning_rate": 4.2984631246985897e-05,
+      "loss": 0.1610772728919983,
+      "step": 1470
+    },
+    {
+      "epoch": 0.26837700145560406,
+      "grad_norm": 0.18852312862873077,
+      "learning_rate": 4.2933424365097564e-05,
+      "loss": 0.1686462163925171,
+      "step": 1475
+    },
+    {
+      "epoch": 0.2692867540029112,
+      "grad_norm": 0.1780245155096054,
+      "learning_rate": 4.2882062017784294e-05,
+      "loss": 0.16953932046890258,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2701965065502183,
+      "grad_norm": 0.180568665266037,
+      "learning_rate": 4.2830544650307895e-05,
+      "loss": 0.16442664861679077,
+      "step": 1485
+    },
+    {
+      "epoch": 0.27110625909752545,
+      "grad_norm": 0.16876435279846191,
+      "learning_rate": 4.277887270927407e-05,
+      "loss": 0.17128173112869263,
+      "step": 1490
+    },
+    {
+      "epoch": 0.2720160116448326,
+      "grad_norm": 0.164053276181221,
+      "learning_rate": 4.2727046642628513e-05,
+      "loss": 0.16331382989883422,
+      "step": 1495
+    },
+    {
+      "epoch": 0.27292576419213976,
+      "grad_norm": 0.14577528834342957,
+      "learning_rate": 4.267506689965305e-05,
+      "loss": 0.1638316035270691,
+      "step": 1500
+    },
+    {
+      "epoch": 0.2738355167394469,
+      "grad_norm": 0.1648740917444229,
+      "learning_rate": 4.262293393096171e-05,
+      "loss": 0.15332664251327516,
+      "step": 1505
+    },
+    {
+      "epoch": 0.274745269286754,
+      "grad_norm": 0.16445094347000122,
+      "learning_rate": 4.257064818849685e-05,
+      "loss": 0.1706634521484375,
+      "step": 1510
+    },
+    {
+      "epoch": 0.27565502183406115,
+      "grad_norm": 0.1584935486316681,
+      "learning_rate": 4.251821012552524e-05,
+      "loss": 0.1684114694595337,
+      "step": 1515
+    },
+    {
+      "epoch": 0.2765647743813683,
+      "grad_norm": 0.17215611040592194,
+      "learning_rate": 4.24656201966341e-05,
+      "loss": 0.15594131946563722,
+      "step": 1520
+    },
+    {
+      "epoch": 0.2774745269286754,
+      "grad_norm": 0.15945589542388916,
+      "learning_rate": 4.2412878857727214e-05,
+      "loss": 0.1686659574508667,
+      "step": 1525
+    },
+    {
+      "epoch": 0.27838427947598254,
+      "grad_norm": 0.16103951632976532,
+      "learning_rate": 4.2359986566020906e-05,
+      "loss": 0.17779340744018554,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2792940320232897,
+      "grad_norm": 0.1770307570695877,
+      "learning_rate": 4.230694378004014e-05,
+      "loss": 0.16786882877349854,
+      "step": 1535
+    },
+    {
+      "epoch": 0.2802037845705968,
+      "grad_norm": 0.16225053369998932,
+      "learning_rate": 4.2253750959614504e-05,
+      "loss": 0.16558897495269775,
+      "step": 1540
+    },
+    {
+      "epoch": 0.28111353711790393,
+      "grad_norm": 0.27213969826698303,
+      "learning_rate": 4.220040856587425e-05,
+      "loss": 0.1641119599342346,
+      "step": 1545
+    },
+    {
+      "epoch": 0.28202328966521106,
+      "grad_norm": 0.1773071587085724,
+      "learning_rate": 4.2146917061246284e-05,
+      "loss": 0.16919140815734862,
+      "step": 1550
+    },
+    {
+      "epoch": 0.2829330422125182,
+      "grad_norm": 0.15519705414772034,
+      "learning_rate": 4.209327690945014e-05,
+      "loss": 0.15501506328582765,
+      "step": 1555
+    },
+    {
+      "epoch": 0.2838427947598253,
+      "grad_norm": 0.19921597838401794,
+      "learning_rate": 4.203948857549402e-05,
+      "loss": 0.1690821886062622,
+      "step": 1560
+    },
+    {
+      "epoch": 0.28475254730713245,
+      "grad_norm": 0.15417630970478058,
+      "learning_rate": 4.1985552525670696e-05,
+      "loss": 0.1675640344619751,
+      "step": 1565
+    },
+    {
+      "epoch": 0.2856622998544396,
+      "grad_norm": 0.1739572137594223,
+      "learning_rate": 4.193146922755348e-05,
+      "loss": 0.16738017797470092,
+      "step": 1570
+    },
+    {
+      "epoch": 0.2865720524017467,
+      "grad_norm": 0.1384361982345581,
+      "learning_rate": 4.187723914999221e-05,
+      "loss": 0.16802358627319336,
+      "step": 1575
+    },
+    {
+      "epoch": 0.28748180494905384,
+      "grad_norm": 0.1491454839706421,
+      "learning_rate": 4.182286276310915e-05,
+      "loss": 0.1619583249092102,
+      "step": 1580
+    },
+    {
+      "epoch": 0.288391557496361,
+      "grad_norm": 0.15831919014453888,
+      "learning_rate": 4.176834053829492e-05,
+      "loss": 0.1625199794769287,
+      "step": 1585
+    },
+    {
+      "epoch": 0.2893013100436681,
+      "grad_norm": 0.16265396773815155,
+      "learning_rate": 4.1713672948204416e-05,
+      "loss": 0.16718552112579346,
+      "step": 1590
+    },
+    {
+      "epoch": 0.29021106259097523,
+      "grad_norm": 0.15153461694717407,
+      "learning_rate": 4.1658860466752714e-05,
+      "loss": 0.15979087352752686,
+      "step": 1595
+    },
+    {
+      "epoch": 0.29112081513828236,
+      "grad_norm": 0.1620412915945053,
+      "learning_rate": 4.160390356911096e-05,
+      "loss": 0.16103557348251343,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2920305676855895,
+      "grad_norm": 0.16673807799816132,
+      "learning_rate": 4.154880273170223e-05,
+      "loss": 0.16394708156585694,
+      "step": 1605
+    },
+    {
+      "epoch": 0.2929403202328967,
+      "grad_norm": 0.14834867417812347,
+      "learning_rate": 4.149355843219744e-05,
+      "loss": 0.15916435718536376,
+      "step": 1610
+    },
+    {
+      "epoch": 0.2938500727802038,
+      "grad_norm": 0.16977964341640472,
+      "learning_rate": 4.143817114951119e-05,
+      "loss": 0.16538127660751342,
+      "step": 1615
+    },
+    {
+      "epoch": 0.29475982532751094,
+      "grad_norm": 0.17986875772476196,
+      "learning_rate": 4.138264136379756e-05,
+      "loss": 0.15514618158340454,
+      "step": 1620
+    },
+    {
+      "epoch": 0.29566957787481807,
+      "grad_norm": 0.15794920921325684,
+      "learning_rate": 4.132696955644605e-05,
+      "loss": 0.15992183685302735,
+      "step": 1625
+    },
+    {
+      "epoch": 0.2965793304221252,
+      "grad_norm": 0.19955399632453918,
+      "learning_rate": 4.127115621007731e-05,
+      "loss": 0.16362056732177735,
+      "step": 1630
+    },
+    {
+      "epoch": 0.29748908296943233,
+      "grad_norm": 0.1352023035287857,
+      "learning_rate": 4.121520180853903e-05,
+      "loss": 0.15631601810455323,
+      "step": 1635
+    },
+    {
+      "epoch": 0.29839883551673946,
+      "grad_norm": 0.15340781211853027,
+      "learning_rate": 4.1159106836901674e-05,
+      "loss": 0.1571858048439026,
+      "step": 1640
+    },
+    {
+      "epoch": 0.2993085880640466,
+      "grad_norm": 0.15311770141124725,
+      "learning_rate": 4.110287178145433e-05,
+      "loss": 0.16082344055175782,
+      "step": 1645
+    },
+    {
+      "epoch": 0.3002183406113537,
+      "grad_norm": 0.17811627686023712,
+      "learning_rate": 4.10464971297005e-05,
+      "loss": 0.16117215156555176,
+      "step": 1650
+    },
+    {
+      "epoch": 0.30112809315866085,
+      "grad_norm": 0.21060039103031158,
+      "learning_rate": 4.0989983370353805e-05,
+      "loss": 0.15838587284088135,
+      "step": 1655
+    },
+    {
+      "epoch": 0.302037845705968,
+      "grad_norm": 0.155836820602417,
+      "learning_rate": 4.093333099333383e-05,
+      "loss": 0.16648870706558228,
+      "step": 1660
+    },
+    {
+      "epoch": 0.3029475982532751,
+      "grad_norm": 0.13711698353290558,
+      "learning_rate": 4.0876540489761826e-05,
+      "loss": 0.16899349689483642,
+      "step": 1665
+    },
+    {
+      "epoch": 0.30385735080058224,
+      "grad_norm": 0.15162716805934906,
+      "learning_rate": 4.0819612351956485e-05,
+      "loss": 0.16574090719223022,
+      "step": 1670
+    },
+    {
+      "epoch": 0.30476710334788937,
+      "grad_norm": 0.15016348659992218,
+      "learning_rate": 4.0762547073429615e-05,
+      "loss": 0.1689780354499817,
+      "step": 1675
+    },
+    {
+      "epoch": 0.3056768558951965,
+      "grad_norm": 0.15182986855506897,
+      "learning_rate": 4.070534514888194e-05,
+      "loss": 0.1593686819076538,
+      "step": 1680
+    },
+    {
+      "epoch": 0.3065866084425036,
+      "grad_norm": 0.15648750960826874,
+      "learning_rate": 4.0648007074198765e-05,
+      "loss": 0.16436235904693602,
+      "step": 1685
+    },
+    {
+      "epoch": 0.30749636098981076,
+      "grad_norm": 0.18339484930038452,
+      "learning_rate": 4.0590533346445665e-05,
+      "loss": 0.1678077220916748,
+      "step": 1690
+    },
+    {
+      "epoch": 0.3084061135371179,
+      "grad_norm": 0.16426527500152588,
+      "learning_rate": 4.053292446386422e-05,
+      "loss": 0.1689227342605591,
+      "step": 1695
+    },
+    {
+      "epoch": 0.309315866084425,
+      "grad_norm": 0.16129335761070251,
+      "learning_rate": 4.047518092586766e-05,
+      "loss": 0.16592445373535156,
+      "step": 1700
+    },
+    {
+      "epoch": 0.31022561863173215,
+      "grad_norm": 0.15512363612651825,
+      "learning_rate": 4.041730323303654e-05,
+      "loss": 0.16142364740371704,
+      "step": 1705
+    },
+    {
+      "epoch": 0.3111353711790393,
+      "grad_norm": 0.159842386841774,
+      "learning_rate": 4.0359291887114425e-05,
+      "loss": 0.1702875852584839,
+      "step": 1710
+    },
+    {
+      "epoch": 0.3120451237263464,
+      "grad_norm": 0.19558854401111603,
+      "learning_rate": 4.030114739100352e-05,
+      "loss": 0.15966148376464845,
+      "step": 1715
+    },
+    {
+      "epoch": 0.3129548762736536,
+      "grad_norm": 0.1577496975660324,
+      "learning_rate": 4.024287024876029e-05,
+      "loss": 0.1620358943939209,
+      "step": 1720
+    },
+    {
+      "epoch": 0.3138646288209607,
+      "grad_norm": 0.1629355251789093,
+      "learning_rate": 4.0184460965591144e-05,
+      "loss": 0.16511552333831786,
+      "step": 1725
+    },
+    {
+      "epoch": 0.31477438136826785,
+      "grad_norm": 0.17060767114162445,
+      "learning_rate": 4.0125920047848e-05,
+      "loss": 0.15672838687896729,
+      "step": 1730
+    },
+    {
+      "epoch": 0.315684133915575,
+      "grad_norm": 0.22447620332241058,
+      "learning_rate": 4.006724800302394e-05,
+      "loss": 0.15339784622192382,
+      "step": 1735
+    },
+    {
+      "epoch": 0.3165938864628821,
+      "grad_norm": 0.14572037756443024,
+      "learning_rate": 4.000844533974878e-05,
+      "loss": 0.16566959619522095,
+      "step": 1740
+    },
+    {
+      "epoch": 0.31750363901018924,
+      "grad_norm": 0.15915483236312866,
+      "learning_rate": 3.9949512567784684e-05,
+      "loss": 0.16153957843780517,
+      "step": 1745
+    },
+    {
+      "epoch": 0.3184133915574964,
+      "grad_norm": 0.1668540984392166,
+      "learning_rate": 3.9890450198021704e-05,
+      "loss": 0.1659809947013855,
+      "step": 1750
+    },
+    {
+      "epoch": 0.3193231441048035,
+      "grad_norm": 0.16612035036087036,
+      "learning_rate": 3.983125874247341e-05,
+      "loss": 0.16941241025924683,
+      "step": 1755
+    },
+    {
+      "epoch": 0.32023289665211063,
+      "grad_norm": 0.15163679420948029,
+      "learning_rate": 3.9771938714272407e-05,
+      "loss": 0.16053590774536133,
+      "step": 1760
+    },
+    {
+      "epoch": 0.32114264919941776,
+      "grad_norm": 0.1797824203968048,
+      "learning_rate": 3.97124906276659e-05,
+      "loss": 0.1667110800743103,
+      "step": 1765
+    },
+    {
+      "epoch": 0.3220524017467249,
+      "grad_norm": 0.15076608955860138,
+      "learning_rate": 3.9652914998011237e-05,
+      "loss": 0.1607860803604126,
+      "step": 1770
+    },
+    {
+      "epoch": 0.322962154294032,
+      "grad_norm": 0.16523587703704834,
+      "learning_rate": 3.959321234177144e-05,
+      "loss": 0.16515827178955078,
+      "step": 1775
+    },
+    {
+      "epoch": 0.32387190684133915,
+      "grad_norm": 0.22065149247646332,
+      "learning_rate": 3.9533383176510746e-05,
+      "loss": 0.1618957757949829,
+      "step": 1780
+    },
+    {
+      "epoch": 0.3247816593886463,
+      "grad_norm": 0.16426463425159454,
+      "learning_rate": 3.9473428020890066e-05,
+      "loss": 0.15763382911682128,
+      "step": 1785
+    },
+    {
+      "epoch": 0.3256914119359534,
+      "grad_norm": 0.16474904119968414,
+      "learning_rate": 3.941334739466257e-05,
+      "loss": 0.15135571956634522,
+      "step": 1790
+    },
+    {
+      "epoch": 0.32660116448326054,
+      "grad_norm": 0.16746412217617035,
+      "learning_rate": 3.935314181866909e-05,
+      "loss": 0.15925389528274536,
+      "step": 1795
+    },
+    {
+      "epoch": 0.32751091703056767,
+      "grad_norm": 0.17819371819496155,
+      "learning_rate": 3.929281181483369e-05,
+      "loss": 0.1598669171333313,
+      "step": 1800
+    },
+    {
+      "epoch": 0.3284206695778748,
+      "grad_norm": 0.1816040277481079,
+      "learning_rate": 3.923235790615907e-05,
+      "loss": 0.1652522087097168,
+      "step": 1805
+    },
+    {
+      "epoch": 0.32933042212518193,
+      "grad_norm": 0.14846695959568024,
+      "learning_rate": 3.917178061672211e-05,
+      "loss": 0.16665585041046144,
+      "step": 1810
+    },
+    {
+      "epoch": 0.33024017467248906,
+      "grad_norm": 0.1734926551580429,
+      "learning_rate": 3.911108047166924e-05,
+      "loss": 0.16069791316986085,
+      "step": 1815
+    },
+    {
+      "epoch": 0.3311499272197962,
+      "grad_norm": 0.16154922544956207,
+      "learning_rate": 3.905025799721194e-05,
+      "loss": 0.16114097833633423,
+      "step": 1820
+    },
+    {
+      "epoch": 0.3320596797671033,
+      "grad_norm": 0.1538771390914917,
+      "learning_rate": 3.898931372062217e-05,
+      "loss": 0.1602831244468689,
+      "step": 1825
+    },
+    {
+      "epoch": 0.3329694323144105,
+      "grad_norm": 0.14036566019058228,
+      "learning_rate": 3.892824817022781e-05,
+      "loss": 0.1502395749092102,
+      "step": 1830
+    },
+    {
+      "epoch": 0.33387918486171764,
+      "grad_norm": 0.19212059676647186,
+      "learning_rate": 3.886706187540804e-05,
+      "loss": 0.16265250444412233,
+      "step": 1835
+    },
+    {
+      "epoch": 0.33478893740902477,
+      "grad_norm": 0.17410333454608917,
+      "learning_rate": 3.880575536658881e-05,
+      "loss": 0.15689224004745483,
+      "step": 1840
+    },
+    {
+      "epoch": 0.3356986899563319,
+      "grad_norm": 0.15165294706821442,
+      "learning_rate": 3.874432917523817e-05,
+      "loss": 0.15033140182495117,
+      "step": 1845
+    },
+    {
+      "epoch": 0.336608442503639,
+      "grad_norm": 0.16166730225086212,
+      "learning_rate": 3.8682783833861736e-05,
+      "loss": 0.16896235942840576,
+      "step": 1850
+    },
+    {
+      "epoch": 0.33751819505094616,
+      "grad_norm": 0.16497021913528442,
+      "learning_rate": 3.8621119875998026e-05,
+      "loss": 0.1600774645805359,
+      "step": 1855
+    },
+    {
+      "epoch": 0.3384279475982533,
+      "grad_norm": 0.17264948785305023,
+      "learning_rate": 3.855933783621384e-05,
+      "loss": 0.16947593688964843,
+      "step": 1860
+    },
+    {
+      "epoch": 0.3393377001455604,
+      "grad_norm": 0.16870704293251038,
+      "learning_rate": 3.8497438250099636e-05,
+      "loss": 0.16062095165252685,
+      "step": 1865
+    },
+    {
+      "epoch": 0.34024745269286755,
+      "grad_norm": 0.16644036769866943,
+      "learning_rate": 3.843542165426492e-05,
+      "loss": 0.16015599966049193,
+      "step": 1870
+    },
+    {
+      "epoch": 0.3411572052401747,
+      "grad_norm": 0.1626352220773697,
+      "learning_rate": 3.837328858633349e-05,
+      "loss": 0.17444703578948975,
+      "step": 1875
+    },
+    {
+      "epoch": 0.3420669577874818,
+      "grad_norm": 0.1427375227212906,
+      "learning_rate": 3.83110395849389e-05,
+      "loss": 0.1589805006980896,
+      "step": 1880
+    },
+    {
+      "epoch": 0.34297671033478894,
+      "grad_norm": 0.17840255796909332,
+      "learning_rate": 3.824867518971973e-05,
+      "loss": 0.15953952074050903,
+      "step": 1885
+    },
+    {
+      "epoch": 0.34388646288209607,
+      "grad_norm": 0.16998249292373657,
+      "learning_rate": 3.818619594131489e-05,
+      "loss": 0.16027032136917113,
+      "step": 1890
+    },
+    {
+      "epoch": 0.3447962154294032,
+      "grad_norm": 0.14950257539749146,
+      "learning_rate": 3.812360238135897e-05,
+      "loss": 0.15335670709609986,
+      "step": 1895
+    },
+    {
+      "epoch": 0.3457059679767103,
+      "grad_norm": 0.1678011417388916,
+      "learning_rate": 3.806089505247752e-05,
+      "loss": 0.1560648798942566,
+      "step": 1900
+    },
+    {
+      "epoch": 0.34661572052401746,
+      "grad_norm": 0.17944541573524475,
+      "learning_rate": 3.799807449828238e-05,
+      "loss": 0.16072254180908202,
+      "step": 1905
+    },
+    {
+      "epoch": 0.3475254730713246,
+      "grad_norm": 0.166817307472229,
+      "learning_rate": 3.793514126336691e-05,
+      "loss": 0.1542820692062378,
+      "step": 1910
+    },
+    {
+      "epoch": 0.3484352256186317,
+      "grad_norm": 0.16047626733779907,
+      "learning_rate": 3.787209589330134e-05,
+      "loss": 0.16092092990875245,
+      "step": 1915
+    },
+    {
+      "epoch": 0.34934497816593885,
+      "grad_norm": 0.16478900611400604,
+      "learning_rate": 3.7808938934627965e-05,
+      "loss": 0.16765867471694945,
+      "step": 1920
+    },
+    {
+      "epoch": 0.350254730713246,
+      "grad_norm": 0.15349514782428741,
+      "learning_rate": 3.774567093485648e-05,
+      "loss": 0.15890377759933472,
+      "step": 1925
+    },
+    {
+      "epoch": 0.3511644832605531,
+      "grad_norm": 0.1515921950340271,
+      "learning_rate": 3.768229244245917e-05,
+      "loss": 0.16668319702148438,
+      "step": 1930
+    },
+    {
+      "epoch": 0.35207423580786024,
+      "grad_norm": 0.16310466825962067,
+      "learning_rate": 3.7618804006866195e-05,
+      "loss": 0.15182652473449706,
+      "step": 1935
+    },
+    {
+      "epoch": 0.3529839883551674,
+      "grad_norm": 0.17294517159461975,
+      "learning_rate": 3.755520617846084e-05,
+      "loss": 0.16287628412246705,
+      "step": 1940
+    },
+    {
+      "epoch": 0.35389374090247455,
+      "grad_norm": 0.1482895463705063,
+      "learning_rate": 3.749149950857467e-05,
+      "loss": 0.15321952104568481,
+      "step": 1945
+    },
+    {
+      "epoch": 0.3548034934497817,
+      "grad_norm": 0.2236029952764511,
+      "learning_rate": 3.7427684549482847e-05,
+      "loss": 0.15403482913970948,
+      "step": 1950
+    },
+    {
+      "epoch": 0.3557132459970888,
+      "grad_norm": 0.20185327529907227,
+      "learning_rate": 3.736376185439927e-05,
+      "loss": 0.1633884072303772,
+      "step": 1955
+    },
+    {
+      "epoch": 0.35662299854439594,
+      "grad_norm": 0.13906247913837433,
+      "learning_rate": 3.7299731977471816e-05,
+      "loss": 0.15925350189208984,
+      "step": 1960
+    },
+    {
+      "epoch": 0.35753275109170307,
+      "grad_norm": 0.18665002286434174,
+      "learning_rate": 3.723559547377751e-05,
+      "loss": 0.1612026572227478,
+      "step": 1965
+    },
+    {
+      "epoch": 0.3584425036390102,
+      "grad_norm": 0.16913433372974396,
+      "learning_rate": 3.717135289931774e-05,
+      "loss": 0.15479494333267213,
+      "step": 1970
+    },
+    {
+      "epoch": 0.35935225618631733,
+      "grad_norm": 0.1620066910982132,
+      "learning_rate": 3.7107004811013434e-05,
+      "loss": 0.1604058027267456,
+      "step": 1975
+    },
+    {
+      "epoch": 0.36026200873362446,
+      "grad_norm": 0.16838301718235016,
+      "learning_rate": 3.704255176670021e-05,
+      "loss": 0.15335073471069335,
+      "step": 1980
+    },
+    {
+      "epoch": 0.3611717612809316,
+      "grad_norm": 0.3054695427417755,
+      "learning_rate": 3.6977994325123535e-05,
+      "loss": 0.16558053493499755,
+      "step": 1985
+    },
+    {
+      "epoch": 0.3620815138282387,
+      "grad_norm": 0.1526716649532318,
+      "learning_rate": 3.6913333045933934e-05,
+      "loss": 0.16148923635482787,
+      "step": 1990
+    },
+    {
+      "epoch": 0.36299126637554585,
+      "grad_norm": 0.15328513085842133,
+      "learning_rate": 3.684856848968209e-05,
+      "loss": 0.1553613781929016,
+      "step": 1995
+    },
+    {
+      "epoch": 0.363901018922853,
+      "grad_norm": 0.16129714250564575,
+      "learning_rate": 3.6783701217813995e-05,
+      "loss": 0.16724612712860107,
+      "step": 2000
+    },
+    {
+      "epoch": 0.3648107714701601,
+      "grad_norm": 0.15715539455413818,
+      "learning_rate": 3.6718731792666086e-05,
+      "loss": 0.15867922306060792,
+      "step": 2005
+    },
+    {
+      "epoch": 0.36572052401746724,
+      "grad_norm": 0.15569166839122772,
+      "learning_rate": 3.6653660777460366e-05,
+      "loss": 0.1552058696746826,
+      "step": 2010
+    },
+    {
+      "epoch": 0.36663027656477437,
+      "grad_norm": 0.16223010420799255,
+      "learning_rate": 3.6588488736299535e-05,
+      "loss": 0.1583200454711914,
+      "step": 2015
+    },
+    {
+      "epoch": 0.3675400291120815,
+      "grad_norm": 0.18441995978355408,
+      "learning_rate": 3.652321623416209e-05,
+      "loss": 0.15050662755966188,
+      "step": 2020
+    },
+    {
+      "epoch": 0.36844978165938863,
+      "grad_norm": 0.13792674243450165,
+      "learning_rate": 3.645784383689742e-05,
+      "loss": 0.15458759069442748,
+      "step": 2025
+    },
+    {
+      "epoch": 0.36935953420669576,
+      "grad_norm": 0.14993111789226532,
+      "learning_rate": 3.639237211122091e-05,
+      "loss": 0.15926222801208495,
+      "step": 2030
+    },
+    {
+      "epoch": 0.3702692867540029,
+      "grad_norm": 0.16815930604934692,
+      "learning_rate": 3.632680162470904e-05,
+      "loss": 0.15524441003799438,
+      "step": 2035
+    },
+    {
+      "epoch": 0.37117903930131,
+      "grad_norm": 0.13312821090221405,
+      "learning_rate": 3.626113294579441e-05,
+      "loss": 0.15883516073226928,
+      "step": 2040
+    },
+    {
+      "epoch": 0.37208879184861715,
+      "grad_norm": 0.16838273406028748,
+      "learning_rate": 3.619536664376091e-05,
+      "loss": 0.15829603672027587,
+      "step": 2045
+    },
+    {
+      "epoch": 0.37299854439592434,
+      "grad_norm": 0.14706873893737793,
+      "learning_rate": 3.612950328873869e-05,
+      "loss": 0.15644397735595703,
+      "step": 2050
+    },
+    {
+      "epoch": 0.37390829694323147,
+      "grad_norm": 0.1644199639558792,
+      "learning_rate": 3.606354345169926e-05,
+      "loss": 0.15858219861984252,
+      "step": 2055
+    },
+    {
+      "epoch": 0.3748180494905386,
+      "grad_norm": 0.18077051639556885,
+      "learning_rate": 3.599748770445055e-05,
+      "loss": 0.1641286849975586,
+      "step": 2060
+    },
+    {
+      "epoch": 0.3757278020378457,
+      "grad_norm": 0.16329127550125122,
+      "learning_rate": 3.5931336619631914e-05,
+      "loss": 0.15027186870574952,
+      "step": 2065
+    },
+    {
+      "epoch": 0.37663755458515286,
+      "grad_norm": 0.16346783936023712,
+      "learning_rate": 3.586509077070922e-05,
+      "loss": 0.1558641314506531,
+      "step": 2070
+    },
+    {
+      "epoch": 0.37754730713246,
+      "grad_norm": 0.1727602630853653,
+      "learning_rate": 3.5798750731969834e-05,
+      "loss": 0.15390506982803345,
+      "step": 2075
+    },
+    {
+      "epoch": 0.3784570596797671,
+      "grad_norm": 0.7598192691802979,
+      "learning_rate": 3.5732317078517654e-05,
+      "loss": 0.1533232808113098,
+      "step": 2080
+    },
+    {
+      "epoch": 0.37936681222707425,
+      "grad_norm": 0.1433355212211609,
+      "learning_rate": 3.5665790386268124e-05,
+      "loss": 0.15560413599014283,
+      "step": 2085
+    },
+    {
+      "epoch": 0.3802765647743814,
+      "grad_norm": 0.18439625203609467,
+      "learning_rate": 3.559917123194325e-05,
+      "loss": 0.16695556640625,
+      "step": 2090
+    },
+    {
+      "epoch": 0.3811863173216885,
+      "grad_norm": 0.1693502813577652,
+      "learning_rate": 3.55324601930666e-05,
+      "loss": 0.15957870483398437,
+      "step": 2095
+    },
+    {
+      "epoch": 0.38209606986899564,
+      "grad_norm": 0.17776088416576385,
+      "learning_rate": 3.54656578479583e-05,
+      "loss": 0.1527492880821228,
+      "step": 2100
+    },
+    {
+      "epoch": 0.38300582241630277,
+      "grad_norm": 0.15993724763393402,
+      "learning_rate": 3.539876477572998e-05,
+      "loss": 0.1567505717277527,
+      "step": 2105
+    },
+    {
+      "epoch": 0.3839155749636099,
+      "grad_norm": 0.17067375779151917,
+      "learning_rate": 3.533178155627981e-05,
+      "loss": 0.14660797119140626,
+      "step": 2110
+    },
+    {
+      "epoch": 0.384825327510917,
+      "grad_norm": 0.20239882171154022,
+      "learning_rate": 3.526470877028745e-05,
+      "loss": 0.1596767544746399,
+      "step": 2115
+    },
+    {
+      "epoch": 0.38573508005822416,
+      "grad_norm": 0.1863643079996109,
+      "learning_rate": 3.5197546999209005e-05,
+      "loss": 0.15738571882247926,
+      "step": 2120
+    },
+    {
+      "epoch": 0.3866448326055313,
+      "grad_norm": 0.16994133591651917,
+      "learning_rate": 3.5130296825272014e-05,
+      "loss": 0.16255316734313965,
+      "step": 2125
+    },
+    {
+      "epoch": 0.3875545851528384,
+      "grad_norm": 0.18703415989875793,
+      "learning_rate": 3.5062958831470355e-05,
+      "loss": 0.15206334590911866,
+      "step": 2130
+    },
+    {
+      "epoch": 0.38846433770014555,
+      "grad_norm": 0.15433982014656067,
+      "learning_rate": 3.4995533601559226e-05,
+      "loss": 0.1590178370475769,
+      "step": 2135
+    },
+    {
+      "epoch": 0.3893740902474527,
+      "grad_norm": 0.16498146951198578,
+      "learning_rate": 3.4928021720050104e-05,
+      "loss": 0.14759145975112914,
+      "step": 2140
+    },
+    {
+      "epoch": 0.3902838427947598,
+      "grad_norm": 0.17880478501319885,
+      "learning_rate": 3.486042377220562e-05,
+      "loss": 0.1642458915710449,
+      "step": 2145
+    },
+    {
+      "epoch": 0.39119359534206694,
+      "grad_norm": 0.14700061082839966,
+      "learning_rate": 3.479274034403455e-05,
+      "loss": 0.16105138063430785,
+      "step": 2150
+    },
+    {
+      "epoch": 0.39210334788937407,
+      "grad_norm": 0.1620762050151825,
+      "learning_rate": 3.472497202228664e-05,
+      "loss": 0.15104985237121582,
+      "step": 2155
+    },
+    {
+      "epoch": 0.3930131004366812,
+      "grad_norm": 0.1625058799982071,
+      "learning_rate": 3.4657119394447654e-05,
+      "loss": 0.16145485639572144,
+      "step": 2160
+    },
+    {
+      "epoch": 0.3939228529839884,
+      "grad_norm": 0.1631549596786499,
+      "learning_rate": 3.458918304873417e-05,
+      "loss": 0.16712255477905275,
+      "step": 2165
+    },
+    {
+      "epoch": 0.3948326055312955,
+      "grad_norm": 0.16041551530361176,
+      "learning_rate": 3.452116357408853e-05,
+      "loss": 0.15118330717086792,
+      "step": 2170
+    },
+    {
+      "epoch": 0.39574235807860264,
+      "grad_norm": 0.16692611575126648,
+      "learning_rate": 3.44530615601737e-05,
+      "loss": 0.16982550621032716,
+      "step": 2175
+    },
+    {
+      "epoch": 0.39665211062590977,
+      "grad_norm": 0.16082268953323364,
+      "learning_rate": 3.438487759736821e-05,
+      "loss": 0.1513260006904602,
+      "step": 2180
+    },
+    {
+      "epoch": 0.3975618631732169,
+      "grad_norm": 0.1474589854478836,
+      "learning_rate": 3.4316612276761004e-05,
+      "loss": 0.14968743324279785,
+      "step": 2185
+    },
+    {
+      "epoch": 0.39847161572052403,
+      "grad_norm": 0.14531342685222626,
+      "learning_rate": 3.42482661901463e-05,
+      "loss": 0.1563260555267334,
+      "step": 2190
+    },
+    {
+      "epoch": 0.39938136826783116,
+      "grad_norm": 0.16775506734848022,
+      "learning_rate": 3.41798399300185e-05,
+      "loss": 0.14861010313034057,
+      "step": 2195
+    },
+    {
+      "epoch": 0.4002911208151383,
+      "grad_norm": 0.15065217018127441,
+      "learning_rate": 3.411133408956703e-05,
+      "loss": 0.15559519529342652,
+      "step": 2200
+    },
+    {
+      "epoch": 0.4012008733624454,
+      "grad_norm": 0.16655296087265015,
+      "learning_rate": 3.4042749262671184e-05,
+      "loss": 0.16025567054748535,
+      "step": 2205
+    },
+    {
+      "epoch": 0.40211062590975255,
+      "grad_norm": 0.14773905277252197,
+      "learning_rate": 3.397408604389501e-05,
+      "loss": 0.15074082612991332,
+      "step": 2210
+    },
+    {
+      "epoch": 0.4030203784570597,
+      "grad_norm": 0.16233304142951965,
+      "learning_rate": 3.3905345028482125e-05,
+      "loss": 0.15490520000457764,
+      "step": 2215
+    },
+    {
+      "epoch": 0.4039301310043668,
+      "grad_norm": 0.17520153522491455,
+      "learning_rate": 3.383652681235058e-05,
+      "loss": 0.1517520785331726,
+      "step": 2220
+    },
+    {
+      "epoch": 0.40483988355167394,
+      "grad_norm": 0.14749875664710999,
+      "learning_rate": 3.376763199208766e-05,
+      "loss": 0.15410997867584228,
+      "step": 2225
+    },
+    {
+      "epoch": 0.40574963609898107,
+      "grad_norm": 0.16855919361114502,
+      "learning_rate": 3.369866116494477e-05,
+      "loss": 0.1510261058807373,
+      "step": 2230
+    },
+    {
+      "epoch": 0.4066593886462882,
+      "grad_norm": 0.1594122350215912,
+      "learning_rate": 3.362961492883218e-05,
+      "loss": 0.1493813395500183,
+      "step": 2235
+    },
+    {
+      "epoch": 0.40756914119359533,
+      "grad_norm": 0.13645926117897034,
+      "learning_rate": 3.3560493882313915e-05,
+      "loss": 0.14876762628555298,
+      "step": 2240
+    },
+    {
+      "epoch": 0.40847889374090246,
+      "grad_norm": 0.14304400980472565,
+      "learning_rate": 3.349129862460251e-05,
+      "loss": 0.15567013025283813,
+      "step": 2245
+    },
+    {
+      "epoch": 0.4093886462882096,
+      "grad_norm": 0.17040041089057922,
+      "learning_rate": 3.342202975555386e-05,
+      "loss": 0.1563249945640564,
+      "step": 2250
+    },
+    {
+      "epoch": 0.4102983988355167,
+      "grad_norm": 0.15594671666622162,
+      "learning_rate": 3.3352687875661984e-05,
+      "loss": 0.1546410083770752,
+      "step": 2255
+    },
+    {
+      "epoch": 0.41120815138282385,
+      "grad_norm": 0.1677195280790329,
+      "learning_rate": 3.328327358605384e-05,
+      "loss": 0.15710171461105346,
+      "step": 2260
+    },
+    {
+      "epoch": 0.412117903930131,
+      "grad_norm": 0.1731705516576767,
+      "learning_rate": 3.321378748848412e-05,
+      "loss": 0.16444036960601807,
+      "step": 2265
+    },
+    {
+      "epoch": 0.4130276564774381,
+      "grad_norm": 0.18779033422470093,
+      "learning_rate": 3.3144230185329984e-05,
+      "loss": 0.15659687519073487,
+      "step": 2270
+    },
+    {
+      "epoch": 0.4139374090247453,
+      "grad_norm": 0.1543768346309662,
+      "learning_rate": 3.3074602279585913e-05,
+      "loss": 0.15100739002227784,
+      "step": 2275
+    },
+    {
+      "epoch": 0.4148471615720524,
+      "grad_norm": 0.16672168672084808,
+      "learning_rate": 3.300490437485843e-05,
+      "loss": 0.15535364151000977,
+      "step": 2280
+    },
+    {
+      "epoch": 0.41575691411935956,
+      "grad_norm": 0.16741308569908142,
+      "learning_rate": 3.293513707536089e-05,
+      "loss": 0.15523911714553834,
+      "step": 2285
+    },
+    {
+      "epoch": 0.4166666666666667,
+      "grad_norm": 0.1488303542137146,
+      "learning_rate": 3.286530098590822e-05,
+      "loss": 0.1542000651359558,
+      "step": 2290
+    },
+    {
+      "epoch": 0.4175764192139738,
+      "grad_norm": 0.1637732982635498,
+      "learning_rate": 3.2795396711911694e-05,
+      "loss": 0.15354831218719484,
+      "step": 2295
+    },
+    {
+      "epoch": 0.41848617176128095,
+      "grad_norm": 0.1472022533416748,
+      "learning_rate": 3.272542485937369e-05,
+      "loss": 0.16235145330429077,
+      "step": 2300
+    },
+    {
+      "epoch": 0.4193959243085881,
+      "grad_norm": 0.15908290445804596,
+      "learning_rate": 3.265538603488241e-05,
+      "loss": 0.15642645359039306,
+      "step": 2305
+    },
+    {
+      "epoch": 0.4203056768558952,
+      "grad_norm": 0.1584865301847458,
+      "learning_rate": 3.2585280845606645e-05,
+      "loss": 0.15490249395370484,
+      "step": 2310
+    },
+    {
+      "epoch": 0.42121542940320233,
+      "grad_norm": 0.15893949568271637,
+      "learning_rate": 3.251510989929052e-05,
+      "loss": 0.1598116159439087,
+      "step": 2315
+    },
+    {
+      "epoch": 0.42212518195050946,
+      "grad_norm": 0.18930596113204956,
+      "learning_rate": 3.244487380424817e-05,
+      "loss": 0.1482008934020996,
+      "step": 2320
+    },
+    {
+      "epoch": 0.4230349344978166,
+      "grad_norm": 0.132876455783844,
+      "learning_rate": 3.237457316935856e-05,
+      "loss": 0.15304710865020751,
+      "step": 2325
+    },
+    {
+      "epoch": 0.4239446870451237,
+      "grad_norm": 0.16447032988071442,
+      "learning_rate": 3.2304208604060106e-05,
+      "loss": 0.15298750400543212,
+      "step": 2330
+    },
+    {
+      "epoch": 0.42485443959243085,
+      "grad_norm": 0.17748120427131653,
+      "learning_rate": 3.223378071834546e-05,
+      "loss": 0.1556084156036377,
+      "step": 2335
+    },
+    {
+      "epoch": 0.425764192139738,
+      "grad_norm": 0.16366586089134216,
+      "learning_rate": 3.2163290122756206e-05,
+      "loss": 0.14387927055358887,
+      "step": 2340
+    },
+    {
+      "epoch": 0.4266739446870451,
+      "grad_norm": 0.15398970246315002,
+      "learning_rate": 3.209273742837755e-05,
+      "loss": 0.16091293096542358,
+      "step": 2345
+    },
+    {
+      "epoch": 0.42758369723435224,
+      "grad_norm": 0.164212167263031,
+      "learning_rate": 3.202212324683305e-05,
+      "loss": 0.15523531436920165,
+      "step": 2350
+    },
+    {
+      "epoch": 0.4284934497816594,
+      "grad_norm": 0.16749800741672516,
+      "learning_rate": 3.1951448190279255e-05,
+      "loss": 0.15354975461959838,
+      "step": 2355
+    },
+    {
+      "epoch": 0.4294032023289665,
+      "grad_norm": 0.14137034118175507,
+      "learning_rate": 3.18807128714005e-05,
+      "loss": 0.14981694221496583,
+      "step": 2360
+    },
+    {
+      "epoch": 0.43031295487627363,
+      "grad_norm": 0.14848439395427704,
+      "learning_rate": 3.1809917903403507e-05,
+      "loss": 0.15448769330978393,
+      "step": 2365
+    },
+    {
+      "epoch": 0.43122270742358076,
+      "grad_norm": 0.1747605800628662,
+      "learning_rate": 3.1739063900012095e-05,
+      "loss": 0.15882387161254882,
+      "step": 2370
+    },
+    {
+      "epoch": 0.4321324599708879,
+      "grad_norm": 0.16054467856884003,
+      "learning_rate": 3.166815147546186e-05,
+      "loss": 0.15170297622680665,
+      "step": 2375
+    },
+    {
+      "epoch": 0.433042212518195,
+      "grad_norm": 0.15428027510643005,
+      "learning_rate": 3.1597181244494886e-05,
+      "loss": 0.16202548742294312,
+      "step": 2380
+    },
+    {
+      "epoch": 0.4339519650655022,
+      "grad_norm": 0.16747219860553741,
+      "learning_rate": 3.1526153822354325e-05,
+      "loss": 0.15461477041244506,
+      "step": 2385
+    },
+    {
+      "epoch": 0.43486171761280934,
+      "grad_norm": 0.17415772378444672,
+      "learning_rate": 3.145506982477918e-05,
+      "loss": 0.16173542737960817,
+      "step": 2390
+    },
+    {
+      "epoch": 0.43577147016011647,
+      "grad_norm": 0.1293518990278244,
+      "learning_rate": 3.1383929867998865e-05,
+      "loss": 0.15572521686553956,
+      "step": 2395
+    },
+    {
+      "epoch": 0.4366812227074236,
+      "grad_norm": 0.16909323632717133,
+      "learning_rate": 3.1312734568727935e-05,
+      "loss": 0.15898628234863282,
+      "step": 2400
+    },
+    {
+      "epoch": 0.43759097525473073,
+      "grad_norm": 0.16770294308662415,
+      "learning_rate": 3.124148454416069e-05,
+      "loss": 0.1536281704902649,
+      "step": 2405
+    },
+    {
+      "epoch": 0.43850072780203786,
+      "grad_norm": 0.14078612625598907,
+      "learning_rate": 3.117018041196585e-05,
+      "loss": 0.15274266004562378,
+      "step": 2410
+    },
+    {
+      "epoch": 0.439410480349345,
+      "grad_norm": 0.15457536280155182,
+      "learning_rate": 3.1098822790281226e-05,
+      "loss": 0.15391263961791993,
+      "step": 2415
+    },
+    {
+      "epoch": 0.4403202328966521,
+      "grad_norm": 0.1640717089176178,
+      "learning_rate": 3.102741229770827e-05,
+      "loss": 0.15515168905258178,
+      "step": 2420
+    },
+    {
+      "epoch": 0.44122998544395925,
+      "grad_norm": 0.2601533830165863,
+      "learning_rate": 3.095594955330683e-05,
+      "loss": 0.1587247371673584,
+      "step": 2425
+    },
+    {
+      "epoch": 0.4421397379912664,
+      "grad_norm": 0.1352529525756836,
+      "learning_rate": 3.08844351765897e-05,
+      "loss": 0.1483217477798462,
+      "step": 2430
+    },
+    {
+      "epoch": 0.4430494905385735,
+      "grad_norm": 0.18479721248149872,
+      "learning_rate": 3.081286978751728e-05,
+      "loss": 0.15121787786483765,
+      "step": 2435
+    },
+    {
+      "epoch": 0.44395924308588064,
+      "grad_norm": 0.16954511404037476,
+      "learning_rate": 3.074125400649221e-05,
+      "loss": 0.16073100566864013,
+      "step": 2440
+    },
+    {
+      "epoch": 0.44486899563318777,
+      "grad_norm": 0.15154729783535004,
+      "learning_rate": 3.0669588454353944e-05,
+      "loss": 0.15738017559051515,
+      "step": 2445
+    },
+    {
+      "epoch": 0.4457787481804949,
+      "grad_norm": 0.1540488302707672,
+      "learning_rate": 3.059787375237344e-05,
+      "loss": 0.1515384554862976,
+      "step": 2450
+    },
+    {
+      "epoch": 0.44668850072780203,
+      "grad_norm": 0.1814432442188263,
+      "learning_rate": 3.052611052224774e-05,
+      "loss": 0.15731438398361205,
+      "step": 2455
+    },
+    {
+      "epoch": 0.44759825327510916,
+      "grad_norm": 0.16657036542892456,
+      "learning_rate": 3.0454299386094542e-05,
+      "loss": 0.15741543769836425,
+      "step": 2460
+    },
+    {
+      "epoch": 0.4485080058224163,
+      "grad_norm": 0.2177237570285797,
+      "learning_rate": 3.0382440966446875e-05,
+      "loss": 0.14972515106201173,
+      "step": 2465
+    },
+    {
+      "epoch": 0.4494177583697234,
+      "grad_norm": 0.1669909954071045,
+      "learning_rate": 3.031053588624766e-05,
+      "loss": 0.1506432294845581,
+      "step": 2470
+    },
+    {
+      "epoch": 0.45032751091703055,
+      "grad_norm": 0.1752234250307083,
+      "learning_rate": 3.0238584768844313e-05,
+      "loss": 0.14969609975814818,
+      "step": 2475
+    },
+    {
+      "epoch": 0.4512372634643377,
+      "grad_norm": 0.18267901241779327,
+      "learning_rate": 3.0166588237983363e-05,
+      "loss": 0.15112748146057128,
+      "step": 2480
+    },
+    {
+      "epoch": 0.4521470160116448,
+      "grad_norm": 0.16250105202198029,
+      "learning_rate": 3.0094546917805007e-05,
+      "loss": 0.15864100456237792,
+      "step": 2485
+    },
+    {
+      "epoch": 0.45305676855895194,
+      "grad_norm": 0.14825721085071564,
+      "learning_rate": 3.0022461432837752e-05,
+      "loss": 0.1513954520225525,
+      "step": 2490
+    },
+    {
+      "epoch": 0.4539665211062591,
+      "grad_norm": 0.1626640111207962,
+      "learning_rate": 2.9950332407992943e-05,
+      "loss": 0.1505578875541687,
+      "step": 2495
+    },
+    {
+      "epoch": 0.45487627365356625,
+      "grad_norm": 0.1535351574420929,
+      "learning_rate": 2.987816046855939e-05,
+      "loss": 0.15255829095840454,
+      "step": 2500
+    },
+    {
+      "epoch": 0.4557860262008734,
+      "grad_norm": 0.17552775144577026,
+      "learning_rate": 2.9805946240197928e-05,
+      "loss": 0.1516443133354187,
+      "step": 2505
+    },
+    {
+      "epoch": 0.4566957787481805,
+      "grad_norm": 0.16020981967449188,
+      "learning_rate": 2.9733690348935994e-05,
+      "loss": 0.14519743919372557,
+      "step": 2510
+    },
+    {
+      "epoch": 0.45760553129548764,
+      "grad_norm": 0.17800211906433105,
+      "learning_rate": 2.9661393421162204e-05,
+      "loss": 0.15679080486297609,
+      "step": 2515
+    },
+    {
+      "epoch": 0.4585152838427948,
+      "grad_norm": 0.16016991436481476,
+      "learning_rate": 2.9589056083620902e-05,
+      "loss": 0.14768127202987671,
+      "step": 2520
+    },
+    {
+      "epoch": 0.4594250363901019,
+      "grad_norm": 0.16272081434726715,
+      "learning_rate": 2.951667896340679e-05,
+      "loss": 0.1513301968574524,
+      "step": 2525
+    },
+    {
+      "epoch": 0.46033478893740903,
+      "grad_norm": 0.1726413071155548,
+      "learning_rate": 2.9444262687959402e-05,
+      "loss": 0.14819332361221313,
+      "step": 2530
+    },
+    {
+      "epoch": 0.46124454148471616,
+      "grad_norm": 0.1670403778553009,
+      "learning_rate": 2.9371807885057735e-05,
+      "loss": 0.15245940685272216,
+      "step": 2535
+    },
+    {
+      "epoch": 0.4621542940320233,
+      "grad_norm": 0.1650049239397049,
+      "learning_rate": 2.9299315182814772e-05,
+      "loss": 0.15187418460845947,
+      "step": 2540
+    },
+    {
+      "epoch": 0.4630640465793304,
+      "grad_norm": 0.16327734291553497,
+      "learning_rate": 2.9226785209672047e-05,
+      "loss": 0.15579828023910522,
+      "step": 2545
+    },
+    {
+      "epoch": 0.46397379912663755,
+      "grad_norm": 0.3367880582809448,
+      "learning_rate": 2.91542185943942e-05,
+      "loss": 0.15617697238922118,
+      "step": 2550
+    },
+    {
+      "epoch": 0.4648835516739447,
+      "grad_norm": 0.1731594055891037,
+      "learning_rate": 2.908161596606353e-05,
+      "loss": 0.1559603691101074,
+      "step": 2555
+    },
+    {
+      "epoch": 0.4657933042212518,
+      "grad_norm": 0.1477293074131012,
+      "learning_rate": 2.9008977954074517e-05,
+      "loss": 0.15567959547042848,
+      "step": 2560
+    },
+    {
+      "epoch": 0.46670305676855894,
+      "grad_norm": 0.16227173805236816,
+      "learning_rate": 2.8936305188128392e-05,
+      "loss": 0.1522113561630249,
+      "step": 2565
+    },
+    {
+      "epoch": 0.4676128093158661,
+      "grad_norm": 0.2031075656414032,
+      "learning_rate": 2.8863598298227674e-05,
+      "loss": 0.15054640769958497,
+      "step": 2570
+    },
+    {
+      "epoch": 0.4685225618631732,
+      "grad_norm": 0.18351472914218903,
+      "learning_rate": 2.8790857914670698e-05,
+      "loss": 0.15837019681930542,
+      "step": 2575
+    },
+    {
+      "epoch": 0.46943231441048033,
+      "grad_norm": 0.15914765000343323,
+      "learning_rate": 2.871808466804616e-05,
+      "loss": 0.1550259470939636,
+      "step": 2580
+    },
+    {
+      "epoch": 0.47034206695778746,
+      "grad_norm": 0.17366717755794525,
+      "learning_rate": 2.8645279189227636e-05,
+      "loss": 0.15702390670776367,
+      "step": 2585
+    },
+    {
+      "epoch": 0.4712518195050946,
+      "grad_norm": 0.13677838444709778,
+      "learning_rate": 2.8572442109368134e-05,
+      "loss": 0.15485031604766847,
+      "step": 2590
+    },
+    {
+      "epoch": 0.4721615720524017,
+      "grad_norm": 0.1477748304605484,
+      "learning_rate": 2.8499574059894617e-05,
+      "loss": 0.14577245712280273,
+      "step": 2595
+    },
+    {
+      "epoch": 0.47307132459970885,
+      "grad_norm": 0.1582217663526535,
+      "learning_rate": 2.842667567250252e-05,
+      "loss": 0.15586793422698975,
+      "step": 2600
+    },
+    {
+      "epoch": 0.47398107714701604,
+      "grad_norm": 0.19658738374710083,
+      "learning_rate": 2.8353747579150268e-05,
+      "loss": 0.15060495138168334,
+      "step": 2605
+    },
+    {
+      "epoch": 0.47489082969432317,
+      "grad_norm": 0.176767036318779,
+      "learning_rate": 2.828079041205382e-05,
+      "loss": 0.15116705894470214,
+      "step": 2610
+    },
+    {
+      "epoch": 0.4758005822416303,
+      "grad_norm": 0.16972507536411285,
+      "learning_rate": 2.820780480368117e-05,
+      "loss": 0.1541937470436096,
+      "step": 2615
+    },
+    {
+      "epoch": 0.47671033478893743,
+      "grad_norm": 0.1548585742712021,
+      "learning_rate": 2.8134791386746884e-05,
+      "loss": 0.14334756135940552,
+      "step": 2620
+    },
+    {
+      "epoch": 0.47762008733624456,
+      "grad_norm": 0.15411986410617828,
+      "learning_rate": 2.806175079420658e-05,
+      "loss": 0.14642289876937867,
+      "step": 2625
+    },
+    {
+      "epoch": 0.4785298398835517,
+      "grad_norm": 0.16609491407871246,
+      "learning_rate": 2.7988683659251474e-05,
+      "loss": 0.15083469152450563,
+      "step": 2630
+    },
+    {
+      "epoch": 0.4794395924308588,
+      "grad_norm": 0.16592684388160706,
+      "learning_rate": 2.791559061530289e-05,
+      "loss": 0.14218480587005616,
+      "step": 2635
+    },
+    {
+      "epoch": 0.48034934497816595,
+      "grad_norm": 0.1764935404062271,
+      "learning_rate": 2.7842472296006722e-05,
+      "loss": 0.15004343986511232,
+      "step": 2640
+    },
+    {
+      "epoch": 0.4812590975254731,
+      "grad_norm": 0.20094354450702667,
+      "learning_rate": 2.7769329335228022e-05,
+      "loss": 0.14975016117095946,
+      "step": 2645
+    },
+    {
+      "epoch": 0.4821688500727802,
+      "grad_norm": 0.1869269460439682,
+      "learning_rate": 2.769616236704542e-05,
+      "loss": 0.155981707572937,
+      "step": 2650
+    },
+    {
+      "epoch": 0.48307860262008734,
+      "grad_norm": 0.16671574115753174,
+      "learning_rate": 2.762297202574571e-05,
+      "loss": 0.14633859395980836,
+      "step": 2655
+    },
+    {
+      "epoch": 0.48398835516739447,
+      "grad_norm": 0.14999663829803467,
+      "learning_rate": 2.754975894581826e-05,
+      "loss": 0.15692603588104248,
+      "step": 2660
+    },
+    {
+      "epoch": 0.4848981077147016,
+      "grad_norm": 0.16893649101257324,
+      "learning_rate": 2.7476523761949592e-05,
+      "loss": 0.14530394077301026,
+      "step": 2665
+    },
+    {
+      "epoch": 0.48580786026200873,
+      "grad_norm": 0.16039884090423584,
+      "learning_rate": 2.740326710901784e-05,
+      "loss": 0.15013915300369263,
+      "step": 2670
+    },
+    {
+      "epoch": 0.48671761280931586,
+      "grad_norm": 0.16672006249427795,
+      "learning_rate": 2.732998962208725e-05,
+      "loss": 0.15667349100112915,
+      "step": 2675
+    },
+    {
+      "epoch": 0.487627365356623,
+      "grad_norm": 0.2160867303609848,
+      "learning_rate": 2.7256691936402684e-05,
+      "loss": 0.14335414171218872,
+      "step": 2680
+    },
+    {
+      "epoch": 0.4885371179039301,
+      "grad_norm": 0.349030077457428,
+      "learning_rate": 2.71833746873841e-05,
+      "loss": 0.1437530279159546,
+      "step": 2685
+    },
+    {
+      "epoch": 0.48944687045123725,
+      "grad_norm": 0.18380966782569885,
+      "learning_rate": 2.7110038510621073e-05,
+      "loss": 0.1476014256477356,
+      "step": 2690
+    },
+    {
+      "epoch": 0.4903566229985444,
+      "grad_norm": 0.1523742377758026,
+      "learning_rate": 2.703668404186722e-05,
+      "loss": 0.14578526020050048,
+      "step": 2695
+    },
+    {
+      "epoch": 0.4912663755458515,
+      "grad_norm": 0.16092729568481445,
+      "learning_rate": 2.696331191703479e-05,
+      "loss": 0.15335593223571778,
+      "step": 2700
+    },
+    {
+      "epoch": 0.49217612809315864,
+      "grad_norm": 0.17185333371162415,
+      "learning_rate": 2.688992277218904e-05,
+      "loss": 0.1540898084640503,
+      "step": 2705
+    },
+    {
+      "epoch": 0.49308588064046577,
+      "grad_norm": 0.1521969735622406,
+      "learning_rate": 2.6816517243542792e-05,
+      "loss": 0.15171396732330322,
+      "step": 2710
+    },
+    {
+      "epoch": 0.49399563318777295,
+      "grad_norm": 0.16064171493053436,
+      "learning_rate": 2.674309596745092e-05,
+      "loss": 0.1505839228630066,
+      "step": 2715
+    },
+    {
+      "epoch": 0.4949053857350801,
+      "grad_norm": 0.16430898010730743,
+      "learning_rate": 2.6669659580404795e-05,
+      "loss": 0.1551363468170166,
+      "step": 2720
+    },
+    {
+      "epoch": 0.4958151382823872,
+      "grad_norm": 0.16125477850437164,
+      "learning_rate": 2.659620871902677e-05,
+      "loss": 0.15069286823272704,
+      "step": 2725
+    },
+    {
+      "epoch": 0.49672489082969434,
+      "grad_norm": 0.1428450047969818,
+      "learning_rate": 2.652274402006471e-05,
+      "loss": 0.15511081218719483,
+      "step": 2730
+    },
+    {
+      "epoch": 0.4976346433770015,
+      "grad_norm": 0.15452754497528076,
+      "learning_rate": 2.6449266120386406e-05,
+      "loss": 0.14941939115524291,
+      "step": 2735
+    },
+    {
+      "epoch": 0.4985443959243086,
+      "grad_norm": 0.17243537306785583,
+      "learning_rate": 2.6375775656974123e-05,
+      "loss": 0.151741623878479,
+      "step": 2740
+    },
+    {
+      "epoch": 0.49945414847161573,
+      "grad_norm": 0.13736453652381897,
+      "learning_rate": 2.6302273266919008e-05,
+      "loss": 0.147042977809906,
+      "step": 2745
+    },
+    {
+      "epoch": 0.5003639010189228,
+      "grad_norm": 0.16241495311260223,
+      "learning_rate": 2.6228759587415614e-05,
+      "loss": 0.14664684534072875,
+      "step": 2750
+    },
+    {
+      "epoch": 0.50127365356623,
+      "grad_norm": 0.193496435880661,
+      "learning_rate": 2.6155235255756356e-05,
+      "loss": 0.15486966371536254,
+      "step": 2755
+    },
+    {
+      "epoch": 0.5021834061135371,
+      "grad_norm": 0.1542847901582718,
+      "learning_rate": 2.6081700909326e-05,
+      "loss": 0.15148009061813356,
+      "step": 2760
+    },
+    {
+      "epoch": 0.5030931586608443,
+      "grad_norm": 0.1696511209011078,
+      "learning_rate": 2.6008157185596142e-05,
+      "loss": 0.14190055131912233,
+      "step": 2765
+    },
+    {
+      "epoch": 0.5040029112081513,
+      "grad_norm": 0.14690077304840088,
+      "learning_rate": 2.5934604722119655e-05,
+      "loss": 0.1570739269256592,
+      "step": 2770
+    },
+    {
+      "epoch": 0.5049126637554585,
+      "grad_norm": 0.17149671912193298,
+      "learning_rate": 2.5861044156525162e-05,
+      "loss": 0.14940304756164552,
+      "step": 2775
+    },
+    {
+      "epoch": 0.5058224163027657,
+      "grad_norm": 0.16639231145381927,
+      "learning_rate": 2.578747612651155e-05,
+      "loss": 0.15691237449645995,
+      "step": 2780
+    },
+    {
+      "epoch": 0.5067321688500728,
+      "grad_norm": 0.2062763124704361,
+      "learning_rate": 2.5713901269842404e-05,
+      "loss": 0.1564734935760498,
+      "step": 2785
+    },
+    {
+      "epoch": 0.50764192139738,
+      "grad_norm": 0.12636308372020721,
+      "learning_rate": 2.5640320224340502e-05,
+      "loss": 0.14539417028427123,
+      "step": 2790
+    },
+    {
+      "epoch": 0.508551673944687,
+      "grad_norm": 0.16893689334392548,
+      "learning_rate": 2.556673362788225e-05,
+      "loss": 0.15440930128097535,
+      "step": 2795
+    },
+    {
+      "epoch": 0.5094614264919942,
+      "grad_norm": 0.16250015795230865,
+      "learning_rate": 2.54931421183922e-05,
+      "loss": 0.14485647678375244,
+      "step": 2800
+    },
+    {
+      "epoch": 0.5103711790393013,
+      "grad_norm": 0.1700994372367859,
+      "learning_rate": 2.5419546333837462e-05,
+      "loss": 0.15411126613616943,
+      "step": 2805
+    },
+    {
+      "epoch": 0.5112809315866085,
+      "grad_norm": 0.1547706127166748,
+      "learning_rate": 2.5345946912222256e-05,
+      "loss": 0.15516072511672974,
+      "step": 2810
+    },
+    {
+      "epoch": 0.5121906841339156,
+      "grad_norm": 0.17955681681632996,
+      "learning_rate": 2.527234449158228e-05,
+      "loss": 0.15546923875808716,
+      "step": 2815
+    },
+    {
+      "epoch": 0.5131004366812227,
+      "grad_norm": 0.163709819316864,
+      "learning_rate": 2.519873970997927e-05,
+      "loss": 0.15665037631988527,
+      "step": 2820
+    },
+    {
+      "epoch": 0.5140101892285298,
+      "grad_norm": 0.17859576642513275,
+      "learning_rate": 2.5125133205495405e-05,
+      "loss": 0.1539722204208374,
+      "step": 2825
+    },
+    {
+      "epoch": 0.514919941775837,
+      "grad_norm": 0.17443150281906128,
+      "learning_rate": 2.5051525616227806e-05,
+      "loss": 0.148411762714386,
+      "step": 2830
+    },
+    {
+      "epoch": 0.5158296943231441,
+      "grad_norm": 0.17397581040859222,
+      "learning_rate": 2.4977917580283007e-05,
+      "loss": 0.14880497455596925,
+      "step": 2835
+    },
+    {
+      "epoch": 0.5167394468704513,
+      "grad_norm": 0.14565663039684296,
+      "learning_rate": 2.4904309735771405e-05,
+      "loss": 0.14934680461883545,
+      "step": 2840
+    },
+    {
+      "epoch": 0.5176491994177583,
+      "grad_norm": 0.17895659804344177,
+      "learning_rate": 2.4830702720801746e-05,
+      "loss": 0.15287939310073853,
+      "step": 2845
+    },
+    {
+      "epoch": 0.5185589519650655,
+      "grad_norm": 0.15812788903713226,
+      "learning_rate": 2.4757097173475572e-05,
+      "loss": 0.14576947689056396,
+      "step": 2850
+    },
+    {
+      "epoch": 0.5194687045123726,
+      "grad_norm": 0.17123781144618988,
+      "learning_rate": 2.46834937318817e-05,
+      "loss": 0.15224847793579102,
+      "step": 2855
+    },
+    {
+      "epoch": 0.5203784570596798,
+      "grad_norm": 0.14845474064350128,
+      "learning_rate": 2.460989303409072e-05,
+      "loss": 0.14901585578918458,
+      "step": 2860
+    },
+    {
+      "epoch": 0.5212882096069869,
+      "grad_norm": 0.23493704199790955,
+      "learning_rate": 2.4536295718149407e-05,
+      "loss": 0.1517487049102783,
+      "step": 2865
+    },
+    {
+      "epoch": 0.522197962154294,
+      "grad_norm": 0.16209843754768372,
+      "learning_rate": 2.4462702422075217e-05,
+      "loss": 0.14327445030212402,
+      "step": 2870
+    },
+    {
+      "epoch": 0.5231077147016011,
+      "grad_norm": 0.17249803245067596,
+      "learning_rate": 2.4389113783850793e-05,
+      "loss": 0.1517549753189087,
+      "step": 2875
+    },
+    {
+      "epoch": 0.5240174672489083,
+      "grad_norm": 0.14561402797698975,
+      "learning_rate": 2.431553044141836e-05,
+      "loss": 0.14764087200164794,
+      "step": 2880
+    },
+    {
+      "epoch": 0.5249272197962155,
+      "grad_norm": 0.17033302783966064,
+      "learning_rate": 2.4241953032674256e-05,
+      "loss": 0.15181604623794556,
+      "step": 2885
+    },
+    {
+      "epoch": 0.5258369723435226,
+      "grad_norm": 0.1184430941939354,
+      "learning_rate": 2.4168382195463367e-05,
+      "loss": 0.14264242649078368,
+      "step": 2890
+    },
+    {
+      "epoch": 0.5267467248908297,
+      "grad_norm": 0.17521196603775024,
+      "learning_rate": 2.4094818567573618e-05,
+      "loss": 0.1509538173675537,
+      "step": 2895
+    },
+    {
+      "epoch": 0.5276564774381368,
+      "grad_norm": 0.1681576371192932,
+      "learning_rate": 2.4021262786730428e-05,
+      "loss": 0.15344605445861817,
+      "step": 2900
+    },
+    {
+      "epoch": 0.528566229985444,
+      "grad_norm": 0.17134182155132294,
+      "learning_rate": 2.3947715490591206e-05,
+      "loss": 0.15161689519882202,
+      "step": 2905
+    },
+    {
+      "epoch": 0.5294759825327511,
+      "grad_norm": 0.1796472817659378,
+      "learning_rate": 2.3874177316739778e-05,
+      "loss": 0.15086464881896972,
+      "step": 2910
+    },
+    {
+      "epoch": 0.5303857350800583,
+      "grad_norm": 0.23268625140190125,
+      "learning_rate": 2.380064890268093e-05,
+      "loss": 0.15354180335998535,
+      "step": 2915
+    },
+    {
+      "epoch": 0.5312954876273653,
+      "grad_norm": 0.16318941116333008,
+      "learning_rate": 2.372713088583481e-05,
+      "loss": 0.15131797790527343,
+      "step": 2920
+    },
+    {
+      "epoch": 0.5322052401746725,
+      "grad_norm": 0.18171803653240204,
+      "learning_rate": 2.365362390353143e-05,
+      "loss": 0.15784090757369995,
+      "step": 2925
+    },
+    {
+      "epoch": 0.5331149927219796,
+      "grad_norm": 0.17672640085220337,
+      "learning_rate": 2.3580128593005156e-05,
+      "loss": 0.15509436130523682,
+      "step": 2930
+    },
+    {
+      "epoch": 0.5340247452692868,
+      "grad_norm": 0.15985223650932312,
+      "learning_rate": 2.3506645591389174e-05,
+      "loss": 0.14851027727127075,
+      "step": 2935
+    },
+    {
+      "epoch": 0.5349344978165939,
+      "grad_norm": 0.16597607731819153,
+      "learning_rate": 2.343317553570995e-05,
+      "loss": 0.1504931092262268,
+      "step": 2940
+    },
+    {
+      "epoch": 0.535844250363901,
+      "grad_norm": 0.20180748403072357,
+      "learning_rate": 2.3359719062881725e-05,
+      "loss": 0.15023820400238036,
+      "step": 2945
+    },
+    {
+      "epoch": 0.5367540029112081,
+      "grad_norm": 0.1735963076353073,
+      "learning_rate": 2.3286276809701e-05,
+      "loss": 0.15374408960342406,
+      "step": 2950
+    },
+    {
+      "epoch": 0.5376637554585153,
+      "grad_norm": 0.17629501223564148,
+      "learning_rate": 2.3212849412840995e-05,
+      "loss": 0.15007833242416382,
+      "step": 2955
+    },
+    {
+      "epoch": 0.5385735080058224,
+      "grad_norm": 0.1493796557188034,
+      "learning_rate": 2.3139437508846155e-05,
+      "loss": 0.15206656455993653,
+      "step": 2960
+    },
+    {
+      "epoch": 0.5394832605531296,
+      "grad_norm": 0.17426837980747223,
+      "learning_rate": 2.306604173412659e-05,
+      "loss": 0.1441131591796875,
+      "step": 2965
+    },
+    {
+      "epoch": 0.5403930131004366,
+      "grad_norm": 0.16984431445598602,
+      "learning_rate": 2.2992662724952613e-05,
+      "loss": 0.14438753128051757,
+      "step": 2970
+    },
+    {
+      "epoch": 0.5413027656477438,
+      "grad_norm": 0.1814386397600174,
+      "learning_rate": 2.2919301117449167e-05,
+      "loss": 0.14887022972106934,
+      "step": 2975
+    },
+    {
+      "epoch": 0.5422125181950509,
+      "grad_norm": 0.158392995595932,
+      "learning_rate": 2.2845957547590368e-05,
+      "loss": 0.14404361248016356,
+      "step": 2980
+    },
+    {
+      "epoch": 0.5431222707423581,
+      "grad_norm": 0.17496263980865479,
+      "learning_rate": 2.2772632651193953e-05,
+      "loss": 0.1454906702041626,
+      "step": 2985
+    },
+    {
+      "epoch": 0.5440320232896652,
+      "grad_norm": 0.157533198595047,
+      "learning_rate": 2.2699327063915766e-05,
+      "loss": 0.1458217740058899,
+      "step": 2990
+    },
+    {
+      "epoch": 0.5449417758369723,
+      "grad_norm": 0.1767890453338623,
+      "learning_rate": 2.262604142124427e-05,
+      "loss": 0.14384825229644777,
+      "step": 2995
+    },
+    {
+      "epoch": 0.5458515283842795,
+      "grad_norm": 0.1851050704717636,
+      "learning_rate": 2.2552776358495033e-05,
+      "loss": 0.14832457304000854,
+      "step": 3000
+    },
+    {
+      "epoch": 0.5467612809315866,
+      "grad_norm": 0.164175882935524,
+      "learning_rate": 2.247953251080521e-05,
+      "loss": 0.14999878406524658,
+      "step": 3005
+    },
+    {
+      "epoch": 0.5476710334788938,
+      "grad_norm": 0.3403675854206085,
+      "learning_rate": 2.240631051312804e-05,
+      "loss": 0.1443937063217163,
+      "step": 3010
+    },
+    {
+      "epoch": 0.5485807860262009,
+      "grad_norm": 0.16751109063625336,
+      "learning_rate": 2.2333111000227342e-05,
+      "loss": 0.1462402105331421,
+      "step": 3015
+    },
+    {
+      "epoch": 0.549490538573508,
+      "grad_norm": 0.14741151034832,
+      "learning_rate": 2.225993460667201e-05,
+      "loss": 0.149855899810791,
+      "step": 3020
+    },
+    {
+      "epoch": 0.5504002911208151,
+      "grad_norm": 0.20605266094207764,
+      "learning_rate": 2.218678196683054e-05,
+      "loss": 0.15413178205490113,
+      "step": 3025
+    },
+    {
+      "epoch": 0.5513100436681223,
+      "grad_norm": 0.14884796738624573,
+      "learning_rate": 2.2113653714865473e-05,
+      "loss": 0.14592334032058715,
+      "step": 3030
+    },
+    {
+      "epoch": 0.5522197962154294,
+      "grad_norm": 0.17114350199699402,
+      "learning_rate": 2.2040550484727943e-05,
+      "loss": 0.1498338460922241,
+      "step": 3035
+    },
+    {
+      "epoch": 0.5531295487627366,
+      "grad_norm": 0.16496853530406952,
+      "learning_rate": 2.196747291015219e-05,
+      "loss": 0.14650315046310425,
+      "step": 3040
+    },
+    {
+      "epoch": 0.5540393013100436,
+      "grad_norm": 0.15172401070594788,
+      "learning_rate": 2.189442162465001e-05,
+      "loss": 0.14984124898910522,
+      "step": 3045
+    },
+    {
+      "epoch": 0.5549490538573508,
+      "grad_norm": 0.19258467853069305,
+      "learning_rate": 2.182139726150532e-05,
+      "loss": 0.1486764669418335,
+      "step": 3050
+    },
+    {
+      "epoch": 0.5558588064046579,
+      "grad_norm": 0.1749001443386078,
+      "learning_rate": 2.1748400453768652e-05,
+      "loss": 0.14983701705932617,
+      "step": 3055
+    },
+    {
+      "epoch": 0.5567685589519651,
+      "grad_norm": 0.37510567903518677,
+      "learning_rate": 2.1675431834251637e-05,
+      "loss": 0.14483561515808105,
+      "step": 3060
+    },
+    {
+      "epoch": 0.5576783114992722,
+      "grad_norm": 0.16932405531406403,
+      "learning_rate": 2.1602492035521553e-05,
+      "loss": 0.14487643241882325,
+      "step": 3065
+    },
+    {
+      "epoch": 0.5585880640465793,
+      "grad_norm": 0.174176424741745,
+      "learning_rate": 2.152958168989584e-05,
+      "loss": 0.14737497568130492,
+      "step": 3070
+    },
+    {
+      "epoch": 0.5594978165938864,
+      "grad_norm": 0.1601252257823944,
+      "learning_rate": 2.1456701429436577e-05,
+      "loss": 0.15183379650115966,
+      "step": 3075
+    },
+    {
+      "epoch": 0.5604075691411936,
+      "grad_norm": 0.14960910379886627,
+      "learning_rate": 2.1383851885945085e-05,
+      "loss": 0.143074893951416,
+      "step": 3080
+    },
+    {
+      "epoch": 0.5613173216885007,
+      "grad_norm": 0.1678633838891983,
+      "learning_rate": 2.1311033690956346e-05,
+      "loss": 0.14961432218551635,
+      "step": 3085
+    },
+    {
+      "epoch": 0.5622270742358079,
+      "grad_norm": 0.15814319252967834,
+      "learning_rate": 2.1238247475733613e-05,
+      "loss": 0.14308581352233887,
+      "step": 3090
+    },
+    {
+      "epoch": 0.5631368267831149,
+      "grad_norm": 0.21240772306919098,
+      "learning_rate": 2.1165493871262887e-05,
+      "loss": 0.14737485647201537,
+      "step": 3095
+    },
+    {
+      "epoch": 0.5640465793304221,
+      "grad_norm": 0.15161271393299103,
+      "learning_rate": 2.109277350824749e-05,
+      "loss": 0.14534420967102052,
+      "step": 3100
+    },
+    {
+      "epoch": 0.5649563318777293,
+      "grad_norm": 0.16572362184524536,
+      "learning_rate": 2.1020087017102537e-05,
+      "loss": 0.14299670457839966,
+      "step": 3105
+    },
+    {
+      "epoch": 0.5658660844250364,
+      "grad_norm": 0.1548164039850235,
+      "learning_rate": 2.094743502794954e-05,
+      "loss": 0.14371142387390137,
+      "step": 3110
+    },
+    {
+      "epoch": 0.5667758369723436,
+      "grad_norm": 0.2574169933795929,
+      "learning_rate": 2.0874818170610885e-05,
+      "loss": 0.14350423812866211,
+      "step": 3115
+    },
+    {
+      "epoch": 0.5676855895196506,
+      "grad_norm": 0.16359548270702362,
+      "learning_rate": 2.080223707460443e-05,
+      "loss": 0.1520243763923645,
+      "step": 3120
+    },
+    {
+      "epoch": 0.5685953420669578,
+      "grad_norm": 0.1798320859670639,
+      "learning_rate": 2.072969236913799e-05,
+      "loss": 0.14832595586776734,
+      "step": 3125
+    },
+    {
+      "epoch": 0.5695050946142649,
+      "grad_norm": 0.17045916616916656,
+      "learning_rate": 2.0657184683103926e-05,
+      "loss": 0.15308042764663696,
+      "step": 3130
+    },
+    {
+      "epoch": 0.5704148471615721,
+      "grad_norm": 0.16345897316932678,
+      "learning_rate": 2.058471464507366e-05,
+      "loss": 0.14564799070358275,
+      "step": 3135
+    },
+    {
+      "epoch": 0.5713245997088792,
+      "grad_norm": 0.15170110762119293,
+      "learning_rate": 2.0512282883292257e-05,
+      "loss": 0.14161767959594726,
+      "step": 3140
+    },
+    {
+      "epoch": 0.5722343522561864,
+      "grad_norm": 0.8107472658157349,
+      "learning_rate": 2.0439890025672955e-05,
+      "loss": 0.14481087923049926,
+      "step": 3145
+    },
+    {
+      "epoch": 0.5731441048034934,
+      "grad_norm": 0.15346679091453552,
+      "learning_rate": 2.036753669979174e-05,
+      "loss": 0.14860262870788574,
+      "step": 3150
+    },
+    {
+      "epoch": 0.5740538573508006,
+      "grad_norm": 0.1632593423128128,
+      "learning_rate": 2.0295223532881886e-05,
+      "loss": 0.1481687307357788,
+      "step": 3155
+    },
+    {
+      "epoch": 0.5749636098981077,
+      "grad_norm": 0.23399172723293304,
+      "learning_rate": 2.022295115182852e-05,
+      "loss": 0.149153733253479,
+      "step": 3160
+    },
+    {
+      "epoch": 0.5758733624454149,
+      "grad_norm": 0.14977394044399261,
+      "learning_rate": 2.015072018316323e-05,
+      "loss": 0.14921388626098633,
+      "step": 3165
+    },
+    {
+      "epoch": 0.576783114992722,
+      "grad_norm": 0.1550658792257309,
+      "learning_rate": 2.007853125305856e-05,
+      "loss": 0.1482759475708008,
+      "step": 3170
+    },
+    {
+      "epoch": 0.5776928675400291,
+      "grad_norm": 0.16661737859249115,
+      "learning_rate": 2.0006384987322645e-05,
+      "loss": 0.14903552532196046,
+      "step": 3175
+    },
+    {
+      "epoch": 0.5786026200873362,
+      "grad_norm": 0.1746823936700821,
+      "learning_rate": 1.9934282011393753e-05,
+      "loss": 0.1412947654724121,
+      "step": 3180
+    },
+    {
+      "epoch": 0.5795123726346434,
+      "grad_norm": 0.17025792598724365,
+      "learning_rate": 1.9862222950334857e-05,
+      "loss": 0.15289769172668458,
+      "step": 3185
+    },
+    {
+      "epoch": 0.5804221251819505,
+      "grad_norm": 0.16857658326625824,
+      "learning_rate": 1.9790208428828252e-05,
+      "loss": 0.14419941902160643,
+      "step": 3190
+    },
+    {
+      "epoch": 0.5813318777292577,
+      "grad_norm": 0.16099876165390015,
+      "learning_rate": 1.9718239071170118e-05,
+      "loss": 0.14476487636566163,
+      "step": 3195
+    },
+    {
+      "epoch": 0.5822416302765647,
+      "grad_norm": 0.16140873730182648,
+      "learning_rate": 1.964631550126508e-05,
+      "loss": 0.14588416814804078,
+      "step": 3200
+    },
+    {
+      "epoch": 0.5831513828238719,
+      "grad_norm": 0.15719448029994965,
+      "learning_rate": 1.957443834262087e-05,
+      "loss": 0.15144693851470947,
+      "step": 3205
+    },
+    {
+      "epoch": 0.584061135371179,
+      "grad_norm": 0.16512645781040192,
+      "learning_rate": 1.950260821834285e-05,
+      "loss": 0.14787566661834717,
+      "step": 3210
+    },
+    {
+      "epoch": 0.5849708879184862,
+      "grad_norm": 0.18584516644477844,
+      "learning_rate": 1.9430825751128643e-05,
+      "loss": 0.14514710903167724,
+      "step": 3215
+    },
+    {
+      "epoch": 0.5858806404657934,
+      "grad_norm": 0.17640981078147888,
+      "learning_rate": 1.9359091563262742e-05,
+      "loss": 0.1511004686355591,
+      "step": 3220
+    },
+    {
+      "epoch": 0.5867903930131004,
+      "grad_norm": 0.1697624921798706,
+      "learning_rate": 1.9287406276611095e-05,
+      "loss": 0.15392563343048096,
+      "step": 3225
+    },
+    {
+      "epoch": 0.5877001455604076,
+      "grad_norm": 0.1677260845899582,
+      "learning_rate": 1.9215770512615725e-05,
+      "loss": 0.15311745405197144,
+      "step": 3230
+    },
+    {
+      "epoch": 0.5886098981077147,
+      "grad_norm": 0.15357480943202972,
+      "learning_rate": 1.9144184892289337e-05,
+      "loss": 0.14370160102844237,
+      "step": 3235
+    },
+    {
+      "epoch": 0.5895196506550219,
+      "grad_norm": 0.18601207435131073,
+      "learning_rate": 1.9072650036209955e-05,
+      "loss": 0.14095077514648438,
+      "step": 3240
+    },
+    {
+      "epoch": 0.590429403202329,
+      "grad_norm": 0.17313526570796967,
+      "learning_rate": 1.9001166564515513e-05,
+      "loss": 0.148259174823761,
+      "step": 3245
+    },
+    {
+      "epoch": 0.5913391557496361,
+      "grad_norm": 0.1634378433227539,
+      "learning_rate": 1.8929735096898504e-05,
+      "loss": 0.15082294940948487,
+      "step": 3250
+    },
+    {
+      "epoch": 0.5922489082969432,
+      "grad_norm": 0.18542174994945526,
+      "learning_rate": 1.885835625260058e-05,
+      "loss": 0.14461435079574586,
+      "step": 3255
+    },
+    {
+      "epoch": 0.5931586608442504,
+      "grad_norm": 0.1740756630897522,
+      "learning_rate": 1.87870306504072e-05,
+      "loss": 0.14083608388900756,
+      "step": 3260
+    },
+    {
+      "epoch": 0.5940684133915575,
+      "grad_norm": 0.25606217980384827,
+      "learning_rate": 1.8715758908642288e-05,
+      "loss": 0.15125386714935302,
+      "step": 3265
+    },
+    {
+      "epoch": 0.5949781659388647,
+      "grad_norm": 0.20194627344608307,
+      "learning_rate": 1.8644541645162834e-05,
+      "loss": 0.14433003664016725,
+      "step": 3270
+    },
+    {
+      "epoch": 0.5958879184861717,
+      "grad_norm": 0.1902168095111847,
+      "learning_rate": 1.8573379477353542e-05,
+      "loss": 0.14718132019042968,
+      "step": 3275
+    },
+    {
+      "epoch": 0.5967976710334789,
+      "grad_norm": 0.15122972428798676,
+      "learning_rate": 1.850227302212151e-05,
+      "loss": 0.153376567363739,
+      "step": 3280
+    },
+    {
+      "epoch": 0.597707423580786,
+      "grad_norm": 0.14331959187984467,
+      "learning_rate": 1.843122289589085e-05,
+      "loss": 0.146630597114563,
+      "step": 3285
+    },
+    {
+      "epoch": 0.5986171761280932,
+      "grad_norm": 0.15083099901676178,
+      "learning_rate": 1.836022971459737e-05,
+      "loss": 0.1445971965789795,
+      "step": 3290
+    },
+    {
+      "epoch": 0.5995269286754003,
+      "grad_norm": 0.16585418581962585,
+      "learning_rate": 1.828929409368321e-05,
+      "loss": 0.15120241641998292,
+      "step": 3295
+    },
+    {
+      "epoch": 0.6004366812227074,
+      "grad_norm": 0.1653224229812622,
+      "learning_rate": 1.8218416648091524e-05,
+      "loss": 0.14349838495254516,
+      "step": 3300
+    },
+    {
+      "epoch": 0.6013464337700145,
+      "grad_norm": 0.1891375184059143,
+      "learning_rate": 1.8147597992261124e-05,
+      "loss": 0.15171384811401367,
+      "step": 3305
+    },
+    {
+      "epoch": 0.6022561863173217,
+      "grad_norm": 0.13392704725265503,
+      "learning_rate": 1.8076838740121187e-05,
+      "loss": 0.14607118368148803,
+      "step": 3310
+    },
+    {
+      "epoch": 0.6031659388646288,
+      "grad_norm": 0.15421944856643677,
+      "learning_rate": 1.8006139505085926e-05,
+      "loss": 0.1380957007408142,
+      "step": 3315
+    },
+    {
+      "epoch": 0.604075691411936,
+      "grad_norm": 0.16637761890888214,
+      "learning_rate": 1.7935500900049246e-05,
+      "loss": 0.14604611396789552,
+      "step": 3320
+    },
+    {
+      "epoch": 0.6049854439592431,
+      "grad_norm": 0.16638441383838654,
+      "learning_rate": 1.7864923537379445e-05,
+      "loss": 0.1513611912727356,
+      "step": 3325
+    },
+    {
+      "epoch": 0.6058951965065502,
+      "grad_norm": 0.1745707094669342,
+      "learning_rate": 1.779440802891394e-05,
+      "loss": 0.15391240119934083,
+      "step": 3330
+    },
+    {
+      "epoch": 0.6068049490538574,
+      "grad_norm": 0.1620505005121231,
+      "learning_rate": 1.77239549859539e-05,
+      "loss": 0.14986472129821776,
+      "step": 3335
+    },
+    {
+      "epoch": 0.6077147016011645,
+      "grad_norm": 0.1579132080078125,
+      "learning_rate": 1.7653565019259e-05,
+      "loss": 0.1466603994369507,
+      "step": 3340
+    },
+    {
+      "epoch": 0.6086244541484717,
+      "grad_norm": 0.19154994189739227,
+      "learning_rate": 1.7583238739042086e-05,
+      "loss": 0.15228934288024903,
+      "step": 3345
+    },
+    {
+      "epoch": 0.6095342066957787,
+      "grad_norm": 0.15771779417991638,
+      "learning_rate": 1.7512976754963913e-05,
+      "loss": 0.14965078830718995,
+      "step": 3350
+    },
+    {
+      "epoch": 0.6104439592430859,
+      "grad_norm": 0.18406136333942413,
+      "learning_rate": 1.744277967612785e-05,
+      "loss": 0.1473196864128113,
+      "step": 3355
+    },
+    {
+      "epoch": 0.611353711790393,
+      "grad_norm": 0.17603816092014313,
+      "learning_rate": 1.7372648111074607e-05,
+      "loss": 0.1430676221847534,
+      "step": 3360
+    },
+    {
+      "epoch": 0.6122634643377002,
+      "grad_norm": 0.156408429145813,
+      "learning_rate": 1.7302582667776933e-05,
+      "loss": 0.14018454551696777,
+      "step": 3365
+    },
+    {
+      "epoch": 0.6131732168850073,
+      "grad_norm": 0.14504843950271606,
+      "learning_rate": 1.7232583953634407e-05,
+      "loss": 0.14505640268325806,
+      "step": 3370
+    },
+    {
+      "epoch": 0.6140829694323144,
+      "grad_norm": 0.1864968240261078,
+      "learning_rate": 1.716265257546808e-05,
+      "loss": 0.14810394048690795,
+      "step": 3375
+    },
+    {
+      "epoch": 0.6149927219796215,
+      "grad_norm": 0.1621711403131485,
+      "learning_rate": 1.7092789139515295e-05,
+      "loss": 0.14203091859817504,
+      "step": 3380
+    },
+    {
+      "epoch": 0.6159024745269287,
+      "grad_norm": 0.17994914948940277,
+      "learning_rate": 1.70229942514244e-05,
+      "loss": 0.14565644264221192,
+      "step": 3385
+    },
+    {
+      "epoch": 0.6168122270742358,
+      "grad_norm": 0.1707388162612915,
+      "learning_rate": 1.6953268516249486e-05,
+      "loss": 0.14449434280395507,
+      "step": 3390
+    },
+    {
+      "epoch": 0.617721979621543,
+      "grad_norm": 0.16425329446792603,
+      "learning_rate": 1.6883612538445175e-05,
+      "loss": 0.15185940265655518,
+      "step": 3395
+    },
+    {
+      "epoch": 0.61863173216885,
+      "grad_norm": 0.15987788140773773,
+      "learning_rate": 1.6814026921861335e-05,
+      "loss": 0.14994431734085084,
+      "step": 3400
+    },
+    {
+      "epoch": 0.6195414847161572,
+      "grad_norm": 0.2987690269947052,
+      "learning_rate": 1.6744512269737894e-05,
+      "loss": 0.14652738571166993,
+      "step": 3405
+    },
+    {
+      "epoch": 0.6204512372634643,
+      "grad_norm": 0.1681315004825592,
+      "learning_rate": 1.6675069184699574e-05,
+      "loss": 0.14566165208816528,
+      "step": 3410
+    },
+    {
+      "epoch": 0.6213609898107715,
+      "grad_norm": 0.15847846865653992,
+      "learning_rate": 1.660569826875069e-05,
+      "loss": 0.1374401330947876,
+      "step": 3415
+    },
+    {
+      "epoch": 0.6222707423580786,
+      "grad_norm": 0.16370312869548798,
+      "learning_rate": 1.6536400123269907e-05,
+      "loss": 0.14905524253845215,
+      "step": 3420
+    },
+    {
+      "epoch": 0.6231804949053857,
+      "grad_norm": 0.16054444015026093,
+      "learning_rate": 1.6467175349005054e-05,
+      "loss": 0.1496324896812439,
+      "step": 3425
+    },
+    {
+      "epoch": 0.6240902474526928,
+      "grad_norm": 0.1663951277732849,
+      "learning_rate": 1.639802454606788e-05,
+      "loss": 0.1504170298576355,
+      "step": 3430
+    },
+    {
+      "epoch": 0.625,
+      "grad_norm": 0.1591310054063797,
+      "learning_rate": 1.6328948313928906e-05,
+      "loss": 0.1410186171531677,
+      "step": 3435
+    },
+    {
+      "epoch": 0.6259097525473072,
+      "grad_norm": 0.1637524962425232,
+      "learning_rate": 1.6259947251412178e-05,
+      "loss": 0.13963305950164795,
+      "step": 3440
+    },
+    {
+      "epoch": 0.6268195050946143,
+      "grad_norm": 0.1688017100095749,
+      "learning_rate": 1.6191021956690096e-05,
+      "loss": 0.14727941751480103,
+      "step": 3445
+    },
+    {
+      "epoch": 0.6277292576419214,
+      "grad_norm": 0.1691795438528061,
+      "learning_rate": 1.612217302727821e-05,
+      "loss": 0.14856183528900146,
+      "step": 3450
+    },
+    {
+      "epoch": 0.6286390101892285,
+      "grad_norm": 0.18501746654510498,
+      "learning_rate": 1.60534010600301e-05,
+      "loss": 0.1481746554374695,
+      "step": 3455
+    },
+    {
+      "epoch": 0.6295487627365357,
+      "grad_norm": 0.16234716773033142,
+      "learning_rate": 1.5984706651132125e-05,
+      "loss": 0.1427530527114868,
+      "step": 3460
+    },
+    {
+      "epoch": 0.6304585152838428,
+      "grad_norm": 0.16013780236244202,
+      "learning_rate": 1.5916090396098293e-05,
+      "loss": 0.14264426231384278,
+      "step": 3465
+    },
+    {
+      "epoch": 0.63136826783115,
+      "grad_norm": 0.17116396129131317,
+      "learning_rate": 1.5847552889765095e-05,
+      "loss": 0.14109257459640503,
+      "step": 3470
+    },
+    {
+      "epoch": 0.632278020378457,
+      "grad_norm": 0.16949769854545593,
+      "learning_rate": 1.5779094726286344e-05,
+      "loss": 0.1387040376663208,
+      "step": 3475
+    },
+    {
+      "epoch": 0.6331877729257642,
+      "grad_norm": 0.14983431994915009,
+      "learning_rate": 1.5710716499128044e-05,
+      "loss": 0.13645120859146118,
+      "step": 3480
+    },
+    {
+      "epoch": 0.6340975254730713,
+      "grad_norm": 0.1632554531097412,
+      "learning_rate": 1.564241880106321e-05,
+      "loss": 0.14883992671966553,
+      "step": 3485
+    },
+    {
+      "epoch": 0.6350072780203785,
+      "grad_norm": 0.15686506032943726,
+      "learning_rate": 1.5574202224166744e-05,
+      "loss": 0.14244272708892822,
+      "step": 3490
+    },
+    {
+      "epoch": 0.6359170305676856,
+      "grad_norm": 0.18843458592891693,
+      "learning_rate": 1.5506067359810333e-05,
+      "loss": 0.15149861574172974,
+      "step": 3495
+    },
+    {
+      "epoch": 0.6368267831149927,
+      "grad_norm": 0.15874551236629486,
+      "learning_rate": 1.5438014798657275e-05,
+      "loss": 0.15188233852386473,
+      "step": 3500
+    },
+    {
+      "epoch": 0.6377365356622998,
+      "grad_norm": 0.17014239728450775,
+      "learning_rate": 1.5370045130657366e-05,
+      "loss": 0.14694437980651856,
+      "step": 3505
+    },
+    {
+      "epoch": 0.638646288209607,
+      "grad_norm": 0.14744038879871368,
+      "learning_rate": 1.5302158945041838e-05,
+      "loss": 0.14434736967086792,
+      "step": 3510
+    },
+    {
+      "epoch": 0.6395560407569141,
+      "grad_norm": 0.2069770246744156,
+      "learning_rate": 1.523435683031818e-05,
+      "loss": 0.13982917070388795,
+      "step": 3515
+    },
+    {
+      "epoch": 0.6404657933042213,
+      "grad_norm": 0.17811502516269684,
+      "learning_rate": 1.5166639374265063e-05,
+      "loss": 0.1408839702606201,
+      "step": 3520
+    },
+    {
+      "epoch": 0.6413755458515283,
+      "grad_norm": 0.165786474943161,
+      "learning_rate": 1.509900716392728e-05,
+      "loss": 0.15312877893447877,
+      "step": 3525
+    },
+    {
+      "epoch": 0.6422852983988355,
+      "grad_norm": 0.1633884161710739,
+      "learning_rate": 1.5031460785610596e-05,
+      "loss": 0.1488795518875122,
+      "step": 3530
+    },
+    {
+      "epoch": 0.6431950509461426,
+      "grad_norm": 0.16498984396457672,
+      "learning_rate": 1.4964000824876723e-05,
+      "loss": 0.15031465291976928,
+      "step": 3535
+    },
+    {
+      "epoch": 0.6441048034934498,
+      "grad_norm": 0.18043678998947144,
+      "learning_rate": 1.4896627866538191e-05,
+      "loss": 0.147829806804657,
+      "step": 3540
+    },
+    {
+      "epoch": 0.6450145560407569,
+      "grad_norm": 0.16813597083091736,
+      "learning_rate": 1.4829342494653315e-05,
+      "loss": 0.1418998956680298,
+      "step": 3545
+    },
+    {
+      "epoch": 0.645924308588064,
+      "grad_norm": 0.1817242056131363,
+      "learning_rate": 1.4762145292521118e-05,
+      "loss": 0.14508869647979736,
+      "step": 3550
+    },
+    {
+      "epoch": 0.6468340611353712,
+      "grad_norm": 0.14666494727134705,
+      "learning_rate": 1.469503684267628e-05,
+      "loss": 0.14159854650497436,
+      "step": 3555
+    },
+    {
+      "epoch": 0.6477438136826783,
+      "grad_norm": 0.16485381126403809,
+      "learning_rate": 1.4628017726884086e-05,
+      "loss": 0.14419105052947997,
+      "step": 3560
+    },
+    {
+      "epoch": 0.6486535662299855,
+      "grad_norm": 0.16100342571735382,
+      "learning_rate": 1.4561088526135375e-05,
+      "loss": 0.14501721858978273,
+      "step": 3565
+    },
+    {
+      "epoch": 0.6495633187772926,
+      "grad_norm": 0.16996590793132782,
+      "learning_rate": 1.4494249820641493e-05,
+      "loss": 0.1377166509628296,
+      "step": 3570
+    },
+    {
+      "epoch": 0.6504730713245997,
+      "grad_norm": 0.16168837249279022,
+      "learning_rate": 1.4427502189829339e-05,
+      "loss": 0.1414325475692749,
+      "step": 3575
+    },
+    {
+      "epoch": 0.6513828238719068,
+      "grad_norm": 0.16318906843662262,
+      "learning_rate": 1.436084621233621e-05,
+      "loss": 0.14685193300247193,
+      "step": 3580
+    },
+    {
+      "epoch": 0.652292576419214,
+      "grad_norm": 0.1636219322681427,
+      "learning_rate": 1.4294282466004899e-05,
+      "loss": 0.1405899167060852,
+      "step": 3585
+    },
+    {
+      "epoch": 0.6532023289665211,
+      "grad_norm": 0.1838461309671402,
+      "learning_rate": 1.422781152787865e-05,
+      "loss": 0.14386332035064697,
+      "step": 3590
+    },
+    {
+      "epoch": 0.6541120815138283,
+      "grad_norm": 0.1796344667673111,
+      "learning_rate": 1.4161433974196115e-05,
+      "loss": 0.1513024687767029,
+      "step": 3595
+    },
+    {
+      "epoch": 0.6550218340611353,
+      "grad_norm": 0.16424529254436493,
+      "learning_rate": 1.4095150380386427e-05,
+      "loss": 0.14238927364349366,
+      "step": 3600
+    },
+    {
+      "epoch": 0.6559315866084425,
+      "grad_norm": 0.19264160096645355,
+      "learning_rate": 1.402896132106415e-05,
+      "loss": 0.14297477006912232,
+      "step": 3605
+    },
+    {
+      "epoch": 0.6568413391557496,
+      "grad_norm": 0.18319948017597198,
+      "learning_rate": 1.3962867370024347e-05,
+      "loss": 0.1448880434036255,
+      "step": 3610
+    },
+    {
+      "epoch": 0.6577510917030568,
+      "grad_norm": 0.16507290303707123,
+      "learning_rate": 1.389686910023758e-05,
+      "loss": 0.14724698066711425,
+      "step": 3615
+    },
+    {
+      "epoch": 0.6586608442503639,
+      "grad_norm": 0.17871244251728058,
+      "learning_rate": 1.3830967083844942e-05,
+      "loss": 0.14479386806488037,
+      "step": 3620
+    },
+    {
+      "epoch": 0.659570596797671,
+      "grad_norm": 0.1846228390932083,
+      "learning_rate": 1.3765161892153112e-05,
+      "loss": 0.1453616738319397,
+      "step": 3625
+    },
+    {
+      "epoch": 0.6604803493449781,
+      "grad_norm": 0.17185978591442108,
+      "learning_rate": 1.3699454095629372e-05,
+      "loss": 0.14906206130981445,
+      "step": 3630
+    },
+    {
+      "epoch": 0.6613901018922853,
+      "grad_norm": 0.14751191437244415,
+      "learning_rate": 1.3633844263896698e-05,
+      "loss": 0.13991892337799072,
+      "step": 3635
+    },
+    {
+      "epoch": 0.6622998544395924,
+      "grad_norm": 0.22059763967990875,
+      "learning_rate": 1.3568332965728817e-05,
+      "loss": 0.14680869579315187,
+      "step": 3640
+    },
+    {
+      "epoch": 0.6632096069868996,
+      "grad_norm": 0.15295909345149994,
+      "learning_rate": 1.3502920769045232e-05,
+      "loss": 0.1404443383216858,
+      "step": 3645
+    },
+    {
+      "epoch": 0.6641193595342066,
+      "grad_norm": 0.14600558578968048,
+      "learning_rate": 1.3437608240906364e-05,
+      "loss": 0.14663270711898804,
+      "step": 3650
+    },
+    {
+      "epoch": 0.6650291120815138,
+      "grad_norm": 0.15548352897167206,
+      "learning_rate": 1.3372395947508587e-05,
+      "loss": 0.1431443452835083,
+      "step": 3655
+    },
+    {
+      "epoch": 0.665938864628821,
+      "grad_norm": 0.1813388466835022,
+      "learning_rate": 1.3307284454179342e-05,
+      "loss": 0.1458706736564636,
+      "step": 3660
+    },
+    {
+      "epoch": 0.6668486171761281,
+      "grad_norm": 0.16326870024204254,
+      "learning_rate": 1.3242274325372247e-05,
+      "loss": 0.14700595140457154,
+      "step": 3665
+    },
+    {
+      "epoch": 0.6677583697234353,
+      "grad_norm": 0.18779197335243225,
+      "learning_rate": 1.3177366124662149e-05,
+      "loss": 0.1497237801551819,
+      "step": 3670
+    },
+    {
+      "epoch": 0.6686681222707423,
+      "grad_norm": 0.16291002929210663,
+      "learning_rate": 1.3112560414740315e-05,
+      "loss": 0.1387086868286133,
+      "step": 3675
+    },
+    {
+      "epoch": 0.6695778748180495,
+      "grad_norm": 0.1532297134399414,
+      "learning_rate": 1.3047857757409487e-05,
+      "loss": 0.14497545957565308,
+      "step": 3680
+    },
+    {
+      "epoch": 0.6704876273653566,
+      "grad_norm": 0.14697515964508057,
+      "learning_rate": 1.2983258713579066e-05,
+      "loss": 0.1494283437728882,
+      "step": 3685
+    },
+    {
+      "epoch": 0.6713973799126638,
+      "grad_norm": 0.15213452279567719,
+      "learning_rate": 1.2918763843260218e-05,
+      "loss": 0.1468907594680786,
+      "step": 3690
+    },
+    {
+      "epoch": 0.6723071324599709,
+      "grad_norm": 0.1745215803384781,
+      "learning_rate": 1.285437370556099e-05,
+      "loss": 0.14997754096984864,
+      "step": 3695
+    },
+    {
+      "epoch": 0.673216885007278,
+      "grad_norm": 0.19207637012004852,
+      "learning_rate": 1.2790088858681577e-05,
+      "loss": 0.14202862977981567,
+      "step": 3700
+    },
+    {
+      "epoch": 0.6741266375545851,
+      "grad_norm": 0.1521359086036682,
+      "learning_rate": 1.2725909859909313e-05,
+      "loss": 0.14547673463821412,
+      "step": 3705
+    },
+    {
+      "epoch": 0.6750363901018923,
+      "grad_norm": 0.16975535452365875,
+      "learning_rate": 1.2661837265613999e-05,
+      "loss": 0.14006874561309815,
+      "step": 3710
+    },
+    {
+      "epoch": 0.6759461426491994,
+      "grad_norm": 0.22234582901000977,
+      "learning_rate": 1.2597871631242992e-05,
+      "loss": 0.13691173791885375,
+      "step": 3715
+    },
+    {
+      "epoch": 0.6768558951965066,
+      "grad_norm": 0.16082969307899475,
+      "learning_rate": 1.2534013511316383e-05,
+      "loss": 0.14932308197021485,
+      "step": 3720
+    },
+    {
+      "epoch": 0.6777656477438136,
+      "grad_norm": 0.1751091182231903,
+      "learning_rate": 1.247026345942226e-05,
+      "loss": 0.14531974792480468,
+      "step": 3725
+    },
+    {
+      "epoch": 0.6786754002911208,
+      "grad_norm": 0.15838147699832916,
+      "learning_rate": 1.2406622028211844e-05,
+      "loss": 0.14759832620620728,
+      "step": 3730
+    },
+    {
+      "epoch": 0.6795851528384279,
+      "grad_norm": 0.1771744042634964,
+      "learning_rate": 1.2343089769394714e-05,
+      "loss": 0.1382831573486328,
+      "step": 3735
+    },
+    {
+      "epoch": 0.6804949053857351,
+      "grad_norm": 0.16301538050174713,
+      "learning_rate": 1.2279667233734037e-05,
+      "loss": 0.14444775581359864,
+      "step": 3740
+    },
+    {
+      "epoch": 0.6814046579330422,
+      "grad_norm": 0.1584121286869049,
+      "learning_rate": 1.2216354971041796e-05,
+      "loss": 0.14200170040130616,
+      "step": 3745
+    },
+    {
+      "epoch": 0.6823144104803494,
+      "grad_norm": 0.139187291264534,
+      "learning_rate": 1.2153153530174007e-05,
+      "loss": 0.14318310022354125,
+      "step": 3750
+    },
+    {
+      "epoch": 0.6832241630276564,
+      "grad_norm": 0.13665248453617096,
+      "learning_rate": 1.2090063459025955e-05,
+      "loss": 0.1411946654319763,
+      "step": 3755
+    },
+    {
+      "epoch": 0.6841339155749636,
+      "grad_norm": 0.16273781657218933,
+      "learning_rate": 1.2027085304527475e-05,
+      "loss": 0.14873508214950562,
+      "step": 3760
+    },
+    {
+      "epoch": 0.6850436681222707,
+      "grad_norm": 0.16317526996135712,
+      "learning_rate": 1.1964219612638194e-05,
+      "loss": 0.14644203186035157,
+      "step": 3765
+    },
+    {
+      "epoch": 0.6859534206695779,
+      "grad_norm": 0.17253617942333221,
+      "learning_rate": 1.1901466928342777e-05,
+      "loss": 0.14027841091156007,
+      "step": 3770
+    },
+    {
+      "epoch": 0.6868631732168851,
+      "grad_norm": 0.19692830741405487,
+      "learning_rate": 1.183882779564624e-05,
+      "loss": 0.14411110877990724,
+      "step": 3775
+    },
+    {
+      "epoch": 0.6877729257641921,
+      "grad_norm": 0.15444578230381012,
+      "learning_rate": 1.1776302757569214e-05,
+      "loss": 0.14355008602142333,
+      "step": 3780
+    },
+    {
+      "epoch": 0.6886826783114993,
+      "grad_norm": 0.1622200757265091,
+      "learning_rate": 1.1713892356143239e-05,
+      "loss": 0.14794334173202514,
+      "step": 3785
+    },
+    {
+      "epoch": 0.6895924308588064,
+      "grad_norm": 0.1898501068353653,
+      "learning_rate": 1.1651597132406073e-05,
+      "loss": 0.1418622612953186,
+      "step": 3790
+    },
+    {
+      "epoch": 0.6905021834061136,
+      "grad_norm": 0.17803208529949188,
+      "learning_rate": 1.1589417626396973e-05,
+      "loss": 0.14576040506362914,
+      "step": 3795
+    },
+    {
+      "epoch": 0.6914119359534207,
+      "grad_norm": 0.17138013243675232,
+      "learning_rate": 1.1527354377152053e-05,
+      "loss": 0.14494270086288452,
+      "step": 3800
+    },
+    {
+      "epoch": 0.6923216885007278,
+      "grad_norm": 0.15170913934707642,
+      "learning_rate": 1.1465407922699603e-05,
+      "loss": 0.144084370136261,
+      "step": 3805
+    },
+    {
+      "epoch": 0.6932314410480349,
+      "grad_norm": 0.158562570810318,
+      "learning_rate": 1.1403578800055387e-05,
+      "loss": 0.13636608123779298,
+      "step": 3810
+    },
+    {
+      "epoch": 0.6941411935953421,
+      "grad_norm": 0.17687302827835083,
+      "learning_rate": 1.1341867545218044e-05,
+      "loss": 0.14214688539505005,
+      "step": 3815
+    },
+    {
+      "epoch": 0.6950509461426492,
+      "grad_norm": 0.15394899249076843,
+      "learning_rate": 1.1280274693164378e-05,
+      "loss": 0.14914129972457885,
+      "step": 3820
+    },
+    {
+      "epoch": 0.6959606986899564,
+      "grad_norm": 0.15709355473518372,
+      "learning_rate": 1.12188007778448e-05,
+      "loss": 0.14798580408096312,
+      "step": 3825
+    },
+    {
+      "epoch": 0.6968704512372634,
+      "grad_norm": 0.16631539165973663,
+      "learning_rate": 1.115744633217864e-05,
+      "loss": 0.14756966829299928,
+      "step": 3830
+    },
+    {
+      "epoch": 0.6977802037845706,
+      "grad_norm": 0.15893076360225677,
+      "learning_rate": 1.109621188804951e-05,
+      "loss": 0.14061959981918334,
+      "step": 3835
+    },
+    {
+      "epoch": 0.6986899563318777,
+      "grad_norm": 0.183414489030838,
+      "learning_rate": 1.103509797630077e-05,
+      "loss": 0.1448473334312439,
+      "step": 3840
+    },
+    {
+      "epoch": 0.6995997088791849,
+      "grad_norm": 0.14087305963039398,
+      "learning_rate": 1.0974105126730841e-05,
+      "loss": 0.14369285106658936,
+      "step": 3845
+    },
+    {
+      "epoch": 0.700509461426492,
+      "grad_norm": 0.16919967532157898,
+      "learning_rate": 1.0913233868088685e-05,
+      "loss": 0.1478085398674011,
+      "step": 3850
+    },
+    {
+      "epoch": 0.7014192139737991,
+      "grad_norm": 0.1439533829689026,
+      "learning_rate": 1.0852484728069178e-05,
+      "loss": 0.14376721382141114,
+      "step": 3855
+    },
+    {
+      "epoch": 0.7023289665211062,
+      "grad_norm": 0.17719274759292603,
+      "learning_rate": 1.0791858233308521e-05,
+      "loss": 0.14089040756225585,
+      "step": 3860
+    },
+    {
+      "epoch": 0.7032387190684134,
+      "grad_norm": 0.19753769040107727,
+      "learning_rate": 1.0731354909379754e-05,
+      "loss": 0.15021742582321168,
+      "step": 3865
+    },
+    {
+      "epoch": 0.7041484716157205,
+      "grad_norm": 0.19186992943286896,
+      "learning_rate": 1.0670975280788086e-05,
+      "loss": 0.14113202095031738,
+      "step": 3870
+    },
+    {
+      "epoch": 0.7050582241630277,
+      "grad_norm": 0.1709229201078415,
+      "learning_rate": 1.0610719870966443e-05,
+      "loss": 0.1500566840171814,
+      "step": 3875
+    },
+    {
+      "epoch": 0.7059679767103348,
+      "grad_norm": 0.17846204340457916,
+      "learning_rate": 1.0550589202270892e-05,
+      "loss": 0.15014195442199707,
+      "step": 3880
+    },
+    {
+      "epoch": 0.7068777292576419,
+      "grad_norm": 0.1827082335948944,
+      "learning_rate": 1.0490583795976091e-05,
+      "loss": 0.1423472762107849,
+      "step": 3885
+    },
+    {
+      "epoch": 0.7077874818049491,
+      "grad_norm": 0.17418377101421356,
+      "learning_rate": 1.043070417227083e-05,
+      "loss": 0.14668900966644288,
+      "step": 3890
+    },
+    {
+      "epoch": 0.7086972343522562,
+      "grad_norm": 0.17385616898536682,
+      "learning_rate": 1.0370950850253449e-05,
+      "loss": 0.14627279043197633,
+      "step": 3895
+    },
+    {
+      "epoch": 0.7096069868995634,
+      "grad_norm": 0.16486723721027374,
+      "learning_rate": 1.0311324347927404e-05,
+      "loss": 0.14603652954101562,
+      "step": 3900
+    },
+    {
+      "epoch": 0.7105167394468704,
+      "grad_norm": 0.21806862950325012,
+      "learning_rate": 1.0251825182196732e-05,
+      "loss": 0.1488169550895691,
+      "step": 3905
+    },
+    {
+      "epoch": 0.7114264919941776,
+      "grad_norm": 0.19884569942951202,
+      "learning_rate": 1.019245386886159e-05,
+      "loss": 0.14387656450271608,
+      "step": 3910
+    },
+    {
+      "epoch": 0.7123362445414847,
+      "grad_norm": 0.16139011085033417,
+      "learning_rate": 1.0133210922613789e-05,
+      "loss": 0.1483074426651001,
+      "step": 3915
+    },
+    {
+      "epoch": 0.7132459970887919,
+      "grad_norm": 0.17000740766525269,
+      "learning_rate": 1.007409685703229e-05,
+      "loss": 0.14050065279006957,
+      "step": 3920
+    },
+    {
+      "epoch": 0.714155749636099,
+      "grad_norm": 0.17235304415225983,
+      "learning_rate": 1.0015112184578813e-05,
+      "loss": 0.1440442681312561,
+      "step": 3925
+    },
+    {
+      "epoch": 0.7150655021834061,
+      "grad_norm": 0.15737567842006683,
+      "learning_rate": 9.956257416593362e-06,
+      "loss": 0.14960765838623047,
+      "step": 3930
+    },
+    {
+      "epoch": 0.7159752547307132,
+      "grad_norm": 0.15499180555343628,
+      "learning_rate": 9.897533063289773e-06,
+      "loss": 0.14488829374313356,
+      "step": 3935
+    },
+    {
+      "epoch": 0.7168850072780204,
+      "grad_norm": 0.17744216322898865,
+      "learning_rate": 9.838939633751337e-06,
+      "loss": 0.1416949987411499,
+      "step": 3940
+    },
+    {
+      "epoch": 0.7177947598253275,
+      "grad_norm": 0.1597192883491516,
+      "learning_rate": 9.780477635926358e-06,
+      "loss": 0.14275280237197877,
+      "step": 3945
+    },
+    {
+      "epoch": 0.7187045123726347,
+      "grad_norm": 0.17800374329090118,
+      "learning_rate": 9.722147576623743e-06,
+      "loss": 0.14532098770141602,
+      "step": 3950
+    },
+    {
+      "epoch": 0.7196142649199417,
+      "grad_norm": 0.1828162521123886,
+      "learning_rate": 9.66394996150864e-06,
+      "loss": 0.14525585174560546,
+      "step": 3955
+    },
+    {
+      "epoch": 0.7205240174672489,
+      "grad_norm": 0.1800539344549179,
+      "learning_rate": 9.605885295098005e-06,
+      "loss": 0.14235819578170777,
+      "step": 3960
+    },
+    {
+      "epoch": 0.721433770014556,
+      "grad_norm": 0.16556483507156372,
+      "learning_rate": 9.54795408075628e-06,
+      "loss": 0.13965482711791993,
+      "step": 3965
+    },
+    {
+      "epoch": 0.7223435225618632,
+      "grad_norm": 0.1592024862766266,
+      "learning_rate": 9.49015682069101e-06,
+      "loss": 0.14051042795181273,
+      "step": 3970
+    },
+    {
+      "epoch": 0.7232532751091703,
+      "grad_norm": 0.18988847732543945,
+      "learning_rate": 9.43249401594846e-06,
+      "loss": 0.1436900496482849,
+      "step": 3975
+    },
+    {
+      "epoch": 0.7241630276564774,
+      "grad_norm": 0.24433808028697968,
+      "learning_rate": 9.374966166409329e-06,
+      "loss": 0.14883997440338134,
+      "step": 3980
+    },
+    {
+      "epoch": 0.7250727802037845,
+      "grad_norm": 0.15091639757156372,
+      "learning_rate": 9.317573770784352e-06,
+      "loss": 0.14726560115814208,
+      "step": 3985
+    },
+    {
+      "epoch": 0.7259825327510917,
+      "grad_norm": 0.17045573890209198,
+      "learning_rate": 9.260317326610051e-06,
+      "loss": 0.14120506048202514,
+      "step": 3990
+    },
+    {
+      "epoch": 0.7268922852983989,
+      "grad_norm": 0.18847957253456116,
+      "learning_rate": 9.203197330244343e-06,
+      "loss": 0.1377041220664978,
+      "step": 3995
+    },
+    {
+      "epoch": 0.727802037845706,
+      "grad_norm": 0.1516445279121399,
+      "learning_rate": 9.14621427686229e-06,
+      "loss": 0.14043946266174318,
+      "step": 4000
+    },
+    {
+      "epoch": 0.7287117903930131,
+      "grad_norm": 0.18264050781726837,
+      "learning_rate": 9.0893686604518e-06,
+      "loss": 0.14080368280410765,
+      "step": 4005
+    },
+    {
+      "epoch": 0.7296215429403202,
+      "grad_norm": 0.19129371643066406,
+      "learning_rate": 9.032660973809312e-06,
+      "loss": 0.1402561902999878,
+      "step": 4010
+    },
+    {
+      "epoch": 0.7305312954876274,
+      "grad_norm": 0.15762710571289062,
+      "learning_rate": 8.976091708535567e-06,
+      "loss": 0.14421157836914061,
+      "step": 4015
+    },
+    {
+      "epoch": 0.7314410480349345,
+      "grad_norm": 0.17785198986530304,
+      "learning_rate": 8.919661355031331e-06,
+      "loss": 0.14999009370803834,
+      "step": 4020
+    },
+    {
+      "epoch": 0.7323508005822417,
+      "grad_norm": 0.15306031703948975,
+      "learning_rate": 8.8633704024931e-06,
+      "loss": 0.14101698398590087,
+      "step": 4025
+    },
+    {
+      "epoch": 0.7332605531295487,
+      "grad_norm": 0.16481758654117584,
+      "learning_rate": 8.807219338908968e-06,
+      "loss": 0.14170764684677123,
+      "step": 4030
+    },
+    {
+      "epoch": 0.7341703056768559,
+      "grad_norm": 0.14892235398292542,
+      "learning_rate": 8.751208651054257e-06,
+      "loss": 0.15317896604537964,
+      "step": 4035
+    },
+    {
+      "epoch": 0.735080058224163,
+      "grad_norm": 0.1775592565536499,
+      "learning_rate": 8.695338824487409e-06,
+      "loss": 0.1520617723464966,
+      "step": 4040
+    },
+    {
+      "epoch": 0.7359898107714702,
+      "grad_norm": 0.1614258885383606,
+      "learning_rate": 8.639610343545728e-06,
+      "loss": 0.13747400045394897,
+      "step": 4045
+    },
+    {
+      "epoch": 0.7368995633187773,
+      "grad_norm": 0.21415506303310394,
+      "learning_rate": 8.58402369134117e-06,
+      "loss": 0.1432439088821411,
+      "step": 4050
+    },
+    {
+      "epoch": 0.7378093158660844,
+      "grad_norm": 0.1759418249130249,
+      "learning_rate": 8.528579349756205e-06,
+      "loss": 0.141641104221344,
+      "step": 4055
+    },
+    {
+      "epoch": 0.7387190684133915,
+      "grad_norm": 0.16738329827785492,
+      "learning_rate": 8.47327779943957e-06,
+      "loss": 0.14294810295104982,
+      "step": 4060
+    },
+    {
+      "epoch": 0.7396288209606987,
+      "grad_norm": 0.13916844129562378,
+      "learning_rate": 8.41811951980217e-06,
+      "loss": 0.13876968622207642,
+      "step": 4065
+    },
+    {
+      "epoch": 0.7405385735080058,
+      "grad_norm": 0.1828441321849823,
+      "learning_rate": 8.36310498901288e-06,
+      "loss": 0.148428475856781,
+      "step": 4070
+    },
+    {
+      "epoch": 0.741448326055313,
+      "grad_norm": 0.16534076631069183,
+      "learning_rate": 8.308234683994415e-06,
+      "loss": 0.14222711324691772,
+      "step": 4075
+    },
+    {
+      "epoch": 0.74235807860262,
+      "grad_norm": 0.17922644317150116,
+      "learning_rate": 8.253509080419198e-06,
+      "loss": 0.14365782737731933,
+      "step": 4080
+    },
+    {
+      "epoch": 0.7432678311499272,
+      "grad_norm": 0.15061035752296448,
+      "learning_rate": 8.198928652705204e-06,
+      "loss": 0.13571925163269044,
+      "step": 4085
+    },
+    {
+      "epoch": 0.7441775836972343,
+      "grad_norm": 0.18075402081012726,
+      "learning_rate": 8.144493874011908e-06,
+      "loss": 0.14385528564453126,
+      "step": 4090
+    },
+    {
+      "epoch": 0.7450873362445415,
+      "grad_norm": 0.16514739394187927,
+      "learning_rate": 8.090205216236135e-06,
+      "loss": 0.14920626878738402,
+      "step": 4095
+    },
+    {
+      "epoch": 0.7459970887918487,
+      "grad_norm": 0.16453702747821808,
+      "learning_rate": 8.03606315000797e-06,
+      "loss": 0.14704222679138185,
+      "step": 4100
+    },
+    {
+      "epoch": 0.7469068413391557,
+      "grad_norm": 0.16719917953014374,
+      "learning_rate": 7.982068144686707e-06,
+      "loss": 0.14722511768341065,
+      "step": 4105
+    },
+    {
+      "epoch": 0.7478165938864629,
+      "grad_norm": 0.18499110639095306,
+      "learning_rate": 7.92822066835677e-06,
+      "loss": 0.1401848554611206,
+      "step": 4110
+    },
+    {
+      "epoch": 0.74872634643377,
+      "grad_norm": 0.17249563336372375,
+      "learning_rate": 7.87452118782363e-06,
+      "loss": 0.15132423639297485,
+      "step": 4115
+    },
+    {
+      "epoch": 0.7496360989810772,
+      "grad_norm": 0.15049682557582855,
+      "learning_rate": 7.8209701686098e-06,
+      "loss": 0.1341150164604187,
+      "step": 4120
+    },
+    {
+      "epoch": 0.7505458515283843,
+      "grad_norm": 0.16892646253108978,
+      "learning_rate": 7.767568074950751e-06,
+      "loss": 0.1466840147972107,
+      "step": 4125
+    },
+    {
+      "epoch": 0.7514556040756915,
+      "grad_norm": 0.17288286983966827,
+      "learning_rate": 7.714315369790942e-06,
+      "loss": 0.13819680213928223,
+      "step": 4130
+    },
+    {
+      "epoch": 0.7523653566229985,
+      "grad_norm": 0.21893996000289917,
+      "learning_rate": 7.661212514779745e-06,
+      "loss": 0.14369510412216185,
+      "step": 4135
+    },
+    {
+      "epoch": 0.7532751091703057,
+      "grad_norm": 0.1674601435661316,
+      "learning_rate": 7.608259970267509e-06,
+      "loss": 0.14810250997543334,
+      "step": 4140
+    },
+    {
+      "epoch": 0.7541848617176128,
+      "grad_norm": 0.15875539183616638,
+      "learning_rate": 7.555458195301526e-06,
+      "loss": 0.14103198051452637,
+      "step": 4145
+    },
+    {
+      "epoch": 0.75509461426492,
+      "grad_norm": 0.19454079866409302,
+      "learning_rate": 7.502807647622037e-06,
+      "loss": 0.13848764896392823,
+      "step": 4150
+    },
+    {
+      "epoch": 0.756004366812227,
+      "grad_norm": 0.1795455813407898,
+      "learning_rate": 7.450308783658341e-06,
+      "loss": 0.14459335803985596,
+      "step": 4155
+    },
+    {
+      "epoch": 0.7569141193595342,
+      "grad_norm": 0.1643362045288086,
+      "learning_rate": 7.397962058524735e-06,
+      "loss": 0.14335378408432006,
+      "step": 4160
+    },
+    {
+      "epoch": 0.7578238719068413,
+      "grad_norm": 0.16362066566944122,
+      "learning_rate": 7.3457679260166475e-06,
+      "loss": 0.14222005605697632,
+      "step": 4165
+    },
+    {
+      "epoch": 0.7587336244541485,
+      "grad_norm": 0.17313003540039062,
+      "learning_rate": 7.293726838606674e-06,
+      "loss": 0.14272255897521974,
+      "step": 4170
+    },
+    {
+      "epoch": 0.7596433770014556,
+      "grad_norm": 0.1809929460287094,
+      "learning_rate": 7.2418392474406405e-06,
+      "loss": 0.14089123010635377,
+      "step": 4175
+    },
+    {
+      "epoch": 0.7605531295487628,
+      "grad_norm": 0.14306005835533142,
+      "learning_rate": 7.19010560233373e-06,
+      "loss": 0.13531534671783446,
+      "step": 4180
+    },
+    {
+      "epoch": 0.7614628820960698,
+      "grad_norm": 0.15525390207767487,
+      "learning_rate": 7.138526351766559e-06,
+      "loss": 0.14340845346450806,
+      "step": 4185
+    },
+    {
+      "epoch": 0.762372634643377,
+      "grad_norm": 0.24478943645954132,
+      "learning_rate": 7.087101942881263e-06,
+      "loss": 0.14744555950164795,
+      "step": 4190
+    },
+    {
+      "epoch": 0.7632823871906841,
+      "grad_norm": 0.31335577368736267,
+      "learning_rate": 7.035832821477711e-06,
+      "loss": 0.1484094500541687,
+      "step": 4195
+    },
+    {
+      "epoch": 0.7641921397379913,
+      "grad_norm": 0.15140366554260254,
+      "learning_rate": 6.984719432009515e-06,
+      "loss": 0.14991614818572999,
+      "step": 4200
+    },
+    {
+      "epoch": 0.7651018922852983,
+      "grad_norm": 0.16125506162643433,
+      "learning_rate": 6.933762217580289e-06,
+      "loss": 0.1408134937286377,
+      "step": 4205
+    },
+    {
+      "epoch": 0.7660116448326055,
+      "grad_norm": 0.2501450181007385,
+      "learning_rate": 6.882961619939726e-06,
+      "loss": 0.13875640630722047,
+      "step": 4210
+    },
+    {
+      "epoch": 0.7669213973799127,
+      "grad_norm": 0.16227811574935913,
+      "learning_rate": 6.8323180794798245e-06,
+      "loss": 0.14138660430908204,
+      "step": 4215
+    },
+    {
+      "epoch": 0.7678311499272198,
+      "grad_norm": 0.16676810383796692,
+      "learning_rate": 6.781832035231053e-06,
+      "loss": 0.14696706533432008,
+      "step": 4220
+    },
+    {
+      "epoch": 0.768740902474527,
+      "grad_norm": 0.14638574421405792,
+      "learning_rate": 6.731503924858518e-06,
+      "loss": 0.14263020753860473,
+      "step": 4225
+    },
+    {
+      "epoch": 0.769650655021834,
+      "grad_norm": 0.17093190550804138,
+      "learning_rate": 6.681334184658211e-06,
+      "loss": 0.14694111347198485,
+      "step": 4230
+    },
+    {
+      "epoch": 0.7705604075691412,
+      "grad_norm": 0.17174287140369415,
+      "learning_rate": 6.631323249553201e-06,
+      "loss": 0.13854929208755493,
+      "step": 4235
+    },
+    {
+      "epoch": 0.7714701601164483,
+      "grad_norm": 0.14599016308784485,
+      "learning_rate": 6.5814715530898745e-06,
+      "loss": 0.14058833122253417,
+      "step": 4240
+    },
+    {
+      "epoch": 0.7723799126637555,
+      "grad_norm": 0.16222265362739563,
+      "learning_rate": 6.531779527434176e-06,
+      "loss": 0.1428326725959778,
+      "step": 4245
+    },
+    {
+      "epoch": 0.7732896652110626,
+      "grad_norm": 0.1741994023323059,
+      "learning_rate": 6.482247603367839e-06,
+      "loss": 0.13985042572021483,
+      "step": 4250
+    },
+    {
+      "epoch": 0.7741994177583698,
+      "grad_norm": 0.17427101731300354,
+      "learning_rate": 6.432876210284688e-06,
+      "loss": 0.1442667603492737,
+      "step": 4255
+    },
+    {
+      "epoch": 0.7751091703056768,
+      "grad_norm": 0.1665259599685669,
+      "learning_rate": 6.383665776186912e-06,
+      "loss": 0.1421986222267151,
+      "step": 4260
+    },
+    {
+      "epoch": 0.776018922852984,
+      "grad_norm": 0.1728232353925705,
+      "learning_rate": 6.334616727681303e-06,
+      "loss": 0.1367053508758545,
+      "step": 4265
+    },
+    {
+      "epoch": 0.7769286754002911,
+      "grad_norm": 0.15882381796836853,
+      "learning_rate": 6.285729489975639e-06,
+      "loss": 0.14551182985305786,
+      "step": 4270
+    },
+    {
+      "epoch": 0.7778384279475983,
+      "grad_norm": 0.242042675614357,
+      "learning_rate": 6.2370044868749115e-06,
+      "loss": 0.1455132007598877,
+      "step": 4275
+    },
+    {
+      "epoch": 0.7787481804949054,
+      "grad_norm": 0.1599501073360443,
+      "learning_rate": 6.188442140777742e-06,
+      "loss": 0.1424942970275879,
+      "step": 4280
+    },
+    {
+      "epoch": 0.7796579330422125,
+      "grad_norm": 0.15182635188102722,
+      "learning_rate": 6.140042872672647e-06,
+      "loss": 0.14212887287139891,
+      "step": 4285
+    },
+    {
+      "epoch": 0.7805676855895196,
+      "grad_norm": 0.1720375418663025,
+      "learning_rate": 6.091807102134403e-06,
+      "loss": 0.14243412017822266,
+      "step": 4290
+    },
+    {
+      "epoch": 0.7814774381368268,
+      "grad_norm": 0.16436047852039337,
+      "learning_rate": 6.043735247320454e-06,
+      "loss": 0.15035657882690429,
+      "step": 4295
+    },
+    {
+      "epoch": 0.7823871906841339,
+      "grad_norm": 0.1498408019542694,
+      "learning_rate": 5.995827724967218e-06,
+      "loss": 0.14494839906692505,
+      "step": 4300
+    },
+    {
+      "epoch": 0.7832969432314411,
+      "grad_norm": 0.16924560070037842,
+      "learning_rate": 5.948084950386535e-06,
+      "loss": 0.13581212759017944,
+      "step": 4305
+    },
+    {
+      "epoch": 0.7842066957787481,
+      "grad_norm": 0.15889139473438263,
+      "learning_rate": 5.900507337462036e-06,
+      "loss": 0.15071530342102052,
+      "step": 4310
+    },
+    {
+      "epoch": 0.7851164483260553,
+      "grad_norm": 0.17201054096221924,
+      "learning_rate": 5.853095298645542e-06,
+      "loss": 0.1398628830909729,
+      "step": 4315
+    },
+    {
+      "epoch": 0.7860262008733624,
+      "grad_norm": 0.17965619266033173,
+      "learning_rate": 5.805849244953548e-06,
+      "loss": 0.14666696786880493,
+      "step": 4320
+    },
+    {
+      "epoch": 0.7869359534206696,
+      "grad_norm": 0.17514032125473022,
+      "learning_rate": 5.758769585963569e-06,
+      "loss": 0.1383386731147766,
+      "step": 4325
+    },
+    {
+      "epoch": 0.7878457059679768,
+      "grad_norm": 0.17497631907463074,
+      "learning_rate": 5.7118567298106744e-06,
+      "loss": 0.14362354278564454,
+      "step": 4330
+    },
+    {
+      "epoch": 0.7887554585152838,
+      "grad_norm": 0.16770458221435547,
+      "learning_rate": 5.665111083183905e-06,
+      "loss": 0.14136618375778198,
+      "step": 4335
+    },
+    {
+      "epoch": 0.789665211062591,
+      "grad_norm": 0.17134106159210205,
+      "learning_rate": 5.618533051322747e-06,
+      "loss": 0.1401529550552368,
+      "step": 4340
+    },
+    {
+      "epoch": 0.7905749636098981,
+      "grad_norm": 0.19458788633346558,
+      "learning_rate": 5.5721230380136435e-06,
+      "loss": 0.1393273115158081,
+      "step": 4345
+    },
+    {
+      "epoch": 0.7914847161572053,
+      "grad_norm": 0.19483692944049835,
+      "learning_rate": 5.525881445586467e-06,
+      "loss": 0.1369825482368469,
+      "step": 4350
+    },
+    {
+      "epoch": 0.7923944687045124,
+      "grad_norm": 0.3052191734313965,
+      "learning_rate": 5.4798086749110495e-06,
+      "loss": 0.14762181043624878,
+      "step": 4355
+    },
+    {
+      "epoch": 0.7933042212518195,
+      "grad_norm": 0.164458766579628,
+      "learning_rate": 5.4339051253937065e-06,
+      "loss": 0.14501686096191407,
+      "step": 4360
+    },
+    {
+      "epoch": 0.7942139737991266,
+      "grad_norm": 0.1719193458557129,
+      "learning_rate": 5.3881711949737625e-06,
+      "loss": 0.13321092128753662,
+      "step": 4365
+    },
+    {
+      "epoch": 0.7951237263464338,
+      "grad_norm": 0.17219696938991547,
+      "learning_rate": 5.342607280120121e-06,
+      "loss": 0.1413906455039978,
+      "step": 4370
+    },
+    {
+      "epoch": 0.7960334788937409,
+      "grad_norm": 0.15083056688308716,
+      "learning_rate": 5.297213775827789e-06,
+      "loss": 0.14772192239761353,
+      "step": 4375
+    },
+    {
+      "epoch": 0.7969432314410481,
+      "grad_norm": 0.1699071079492569,
+      "learning_rate": 5.251991075614507e-06,
+      "loss": 0.1392375946044922,
+      "step": 4380
+    },
+    {
+      "epoch": 0.7978529839883551,
+      "grad_norm": 0.1680395007133484,
+      "learning_rate": 5.206939571517302e-06,
+      "loss": 0.14185575246810914,
+      "step": 4385
+    },
+    {
+      "epoch": 0.7987627365356623,
+      "grad_norm": 0.16526710987091064,
+      "learning_rate": 5.162059654089083e-06,
+      "loss": 0.15001428127288818,
+      "step": 4390
+    },
+    {
+      "epoch": 0.7996724890829694,
+      "grad_norm": 0.16281752288341522,
+      "learning_rate": 5.1173517123952794e-06,
+      "loss": 0.13747023344039916,
+      "step": 4395
+    },
+    {
+      "epoch": 0.8005822416302766,
+      "grad_norm": 0.1454378366470337,
+      "learning_rate": 5.072816134010458e-06,
+      "loss": 0.14710829257965088,
+      "step": 4400
+    },
+    {
+      "epoch": 0.8014919941775837,
+      "grad_norm": 0.16565890610218048,
+      "learning_rate": 5.028453305014966e-06,
+      "loss": 0.14138611555099487,
+      "step": 4405
+    },
+    {
+      "epoch": 0.8024017467248908,
+      "grad_norm": 0.1962810605764389,
+      "learning_rate": 4.984263609991577e-06,
+      "loss": 0.13836177587509155,
+      "step": 4410
+    },
+    {
+      "epoch": 0.8033114992721979,
+      "grad_norm": 0.16091369092464447,
+      "learning_rate": 4.940247432022149e-06,
+      "loss": 0.14407440423965454,
+      "step": 4415
+    },
+    {
+      "epoch": 0.8042212518195051,
+      "grad_norm": 0.1930241584777832,
+      "learning_rate": 4.89640515268433e-06,
+      "loss": 0.14346336126327514,
+      "step": 4420
+    },
+    {
+      "epoch": 0.8051310043668122,
+      "grad_norm": 0.19301500916481018,
+      "learning_rate": 4.852737152048242e-06,
+      "loss": 0.14174317121505736,
+      "step": 4425
+    },
+    {
+      "epoch": 0.8060407569141194,
+      "grad_norm": 0.1541353315114975,
+      "learning_rate": 4.80924380867315e-06,
+      "loss": 0.14100592136383056,
+      "step": 4430
+    },
+    {
+      "epoch": 0.8069505094614265,
+      "grad_norm": 0.16285750269889832,
+      "learning_rate": 4.765925499604243e-06,
+      "loss": 0.1441288709640503,
+      "step": 4435
+    },
+    {
+      "epoch": 0.8078602620087336,
+      "grad_norm": 0.17382675409317017,
+      "learning_rate": 4.722782600369299e-06,
+      "loss": 0.13763951063156127,
+      "step": 4440
+    },
+    {
+      "epoch": 0.8087700145560408,
+      "grad_norm": 0.1697344034910202,
+      "learning_rate": 4.679815484975505e-06,
+      "loss": 0.1410105347633362,
+      "step": 4445
+    },
+    {
+      "epoch": 0.8096797671033479,
+      "grad_norm": 0.19964542984962463,
+      "learning_rate": 4.637024525906131e-06,
+      "loss": 0.1439276695251465,
+      "step": 4450
+    },
+    {
+      "epoch": 0.8105895196506551,
+      "grad_norm": 0.165307879447937,
+      "learning_rate": 4.59441009411736e-06,
+      "loss": 0.13897504806518554,
+      "step": 4455
+    },
+    {
+      "epoch": 0.8114992721979621,
+      "grad_norm": 0.16687989234924316,
+      "learning_rate": 4.551972559035067e-06,
+      "loss": 0.1422593355178833,
+      "step": 4460
+    },
+    {
+      "epoch": 0.8124090247452693,
+      "grad_norm": 0.15737789869308472,
+      "learning_rate": 4.509712288551571e-06,
+      "loss": 0.1452128052711487,
+      "step": 4465
+    },
+    {
+      "epoch": 0.8133187772925764,
+      "grad_norm": 0.17116659879684448,
+      "learning_rate": 4.467629649022509e-06,
+      "loss": 0.14385371208190917,
+      "step": 4470
+    },
+    {
+      "epoch": 0.8142285298398836,
+      "grad_norm": 0.17457640171051025,
+      "learning_rate": 4.425725005263623e-06,
+      "loss": 0.14808475971221924,
+      "step": 4475
+    },
+    {
+      "epoch": 0.8151382823871907,
+      "grad_norm": 0.1621970385313034,
+      "learning_rate": 4.383998720547583e-06,
+      "loss": 0.13927959203720092,
+      "step": 4480
+    },
+    {
+      "epoch": 0.8160480349344978,
+      "grad_norm": 0.176296666264534,
+      "learning_rate": 4.342451156600896e-06,
+      "loss": 0.15041060447692872,
+      "step": 4485
+    },
+    {
+      "epoch": 0.8169577874818049,
+      "grad_norm": 0.17157645523548126,
+      "learning_rate": 4.301082673600698e-06,
+      "loss": 0.13932652473449708,
+      "step": 4490
+    },
+    {
+      "epoch": 0.8178675400291121,
+      "grad_norm": 0.15378527343273163,
+      "learning_rate": 4.259893630171682e-06,
+      "loss": 0.1406856894493103,
+      "step": 4495
+    },
+    {
+      "epoch": 0.8187772925764192,
+      "grad_norm": 0.1750226765871048,
+      "learning_rate": 4.218884383382987e-06,
+      "loss": 0.1350164532661438,
+      "step": 4500
+    },
+    {
+      "epoch": 0.8196870451237264,
+      "grad_norm": 0.1393742561340332,
+      "learning_rate": 4.178055288745053e-06,
+      "loss": 0.13769235610961914,
+      "step": 4505
+    },
+    {
+      "epoch": 0.8205967976710334,
+      "grad_norm": 0.1668994128704071,
+      "learning_rate": 4.137406700206617e-06,
+      "loss": 0.14029752016067504,
+      "step": 4510
+    },
+    {
+      "epoch": 0.8215065502183406,
+      "grad_norm": 0.1833454668521881,
+      "learning_rate": 4.0969389701515675e-06,
+      "loss": 0.14276301860809326,
+      "step": 4515
+    },
+    {
+      "epoch": 0.8224163027656477,
+      "grad_norm": 0.16187874972820282,
+      "learning_rate": 4.056652449395945e-06,
+      "loss": 0.1444832682609558,
+      "step": 4520
+    },
+    {
+      "epoch": 0.8233260553129549,
+      "grad_norm": 0.1453280746936798,
+      "learning_rate": 4.01654748718488e-06,
+      "loss": 0.14512733221054078,
+      "step": 4525
+    },
+    {
+      "epoch": 0.824235807860262,
+      "grad_norm": 0.1782725751399994,
+      "learning_rate": 3.976624431189563e-06,
+      "loss": 0.14093561172485353,
+      "step": 4530
+    },
+    {
+      "epoch": 0.8251455604075691,
+      "grad_norm": 0.17374491691589355,
+      "learning_rate": 3.936883627504234e-06,
+      "loss": 0.14031401872634888,
+      "step": 4535
+    },
+    {
+      "epoch": 0.8260553129548762,
+      "grad_norm": 0.1609172821044922,
+      "learning_rate": 3.897325420643174e-06,
+      "loss": 0.1428336262702942,
+      "step": 4540
+    },
+    {
+      "epoch": 0.8269650655021834,
+      "grad_norm": 0.1520884931087494,
+      "learning_rate": 3.85795015353774e-06,
+      "loss": 0.1460547924041748,
+      "step": 4545
+    },
+    {
+      "epoch": 0.8278748180494906,
+      "grad_norm": 0.20986326038837433,
+      "learning_rate": 3.818758167533376e-06,
+      "loss": 0.14706350564956666,
+      "step": 4550
+    },
+    {
+      "epoch": 0.8287845705967977,
+      "grad_norm": 0.16825413703918457,
+      "learning_rate": 3.7797498023866396e-06,
+      "loss": 0.14507200717926025,
+      "step": 4555
+    },
+    {
+      "epoch": 0.8296943231441049,
+      "grad_norm": 0.16758380830287933,
+      "learning_rate": 3.740925396262296e-06,
+      "loss": 0.14898381233215333,
+      "step": 4560
+    },
+    {
+      "epoch": 0.8306040756914119,
+      "grad_norm": 0.15207453072071075,
+      "learning_rate": 3.7022852857303503e-06,
+      "loss": 0.14138854742050172,
+      "step": 4565
+    },
+    {
+      "epoch": 0.8315138282387191,
+      "grad_norm": 0.15150749683380127,
+      "learning_rate": 3.66382980576315e-06,
+      "loss": 0.13894975185394287,
+      "step": 4570
+    },
+    {
+      "epoch": 0.8324235807860262,
+      "grad_norm": 0.17071188986301422,
+      "learning_rate": 3.625559289732472e-06,
+      "loss": 0.14072470664978026,
+      "step": 4575
+    },
+    {
+      "epoch": 0.8333333333333334,
+      "grad_norm": 0.154335618019104,
+      "learning_rate": 3.5874740694066294e-06,
+      "loss": 0.13791344165802003,
+      "step": 4580
+    },
+    {
+      "epoch": 0.8342430858806404,
+      "grad_norm": 0.14017128944396973,
+      "learning_rate": 3.5495744749476116e-06,
+      "loss": 0.14427922964096068,
+      "step": 4585
+    },
+    {
+      "epoch": 0.8351528384279476,
+      "grad_norm": 0.17210033535957336,
+      "learning_rate": 3.5118608349081983e-06,
+      "loss": 0.15191166400909423,
+      "step": 4590
+    },
+    {
+      "epoch": 0.8360625909752547,
+      "grad_norm": 0.18715685606002808,
+      "learning_rate": 3.4743334762291358e-06,
+      "loss": 0.14451316595077515,
+      "step": 4595
+    },
+    {
+      "epoch": 0.8369723435225619,
+      "grad_norm": 0.18079884350299835,
+      "learning_rate": 3.436992724236293e-06,
+      "loss": 0.13530746698379517,
+      "step": 4600
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.529148621952221e+18,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-4600/training_args.bin b/checkpoint-4600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-4600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/checkpoint-4700/README.md b/checkpoint-4700/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-4700/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-4700/adapter_config.json b/checkpoint-4700/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-4700/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-4700/adapter_model.safetensors b/checkpoint-4700/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..705fdef1f7d1f50791858a03e6e3e3cc1cb33772
--- /dev/null
+++ b/checkpoint-4700/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:40795c528c213a9d0c33604ca35fda29783ae39cf438979be0a0a9399f3f749f
+size 169741912
diff --git a/checkpoint-4700/chat_template.jinja b/checkpoint-4700/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-4700/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-4700/optimizer.pt b/checkpoint-4700/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f6382636f2f7f8d21e409c7c01b88f8dec42f2ea
--- /dev/null
+++ b/checkpoint-4700/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:49875dd003bad3ded6c6745226845868f086532e0126eeb03e8fdfe2d00d524a
+size 72807355
diff --git a/checkpoint-4700/processor_config.json b/checkpoint-4700/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-4700/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-4700/rng_state.pth b/checkpoint-4700/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-4700/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-4700/scheduler.pt b/checkpoint-4700/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..00cf0b221cced2313e6a1489c95f2debe3f3ffcc
--- /dev/null
+++ b/checkpoint-4700/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c8a0ab8f7a4ad7058c95af284f748e7f4487cc3c575a38242c2321938a3cd3e3
+size 1465
diff --git a/checkpoint-4700/tokenizer.json b/checkpoint-4700/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-4700/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-4700/tokenizer_config.json b/checkpoint-4700/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-4700/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-4700/trainer_state.json b/checkpoint-4700/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..6340bf480675fb0f5b7c50d1740bdef37eedc890
--- /dev/null
+++ b/checkpoint-4700/trainer_state.json
@@ -0,0 +1,6622 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.8551673944687045,
+  "eval_steps": 100,
+  "global_step": 4700,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    },
+    {
+      "epoch": 0.03729985443959243,
+      "grad_norm": 0.061678580939769745,
+      "learning_rate": 4.999340748636462e-05,
+      "loss": 0.24956226348876953,
+      "step": 205
+    },
+    {
+      "epoch": 0.03820960698689956,
+      "grad_norm": 0.07328873127698898,
+      "learning_rate": 4.999160884067051e-05,
+      "loss": 0.26169676780700685,
+      "step": 210
+    },
+    {
+      "epoch": 0.0391193595342067,
+      "grad_norm": 0.08287990838289261,
+      "learning_rate": 4.9989593541926246e-05,
+      "loss": 0.2574604034423828,
+      "step": 215
+    },
+    {
+      "epoch": 0.04002911208151383,
+      "grad_norm": 0.06787359714508057,
+      "learning_rate": 4.9987361607602525e-05,
+      "loss": 0.25351409912109374,
+      "step": 220
+    },
+    {
+      "epoch": 0.04093886462882096,
+      "grad_norm": 0.06695502996444702,
+      "learning_rate": 4.998491305704805e-05,
+      "loss": 0.24522039890289307,
+      "step": 225
+    },
+    {
+      "epoch": 0.041848617176128096,
+      "grad_norm": 0.08872214704751968,
+      "learning_rate": 4.9982247911489375e-05,
+      "loss": 0.2581867933273315,
+      "step": 230
+    },
+    {
+      "epoch": 0.042758369723435226,
+      "grad_norm": 0.07637131959199905,
+      "learning_rate": 4.9979366194030743e-05,
+      "loss": 0.25569658279418944,
+      "step": 235
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 0.08158119022846222,
+      "learning_rate": 4.997626792965385e-05,
+      "loss": 0.2529409646987915,
+      "step": 240
+    },
+    {
+      "epoch": 0.04457787481804949,
+      "grad_norm": 0.07529161125421524,
+      "learning_rate": 4.997295314521766e-05,
+      "loss": 0.24049024581909179,
+      "step": 245
+    },
+    {
+      "epoch": 0.04548762736535662,
+      "grad_norm": 0.08860139548778534,
+      "learning_rate": 4.996942186945813e-05,
+      "loss": 0.2490522861480713,
+      "step": 250
+    },
+    {
+      "epoch": 0.04639737991266375,
+      "grad_norm": 0.0850321501493454,
+      "learning_rate": 4.9965674132988005e-05,
+      "loss": 0.24180831909179687,
+      "step": 255
+    },
+    {
+      "epoch": 0.04730713245997089,
+      "grad_norm": 0.07556115090847015,
+      "learning_rate": 4.996170996829653e-05,
+      "loss": 0.2509631872177124,
+      "step": 260
+    },
+    {
+      "epoch": 0.04821688500727802,
+      "grad_norm": 0.07971206307411194,
+      "learning_rate": 4.995752940974918e-05,
+      "loss": 0.24398891925811766,
+      "step": 265
+    },
+    {
+      "epoch": 0.04912663755458515,
+      "grad_norm": 0.09149336814880371,
+      "learning_rate": 4.9953132493587344e-05,
+      "loss": 0.2300492286682129,
+      "step": 270
+    },
+    {
+      "epoch": 0.050036390101892286,
+      "grad_norm": 0.08265820890665054,
+      "learning_rate": 4.9948519257928034e-05,
+      "loss": 0.24246792793273925,
+      "step": 275
+    },
+    {
+      "epoch": 0.050946142649199416,
+      "grad_norm": 0.10328587144613266,
+      "learning_rate": 4.9943689742763534e-05,
+      "loss": 0.2367171049118042,
+      "step": 280
+    },
+    {
+      "epoch": 0.05185589519650655,
+      "grad_norm": 0.0836917981505394,
+      "learning_rate": 4.993864398996105e-05,
+      "loss": 0.23215813636779786,
+      "step": 285
+    },
+    {
+      "epoch": 0.05276564774381368,
+      "grad_norm": 0.09475161135196686,
+      "learning_rate": 4.99333820432624e-05,
+      "loss": 0.2350748062133789,
+      "step": 290
+    },
+    {
+      "epoch": 0.05367540029112081,
+      "grad_norm": 0.08040128648281097,
+      "learning_rate": 4.992790394828355e-05,
+      "loss": 0.23253886699676513,
+      "step": 295
+    },
+    {
+      "epoch": 0.05458515283842795,
+      "grad_norm": 0.08852150291204453,
+      "learning_rate": 4.992220975251428e-05,
+      "loss": 0.23856515884399415,
+      "step": 300
+    },
+    {
+      "epoch": 0.05549490538573508,
+      "grad_norm": 0.09565229713916779,
+      "learning_rate": 4.991629950531775e-05,
+      "loss": 0.23311660289764405,
+      "step": 305
+    },
+    {
+      "epoch": 0.05640465793304221,
+      "grad_norm": 0.08158160001039505,
+      "learning_rate": 4.991017325793009e-05,
+      "loss": 0.22467944622039795,
+      "step": 310
+    },
+    {
+      "epoch": 0.05731441048034935,
+      "grad_norm": 0.07746429741382599,
+      "learning_rate": 4.990383106345994e-05,
+      "loss": 0.229844069480896,
+      "step": 315
+    },
+    {
+      "epoch": 0.05822416302765648,
+      "grad_norm": 0.08564355969429016,
+      "learning_rate": 4.989727297688797e-05,
+      "loss": 0.22414517402648926,
+      "step": 320
+    },
+    {
+      "epoch": 0.05913391557496361,
+      "grad_norm": 0.07517435401678085,
+      "learning_rate": 4.9890499055066435e-05,
+      "loss": 0.2236532211303711,
+      "step": 325
+    },
+    {
+      "epoch": 0.060043668122270744,
+      "grad_norm": 0.111734539270401,
+      "learning_rate": 4.988350935671869e-05,
+      "loss": 0.21474847793579102,
+      "step": 330
+    },
+    {
+      "epoch": 0.060953420669577874,
+      "grad_norm": 0.09906989336013794,
+      "learning_rate": 4.987630394243866e-05,
+      "loss": 0.23321933746337892,
+      "step": 335
+    },
+    {
+      "epoch": 0.06186317321688501,
+      "grad_norm": 0.10131457448005676,
+      "learning_rate": 4.98688828746903e-05,
+      "loss": 0.2310662031173706,
+      "step": 340
+    },
+    {
+      "epoch": 0.06277292576419213,
+      "grad_norm": 0.09203507006168365,
+      "learning_rate": 4.986124621780708e-05,
+      "loss": 0.22021169662475587,
+      "step": 345
+    },
+    {
+      "epoch": 0.06368267831149928,
+      "grad_norm": 0.09505912661552429,
+      "learning_rate": 4.9853394037991416e-05,
+      "loss": 0.2197155237197876,
+      "step": 350
+    },
+    {
+      "epoch": 0.06459243085880641,
+      "grad_norm": 0.09038657695055008,
+      "learning_rate": 4.984532640331412e-05,
+      "loss": 0.22066287994384765,
+      "step": 355
+    },
+    {
+      "epoch": 0.06550218340611354,
+      "grad_norm": 0.09707064181566238,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 0.22455451488494874,
+      "step": 360
+    },
+    {
+      "epoch": 0.06641193595342067,
+      "grad_norm": 0.10367228090763092,
+      "learning_rate": 4.98285450509961e-05,
+      "loss": 0.21993820667266845,
+      "step": 365
+    },
+    {
+      "epoch": 0.0673216885007278,
+      "grad_norm": 0.12229471653699875,
+      "learning_rate": 4.9819831478833456e-05,
+      "loss": 0.2168867588043213,
+      "step": 370
+    },
+    {
+      "epoch": 0.06823144104803494,
+      "grad_norm": 0.0964592918753624,
+      "learning_rate": 4.981090274276406e-05,
+      "loss": 0.21579203605651856,
+      "step": 375
+    },
+    {
+      "epoch": 0.06914119359534207,
+      "grad_norm": 0.09400496631860733,
+      "learning_rate": 4.980175892019141e-05,
+      "loss": 0.20972180366516113,
+      "step": 380
+    },
+    {
+      "epoch": 0.0700509461426492,
+      "grad_norm": 0.08158645778894424,
+      "learning_rate": 4.9792400090383594e-05,
+      "loss": 0.22148358821868896,
+      "step": 385
+    },
+    {
+      "epoch": 0.07096069868995633,
+      "grad_norm": 0.10916394740343094,
+      "learning_rate": 4.978282633447261e-05,
+      "loss": 0.2214418649673462,
+      "step": 390
+    },
+    {
+      "epoch": 0.07187045123726346,
+      "grad_norm": 0.11138810962438583,
+      "learning_rate": 4.9773037735453636e-05,
+      "loss": 0.21814754009246826,
+      "step": 395
+    },
+    {
+      "epoch": 0.07278020378457059,
+      "grad_norm": 0.10914396494626999,
+      "learning_rate": 4.9763034378184365e-05,
+      "loss": 0.21310818195343018,
+      "step": 400
+    },
+    {
+      "epoch": 0.07368995633187773,
+      "grad_norm": 0.1043366864323616,
+      "learning_rate": 4.975281634938421e-05,
+      "loss": 0.21266789436340333,
+      "step": 405
+    },
+    {
+      "epoch": 0.07459970887918486,
+      "grad_norm": 0.1036868542432785,
+      "learning_rate": 4.9742383737633594e-05,
+      "loss": 0.21606721878051757,
+      "step": 410
+    },
+    {
+      "epoch": 0.075509461426492,
+      "grad_norm": 0.11640442907810211,
+      "learning_rate": 4.9731736633373144e-05,
+      "loss": 0.21532948017120362,
+      "step": 415
+    },
+    {
+      "epoch": 0.07641921397379912,
+      "grad_norm": 0.11219926178455353,
+      "learning_rate": 4.9720875128902956e-05,
+      "loss": 0.2191627025604248,
+      "step": 420
+    },
+    {
+      "epoch": 0.07732896652110625,
+      "grad_norm": 0.12103637307882309,
+      "learning_rate": 4.970979931838176e-05,
+      "loss": 0.20938868522644044,
+      "step": 425
+    },
+    {
+      "epoch": 0.0782387190684134,
+      "grad_norm": 0.13274189829826355,
+      "learning_rate": 4.96985092978261e-05,
+      "loss": 0.21792960166931152,
+      "step": 430
+    },
+    {
+      "epoch": 0.07914847161572053,
+      "grad_norm": 0.11164513230323792,
+      "learning_rate": 4.968700516510954e-05,
+      "loss": 0.2022618055343628,
+      "step": 435
+    },
+    {
+      "epoch": 0.08005822416302766,
+      "grad_norm": 0.09532847255468369,
+      "learning_rate": 4.967528701996174e-05,
+      "loss": 0.21255812644958497,
+      "step": 440
+    },
+    {
+      "epoch": 0.08096797671033479,
+      "grad_norm": 0.10279258340597153,
+      "learning_rate": 4.96633549639677e-05,
+      "loss": 0.20683050155639648,
+      "step": 445
+    },
+    {
+      "epoch": 0.08187772925764192,
+      "grad_norm": 0.1257462352514267,
+      "learning_rate": 4.965120910056677e-05,
+      "loss": 0.21419920921325683,
+      "step": 450
+    },
+    {
+      "epoch": 0.08278748180494905,
+      "grad_norm": 0.11663137376308441,
+      "learning_rate": 4.963884953505186e-05,
+      "loss": 0.2072287082672119,
+      "step": 455
+    },
+    {
+      "epoch": 0.08369723435225619,
+      "grad_norm": 0.10488224029541016,
+      "learning_rate": 4.96262763745684e-05,
+      "loss": 0.1982678532600403,
+      "step": 460
+    },
+    {
+      "epoch": 0.08460698689956332,
+      "grad_norm": 0.11801692098379135,
+      "learning_rate": 4.961348972811354e-05,
+      "loss": 0.20662031173706055,
+      "step": 465
+    },
+    {
+      "epoch": 0.08551673944687045,
+      "grad_norm": 0.11318827420473099,
+      "learning_rate": 4.96004897065351e-05,
+      "loss": 0.20947303771972656,
+      "step": 470
+    },
+    {
+      "epoch": 0.08642649199417758,
+      "grad_norm": 0.13409486413002014,
+      "learning_rate": 4.95872764225307e-05,
+      "loss": 0.19670876264572143,
+      "step": 475
+    },
+    {
+      "epoch": 0.08733624454148471,
+      "grad_norm": 0.14440792798995972,
+      "learning_rate": 4.957384999064672e-05,
+      "loss": 0.19842848777770997,
+      "step": 480
+    },
+    {
+      "epoch": 0.08824599708879186,
+      "grad_norm": 0.12246996909379959,
+      "learning_rate": 4.956021052727731e-05,
+      "loss": 0.20318071842193602,
+      "step": 485
+    },
+    {
+      "epoch": 0.08915574963609899,
+      "grad_norm": 0.13437233865261078,
+      "learning_rate": 4.954635815066342e-05,
+      "loss": 0.21675212383270265,
+      "step": 490
+    },
+    {
+      "epoch": 0.09006550218340612,
+      "grad_norm": 0.11109672486782074,
+      "learning_rate": 4.9532292980891744e-05,
+      "loss": 0.2100757837295532,
+      "step": 495
+    },
+    {
+      "epoch": 0.09097525473071325,
+      "grad_norm": 0.1388893872499466,
+      "learning_rate": 4.9518015139893675e-05,
+      "loss": 0.20303285121917725,
+      "step": 500
+    },
+    {
+      "epoch": 0.09188500727802038,
+      "grad_norm": 0.13239721953868866,
+      "learning_rate": 4.950352475144427e-05,
+      "loss": 0.2152268409729004,
+      "step": 505
+    },
+    {
+      "epoch": 0.0927947598253275,
+      "grad_norm": 0.12834979593753815,
+      "learning_rate": 4.948882194116119e-05,
+      "loss": 0.20799248218536376,
+      "step": 510
+    },
+    {
+      "epoch": 0.09370451237263465,
+      "grad_norm": 0.11886704713106155,
+      "learning_rate": 4.947390683650354e-05,
+      "loss": 0.20394976139068605,
+      "step": 515
+    },
+    {
+      "epoch": 0.09461426491994178,
+      "grad_norm": 0.11398876458406448,
+      "learning_rate": 4.945877956677083e-05,
+      "loss": 0.2091092586517334,
+      "step": 520
+    },
+    {
+      "epoch": 0.09552401746724891,
+      "grad_norm": 0.1422540694475174,
+      "learning_rate": 4.944344026310186e-05,
+      "loss": 0.19564238786697388,
+      "step": 525
+    },
+    {
+      "epoch": 0.09643377001455604,
+      "grad_norm": 0.11359584331512451,
+      "learning_rate": 4.9427889058473535e-05,
+      "loss": 0.20493624210357667,
+      "step": 530
+    },
+    {
+      "epoch": 0.09734352256186317,
+      "grad_norm": 0.11703553050756454,
+      "learning_rate": 4.941212608769974e-05,
+      "loss": 0.2098615884780884,
+      "step": 535
+    },
+    {
+      "epoch": 0.0982532751091703,
+      "grad_norm": 0.14552047848701477,
+      "learning_rate": 4.939615148743017e-05,
+      "loss": 0.20382182598114013,
+      "step": 540
+    },
+    {
+      "epoch": 0.09916302765647744,
+      "grad_norm": 0.13178016245365143,
+      "learning_rate": 4.937996539614914e-05,
+      "loss": 0.19901862144470214,
+      "step": 545
+    },
+    {
+      "epoch": 0.10007278020378457,
+      "grad_norm": 0.635392427444458,
+      "learning_rate": 4.936356795417439e-05,
+      "loss": 0.20694944858551026,
+      "step": 550
+    },
+    {
+      "epoch": 0.1009825327510917,
+      "grad_norm": 0.15019077062606812,
+      "learning_rate": 4.934695930365586e-05,
+      "loss": 0.19313746690750122,
+      "step": 555
+    },
+    {
+      "epoch": 0.10189228529839883,
+      "grad_norm": 0.12941956520080566,
+      "learning_rate": 4.9330139588574474e-05,
+      "loss": 0.19671722650527954,
+      "step": 560
+    },
+    {
+      "epoch": 0.10280203784570596,
+      "grad_norm": 0.13818831741809845,
+      "learning_rate": 4.931310895474088e-05,
+      "loss": 0.20026786327362062,
+      "step": 565
+    },
+    {
+      "epoch": 0.1037117903930131,
+      "grad_norm": 0.12011194974184036,
+      "learning_rate": 4.929586754979417e-05,
+      "loss": 0.1932437539100647,
+      "step": 570
+    },
+    {
+      "epoch": 0.10462154294032024,
+      "grad_norm": 0.1345364898443222,
+      "learning_rate": 4.9278415523200644e-05,
+      "loss": 0.20245940685272218,
+      "step": 575
+    },
+    {
+      "epoch": 0.10553129548762737,
+      "grad_norm": 0.13281017541885376,
+      "learning_rate": 4.926075302625247e-05,
+      "loss": 0.19864981174468993,
+      "step": 580
+    },
+    {
+      "epoch": 0.1064410480349345,
+      "grad_norm": 0.13465586304664612,
+      "learning_rate": 4.924288021206639e-05,
+      "loss": 0.19573183059692384,
+      "step": 585
+    },
+    {
+      "epoch": 0.10735080058224163,
+      "grad_norm": 0.15225961804389954,
+      "learning_rate": 4.9224797235582396e-05,
+      "loss": 0.19946801662445068,
+      "step": 590
+    },
+    {
+      "epoch": 0.10826055312954876,
+      "grad_norm": 0.12816746532917023,
+      "learning_rate": 4.92065042535624e-05,
+      "loss": 0.19851526021957397,
+      "step": 595
+    },
+    {
+      "epoch": 0.1091703056768559,
+      "grad_norm": 0.13802853226661682,
+      "learning_rate": 4.9188001424588824e-05,
+      "loss": 0.19321763515472412,
+      "step": 600
+    },
+    {
+      "epoch": 0.11008005822416303,
+      "grad_norm": 0.17504797875881195,
+      "learning_rate": 4.9169288909063295e-05,
+      "loss": 0.2032616138458252,
+      "step": 605
+    },
+    {
+      "epoch": 0.11098981077147016,
+      "grad_norm": 0.13544194400310516,
+      "learning_rate": 4.91503668692052e-05,
+      "loss": 0.2011256456375122,
+      "step": 610
+    },
+    {
+      "epoch": 0.11189956331877729,
+      "grad_norm": 1.3976134061813354,
+      "learning_rate": 4.91312354690503e-05,
+      "loss": 0.19916868209838867,
+      "step": 615
+    },
+    {
+      "epoch": 0.11280931586608442,
+      "grad_norm": 0.1465059071779251,
+      "learning_rate": 4.91118948744493e-05,
+      "loss": 0.19487457275390624,
+      "step": 620
+    },
+    {
+      "epoch": 0.11371906841339156,
+      "grad_norm": 0.12103168666362762,
+      "learning_rate": 4.909234525306645e-05,
+      "loss": 0.1907251238822937,
+      "step": 625
+    },
+    {
+      "epoch": 0.1146288209606987,
+      "grad_norm": 0.12660574913024902,
+      "learning_rate": 4.907258677437802e-05,
+      "loss": 0.19327253103256226,
+      "step": 630
+    },
+    {
+      "epoch": 0.11553857350800582,
+      "grad_norm": 0.1347813606262207,
+      "learning_rate": 4.90526196096709e-05,
+      "loss": 0.19637736082077026,
+      "step": 635
+    },
+    {
+      "epoch": 0.11644832605531295,
+      "grad_norm": 0.14953652024269104,
+      "learning_rate": 4.903244393204107e-05,
+      "loss": 0.20325069427490233,
+      "step": 640
+    },
+    {
+      "epoch": 0.11735807860262008,
+      "grad_norm": 0.13936272263526917,
+      "learning_rate": 4.901205991639213e-05,
+      "loss": 0.1930275321006775,
+      "step": 645
+    },
+    {
+      "epoch": 0.11826783114992721,
+      "grad_norm": 0.1448420137166977,
+      "learning_rate": 4.899146773943374e-05,
+      "loss": 0.20026936531066894,
+      "step": 650
+    },
+    {
+      "epoch": 0.11917758369723436,
+      "grad_norm": 0.1312534064054489,
+      "learning_rate": 4.897066757968014e-05,
+      "loss": 0.19062033891677857,
+      "step": 655
+    },
+    {
+      "epoch": 0.12008733624454149,
+      "grad_norm": 0.13644742965698242,
+      "learning_rate": 4.894965961744859e-05,
+      "loss": 0.18719595670700073,
+      "step": 660
+    },
+    {
+      "epoch": 0.12099708879184862,
+      "grad_norm": 0.14276087284088135,
+      "learning_rate": 4.892844403485777e-05,
+      "loss": 0.19784307479858398,
+      "step": 665
+    },
+    {
+      "epoch": 0.12190684133915575,
+      "grad_norm": 0.14735399186611176,
+      "learning_rate": 4.890702101582623e-05,
+      "loss": 0.19163782596588136,
+      "step": 670
+    },
+    {
+      "epoch": 0.12281659388646288,
+      "grad_norm": 0.15742065012454987,
+      "learning_rate": 4.888539074607082e-05,
+      "loss": 0.19312986135482788,
+      "step": 675
+    },
+    {
+      "epoch": 0.12372634643377002,
+      "grad_norm": 0.12917031347751617,
+      "learning_rate": 4.8863553413105025e-05,
+      "loss": 0.20066320896148682,
+      "step": 680
+    },
+    {
+      "epoch": 0.12463609898107715,
+      "grad_norm": 0.1484801322221756,
+      "learning_rate": 4.884150920623737e-05,
+      "loss": 0.20096096992492676,
+      "step": 685
+    },
+    {
+      "epoch": 0.12554585152838427,
+      "grad_norm": 0.1455296128988266,
+      "learning_rate": 4.88192583165698e-05,
+      "loss": 0.20518505573272705,
+      "step": 690
+    },
+    {
+      "epoch": 0.12645560407569142,
+      "grad_norm": 0.14517490565776825,
+      "learning_rate": 4.879680093699598e-05,
+      "loss": 0.18859238624572755,
+      "step": 695
+    },
+    {
+      "epoch": 0.12736535662299855,
+      "grad_norm": 0.18778090178966522,
+      "learning_rate": 4.877413726219964e-05,
+      "loss": 0.197074818611145,
+      "step": 700
+    },
+    {
+      "epoch": 0.12827510917030568,
+      "grad_norm": 0.13497677445411682,
+      "learning_rate": 4.87512674886529e-05,
+      "loss": 0.18713107109069824,
+      "step": 705
+    },
+    {
+      "epoch": 0.12918486171761281,
+      "grad_norm": 0.12657155096530914,
+      "learning_rate": 4.872819181461455e-05,
+      "loss": 0.1858484387397766,
+      "step": 710
+    },
+    {
+      "epoch": 0.13009461426491994,
+      "grad_norm": 0.11458148807287216,
+      "learning_rate": 4.870491044012834e-05,
+      "loss": 0.18732179403305055,
+      "step": 715
+    },
+    {
+      "epoch": 0.13100436681222707,
+      "grad_norm": 0.13000249862670898,
+      "learning_rate": 4.8681423567021244e-05,
+      "loss": 0.1872936010360718,
+      "step": 720
+    },
+    {
+      "epoch": 0.1319141193595342,
+      "grad_norm": 0.14580890536308289,
+      "learning_rate": 4.865773139890172e-05,
+      "loss": 0.19280019998550416,
+      "step": 725
+    },
+    {
+      "epoch": 0.13282387190684133,
+      "grad_norm": 0.1507277935743332,
+      "learning_rate": 4.8633834141157913e-05,
+      "loss": 0.1898929238319397,
+      "step": 730
+    },
+    {
+      "epoch": 0.13373362445414846,
+      "grad_norm": 0.1418737769126892,
+      "learning_rate": 4.860973200095592e-05,
+      "loss": 0.17926375865936278,
+      "step": 735
+    },
+    {
+      "epoch": 0.1346433770014556,
+      "grad_norm": 0.17151866853237152,
+      "learning_rate": 4.858542518723794e-05,
+      "loss": 0.18963592052459716,
+      "step": 740
+    },
+    {
+      "epoch": 0.13555312954876272,
+      "grad_norm": 0.11162743717432022,
+      "learning_rate": 4.8560913910720535e-05,
+      "loss": 0.19466646909713745,
+      "step": 745
+    },
+    {
+      "epoch": 0.13646288209606988,
+      "grad_norm": 0.15628376603126526,
+      "learning_rate": 4.8536198383892725e-05,
+      "loss": 0.19494034051895143,
+      "step": 750
+    },
+    {
+      "epoch": 0.137372634643377,
+      "grad_norm": 0.18209289014339447,
+      "learning_rate": 4.851127882101421e-05,
+      "loss": 0.18747550249099731,
+      "step": 755
+    },
+    {
+      "epoch": 0.13828238719068414,
+      "grad_norm": 0.14559614658355713,
+      "learning_rate": 4.8486155438113454e-05,
+      "loss": 0.1897158980369568,
+      "step": 760
+    },
+    {
+      "epoch": 0.13919213973799127,
+      "grad_norm": 0.3198587894439697,
+      "learning_rate": 4.846082845298586e-05,
+      "loss": 0.18571001291275024,
+      "step": 765
+    },
+    {
+      "epoch": 0.1401018922852984,
+      "grad_norm": 0.1486678421497345,
+      "learning_rate": 4.843529808519189e-05,
+      "loss": 0.19561930894851684,
+      "step": 770
+    },
+    {
+      "epoch": 0.14101164483260553,
+      "grad_norm": 0.15318170189857483,
+      "learning_rate": 4.840956455605509e-05,
+      "loss": 0.187040114402771,
+      "step": 775
+    },
+    {
+      "epoch": 0.14192139737991266,
+      "grad_norm": 0.13754244148731232,
+      "learning_rate": 4.838362808866025e-05,
+      "loss": 0.18345539569854735,
+      "step": 780
+    },
+    {
+      "epoch": 0.1428311499272198,
+      "grad_norm": 0.12943248450756073,
+      "learning_rate": 4.835748890785143e-05,
+      "loss": 0.1921079397201538,
+      "step": 785
+    },
+    {
+      "epoch": 0.14374090247452692,
+      "grad_norm": 0.110458143055439,
+      "learning_rate": 4.833114724023001e-05,
+      "loss": 0.17927205562591553,
+      "step": 790
+    },
+    {
+      "epoch": 0.14465065502183405,
+      "grad_norm": 0.2421770840883255,
+      "learning_rate": 4.830460331415275e-05,
+      "loss": 0.18317567110061644,
+      "step": 795
+    },
+    {
+      "epoch": 0.14556040756914118,
+      "grad_norm": 0.14752762019634247,
+      "learning_rate": 4.8277857359729787e-05,
+      "loss": 0.1843916058540344,
+      "step": 800
+    },
+    {
+      "epoch": 0.14647016011644834,
+      "grad_norm": 0.15043556690216064,
+      "learning_rate": 4.8250909608822644e-05,
+      "loss": 0.18354393243789674,
+      "step": 805
+    },
+    {
+      "epoch": 0.14737991266375547,
+      "grad_norm": 0.1381794661283493,
+      "learning_rate": 4.822376029504223e-05,
+      "loss": 0.1789781332015991,
+      "step": 810
+    },
+    {
+      "epoch": 0.1482896652110626,
+      "grad_norm": 0.18386174738407135,
+      "learning_rate": 4.819640965374681e-05,
+      "loss": 0.19494292736053467,
+      "step": 815
+    },
+    {
+      "epoch": 0.14919941775836973,
+      "grad_norm": 0.13829593360424042,
+      "learning_rate": 4.816885792203996e-05,
+      "loss": 0.18486063480377196,
+      "step": 820
+    },
+    {
+      "epoch": 0.15010917030567686,
+      "grad_norm": 0.15033291280269623,
+      "learning_rate": 4.814110533876852e-05,
+      "loss": 0.18061509132385253,
+      "step": 825
+    },
+    {
+      "epoch": 0.151018922852984,
+      "grad_norm": 0.17150473594665527,
+      "learning_rate": 4.811315214452051e-05,
+      "loss": 0.18464866876602173,
+      "step": 830
+    },
+    {
+      "epoch": 0.15192867540029112,
+      "grad_norm": 0.15317125618457794,
+      "learning_rate": 4.808499858162307e-05,
+      "loss": 0.1837708592414856,
+      "step": 835
+    },
+    {
+      "epoch": 0.15283842794759825,
+      "grad_norm": 0.2671392560005188,
+      "learning_rate": 4.805664489414031e-05,
+      "loss": 0.19338636398315429,
+      "step": 840
+    },
+    {
+      "epoch": 0.15374818049490538,
+      "grad_norm": 0.14047028124332428,
+      "learning_rate": 4.802809132787125e-05,
+      "loss": 0.17069108486175538,
+      "step": 845
+    },
+    {
+      "epoch": 0.1546579330422125,
+      "grad_norm": 0.1520431935787201,
+      "learning_rate": 4.799933813034768e-05,
+      "loss": 0.18607735633850098,
+      "step": 850
+    },
+    {
+      "epoch": 0.15556768558951964,
+      "grad_norm": 0.17239463329315186,
+      "learning_rate": 4.797038555083197e-05,
+      "loss": 0.18069062232971192,
+      "step": 855
+    },
+    {
+      "epoch": 0.1564774381368268,
+      "grad_norm": 0.1377955675125122,
+      "learning_rate": 4.794123384031495e-05,
+      "loss": 0.18870222568511963,
+      "step": 860
+    },
+    {
+      "epoch": 0.15738719068413393,
+      "grad_norm": 0.15901461243629456,
+      "learning_rate": 4.791188325151373e-05,
+      "loss": 0.18128334283828734,
+      "step": 865
+    },
+    {
+      "epoch": 0.15829694323144106,
+      "grad_norm": 0.14634132385253906,
+      "learning_rate": 4.7882334038869495e-05,
+      "loss": 0.1866163969039917,
+      "step": 870
+    },
+    {
+      "epoch": 0.1592066957787482,
+      "grad_norm": 0.15361061692237854,
+      "learning_rate": 4.785258645854529e-05,
+      "loss": 0.17850807905197144,
+      "step": 875
+    },
+    {
+      "epoch": 0.16011644832605532,
+      "grad_norm": 0.13751649856567383,
+      "learning_rate": 4.782264076842385e-05,
+      "loss": 0.17731113433837892,
+      "step": 880
+    },
+    {
+      "epoch": 0.16102620087336245,
+      "grad_norm": 0.17909638583660126,
+      "learning_rate": 4.7792497228105314e-05,
+      "loss": 0.18344542980194092,
+      "step": 885
+    },
+    {
+      "epoch": 0.16193595342066958,
+      "grad_norm": 0.16038304567337036,
+      "learning_rate": 4.776215609890498e-05,
+      "loss": 0.18868647813796996,
+      "step": 890
+    },
+    {
+      "epoch": 0.1628457059679767,
+      "grad_norm": 0.1653951108455658,
+      "learning_rate": 4.773161764385107e-05,
+      "loss": 0.18614152669906617,
+      "step": 895
+    },
+    {
+      "epoch": 0.16375545851528384,
+      "grad_norm": 0.16193026304244995,
+      "learning_rate": 4.770088212768241e-05,
+      "loss": 0.18564575910568237,
+      "step": 900
+    },
+    {
+      "epoch": 0.16466521106259097,
+      "grad_norm": 0.16048531234264374,
+      "learning_rate": 4.7669949816846173e-05,
+      "loss": 0.18330031633377075,
+      "step": 905
+    },
+    {
+      "epoch": 0.1655749636098981,
+      "grad_norm": 0.1440177708864212,
+      "learning_rate": 4.7638820979495534e-05,
+      "loss": 0.17712442874908446,
+      "step": 910
+    },
+    {
+      "epoch": 0.16648471615720525,
+      "grad_norm": 0.19635969400405884,
+      "learning_rate": 4.760749588548738e-05,
+      "loss": 0.18679027557373046,
+      "step": 915
+    },
+    {
+      "epoch": 0.16739446870451238,
+      "grad_norm": 0.15576541423797607,
+      "learning_rate": 4.757597480637995e-05,
+      "loss": 0.19283764362335204,
+      "step": 920
+    },
+    {
+      "epoch": 0.1683042212518195,
+      "grad_norm": 0.1550331562757492,
+      "learning_rate": 4.7544258015430463e-05,
+      "loss": 0.18269542455673218,
+      "step": 925
+    },
+    {
+      "epoch": 0.16921397379912664,
+      "grad_norm": 0.18369626998901367,
+      "learning_rate": 4.75123457875928e-05,
+      "loss": 0.1697891116142273,
+      "step": 930
+    },
+    {
+      "epoch": 0.17012372634643377,
+      "grad_norm": 0.15266314148902893,
+      "learning_rate": 4.7480238399515074e-05,
+      "loss": 0.18523451089859008,
+      "step": 935
+    },
+    {
+      "epoch": 0.1710334788937409,
+      "grad_norm": 0.16709664463996887,
+      "learning_rate": 4.744793612953724e-05,
+      "loss": 0.1803238034248352,
+      "step": 940
+    },
+    {
+      "epoch": 0.17194323144104803,
+      "grad_norm": 0.14929179847240448,
+      "learning_rate": 4.741543925768872e-05,
+      "loss": 0.1861217737197876,
+      "step": 945
+    },
+    {
+      "epoch": 0.17285298398835516,
+      "grad_norm": 0.1362280696630478,
+      "learning_rate": 4.7382748065685915e-05,
+      "loss": 0.17896100282669067,
+      "step": 950
+    },
+    {
+      "epoch": 0.1737627365356623,
+      "grad_norm": 0.15290239453315735,
+      "learning_rate": 4.734986283692982e-05,
+      "loss": 0.18432788848876952,
+      "step": 955
+    },
+    {
+      "epoch": 0.17467248908296942,
+      "grad_norm": 0.1287035197019577,
+      "learning_rate": 4.73167838565035e-05,
+      "loss": 0.18485682010650634,
+      "step": 960
+    },
+    {
+      "epoch": 0.17558224163027655,
+      "grad_norm": 0.17969627678394318,
+      "learning_rate": 4.728351141116971e-05,
+      "loss": 0.17361557483673096,
+      "step": 965
+    },
+    {
+      "epoch": 0.1764919941775837,
+      "grad_norm": 0.13751201331615448,
+      "learning_rate": 4.7250045789368326e-05,
+      "loss": 0.1731679320335388,
+      "step": 970
+    },
+    {
+      "epoch": 0.17740174672489084,
+      "grad_norm": 0.1603265255689621,
+      "learning_rate": 4.721638728121388e-05,
+      "loss": 0.17308170795440675,
+      "step": 975
+    },
+    {
+      "epoch": 0.17831149927219797,
+      "grad_norm": 0.1592789888381958,
+      "learning_rate": 4.718253617849306e-05,
+      "loss": 0.17534757852554322,
+      "step": 980
+    },
+    {
+      "epoch": 0.1792212518195051,
+      "grad_norm": 0.12727224826812744,
+      "learning_rate": 4.714849277466214e-05,
+      "loss": 0.17817609310150145,
+      "step": 985
+    },
+    {
+      "epoch": 0.18013100436681223,
+      "grad_norm": 0.15401554107666016,
+      "learning_rate": 4.711425736484447e-05,
+      "loss": 0.1733405351638794,
+      "step": 990
+    },
+    {
+      "epoch": 0.18104075691411936,
+      "grad_norm": 0.13253968954086304,
+      "learning_rate": 4.7079830245827906e-05,
+      "loss": 0.17846795320510864,
+      "step": 995
+    },
+    {
+      "epoch": 0.1819505094614265,
+      "grad_norm": 0.21846213936805725,
+      "learning_rate": 4.7045211716062245e-05,
+      "loss": 0.18021599054336548,
+      "step": 1000
+    },
+    {
+      "epoch": 0.18286026200873362,
+      "grad_norm": 0.16867990791797638,
+      "learning_rate": 4.7010402075656595e-05,
+      "loss": 0.18232386112213134,
+      "step": 1005
+    },
+    {
+      "epoch": 0.18377001455604075,
+      "grad_norm": 0.17180582880973816,
+      "learning_rate": 4.697540162637686e-05,
+      "loss": 0.1816317319869995,
+      "step": 1010
+    },
+    {
+      "epoch": 0.18467976710334788,
+      "grad_norm": 0.16480213403701782,
+      "learning_rate": 4.694021067164303e-05,
+      "loss": 0.17718446254730225,
+      "step": 1015
+    },
+    {
+      "epoch": 0.185589519650655,
+      "grad_norm": 0.15015918016433716,
+      "learning_rate": 4.6904829516526605e-05,
+      "loss": 0.17412011623382567,
+      "step": 1020
+    },
+    {
+      "epoch": 0.18649927219796217,
+      "grad_norm": 0.14445139467716217,
+      "learning_rate": 4.686925846774795e-05,
+      "loss": 0.1778018832206726,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1874090247452693,
+      "grad_norm": 0.1701960265636444,
+      "learning_rate": 4.683349783367362e-05,
+      "loss": 0.16901081800460815,
+      "step": 1030
+    },
+    {
+      "epoch": 0.18831877729257643,
+      "grad_norm": 0.15894867479801178,
+      "learning_rate": 4.679754792431368e-05,
+      "loss": 0.17055928707122803,
+      "step": 1035
+    },
+    {
+      "epoch": 0.18922852983988356,
+      "grad_norm": 0.1511942446231842,
+      "learning_rate": 4.676140905131903e-05,
+      "loss": 0.17339680194854737,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1901382823871907,
+      "grad_norm": 0.14735209941864014,
+      "learning_rate": 4.672508152797872e-05,
+      "loss": 0.17802717685699462,
+      "step": 1045
+    },
+    {
+      "epoch": 0.19104803493449782,
+      "grad_norm": 0.17367291450500488,
+      "learning_rate": 4.66885656692172e-05,
+      "loss": 0.1732744097709656,
+      "step": 1050
+    },
+    {
+      "epoch": 0.19195778748180495,
+      "grad_norm": 0.147227481007576,
+      "learning_rate": 4.665186179159159e-05,
+      "loss": 0.17040517330169677,
+      "step": 1055
+    },
+    {
+      "epoch": 0.19286754002911208,
+      "grad_norm": 0.1709655076265335,
+      "learning_rate": 4.6614970213289e-05,
+      "loss": 0.17794088125228882,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1937772925764192,
+      "grad_norm": 0.1588088721036911,
+      "learning_rate": 4.657789125412366e-05,
+      "loss": 0.17180380821228028,
+      "step": 1065
+    },
+    {
+      "epoch": 0.19468704512372634,
+      "grad_norm": 0.14827021956443787,
+      "learning_rate": 4.654062523553428e-05,
+      "loss": 0.182997989654541,
+      "step": 1070
+    },
+    {
+      "epoch": 0.19559679767103347,
+      "grad_norm": 0.16230466961860657,
+      "learning_rate": 4.6503172480581126e-05,
+      "loss": 0.17346880435943604,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1965065502183406,
+      "grad_norm": 0.1637624353170395,
+      "learning_rate": 4.646553331394333e-05,
+      "loss": 0.17263576984405518,
+      "step": 1080
+    },
+    {
+      "epoch": 0.19741630276564776,
+      "grad_norm": 0.15977843105793,
+      "learning_rate": 4.642770806191603e-05,
+      "loss": 0.17284308671951293,
+      "step": 1085
+    },
+    {
+      "epoch": 0.19832605531295489,
+      "grad_norm": 0.15394869446754456,
+      "learning_rate": 4.6389697052407534e-05,
+      "loss": 0.17797101736068727,
+      "step": 1090
+    },
+    {
+      "epoch": 0.19923580786026202,
+      "grad_norm": 0.15995225310325623,
+      "learning_rate": 4.6351500614936485e-05,
+      "loss": 0.18137198686599731,
+      "step": 1095
+    },
+    {
+      "epoch": 0.20014556040756915,
+      "grad_norm": 0.1779479682445526,
+      "learning_rate": 4.6313119080629006e-05,
+      "loss": 0.17998344898223878,
+      "step": 1100
+    },
+    {
+      "epoch": 0.20105531295487628,
+      "grad_norm": 0.14362832903862,
+      "learning_rate": 4.627455278221584e-05,
+      "loss": 0.18196423053741456,
+      "step": 1105
+    },
+    {
+      "epoch": 0.2019650655021834,
+      "grad_norm": 0.15951639413833618,
+      "learning_rate": 4.623580205402947e-05,
+      "loss": 0.17423888444900512,
+      "step": 1110
+    },
+    {
+      "epoch": 0.20287481804949054,
+      "grad_norm": 0.17273563146591187,
+      "learning_rate": 4.619686723200115e-05,
+      "loss": 0.17392473220825194,
+      "step": 1115
+    },
+    {
+      "epoch": 0.20378457059679767,
+      "grad_norm": 0.1655360758304596,
+      "learning_rate": 4.615774865365813e-05,
+      "loss": 0.17528389692306517,
+      "step": 1120
+    },
+    {
+      "epoch": 0.2046943231441048,
+      "grad_norm": 0.15920691192150116,
+      "learning_rate": 4.611844665812058e-05,
+      "loss": 0.1806849241256714,
+      "step": 1125
+    },
+    {
+      "epoch": 0.20560407569141192,
+      "grad_norm": 0.16114577651023865,
+      "learning_rate": 4.607896158609875e-05,
+      "loss": 0.17217352390289306,
+      "step": 1130
+    },
+    {
+      "epoch": 0.20651382823871905,
+      "grad_norm": 0.1499422937631607,
+      "learning_rate": 4.603929377988999e-05,
+      "loss": 0.17806737422943114,
+      "step": 1135
+    },
+    {
+      "epoch": 0.2074235807860262,
+      "grad_norm": 0.17605191469192505,
+      "learning_rate": 4.5999443583375765e-05,
+      "loss": 0.17842113971710205,
+      "step": 1140
+    },
+    {
+      "epoch": 0.20833333333333334,
+      "grad_norm": 0.16117210686206818,
+      "learning_rate": 4.595941134201871e-05,
+      "loss": 0.18379683494567872,
+      "step": 1145
+    },
+    {
+      "epoch": 0.20924308588064047,
+      "grad_norm": 0.21199050545692444,
+      "learning_rate": 4.591919740285957e-05,
+      "loss": 0.16286123991012574,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2101528384279476,
+      "grad_norm": 0.15100529789924622,
+      "learning_rate": 4.587880211451427e-05,
+      "loss": 0.17995200157165528,
+      "step": 1155
+    },
+    {
+      "epoch": 0.21106259097525473,
+      "grad_norm": 0.16618172824382782,
+      "learning_rate": 4.583822582717085e-05,
+      "loss": 0.16960303783416747,
+      "step": 1160
+    },
+    {
+      "epoch": 0.21197234352256186,
+      "grad_norm": 0.14743569493293762,
+      "learning_rate": 4.579746889258643e-05,
+      "loss": 0.17762668132781984,
+      "step": 1165
+    },
+    {
+      "epoch": 0.212882096069869,
+      "grad_norm": 0.1697179079055786,
+      "learning_rate": 4.575653166408417e-05,
+      "loss": 0.16656005382537842,
+      "step": 1170
+    },
+    {
+      "epoch": 0.21379184861717612,
+      "grad_norm": 0.14886513352394104,
+      "learning_rate": 4.57154144965502e-05,
+      "loss": 0.17091882228851318,
+      "step": 1175
+    },
+    {
+      "epoch": 0.21470160116448325,
+      "grad_norm": 0.18197473883628845,
+      "learning_rate": 4.5674117746430556e-05,
+      "loss": 0.1770920753479004,
+      "step": 1180
+    },
+    {
+      "epoch": 0.21561135371179038,
+      "grad_norm": 0.17323088645935059,
+      "learning_rate": 4.563264177172807e-05,
+      "loss": 0.1734643578529358,
+      "step": 1185
+    },
+    {
+      "epoch": 0.2165211062590975,
+      "grad_norm": 0.1521984338760376,
+      "learning_rate": 4.559098693199929e-05,
+      "loss": 0.17515116930007935,
+      "step": 1190
+    },
+    {
+      "epoch": 0.21743085880640467,
+      "grad_norm": 0.1842304915189743,
+      "learning_rate": 4.554915358835134e-05,
+      "loss": 0.16798022985458375,
+      "step": 1195
+    },
+    {
+      "epoch": 0.2183406113537118,
+      "grad_norm": 0.14753451943397522,
+      "learning_rate": 4.5507142103438794e-05,
+      "loss": 0.1755476713180542,
+      "step": 1200
+    },
+    {
+      "epoch": 0.21925036390101893,
+      "grad_norm": 0.17096194624900818,
+      "learning_rate": 4.546495284146057e-05,
+      "loss": 0.1792473554611206,
+      "step": 1205
+    },
+    {
+      "epoch": 0.22016011644832606,
+      "grad_norm": 0.1579233556985855,
+      "learning_rate": 4.542258616815669e-05,
+      "loss": 0.17230144739151002,
+      "step": 1210
+    },
+    {
+      "epoch": 0.2210698689956332,
+      "grad_norm": 0.177297905087471,
+      "learning_rate": 4.5380042450805216e-05,
+      "loss": 0.1807127833366394,
+      "step": 1215
+    },
+    {
+      "epoch": 0.22197962154294032,
+      "grad_norm": 0.14331696927547455,
+      "learning_rate": 4.533732205821897e-05,
+      "loss": 0.17201389074325563,
+      "step": 1220
+    },
+    {
+      "epoch": 0.22288937409024745,
+      "grad_norm": 0.14473360776901245,
+      "learning_rate": 4.529442536074239e-05,
+      "loss": 0.17036900520324708,
+      "step": 1225
+    },
+    {
+      "epoch": 0.22379912663755458,
+      "grad_norm": 0.1820901483297348,
+      "learning_rate": 4.5251352730248314e-05,
+      "loss": 0.17704882621765136,
+      "step": 1230
+    },
+    {
+      "epoch": 0.2247088791848617,
+      "grad_norm": 0.1948976367712021,
+      "learning_rate": 4.5208104540134746e-05,
+      "loss": 0.1706973433494568,
+      "step": 1235
+    },
+    {
+      "epoch": 0.22561863173216884,
+      "grad_norm": 0.16660070419311523,
+      "learning_rate": 4.51646811653216e-05,
+      "loss": 0.17636821269989014,
+      "step": 1240
+    },
+    {
+      "epoch": 0.22652838427947597,
+      "grad_norm": 0.1699984073638916,
+      "learning_rate": 4.512108298224751e-05,
+      "loss": 0.16986632347106934,
+      "step": 1245
+    },
+    {
+      "epoch": 0.22743813682678313,
+      "grad_norm": 0.17601042985916138,
+      "learning_rate": 4.50773103688665e-05,
+      "loss": 0.17507898807525635,
+      "step": 1250
+    },
+    {
+      "epoch": 0.22834788937409026,
+      "grad_norm": 0.17557238042354584,
+      "learning_rate": 4.503336370464476e-05,
+      "loss": 0.17702863216400147,
+      "step": 1255
+    },
+    {
+      "epoch": 0.2292576419213974,
+      "grad_norm": 0.1800651252269745,
+      "learning_rate": 4.498924337055729e-05,
+      "loss": 0.16419180631637573,
+      "step": 1260
+    },
+    {
+      "epoch": 0.23016739446870452,
+      "grad_norm": 0.2022479772567749,
+      "learning_rate": 4.494494974908468e-05,
+      "loss": 0.17482060194015503,
+      "step": 1265
+    },
+    {
+      "epoch": 0.23107714701601165,
+      "grad_norm": 0.14180205762386322,
+      "learning_rate": 4.490048322420973e-05,
+      "loss": 0.1723136067390442,
+      "step": 1270
+    },
+    {
+      "epoch": 0.23198689956331878,
+      "grad_norm": 0.18607310950756073,
+      "learning_rate": 4.485584418141419e-05,
+      "loss": 0.17096419334411622,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2328966521106259,
+      "grad_norm": 0.15958310663700104,
+      "learning_rate": 4.481103300767529e-05,
+      "loss": 0.1656244158744812,
+      "step": 1280
+    },
+    {
+      "epoch": 0.23380640465793304,
+      "grad_norm": 0.17552383244037628,
+      "learning_rate": 4.476605009146255e-05,
+      "loss": 0.17677626609802247,
+      "step": 1285
+    },
+    {
+      "epoch": 0.23471615720524017,
+      "grad_norm": 0.15299823880195618,
+      "learning_rate": 4.472089582273429e-05,
+      "loss": 0.1778991103172302,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2356259097525473,
+      "grad_norm": 0.14613987505435944,
+      "learning_rate": 4.46755705929343e-05,
+      "loss": 0.17071452140808105,
+      "step": 1295
+    },
+    {
+      "epoch": 0.23653566229985443,
+      "grad_norm": 0.17781122028827667,
+      "learning_rate": 4.463007479498843e-05,
+      "loss": 0.16955430507659913,
+      "step": 1300
+    },
+    {
+      "epoch": 0.23744541484716158,
+      "grad_norm": 0.16326487064361572,
+      "learning_rate": 4.458440882330119e-05,
+      "loss": 0.1777693510055542,
+      "step": 1305
+    },
+    {
+      "epoch": 0.23835516739446871,
+      "grad_norm": 0.17701926827430725,
+      "learning_rate": 4.4538573073752365e-05,
+      "loss": 0.16323351860046387,
+      "step": 1310
+    },
+    {
+      "epoch": 0.23926491994177584,
+      "grad_norm": 0.13104717433452606,
+      "learning_rate": 4.449256794369349e-05,
+      "loss": 0.17653456926345826,
+      "step": 1315
+    },
+    {
+      "epoch": 0.24017467248908297,
+      "grad_norm": 0.1796836256980896,
+      "learning_rate": 4.444639383194452e-05,
+      "loss": 0.17189600467681884,
+      "step": 1320
+    },
+    {
+      "epoch": 0.2410844250363901,
+      "grad_norm": 0.14919696748256683,
+      "learning_rate": 4.440005113879029e-05,
+      "loss": 0.17003334760665895,
+      "step": 1325
+    },
+    {
+      "epoch": 0.24199417758369723,
+      "grad_norm": 0.1728784441947937,
+      "learning_rate": 4.4353540265977064e-05,
+      "loss": 0.17397408485412597,
+      "step": 1330
+    },
+    {
+      "epoch": 0.24290393013100436,
+      "grad_norm": 0.14591015875339508,
+      "learning_rate": 4.43068616167091e-05,
+      "loss": 0.16498478651046752,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2438136826783115,
+      "grad_norm": 0.18417201936244965,
+      "learning_rate": 4.4260015595645055e-05,
+      "loss": 0.16841750144958495,
+      "step": 1340
+    },
+    {
+      "epoch": 0.24472343522561862,
+      "grad_norm": 0.16264279186725616,
+      "learning_rate": 4.4213002608894605e-05,
+      "loss": 0.16907373666763306,
+      "step": 1345
+    },
+    {
+      "epoch": 0.24563318777292575,
+      "grad_norm": 0.15248481929302216,
+      "learning_rate": 4.416582306401481e-05,
+      "loss": 0.15931472778320313,
+      "step": 1350
+    },
+    {
+      "epoch": 0.24654294032023288,
+      "grad_norm": 0.1488373875617981,
+      "learning_rate": 4.4118477370006636e-05,
+      "loss": 0.1701716423034668,
+      "step": 1355
+    },
+    {
+      "epoch": 0.24745269286754004,
+      "grad_norm": 0.14679782092571259,
+      "learning_rate": 4.407096593731142e-05,
+      "loss": 0.157412326335907,
+      "step": 1360
+    },
+    {
+      "epoch": 0.24836244541484717,
+      "grad_norm": 0.17139530181884766,
+      "learning_rate": 4.402328917780728e-05,
+      "loss": 0.17303754091262818,
+      "step": 1365
+    },
+    {
+      "epoch": 0.2492721979621543,
+      "grad_norm": 0.1534871757030487,
+      "learning_rate": 4.397544750480554e-05,
+      "loss": 0.1786255121231079,
+      "step": 1370
+    },
+    {
+      "epoch": 0.2501819505094614,
+      "grad_norm": 0.1876252293586731,
+      "learning_rate": 4.39274413330472e-05,
+      "loss": 0.16442898511886597,
+      "step": 1375
+    },
+    {
+      "epoch": 0.25109170305676853,
+      "grad_norm": 0.16165752708911896,
+      "learning_rate": 4.387927107869928e-05,
+      "loss": 0.1780426025390625,
+      "step": 1380
+    },
+    {
+      "epoch": 0.25200145560407566,
+      "grad_norm": 0.17242255806922913,
+      "learning_rate": 4.383093715935124e-05,
+      "loss": 0.15959256887435913,
+      "step": 1385
+    },
+    {
+      "epoch": 0.25291120815138285,
+      "grad_norm": 0.1627114862203598,
+      "learning_rate": 4.378243999401137e-05,
+      "loss": 0.17606115341186523,
+      "step": 1390
+    },
+    {
+      "epoch": 0.25382096069869,
+      "grad_norm": 0.15911224484443665,
+      "learning_rate": 4.373378000310312e-05,
+      "loss": 0.16798585653305054,
+      "step": 1395
+    },
+    {
+      "epoch": 0.2547307132459971,
+      "grad_norm": 0.15542249381542206,
+      "learning_rate": 4.3684957608461505e-05,
+      "loss": 0.1695417881011963,
+      "step": 1400
+    },
+    {
+      "epoch": 0.25564046579330424,
+      "grad_norm": 0.1475304812192917,
+      "learning_rate": 4.363597323332941e-05,
+      "loss": 0.16340878009796142,
+      "step": 1405
+    },
+    {
+      "epoch": 0.25655021834061137,
+      "grad_norm": 0.16943927109241486,
+      "learning_rate": 4.358682730235395e-05,
+      "loss": 0.17240238189697266,
+      "step": 1410
+    },
+    {
+      "epoch": 0.2574599708879185,
+      "grad_norm": 0.1816391944885254,
+      "learning_rate": 4.3537520241582744e-05,
+      "loss": 0.16558437347412108,
+      "step": 1415
+    },
+    {
+      "epoch": 0.25836972343522563,
+      "grad_norm": 0.23851341009140015,
+      "learning_rate": 4.348805247846027e-05,
+      "loss": 0.16796000003814698,
+      "step": 1420
+    },
+    {
+      "epoch": 0.25927947598253276,
+      "grad_norm": 0.15415243804454803,
+      "learning_rate": 4.343842444182414e-05,
+      "loss": 0.1746017098426819,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2601892285298399,
+      "grad_norm": 0.15651032328605652,
+      "learning_rate": 4.338863656190139e-05,
+      "loss": 0.1649057984352112,
+      "step": 1430
+    },
+    {
+      "epoch": 0.261098981077147,
+      "grad_norm": 0.16601966321468353,
+      "learning_rate": 4.333868927030471e-05,
+      "loss": 0.15888988971710205,
+      "step": 1435
+    },
+    {
+      "epoch": 0.26200873362445415,
+      "grad_norm": 0.1549467295408249,
+      "learning_rate": 4.328858300002876e-05,
+      "loss": 0.16357985734939576,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2629184861717613,
+      "grad_norm": 0.16332370042800903,
+      "learning_rate": 4.32383181854464e-05,
+      "loss": 0.16749982833862304,
+      "step": 1445
+    },
+    {
+      "epoch": 0.2638282387190684,
+      "grad_norm": 0.14827077090740204,
+      "learning_rate": 4.3187895262304894e-05,
+      "loss": 0.16886214017868043,
+      "step": 1450
+    },
+    {
+      "epoch": 0.26473799126637554,
+      "grad_norm": 0.1557198166847229,
+      "learning_rate": 4.313731466772216e-05,
+      "loss": 0.17512214183807373,
+      "step": 1455
+    },
+    {
+      "epoch": 0.26564774381368267,
+      "grad_norm": 0.17263570427894592,
+      "learning_rate": 4.308657684018299e-05,
+      "loss": 0.16248074769973755,
+      "step": 1460
+    },
+    {
+      "epoch": 0.2665574963609898,
+      "grad_norm": 0.17135761678218842,
+      "learning_rate": 4.303568221953521e-05,
+      "loss": 0.16605921983718872,
+      "step": 1465
+    },
+    {
+      "epoch": 0.26746724890829693,
+      "grad_norm": 0.14322632551193237,
+      "learning_rate": 4.2984631246985897e-05,
+      "loss": 0.1610772728919983,
+      "step": 1470
+    },
+    {
+      "epoch": 0.26837700145560406,
+      "grad_norm": 0.18852312862873077,
+      "learning_rate": 4.2933424365097564e-05,
+      "loss": 0.1686462163925171,
+      "step": 1475
+    },
+    {
+      "epoch": 0.2692867540029112,
+      "grad_norm": 0.1780245155096054,
+      "learning_rate": 4.2882062017784294e-05,
+      "loss": 0.16953932046890258,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2701965065502183,
+      "grad_norm": 0.180568665266037,
+      "learning_rate": 4.2830544650307895e-05,
+      "loss": 0.16442664861679077,
+      "step": 1485
+    },
+    {
+      "epoch": 0.27110625909752545,
+      "grad_norm": 0.16876435279846191,
+      "learning_rate": 4.277887270927407e-05,
+      "loss": 0.17128173112869263,
+      "step": 1490
+    },
+    {
+      "epoch": 0.2720160116448326,
+      "grad_norm": 0.164053276181221,
+      "learning_rate": 4.2727046642628513e-05,
+      "loss": 0.16331382989883422,
+      "step": 1495
+    },
+    {
+      "epoch": 0.27292576419213976,
+      "grad_norm": 0.14577528834342957,
+      "learning_rate": 4.267506689965305e-05,
+      "loss": 0.1638316035270691,
+      "step": 1500
+    },
+    {
+      "epoch": 0.2738355167394469,
+      "grad_norm": 0.1648740917444229,
+      "learning_rate": 4.262293393096171e-05,
+      "loss": 0.15332664251327516,
+      "step": 1505
+    },
+    {
+      "epoch": 0.274745269286754,
+      "grad_norm": 0.16445094347000122,
+      "learning_rate": 4.257064818849685e-05,
+      "loss": 0.1706634521484375,
+      "step": 1510
+    },
+    {
+      "epoch": 0.27565502183406115,
+      "grad_norm": 0.1584935486316681,
+      "learning_rate": 4.251821012552524e-05,
+      "loss": 0.1684114694595337,
+      "step": 1515
+    },
+    {
+      "epoch": 0.2765647743813683,
+      "grad_norm": 0.17215611040592194,
+      "learning_rate": 4.24656201966341e-05,
+      "loss": 0.15594131946563722,
+      "step": 1520
+    },
+    {
+      "epoch": 0.2774745269286754,
+      "grad_norm": 0.15945589542388916,
+      "learning_rate": 4.2412878857727214e-05,
+      "loss": 0.1686659574508667,
+      "step": 1525
+    },
+    {
+      "epoch": 0.27838427947598254,
+      "grad_norm": 0.16103951632976532,
+      "learning_rate": 4.2359986566020906e-05,
+      "loss": 0.17779340744018554,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2792940320232897,
+      "grad_norm": 0.1770307570695877,
+      "learning_rate": 4.230694378004014e-05,
+      "loss": 0.16786882877349854,
+      "step": 1535
+    },
+    {
+      "epoch": 0.2802037845705968,
+      "grad_norm": 0.16225053369998932,
+      "learning_rate": 4.2253750959614504e-05,
+      "loss": 0.16558897495269775,
+      "step": 1540
+    },
+    {
+      "epoch": 0.28111353711790393,
+      "grad_norm": 0.27213969826698303,
+      "learning_rate": 4.220040856587425e-05,
+      "loss": 0.1641119599342346,
+      "step": 1545
+    },
+    {
+      "epoch": 0.28202328966521106,
+      "grad_norm": 0.1773071587085724,
+      "learning_rate": 4.2146917061246284e-05,
+      "loss": 0.16919140815734862,
+      "step": 1550
+    },
+    {
+      "epoch": 0.2829330422125182,
+      "grad_norm": 0.15519705414772034,
+      "learning_rate": 4.209327690945014e-05,
+      "loss": 0.15501506328582765,
+      "step": 1555
+    },
+    {
+      "epoch": 0.2838427947598253,
+      "grad_norm": 0.19921597838401794,
+      "learning_rate": 4.203948857549402e-05,
+      "loss": 0.1690821886062622,
+      "step": 1560
+    },
+    {
+      "epoch": 0.28475254730713245,
+      "grad_norm": 0.15417630970478058,
+      "learning_rate": 4.1985552525670696e-05,
+      "loss": 0.1675640344619751,
+      "step": 1565
+    },
+    {
+      "epoch": 0.2856622998544396,
+      "grad_norm": 0.1739572137594223,
+      "learning_rate": 4.193146922755348e-05,
+      "loss": 0.16738017797470092,
+      "step": 1570
+    },
+    {
+      "epoch": 0.2865720524017467,
+      "grad_norm": 0.1384361982345581,
+      "learning_rate": 4.187723914999221e-05,
+      "loss": 0.16802358627319336,
+      "step": 1575
+    },
+    {
+      "epoch": 0.28748180494905384,
+      "grad_norm": 0.1491454839706421,
+      "learning_rate": 4.182286276310915e-05,
+      "loss": 0.1619583249092102,
+      "step": 1580
+    },
+    {
+      "epoch": 0.288391557496361,
+      "grad_norm": 0.15831919014453888,
+      "learning_rate": 4.176834053829492e-05,
+      "loss": 0.1625199794769287,
+      "step": 1585
+    },
+    {
+      "epoch": 0.2893013100436681,
+      "grad_norm": 0.16265396773815155,
+      "learning_rate": 4.1713672948204416e-05,
+      "loss": 0.16718552112579346,
+      "step": 1590
+    },
+    {
+      "epoch": 0.29021106259097523,
+      "grad_norm": 0.15153461694717407,
+      "learning_rate": 4.1658860466752714e-05,
+      "loss": 0.15979087352752686,
+      "step": 1595
+    },
+    {
+      "epoch": 0.29112081513828236,
+      "grad_norm": 0.1620412915945053,
+      "learning_rate": 4.160390356911096e-05,
+      "loss": 0.16103557348251343,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2920305676855895,
+      "grad_norm": 0.16673807799816132,
+      "learning_rate": 4.154880273170223e-05,
+      "loss": 0.16394708156585694,
+      "step": 1605
+    },
+    {
+      "epoch": 0.2929403202328967,
+      "grad_norm": 0.14834867417812347,
+      "learning_rate": 4.149355843219744e-05,
+      "loss": 0.15916435718536376,
+      "step": 1610
+    },
+    {
+      "epoch": 0.2938500727802038,
+      "grad_norm": 0.16977964341640472,
+      "learning_rate": 4.143817114951119e-05,
+      "loss": 0.16538127660751342,
+      "step": 1615
+    },
+    {
+      "epoch": 0.29475982532751094,
+      "grad_norm": 0.17986875772476196,
+      "learning_rate": 4.138264136379756e-05,
+      "loss": 0.15514618158340454,
+      "step": 1620
+    },
+    {
+      "epoch": 0.29566957787481807,
+      "grad_norm": 0.15794920921325684,
+      "learning_rate": 4.132696955644605e-05,
+      "loss": 0.15992183685302735,
+      "step": 1625
+    },
+    {
+      "epoch": 0.2965793304221252,
+      "grad_norm": 0.19955399632453918,
+      "learning_rate": 4.127115621007731e-05,
+      "loss": 0.16362056732177735,
+      "step": 1630
+    },
+    {
+      "epoch": 0.29748908296943233,
+      "grad_norm": 0.1352023035287857,
+      "learning_rate": 4.121520180853903e-05,
+      "loss": 0.15631601810455323,
+      "step": 1635
+    },
+    {
+      "epoch": 0.29839883551673946,
+      "grad_norm": 0.15340781211853027,
+      "learning_rate": 4.1159106836901674e-05,
+      "loss": 0.1571858048439026,
+      "step": 1640
+    },
+    {
+      "epoch": 0.2993085880640466,
+      "grad_norm": 0.15311770141124725,
+      "learning_rate": 4.110287178145433e-05,
+      "loss": 0.16082344055175782,
+      "step": 1645
+    },
+    {
+      "epoch": 0.3002183406113537,
+      "grad_norm": 0.17811627686023712,
+      "learning_rate": 4.10464971297005e-05,
+      "loss": 0.16117215156555176,
+      "step": 1650
+    },
+    {
+      "epoch": 0.30112809315866085,
+      "grad_norm": 0.21060039103031158,
+      "learning_rate": 4.0989983370353805e-05,
+      "loss": 0.15838587284088135,
+      "step": 1655
+    },
+    {
+      "epoch": 0.302037845705968,
+      "grad_norm": 0.155836820602417,
+      "learning_rate": 4.093333099333383e-05,
+      "loss": 0.16648870706558228,
+      "step": 1660
+    },
+    {
+      "epoch": 0.3029475982532751,
+      "grad_norm": 0.13711698353290558,
+      "learning_rate": 4.0876540489761826e-05,
+      "loss": 0.16899349689483642,
+      "step": 1665
+    },
+    {
+      "epoch": 0.30385735080058224,
+      "grad_norm": 0.15162716805934906,
+      "learning_rate": 4.0819612351956485e-05,
+      "loss": 0.16574090719223022,
+      "step": 1670
+    },
+    {
+      "epoch": 0.30476710334788937,
+      "grad_norm": 0.15016348659992218,
+      "learning_rate": 4.0762547073429615e-05,
+      "loss": 0.1689780354499817,
+      "step": 1675
+    },
+    {
+      "epoch": 0.3056768558951965,
+      "grad_norm": 0.15182986855506897,
+      "learning_rate": 4.070534514888194e-05,
+      "loss": 0.1593686819076538,
+      "step": 1680
+    },
+    {
+      "epoch": 0.3065866084425036,
+      "grad_norm": 0.15648750960826874,
+      "learning_rate": 4.0648007074198765e-05,
+      "loss": 0.16436235904693602,
+      "step": 1685
+    },
+    {
+      "epoch": 0.30749636098981076,
+      "grad_norm": 0.18339484930038452,
+      "learning_rate": 4.0590533346445665e-05,
+      "loss": 0.1678077220916748,
+      "step": 1690
+    },
+    {
+      "epoch": 0.3084061135371179,
+      "grad_norm": 0.16426527500152588,
+      "learning_rate": 4.053292446386422e-05,
+      "loss": 0.1689227342605591,
+      "step": 1695
+    },
+    {
+      "epoch": 0.309315866084425,
+      "grad_norm": 0.16129335761070251,
+      "learning_rate": 4.047518092586766e-05,
+      "loss": 0.16592445373535156,
+      "step": 1700
+    },
+    {
+      "epoch": 0.31022561863173215,
+      "grad_norm": 0.15512363612651825,
+      "learning_rate": 4.041730323303654e-05,
+      "loss": 0.16142364740371704,
+      "step": 1705
+    },
+    {
+      "epoch": 0.3111353711790393,
+      "grad_norm": 0.159842386841774,
+      "learning_rate": 4.0359291887114425e-05,
+      "loss": 0.1702875852584839,
+      "step": 1710
+    },
+    {
+      "epoch": 0.3120451237263464,
+      "grad_norm": 0.19558854401111603,
+      "learning_rate": 4.030114739100352e-05,
+      "loss": 0.15966148376464845,
+      "step": 1715
+    },
+    {
+      "epoch": 0.3129548762736536,
+      "grad_norm": 0.1577496975660324,
+      "learning_rate": 4.024287024876029e-05,
+      "loss": 0.1620358943939209,
+      "step": 1720
+    },
+    {
+      "epoch": 0.3138646288209607,
+      "grad_norm": 0.1629355251789093,
+      "learning_rate": 4.0184460965591144e-05,
+      "loss": 0.16511552333831786,
+      "step": 1725
+    },
+    {
+      "epoch": 0.31477438136826785,
+      "grad_norm": 0.17060767114162445,
+      "learning_rate": 4.0125920047848e-05,
+      "loss": 0.15672838687896729,
+      "step": 1730
+    },
+    {
+      "epoch": 0.315684133915575,
+      "grad_norm": 0.22447620332241058,
+      "learning_rate": 4.006724800302394e-05,
+      "loss": 0.15339784622192382,
+      "step": 1735
+    },
+    {
+      "epoch": 0.3165938864628821,
+      "grad_norm": 0.14572037756443024,
+      "learning_rate": 4.000844533974878e-05,
+      "loss": 0.16566959619522095,
+      "step": 1740
+    },
+    {
+      "epoch": 0.31750363901018924,
+      "grad_norm": 0.15915483236312866,
+      "learning_rate": 3.9949512567784684e-05,
+      "loss": 0.16153957843780517,
+      "step": 1745
+    },
+    {
+      "epoch": 0.3184133915574964,
+      "grad_norm": 0.1668540984392166,
+      "learning_rate": 3.9890450198021704e-05,
+      "loss": 0.1659809947013855,
+      "step": 1750
+    },
+    {
+      "epoch": 0.3193231441048035,
+      "grad_norm": 0.16612035036087036,
+      "learning_rate": 3.983125874247341e-05,
+      "loss": 0.16941241025924683,
+      "step": 1755
+    },
+    {
+      "epoch": 0.32023289665211063,
+      "grad_norm": 0.15163679420948029,
+      "learning_rate": 3.9771938714272407e-05,
+      "loss": 0.16053590774536133,
+      "step": 1760
+    },
+    {
+      "epoch": 0.32114264919941776,
+      "grad_norm": 0.1797824203968048,
+      "learning_rate": 3.97124906276659e-05,
+      "loss": 0.1667110800743103,
+      "step": 1765
+    },
+    {
+      "epoch": 0.3220524017467249,
+      "grad_norm": 0.15076608955860138,
+      "learning_rate": 3.9652914998011237e-05,
+      "loss": 0.1607860803604126,
+      "step": 1770
+    },
+    {
+      "epoch": 0.322962154294032,
+      "grad_norm": 0.16523587703704834,
+      "learning_rate": 3.959321234177144e-05,
+      "loss": 0.16515827178955078,
+      "step": 1775
+    },
+    {
+      "epoch": 0.32387190684133915,
+      "grad_norm": 0.22065149247646332,
+      "learning_rate": 3.9533383176510746e-05,
+      "loss": 0.1618957757949829,
+      "step": 1780
+    },
+    {
+      "epoch": 0.3247816593886463,
+      "grad_norm": 0.16426463425159454,
+      "learning_rate": 3.9473428020890066e-05,
+      "loss": 0.15763382911682128,
+      "step": 1785
+    },
+    {
+      "epoch": 0.3256914119359534,
+      "grad_norm": 0.16474904119968414,
+      "learning_rate": 3.941334739466257e-05,
+      "loss": 0.15135571956634522,
+      "step": 1790
+    },
+    {
+      "epoch": 0.32660116448326054,
+      "grad_norm": 0.16746412217617035,
+      "learning_rate": 3.935314181866909e-05,
+      "loss": 0.15925389528274536,
+      "step": 1795
+    },
+    {
+      "epoch": 0.32751091703056767,
+      "grad_norm": 0.17819371819496155,
+      "learning_rate": 3.929281181483369e-05,
+      "loss": 0.1598669171333313,
+      "step": 1800
+    },
+    {
+      "epoch": 0.3284206695778748,
+      "grad_norm": 0.1816040277481079,
+      "learning_rate": 3.923235790615907e-05,
+      "loss": 0.1652522087097168,
+      "step": 1805
+    },
+    {
+      "epoch": 0.32933042212518193,
+      "grad_norm": 0.14846695959568024,
+      "learning_rate": 3.917178061672211e-05,
+      "loss": 0.16665585041046144,
+      "step": 1810
+    },
+    {
+      "epoch": 0.33024017467248906,
+      "grad_norm": 0.1734926551580429,
+      "learning_rate": 3.911108047166924e-05,
+      "loss": 0.16069791316986085,
+      "step": 1815
+    },
+    {
+      "epoch": 0.3311499272197962,
+      "grad_norm": 0.16154922544956207,
+      "learning_rate": 3.905025799721194e-05,
+      "loss": 0.16114097833633423,
+      "step": 1820
+    },
+    {
+      "epoch": 0.3320596797671033,
+      "grad_norm": 0.1538771390914917,
+      "learning_rate": 3.898931372062217e-05,
+      "loss": 0.1602831244468689,
+      "step": 1825
+    },
+    {
+      "epoch": 0.3329694323144105,
+      "grad_norm": 0.14036566019058228,
+      "learning_rate": 3.892824817022781e-05,
+      "loss": 0.1502395749092102,
+      "step": 1830
+    },
+    {
+      "epoch": 0.33387918486171764,
+      "grad_norm": 0.19212059676647186,
+      "learning_rate": 3.886706187540804e-05,
+      "loss": 0.16265250444412233,
+      "step": 1835
+    },
+    {
+      "epoch": 0.33478893740902477,
+      "grad_norm": 0.17410333454608917,
+      "learning_rate": 3.880575536658881e-05,
+      "loss": 0.15689224004745483,
+      "step": 1840
+    },
+    {
+      "epoch": 0.3356986899563319,
+      "grad_norm": 0.15165294706821442,
+      "learning_rate": 3.874432917523817e-05,
+      "loss": 0.15033140182495117,
+      "step": 1845
+    },
+    {
+      "epoch": 0.336608442503639,
+      "grad_norm": 0.16166730225086212,
+      "learning_rate": 3.8682783833861736e-05,
+      "loss": 0.16896235942840576,
+      "step": 1850
+    },
+    {
+      "epoch": 0.33751819505094616,
+      "grad_norm": 0.16497021913528442,
+      "learning_rate": 3.8621119875998026e-05,
+      "loss": 0.1600774645805359,
+      "step": 1855
+    },
+    {
+      "epoch": 0.3384279475982533,
+      "grad_norm": 0.17264948785305023,
+      "learning_rate": 3.855933783621384e-05,
+      "loss": 0.16947593688964843,
+      "step": 1860
+    },
+    {
+      "epoch": 0.3393377001455604,
+      "grad_norm": 0.16870704293251038,
+      "learning_rate": 3.8497438250099636e-05,
+      "loss": 0.16062095165252685,
+      "step": 1865
+    },
+    {
+      "epoch": 0.34024745269286755,
+      "grad_norm": 0.16644036769866943,
+      "learning_rate": 3.843542165426492e-05,
+      "loss": 0.16015599966049193,
+      "step": 1870
+    },
+    {
+      "epoch": 0.3411572052401747,
+      "grad_norm": 0.1626352220773697,
+      "learning_rate": 3.837328858633349e-05,
+      "loss": 0.17444703578948975,
+      "step": 1875
+    },
+    {
+      "epoch": 0.3420669577874818,
+      "grad_norm": 0.1427375227212906,
+      "learning_rate": 3.83110395849389e-05,
+      "loss": 0.1589805006980896,
+      "step": 1880
+    },
+    {
+      "epoch": 0.34297671033478894,
+      "grad_norm": 0.17840255796909332,
+      "learning_rate": 3.824867518971973e-05,
+      "loss": 0.15953952074050903,
+      "step": 1885
+    },
+    {
+      "epoch": 0.34388646288209607,
+      "grad_norm": 0.16998249292373657,
+      "learning_rate": 3.818619594131489e-05,
+      "loss": 0.16027032136917113,
+      "step": 1890
+    },
+    {
+      "epoch": 0.3447962154294032,
+      "grad_norm": 0.14950257539749146,
+      "learning_rate": 3.812360238135897e-05,
+      "loss": 0.15335670709609986,
+      "step": 1895
+    },
+    {
+      "epoch": 0.3457059679767103,
+      "grad_norm": 0.1678011417388916,
+      "learning_rate": 3.806089505247752e-05,
+      "loss": 0.1560648798942566,
+      "step": 1900
+    },
+    {
+      "epoch": 0.34661572052401746,
+      "grad_norm": 0.17944541573524475,
+      "learning_rate": 3.799807449828238e-05,
+      "loss": 0.16072254180908202,
+      "step": 1905
+    },
+    {
+      "epoch": 0.3475254730713246,
+      "grad_norm": 0.166817307472229,
+      "learning_rate": 3.793514126336691e-05,
+      "loss": 0.1542820692062378,
+      "step": 1910
+    },
+    {
+      "epoch": 0.3484352256186317,
+      "grad_norm": 0.16047626733779907,
+      "learning_rate": 3.787209589330134e-05,
+      "loss": 0.16092092990875245,
+      "step": 1915
+    },
+    {
+      "epoch": 0.34934497816593885,
+      "grad_norm": 0.16478900611400604,
+      "learning_rate": 3.7808938934627965e-05,
+      "loss": 0.16765867471694945,
+      "step": 1920
+    },
+    {
+      "epoch": 0.350254730713246,
+      "grad_norm": 0.15349514782428741,
+      "learning_rate": 3.774567093485648e-05,
+      "loss": 0.15890377759933472,
+      "step": 1925
+    },
+    {
+      "epoch": 0.3511644832605531,
+      "grad_norm": 0.1515921950340271,
+      "learning_rate": 3.768229244245917e-05,
+      "loss": 0.16668319702148438,
+      "step": 1930
+    },
+    {
+      "epoch": 0.35207423580786024,
+      "grad_norm": 0.16310466825962067,
+      "learning_rate": 3.7618804006866195e-05,
+      "loss": 0.15182652473449706,
+      "step": 1935
+    },
+    {
+      "epoch": 0.3529839883551674,
+      "grad_norm": 0.17294517159461975,
+      "learning_rate": 3.755520617846084e-05,
+      "loss": 0.16287628412246705,
+      "step": 1940
+    },
+    {
+      "epoch": 0.35389374090247455,
+      "grad_norm": 0.1482895463705063,
+      "learning_rate": 3.749149950857467e-05,
+      "loss": 0.15321952104568481,
+      "step": 1945
+    },
+    {
+      "epoch": 0.3548034934497817,
+      "grad_norm": 0.2236029952764511,
+      "learning_rate": 3.7427684549482847e-05,
+      "loss": 0.15403482913970948,
+      "step": 1950
+    },
+    {
+      "epoch": 0.3557132459970888,
+      "grad_norm": 0.20185327529907227,
+      "learning_rate": 3.736376185439927e-05,
+      "loss": 0.1633884072303772,
+      "step": 1955
+    },
+    {
+      "epoch": 0.35662299854439594,
+      "grad_norm": 0.13906247913837433,
+      "learning_rate": 3.7299731977471816e-05,
+      "loss": 0.15925350189208984,
+      "step": 1960
+    },
+    {
+      "epoch": 0.35753275109170307,
+      "grad_norm": 0.18665002286434174,
+      "learning_rate": 3.723559547377751e-05,
+      "loss": 0.1612026572227478,
+      "step": 1965
+    },
+    {
+      "epoch": 0.3584425036390102,
+      "grad_norm": 0.16913433372974396,
+      "learning_rate": 3.717135289931774e-05,
+      "loss": 0.15479494333267213,
+      "step": 1970
+    },
+    {
+      "epoch": 0.35935225618631733,
+      "grad_norm": 0.1620066910982132,
+      "learning_rate": 3.7107004811013434e-05,
+      "loss": 0.1604058027267456,
+      "step": 1975
+    },
+    {
+      "epoch": 0.36026200873362446,
+      "grad_norm": 0.16838301718235016,
+      "learning_rate": 3.704255176670021e-05,
+      "loss": 0.15335073471069335,
+      "step": 1980
+    },
+    {
+      "epoch": 0.3611717612809316,
+      "grad_norm": 0.3054695427417755,
+      "learning_rate": 3.6977994325123535e-05,
+      "loss": 0.16558053493499755,
+      "step": 1985
+    },
+    {
+      "epoch": 0.3620815138282387,
+      "grad_norm": 0.1526716649532318,
+      "learning_rate": 3.6913333045933934e-05,
+      "loss": 0.16148923635482787,
+      "step": 1990
+    },
+    {
+      "epoch": 0.36299126637554585,
+      "grad_norm": 0.15328513085842133,
+      "learning_rate": 3.684856848968209e-05,
+      "loss": 0.1553613781929016,
+      "step": 1995
+    },
+    {
+      "epoch": 0.363901018922853,
+      "grad_norm": 0.16129714250564575,
+      "learning_rate": 3.6783701217813995e-05,
+      "loss": 0.16724612712860107,
+      "step": 2000
+    },
+    {
+      "epoch": 0.3648107714701601,
+      "grad_norm": 0.15715539455413818,
+      "learning_rate": 3.6718731792666086e-05,
+      "loss": 0.15867922306060792,
+      "step": 2005
+    },
+    {
+      "epoch": 0.36572052401746724,
+      "grad_norm": 0.15569166839122772,
+      "learning_rate": 3.6653660777460366e-05,
+      "loss": 0.1552058696746826,
+      "step": 2010
+    },
+    {
+      "epoch": 0.36663027656477437,
+      "grad_norm": 0.16223010420799255,
+      "learning_rate": 3.6588488736299535e-05,
+      "loss": 0.1583200454711914,
+      "step": 2015
+    },
+    {
+      "epoch": 0.3675400291120815,
+      "grad_norm": 0.18441995978355408,
+      "learning_rate": 3.652321623416209e-05,
+      "loss": 0.15050662755966188,
+      "step": 2020
+    },
+    {
+      "epoch": 0.36844978165938863,
+      "grad_norm": 0.13792674243450165,
+      "learning_rate": 3.645784383689742e-05,
+      "loss": 0.15458759069442748,
+      "step": 2025
+    },
+    {
+      "epoch": 0.36935953420669576,
+      "grad_norm": 0.14993111789226532,
+      "learning_rate": 3.639237211122091e-05,
+      "loss": 0.15926222801208495,
+      "step": 2030
+    },
+    {
+      "epoch": 0.3702692867540029,
+      "grad_norm": 0.16815930604934692,
+      "learning_rate": 3.632680162470904e-05,
+      "loss": 0.15524441003799438,
+      "step": 2035
+    },
+    {
+      "epoch": 0.37117903930131,
+      "grad_norm": 0.13312821090221405,
+      "learning_rate": 3.626113294579441e-05,
+      "loss": 0.15883516073226928,
+      "step": 2040
+    },
+    {
+      "epoch": 0.37208879184861715,
+      "grad_norm": 0.16838273406028748,
+      "learning_rate": 3.619536664376091e-05,
+      "loss": 0.15829603672027587,
+      "step": 2045
+    },
+    {
+      "epoch": 0.37299854439592434,
+      "grad_norm": 0.14706873893737793,
+      "learning_rate": 3.612950328873869e-05,
+      "loss": 0.15644397735595703,
+      "step": 2050
+    },
+    {
+      "epoch": 0.37390829694323147,
+      "grad_norm": 0.1644199639558792,
+      "learning_rate": 3.606354345169926e-05,
+      "loss": 0.15858219861984252,
+      "step": 2055
+    },
+    {
+      "epoch": 0.3748180494905386,
+      "grad_norm": 0.18077051639556885,
+      "learning_rate": 3.599748770445055e-05,
+      "loss": 0.1641286849975586,
+      "step": 2060
+    },
+    {
+      "epoch": 0.3757278020378457,
+      "grad_norm": 0.16329127550125122,
+      "learning_rate": 3.5931336619631914e-05,
+      "loss": 0.15027186870574952,
+      "step": 2065
+    },
+    {
+      "epoch": 0.37663755458515286,
+      "grad_norm": 0.16346783936023712,
+      "learning_rate": 3.586509077070922e-05,
+      "loss": 0.1558641314506531,
+      "step": 2070
+    },
+    {
+      "epoch": 0.37754730713246,
+      "grad_norm": 0.1727602630853653,
+      "learning_rate": 3.5798750731969834e-05,
+      "loss": 0.15390506982803345,
+      "step": 2075
+    },
+    {
+      "epoch": 0.3784570596797671,
+      "grad_norm": 0.7598192691802979,
+      "learning_rate": 3.5732317078517654e-05,
+      "loss": 0.1533232808113098,
+      "step": 2080
+    },
+    {
+      "epoch": 0.37936681222707425,
+      "grad_norm": 0.1433355212211609,
+      "learning_rate": 3.5665790386268124e-05,
+      "loss": 0.15560413599014283,
+      "step": 2085
+    },
+    {
+      "epoch": 0.3802765647743814,
+      "grad_norm": 0.18439625203609467,
+      "learning_rate": 3.559917123194325e-05,
+      "loss": 0.16695556640625,
+      "step": 2090
+    },
+    {
+      "epoch": 0.3811863173216885,
+      "grad_norm": 0.1693502813577652,
+      "learning_rate": 3.55324601930666e-05,
+      "loss": 0.15957870483398437,
+      "step": 2095
+    },
+    {
+      "epoch": 0.38209606986899564,
+      "grad_norm": 0.17776088416576385,
+      "learning_rate": 3.54656578479583e-05,
+      "loss": 0.1527492880821228,
+      "step": 2100
+    },
+    {
+      "epoch": 0.38300582241630277,
+      "grad_norm": 0.15993724763393402,
+      "learning_rate": 3.539876477572998e-05,
+      "loss": 0.1567505717277527,
+      "step": 2105
+    },
+    {
+      "epoch": 0.3839155749636099,
+      "grad_norm": 0.17067375779151917,
+      "learning_rate": 3.533178155627981e-05,
+      "loss": 0.14660797119140626,
+      "step": 2110
+    },
+    {
+      "epoch": 0.384825327510917,
+      "grad_norm": 0.20239882171154022,
+      "learning_rate": 3.526470877028745e-05,
+      "loss": 0.1596767544746399,
+      "step": 2115
+    },
+    {
+      "epoch": 0.38573508005822416,
+      "grad_norm": 0.1863643079996109,
+      "learning_rate": 3.5197546999209005e-05,
+      "loss": 0.15738571882247926,
+      "step": 2120
+    },
+    {
+      "epoch": 0.3866448326055313,
+      "grad_norm": 0.16994133591651917,
+      "learning_rate": 3.5130296825272014e-05,
+      "loss": 0.16255316734313965,
+      "step": 2125
+    },
+    {
+      "epoch": 0.3875545851528384,
+      "grad_norm": 0.18703415989875793,
+      "learning_rate": 3.5062958831470355e-05,
+      "loss": 0.15206334590911866,
+      "step": 2130
+    },
+    {
+      "epoch": 0.38846433770014555,
+      "grad_norm": 0.15433982014656067,
+      "learning_rate": 3.4995533601559226e-05,
+      "loss": 0.1590178370475769,
+      "step": 2135
+    },
+    {
+      "epoch": 0.3893740902474527,
+      "grad_norm": 0.16498146951198578,
+      "learning_rate": 3.4928021720050104e-05,
+      "loss": 0.14759145975112914,
+      "step": 2140
+    },
+    {
+      "epoch": 0.3902838427947598,
+      "grad_norm": 0.17880478501319885,
+      "learning_rate": 3.486042377220562e-05,
+      "loss": 0.1642458915710449,
+      "step": 2145
+    },
+    {
+      "epoch": 0.39119359534206694,
+      "grad_norm": 0.14700061082839966,
+      "learning_rate": 3.479274034403455e-05,
+      "loss": 0.16105138063430785,
+      "step": 2150
+    },
+    {
+      "epoch": 0.39210334788937407,
+      "grad_norm": 0.1620762050151825,
+      "learning_rate": 3.472497202228664e-05,
+      "loss": 0.15104985237121582,
+      "step": 2155
+    },
+    {
+      "epoch": 0.3930131004366812,
+      "grad_norm": 0.1625058799982071,
+      "learning_rate": 3.4657119394447654e-05,
+      "loss": 0.16145485639572144,
+      "step": 2160
+    },
+    {
+      "epoch": 0.3939228529839884,
+      "grad_norm": 0.1631549596786499,
+      "learning_rate": 3.458918304873417e-05,
+      "loss": 0.16712255477905275,
+      "step": 2165
+    },
+    {
+      "epoch": 0.3948326055312955,
+      "grad_norm": 0.16041551530361176,
+      "learning_rate": 3.452116357408853e-05,
+      "loss": 0.15118330717086792,
+      "step": 2170
+    },
+    {
+      "epoch": 0.39574235807860264,
+      "grad_norm": 0.16692611575126648,
+      "learning_rate": 3.44530615601737e-05,
+      "loss": 0.16982550621032716,
+      "step": 2175
+    },
+    {
+      "epoch": 0.39665211062590977,
+      "grad_norm": 0.16082268953323364,
+      "learning_rate": 3.438487759736821e-05,
+      "loss": 0.1513260006904602,
+      "step": 2180
+    },
+    {
+      "epoch": 0.3975618631732169,
+      "grad_norm": 0.1474589854478836,
+      "learning_rate": 3.4316612276761004e-05,
+      "loss": 0.14968743324279785,
+      "step": 2185
+    },
+    {
+      "epoch": 0.39847161572052403,
+      "grad_norm": 0.14531342685222626,
+      "learning_rate": 3.42482661901463e-05,
+      "loss": 0.1563260555267334,
+      "step": 2190
+    },
+    {
+      "epoch": 0.39938136826783116,
+      "grad_norm": 0.16775506734848022,
+      "learning_rate": 3.41798399300185e-05,
+      "loss": 0.14861010313034057,
+      "step": 2195
+    },
+    {
+      "epoch": 0.4002911208151383,
+      "grad_norm": 0.15065217018127441,
+      "learning_rate": 3.411133408956703e-05,
+      "loss": 0.15559519529342652,
+      "step": 2200
+    },
+    {
+      "epoch": 0.4012008733624454,
+      "grad_norm": 0.16655296087265015,
+      "learning_rate": 3.4042749262671184e-05,
+      "loss": 0.16025567054748535,
+      "step": 2205
+    },
+    {
+      "epoch": 0.40211062590975255,
+      "grad_norm": 0.14773905277252197,
+      "learning_rate": 3.397408604389501e-05,
+      "loss": 0.15074082612991332,
+      "step": 2210
+    },
+    {
+      "epoch": 0.4030203784570597,
+      "grad_norm": 0.16233304142951965,
+      "learning_rate": 3.3905345028482125e-05,
+      "loss": 0.15490520000457764,
+      "step": 2215
+    },
+    {
+      "epoch": 0.4039301310043668,
+      "grad_norm": 0.17520153522491455,
+      "learning_rate": 3.383652681235058e-05,
+      "loss": 0.1517520785331726,
+      "step": 2220
+    },
+    {
+      "epoch": 0.40483988355167394,
+      "grad_norm": 0.14749875664710999,
+      "learning_rate": 3.376763199208766e-05,
+      "loss": 0.15410997867584228,
+      "step": 2225
+    },
+    {
+      "epoch": 0.40574963609898107,
+      "grad_norm": 0.16855919361114502,
+      "learning_rate": 3.369866116494477e-05,
+      "loss": 0.1510261058807373,
+      "step": 2230
+    },
+    {
+      "epoch": 0.4066593886462882,
+      "grad_norm": 0.1594122350215912,
+      "learning_rate": 3.362961492883218e-05,
+      "loss": 0.1493813395500183,
+      "step": 2235
+    },
+    {
+      "epoch": 0.40756914119359533,
+      "grad_norm": 0.13645926117897034,
+      "learning_rate": 3.3560493882313915e-05,
+      "loss": 0.14876762628555298,
+      "step": 2240
+    },
+    {
+      "epoch": 0.40847889374090246,
+      "grad_norm": 0.14304400980472565,
+      "learning_rate": 3.349129862460251e-05,
+      "loss": 0.15567013025283813,
+      "step": 2245
+    },
+    {
+      "epoch": 0.4093886462882096,
+      "grad_norm": 0.17040041089057922,
+      "learning_rate": 3.342202975555386e-05,
+      "loss": 0.1563249945640564,
+      "step": 2250
+    },
+    {
+      "epoch": 0.4102983988355167,
+      "grad_norm": 0.15594671666622162,
+      "learning_rate": 3.3352687875661984e-05,
+      "loss": 0.1546410083770752,
+      "step": 2255
+    },
+    {
+      "epoch": 0.41120815138282385,
+      "grad_norm": 0.1677195280790329,
+      "learning_rate": 3.328327358605384e-05,
+      "loss": 0.15710171461105346,
+      "step": 2260
+    },
+    {
+      "epoch": 0.412117903930131,
+      "grad_norm": 0.1731705516576767,
+      "learning_rate": 3.321378748848412e-05,
+      "loss": 0.16444036960601807,
+      "step": 2265
+    },
+    {
+      "epoch": 0.4130276564774381,
+      "grad_norm": 0.18779033422470093,
+      "learning_rate": 3.3144230185329984e-05,
+      "loss": 0.15659687519073487,
+      "step": 2270
+    },
+    {
+      "epoch": 0.4139374090247453,
+      "grad_norm": 0.1543768346309662,
+      "learning_rate": 3.3074602279585913e-05,
+      "loss": 0.15100739002227784,
+      "step": 2275
+    },
+    {
+      "epoch": 0.4148471615720524,
+      "grad_norm": 0.16672168672084808,
+      "learning_rate": 3.300490437485843e-05,
+      "loss": 0.15535364151000977,
+      "step": 2280
+    },
+    {
+      "epoch": 0.41575691411935956,
+      "grad_norm": 0.16741308569908142,
+      "learning_rate": 3.293513707536089e-05,
+      "loss": 0.15523911714553834,
+      "step": 2285
+    },
+    {
+      "epoch": 0.4166666666666667,
+      "grad_norm": 0.1488303542137146,
+      "learning_rate": 3.286530098590822e-05,
+      "loss": 0.1542000651359558,
+      "step": 2290
+    },
+    {
+      "epoch": 0.4175764192139738,
+      "grad_norm": 0.1637732982635498,
+      "learning_rate": 3.2795396711911694e-05,
+      "loss": 0.15354831218719484,
+      "step": 2295
+    },
+    {
+      "epoch": 0.41848617176128095,
+      "grad_norm": 0.1472022533416748,
+      "learning_rate": 3.272542485937369e-05,
+      "loss": 0.16235145330429077,
+      "step": 2300
+    },
+    {
+      "epoch": 0.4193959243085881,
+      "grad_norm": 0.15908290445804596,
+      "learning_rate": 3.265538603488241e-05,
+      "loss": 0.15642645359039306,
+      "step": 2305
+    },
+    {
+      "epoch": 0.4203056768558952,
+      "grad_norm": 0.1584865301847458,
+      "learning_rate": 3.2585280845606645e-05,
+      "loss": 0.15490249395370484,
+      "step": 2310
+    },
+    {
+      "epoch": 0.42121542940320233,
+      "grad_norm": 0.15893949568271637,
+      "learning_rate": 3.251510989929052e-05,
+      "loss": 0.1598116159439087,
+      "step": 2315
+    },
+    {
+      "epoch": 0.42212518195050946,
+      "grad_norm": 0.18930596113204956,
+      "learning_rate": 3.244487380424817e-05,
+      "loss": 0.1482008934020996,
+      "step": 2320
+    },
+    {
+      "epoch": 0.4230349344978166,
+      "grad_norm": 0.132876455783844,
+      "learning_rate": 3.237457316935856e-05,
+      "loss": 0.15304710865020751,
+      "step": 2325
+    },
+    {
+      "epoch": 0.4239446870451237,
+      "grad_norm": 0.16447032988071442,
+      "learning_rate": 3.2304208604060106e-05,
+      "loss": 0.15298750400543212,
+      "step": 2330
+    },
+    {
+      "epoch": 0.42485443959243085,
+      "grad_norm": 0.17748120427131653,
+      "learning_rate": 3.223378071834546e-05,
+      "loss": 0.1556084156036377,
+      "step": 2335
+    },
+    {
+      "epoch": 0.425764192139738,
+      "grad_norm": 0.16366586089134216,
+      "learning_rate": 3.2163290122756206e-05,
+      "loss": 0.14387927055358887,
+      "step": 2340
+    },
+    {
+      "epoch": 0.4266739446870451,
+      "grad_norm": 0.15398970246315002,
+      "learning_rate": 3.209273742837755e-05,
+      "loss": 0.16091293096542358,
+      "step": 2345
+    },
+    {
+      "epoch": 0.42758369723435224,
+      "grad_norm": 0.164212167263031,
+      "learning_rate": 3.202212324683305e-05,
+      "loss": 0.15523531436920165,
+      "step": 2350
+    },
+    {
+      "epoch": 0.4284934497816594,
+      "grad_norm": 0.16749800741672516,
+      "learning_rate": 3.1951448190279255e-05,
+      "loss": 0.15354975461959838,
+      "step": 2355
+    },
+    {
+      "epoch": 0.4294032023289665,
+      "grad_norm": 0.14137034118175507,
+      "learning_rate": 3.18807128714005e-05,
+      "loss": 0.14981694221496583,
+      "step": 2360
+    },
+    {
+      "epoch": 0.43031295487627363,
+      "grad_norm": 0.14848439395427704,
+      "learning_rate": 3.1809917903403507e-05,
+      "loss": 0.15448769330978393,
+      "step": 2365
+    },
+    {
+      "epoch": 0.43122270742358076,
+      "grad_norm": 0.1747605800628662,
+      "learning_rate": 3.1739063900012095e-05,
+      "loss": 0.15882387161254882,
+      "step": 2370
+    },
+    {
+      "epoch": 0.4321324599708879,
+      "grad_norm": 0.16054467856884003,
+      "learning_rate": 3.166815147546186e-05,
+      "loss": 0.15170297622680665,
+      "step": 2375
+    },
+    {
+      "epoch": 0.433042212518195,
+      "grad_norm": 0.15428027510643005,
+      "learning_rate": 3.1597181244494886e-05,
+      "loss": 0.16202548742294312,
+      "step": 2380
+    },
+    {
+      "epoch": 0.4339519650655022,
+      "grad_norm": 0.16747219860553741,
+      "learning_rate": 3.1526153822354325e-05,
+      "loss": 0.15461477041244506,
+      "step": 2385
+    },
+    {
+      "epoch": 0.43486171761280934,
+      "grad_norm": 0.17415772378444672,
+      "learning_rate": 3.145506982477918e-05,
+      "loss": 0.16173542737960817,
+      "step": 2390
+    },
+    {
+      "epoch": 0.43577147016011647,
+      "grad_norm": 0.1293518990278244,
+      "learning_rate": 3.1383929867998865e-05,
+      "loss": 0.15572521686553956,
+      "step": 2395
+    },
+    {
+      "epoch": 0.4366812227074236,
+      "grad_norm": 0.16909323632717133,
+      "learning_rate": 3.1312734568727935e-05,
+      "loss": 0.15898628234863282,
+      "step": 2400
+    },
+    {
+      "epoch": 0.43759097525473073,
+      "grad_norm": 0.16770294308662415,
+      "learning_rate": 3.124148454416069e-05,
+      "loss": 0.1536281704902649,
+      "step": 2405
+    },
+    {
+      "epoch": 0.43850072780203786,
+      "grad_norm": 0.14078612625598907,
+      "learning_rate": 3.117018041196585e-05,
+      "loss": 0.15274266004562378,
+      "step": 2410
+    },
+    {
+      "epoch": 0.439410480349345,
+      "grad_norm": 0.15457536280155182,
+      "learning_rate": 3.1098822790281226e-05,
+      "loss": 0.15391263961791993,
+      "step": 2415
+    },
+    {
+      "epoch": 0.4403202328966521,
+      "grad_norm": 0.1640717089176178,
+      "learning_rate": 3.102741229770827e-05,
+      "loss": 0.15515168905258178,
+      "step": 2420
+    },
+    {
+      "epoch": 0.44122998544395925,
+      "grad_norm": 0.2601533830165863,
+      "learning_rate": 3.095594955330683e-05,
+      "loss": 0.1587247371673584,
+      "step": 2425
+    },
+    {
+      "epoch": 0.4421397379912664,
+      "grad_norm": 0.1352529525756836,
+      "learning_rate": 3.08844351765897e-05,
+      "loss": 0.1483217477798462,
+      "step": 2430
+    },
+    {
+      "epoch": 0.4430494905385735,
+      "grad_norm": 0.18479721248149872,
+      "learning_rate": 3.081286978751728e-05,
+      "loss": 0.15121787786483765,
+      "step": 2435
+    },
+    {
+      "epoch": 0.44395924308588064,
+      "grad_norm": 0.16954511404037476,
+      "learning_rate": 3.074125400649221e-05,
+      "loss": 0.16073100566864013,
+      "step": 2440
+    },
+    {
+      "epoch": 0.44486899563318777,
+      "grad_norm": 0.15154729783535004,
+      "learning_rate": 3.0669588454353944e-05,
+      "loss": 0.15738017559051515,
+      "step": 2445
+    },
+    {
+      "epoch": 0.4457787481804949,
+      "grad_norm": 0.1540488302707672,
+      "learning_rate": 3.059787375237344e-05,
+      "loss": 0.1515384554862976,
+      "step": 2450
+    },
+    {
+      "epoch": 0.44668850072780203,
+      "grad_norm": 0.1814432442188263,
+      "learning_rate": 3.052611052224774e-05,
+      "loss": 0.15731438398361205,
+      "step": 2455
+    },
+    {
+      "epoch": 0.44759825327510916,
+      "grad_norm": 0.16657036542892456,
+      "learning_rate": 3.0454299386094542e-05,
+      "loss": 0.15741543769836425,
+      "step": 2460
+    },
+    {
+      "epoch": 0.4485080058224163,
+      "grad_norm": 0.2177237570285797,
+      "learning_rate": 3.0382440966446875e-05,
+      "loss": 0.14972515106201173,
+      "step": 2465
+    },
+    {
+      "epoch": 0.4494177583697234,
+      "grad_norm": 0.1669909954071045,
+      "learning_rate": 3.031053588624766e-05,
+      "loss": 0.1506432294845581,
+      "step": 2470
+    },
+    {
+      "epoch": 0.45032751091703055,
+      "grad_norm": 0.1752234250307083,
+      "learning_rate": 3.0238584768844313e-05,
+      "loss": 0.14969609975814818,
+      "step": 2475
+    },
+    {
+      "epoch": 0.4512372634643377,
+      "grad_norm": 0.18267901241779327,
+      "learning_rate": 3.0166588237983363e-05,
+      "loss": 0.15112748146057128,
+      "step": 2480
+    },
+    {
+      "epoch": 0.4521470160116448,
+      "grad_norm": 0.16250105202198029,
+      "learning_rate": 3.0094546917805007e-05,
+      "loss": 0.15864100456237792,
+      "step": 2485
+    },
+    {
+      "epoch": 0.45305676855895194,
+      "grad_norm": 0.14825721085071564,
+      "learning_rate": 3.0022461432837752e-05,
+      "loss": 0.1513954520225525,
+      "step": 2490
+    },
+    {
+      "epoch": 0.4539665211062591,
+      "grad_norm": 0.1626640111207962,
+      "learning_rate": 2.9950332407992943e-05,
+      "loss": 0.1505578875541687,
+      "step": 2495
+    },
+    {
+      "epoch": 0.45487627365356625,
+      "grad_norm": 0.1535351574420929,
+      "learning_rate": 2.987816046855939e-05,
+      "loss": 0.15255829095840454,
+      "step": 2500
+    },
+    {
+      "epoch": 0.4557860262008734,
+      "grad_norm": 0.17552775144577026,
+      "learning_rate": 2.9805946240197928e-05,
+      "loss": 0.1516443133354187,
+      "step": 2505
+    },
+    {
+      "epoch": 0.4566957787481805,
+      "grad_norm": 0.16020981967449188,
+      "learning_rate": 2.9733690348935994e-05,
+      "loss": 0.14519743919372557,
+      "step": 2510
+    },
+    {
+      "epoch": 0.45760553129548764,
+      "grad_norm": 0.17800211906433105,
+      "learning_rate": 2.9661393421162204e-05,
+      "loss": 0.15679080486297609,
+      "step": 2515
+    },
+    {
+      "epoch": 0.4585152838427948,
+      "grad_norm": 0.16016991436481476,
+      "learning_rate": 2.9589056083620902e-05,
+      "loss": 0.14768127202987671,
+      "step": 2520
+    },
+    {
+      "epoch": 0.4594250363901019,
+      "grad_norm": 0.16272081434726715,
+      "learning_rate": 2.951667896340679e-05,
+      "loss": 0.1513301968574524,
+      "step": 2525
+    },
+    {
+      "epoch": 0.46033478893740903,
+      "grad_norm": 0.1726413071155548,
+      "learning_rate": 2.9444262687959402e-05,
+      "loss": 0.14819332361221313,
+      "step": 2530
+    },
+    {
+      "epoch": 0.46124454148471616,
+      "grad_norm": 0.1670403778553009,
+      "learning_rate": 2.9371807885057735e-05,
+      "loss": 0.15245940685272216,
+      "step": 2535
+    },
+    {
+      "epoch": 0.4621542940320233,
+      "grad_norm": 0.1650049239397049,
+      "learning_rate": 2.9299315182814772e-05,
+      "loss": 0.15187418460845947,
+      "step": 2540
+    },
+    {
+      "epoch": 0.4630640465793304,
+      "grad_norm": 0.16327734291553497,
+      "learning_rate": 2.9226785209672047e-05,
+      "loss": 0.15579828023910522,
+      "step": 2545
+    },
+    {
+      "epoch": 0.46397379912663755,
+      "grad_norm": 0.3367880582809448,
+      "learning_rate": 2.91542185943942e-05,
+      "loss": 0.15617697238922118,
+      "step": 2550
+    },
+    {
+      "epoch": 0.4648835516739447,
+      "grad_norm": 0.1731594055891037,
+      "learning_rate": 2.908161596606353e-05,
+      "loss": 0.1559603691101074,
+      "step": 2555
+    },
+    {
+      "epoch": 0.4657933042212518,
+      "grad_norm": 0.1477293074131012,
+      "learning_rate": 2.9008977954074517e-05,
+      "loss": 0.15567959547042848,
+      "step": 2560
+    },
+    {
+      "epoch": 0.46670305676855894,
+      "grad_norm": 0.16227173805236816,
+      "learning_rate": 2.8936305188128392e-05,
+      "loss": 0.1522113561630249,
+      "step": 2565
+    },
+    {
+      "epoch": 0.4676128093158661,
+      "grad_norm": 0.2031075656414032,
+      "learning_rate": 2.8863598298227674e-05,
+      "loss": 0.15054640769958497,
+      "step": 2570
+    },
+    {
+      "epoch": 0.4685225618631732,
+      "grad_norm": 0.18351472914218903,
+      "learning_rate": 2.8790857914670698e-05,
+      "loss": 0.15837019681930542,
+      "step": 2575
+    },
+    {
+      "epoch": 0.46943231441048033,
+      "grad_norm": 0.15914765000343323,
+      "learning_rate": 2.871808466804616e-05,
+      "loss": 0.1550259470939636,
+      "step": 2580
+    },
+    {
+      "epoch": 0.47034206695778746,
+      "grad_norm": 0.17366717755794525,
+      "learning_rate": 2.8645279189227636e-05,
+      "loss": 0.15702390670776367,
+      "step": 2585
+    },
+    {
+      "epoch": 0.4712518195050946,
+      "grad_norm": 0.13677838444709778,
+      "learning_rate": 2.8572442109368134e-05,
+      "loss": 0.15485031604766847,
+      "step": 2590
+    },
+    {
+      "epoch": 0.4721615720524017,
+      "grad_norm": 0.1477748304605484,
+      "learning_rate": 2.8499574059894617e-05,
+      "loss": 0.14577245712280273,
+      "step": 2595
+    },
+    {
+      "epoch": 0.47307132459970885,
+      "grad_norm": 0.1582217663526535,
+      "learning_rate": 2.842667567250252e-05,
+      "loss": 0.15586793422698975,
+      "step": 2600
+    },
+    {
+      "epoch": 0.47398107714701604,
+      "grad_norm": 0.19658738374710083,
+      "learning_rate": 2.8353747579150268e-05,
+      "loss": 0.15060495138168334,
+      "step": 2605
+    },
+    {
+      "epoch": 0.47489082969432317,
+      "grad_norm": 0.176767036318779,
+      "learning_rate": 2.828079041205382e-05,
+      "loss": 0.15116705894470214,
+      "step": 2610
+    },
+    {
+      "epoch": 0.4758005822416303,
+      "grad_norm": 0.16972507536411285,
+      "learning_rate": 2.820780480368117e-05,
+      "loss": 0.1541937470436096,
+      "step": 2615
+    },
+    {
+      "epoch": 0.47671033478893743,
+      "grad_norm": 0.1548585742712021,
+      "learning_rate": 2.8134791386746884e-05,
+      "loss": 0.14334756135940552,
+      "step": 2620
+    },
+    {
+      "epoch": 0.47762008733624456,
+      "grad_norm": 0.15411986410617828,
+      "learning_rate": 2.806175079420658e-05,
+      "loss": 0.14642289876937867,
+      "step": 2625
+    },
+    {
+      "epoch": 0.4785298398835517,
+      "grad_norm": 0.16609491407871246,
+      "learning_rate": 2.7988683659251474e-05,
+      "loss": 0.15083469152450563,
+      "step": 2630
+    },
+    {
+      "epoch": 0.4794395924308588,
+      "grad_norm": 0.16592684388160706,
+      "learning_rate": 2.791559061530289e-05,
+      "loss": 0.14218480587005616,
+      "step": 2635
+    },
+    {
+      "epoch": 0.48034934497816595,
+      "grad_norm": 0.1764935404062271,
+      "learning_rate": 2.7842472296006722e-05,
+      "loss": 0.15004343986511232,
+      "step": 2640
+    },
+    {
+      "epoch": 0.4812590975254731,
+      "grad_norm": 0.20094354450702667,
+      "learning_rate": 2.7769329335228022e-05,
+      "loss": 0.14975016117095946,
+      "step": 2645
+    },
+    {
+      "epoch": 0.4821688500727802,
+      "grad_norm": 0.1869269460439682,
+      "learning_rate": 2.769616236704542e-05,
+      "loss": 0.155981707572937,
+      "step": 2650
+    },
+    {
+      "epoch": 0.48307860262008734,
+      "grad_norm": 0.16671574115753174,
+      "learning_rate": 2.762297202574571e-05,
+      "loss": 0.14633859395980836,
+      "step": 2655
+    },
+    {
+      "epoch": 0.48398835516739447,
+      "grad_norm": 0.14999663829803467,
+      "learning_rate": 2.754975894581826e-05,
+      "loss": 0.15692603588104248,
+      "step": 2660
+    },
+    {
+      "epoch": 0.4848981077147016,
+      "grad_norm": 0.16893649101257324,
+      "learning_rate": 2.7476523761949592e-05,
+      "loss": 0.14530394077301026,
+      "step": 2665
+    },
+    {
+      "epoch": 0.48580786026200873,
+      "grad_norm": 0.16039884090423584,
+      "learning_rate": 2.740326710901784e-05,
+      "loss": 0.15013915300369263,
+      "step": 2670
+    },
+    {
+      "epoch": 0.48671761280931586,
+      "grad_norm": 0.16672006249427795,
+      "learning_rate": 2.732998962208725e-05,
+      "loss": 0.15667349100112915,
+      "step": 2675
+    },
+    {
+      "epoch": 0.487627365356623,
+      "grad_norm": 0.2160867303609848,
+      "learning_rate": 2.7256691936402684e-05,
+      "loss": 0.14335414171218872,
+      "step": 2680
+    },
+    {
+      "epoch": 0.4885371179039301,
+      "grad_norm": 0.349030077457428,
+      "learning_rate": 2.71833746873841e-05,
+      "loss": 0.1437530279159546,
+      "step": 2685
+    },
+    {
+      "epoch": 0.48944687045123725,
+      "grad_norm": 0.18380966782569885,
+      "learning_rate": 2.7110038510621073e-05,
+      "loss": 0.1476014256477356,
+      "step": 2690
+    },
+    {
+      "epoch": 0.4903566229985444,
+      "grad_norm": 0.1523742377758026,
+      "learning_rate": 2.703668404186722e-05,
+      "loss": 0.14578526020050048,
+      "step": 2695
+    },
+    {
+      "epoch": 0.4912663755458515,
+      "grad_norm": 0.16092729568481445,
+      "learning_rate": 2.696331191703479e-05,
+      "loss": 0.15335593223571778,
+      "step": 2700
+    },
+    {
+      "epoch": 0.49217612809315864,
+      "grad_norm": 0.17185333371162415,
+      "learning_rate": 2.688992277218904e-05,
+      "loss": 0.1540898084640503,
+      "step": 2705
+    },
+    {
+      "epoch": 0.49308588064046577,
+      "grad_norm": 0.1521969735622406,
+      "learning_rate": 2.6816517243542792e-05,
+      "loss": 0.15171396732330322,
+      "step": 2710
+    },
+    {
+      "epoch": 0.49399563318777295,
+      "grad_norm": 0.16064171493053436,
+      "learning_rate": 2.674309596745092e-05,
+      "loss": 0.1505839228630066,
+      "step": 2715
+    },
+    {
+      "epoch": 0.4949053857350801,
+      "grad_norm": 0.16430898010730743,
+      "learning_rate": 2.6669659580404795e-05,
+      "loss": 0.1551363468170166,
+      "step": 2720
+    },
+    {
+      "epoch": 0.4958151382823872,
+      "grad_norm": 0.16125477850437164,
+      "learning_rate": 2.659620871902677e-05,
+      "loss": 0.15069286823272704,
+      "step": 2725
+    },
+    {
+      "epoch": 0.49672489082969434,
+      "grad_norm": 0.1428450047969818,
+      "learning_rate": 2.652274402006471e-05,
+      "loss": 0.15511081218719483,
+      "step": 2730
+    },
+    {
+      "epoch": 0.4976346433770015,
+      "grad_norm": 0.15452754497528076,
+      "learning_rate": 2.6449266120386406e-05,
+      "loss": 0.14941939115524291,
+      "step": 2735
+    },
+    {
+      "epoch": 0.4985443959243086,
+      "grad_norm": 0.17243537306785583,
+      "learning_rate": 2.6375775656974123e-05,
+      "loss": 0.151741623878479,
+      "step": 2740
+    },
+    {
+      "epoch": 0.49945414847161573,
+      "grad_norm": 0.13736453652381897,
+      "learning_rate": 2.6302273266919008e-05,
+      "loss": 0.147042977809906,
+      "step": 2745
+    },
+    {
+      "epoch": 0.5003639010189228,
+      "grad_norm": 0.16241495311260223,
+      "learning_rate": 2.6228759587415614e-05,
+      "loss": 0.14664684534072875,
+      "step": 2750
+    },
+    {
+      "epoch": 0.50127365356623,
+      "grad_norm": 0.193496435880661,
+      "learning_rate": 2.6155235255756356e-05,
+      "loss": 0.15486966371536254,
+      "step": 2755
+    },
+    {
+      "epoch": 0.5021834061135371,
+      "grad_norm": 0.1542847901582718,
+      "learning_rate": 2.6081700909326e-05,
+      "loss": 0.15148009061813356,
+      "step": 2760
+    },
+    {
+      "epoch": 0.5030931586608443,
+      "grad_norm": 0.1696511209011078,
+      "learning_rate": 2.6008157185596142e-05,
+      "loss": 0.14190055131912233,
+      "step": 2765
+    },
+    {
+      "epoch": 0.5040029112081513,
+      "grad_norm": 0.14690077304840088,
+      "learning_rate": 2.5934604722119655e-05,
+      "loss": 0.1570739269256592,
+      "step": 2770
+    },
+    {
+      "epoch": 0.5049126637554585,
+      "grad_norm": 0.17149671912193298,
+      "learning_rate": 2.5861044156525162e-05,
+      "loss": 0.14940304756164552,
+      "step": 2775
+    },
+    {
+      "epoch": 0.5058224163027657,
+      "grad_norm": 0.16639231145381927,
+      "learning_rate": 2.578747612651155e-05,
+      "loss": 0.15691237449645995,
+      "step": 2780
+    },
+    {
+      "epoch": 0.5067321688500728,
+      "grad_norm": 0.2062763124704361,
+      "learning_rate": 2.5713901269842404e-05,
+      "loss": 0.1564734935760498,
+      "step": 2785
+    },
+    {
+      "epoch": 0.50764192139738,
+      "grad_norm": 0.12636308372020721,
+      "learning_rate": 2.5640320224340502e-05,
+      "loss": 0.14539417028427123,
+      "step": 2790
+    },
+    {
+      "epoch": 0.508551673944687,
+      "grad_norm": 0.16893689334392548,
+      "learning_rate": 2.556673362788225e-05,
+      "loss": 0.15440930128097535,
+      "step": 2795
+    },
+    {
+      "epoch": 0.5094614264919942,
+      "grad_norm": 0.16250015795230865,
+      "learning_rate": 2.54931421183922e-05,
+      "loss": 0.14485647678375244,
+      "step": 2800
+    },
+    {
+      "epoch": 0.5103711790393013,
+      "grad_norm": 0.1700994372367859,
+      "learning_rate": 2.5419546333837462e-05,
+      "loss": 0.15411126613616943,
+      "step": 2805
+    },
+    {
+      "epoch": 0.5112809315866085,
+      "grad_norm": 0.1547706127166748,
+      "learning_rate": 2.5345946912222256e-05,
+      "loss": 0.15516072511672974,
+      "step": 2810
+    },
+    {
+      "epoch": 0.5121906841339156,
+      "grad_norm": 0.17955681681632996,
+      "learning_rate": 2.527234449158228e-05,
+      "loss": 0.15546923875808716,
+      "step": 2815
+    },
+    {
+      "epoch": 0.5131004366812227,
+      "grad_norm": 0.163709819316864,
+      "learning_rate": 2.519873970997927e-05,
+      "loss": 0.15665037631988527,
+      "step": 2820
+    },
+    {
+      "epoch": 0.5140101892285298,
+      "grad_norm": 0.17859576642513275,
+      "learning_rate": 2.5125133205495405e-05,
+      "loss": 0.1539722204208374,
+      "step": 2825
+    },
+    {
+      "epoch": 0.514919941775837,
+      "grad_norm": 0.17443150281906128,
+      "learning_rate": 2.5051525616227806e-05,
+      "loss": 0.148411762714386,
+      "step": 2830
+    },
+    {
+      "epoch": 0.5158296943231441,
+      "grad_norm": 0.17397581040859222,
+      "learning_rate": 2.4977917580283007e-05,
+      "loss": 0.14880497455596925,
+      "step": 2835
+    },
+    {
+      "epoch": 0.5167394468704513,
+      "grad_norm": 0.14565663039684296,
+      "learning_rate": 2.4904309735771405e-05,
+      "loss": 0.14934680461883545,
+      "step": 2840
+    },
+    {
+      "epoch": 0.5176491994177583,
+      "grad_norm": 0.17895659804344177,
+      "learning_rate": 2.4830702720801746e-05,
+      "loss": 0.15287939310073853,
+      "step": 2845
+    },
+    {
+      "epoch": 0.5185589519650655,
+      "grad_norm": 0.15812788903713226,
+      "learning_rate": 2.4757097173475572e-05,
+      "loss": 0.14576947689056396,
+      "step": 2850
+    },
+    {
+      "epoch": 0.5194687045123726,
+      "grad_norm": 0.17123781144618988,
+      "learning_rate": 2.46834937318817e-05,
+      "loss": 0.15224847793579102,
+      "step": 2855
+    },
+    {
+      "epoch": 0.5203784570596798,
+      "grad_norm": 0.14845474064350128,
+      "learning_rate": 2.460989303409072e-05,
+      "loss": 0.14901585578918458,
+      "step": 2860
+    },
+    {
+      "epoch": 0.5212882096069869,
+      "grad_norm": 0.23493704199790955,
+      "learning_rate": 2.4536295718149407e-05,
+      "loss": 0.1517487049102783,
+      "step": 2865
+    },
+    {
+      "epoch": 0.522197962154294,
+      "grad_norm": 0.16209843754768372,
+      "learning_rate": 2.4462702422075217e-05,
+      "loss": 0.14327445030212402,
+      "step": 2870
+    },
+    {
+      "epoch": 0.5231077147016011,
+      "grad_norm": 0.17249803245067596,
+      "learning_rate": 2.4389113783850793e-05,
+      "loss": 0.1517549753189087,
+      "step": 2875
+    },
+    {
+      "epoch": 0.5240174672489083,
+      "grad_norm": 0.14561402797698975,
+      "learning_rate": 2.431553044141836e-05,
+      "loss": 0.14764087200164794,
+      "step": 2880
+    },
+    {
+      "epoch": 0.5249272197962155,
+      "grad_norm": 0.17033302783966064,
+      "learning_rate": 2.4241953032674256e-05,
+      "loss": 0.15181604623794556,
+      "step": 2885
+    },
+    {
+      "epoch": 0.5258369723435226,
+      "grad_norm": 0.1184430941939354,
+      "learning_rate": 2.4168382195463367e-05,
+      "loss": 0.14264242649078368,
+      "step": 2890
+    },
+    {
+      "epoch": 0.5267467248908297,
+      "grad_norm": 0.17521196603775024,
+      "learning_rate": 2.4094818567573618e-05,
+      "loss": 0.1509538173675537,
+      "step": 2895
+    },
+    {
+      "epoch": 0.5276564774381368,
+      "grad_norm": 0.1681576371192932,
+      "learning_rate": 2.4021262786730428e-05,
+      "loss": 0.15344605445861817,
+      "step": 2900
+    },
+    {
+      "epoch": 0.528566229985444,
+      "grad_norm": 0.17134182155132294,
+      "learning_rate": 2.3947715490591206e-05,
+      "loss": 0.15161689519882202,
+      "step": 2905
+    },
+    {
+      "epoch": 0.5294759825327511,
+      "grad_norm": 0.1796472817659378,
+      "learning_rate": 2.3874177316739778e-05,
+      "loss": 0.15086464881896972,
+      "step": 2910
+    },
+    {
+      "epoch": 0.5303857350800583,
+      "grad_norm": 0.23268625140190125,
+      "learning_rate": 2.380064890268093e-05,
+      "loss": 0.15354180335998535,
+      "step": 2915
+    },
+    {
+      "epoch": 0.5312954876273653,
+      "grad_norm": 0.16318941116333008,
+      "learning_rate": 2.372713088583481e-05,
+      "loss": 0.15131797790527343,
+      "step": 2920
+    },
+    {
+      "epoch": 0.5322052401746725,
+      "grad_norm": 0.18171803653240204,
+      "learning_rate": 2.365362390353143e-05,
+      "loss": 0.15784090757369995,
+      "step": 2925
+    },
+    {
+      "epoch": 0.5331149927219796,
+      "grad_norm": 0.17672640085220337,
+      "learning_rate": 2.3580128593005156e-05,
+      "loss": 0.15509436130523682,
+      "step": 2930
+    },
+    {
+      "epoch": 0.5340247452692868,
+      "grad_norm": 0.15985223650932312,
+      "learning_rate": 2.3506645591389174e-05,
+      "loss": 0.14851027727127075,
+      "step": 2935
+    },
+    {
+      "epoch": 0.5349344978165939,
+      "grad_norm": 0.16597607731819153,
+      "learning_rate": 2.343317553570995e-05,
+      "loss": 0.1504931092262268,
+      "step": 2940
+    },
+    {
+      "epoch": 0.535844250363901,
+      "grad_norm": 0.20180748403072357,
+      "learning_rate": 2.3359719062881725e-05,
+      "loss": 0.15023820400238036,
+      "step": 2945
+    },
+    {
+      "epoch": 0.5367540029112081,
+      "grad_norm": 0.1735963076353073,
+      "learning_rate": 2.3286276809701e-05,
+      "loss": 0.15374408960342406,
+      "step": 2950
+    },
+    {
+      "epoch": 0.5376637554585153,
+      "grad_norm": 0.17629501223564148,
+      "learning_rate": 2.3212849412840995e-05,
+      "loss": 0.15007833242416382,
+      "step": 2955
+    },
+    {
+      "epoch": 0.5385735080058224,
+      "grad_norm": 0.1493796557188034,
+      "learning_rate": 2.3139437508846155e-05,
+      "loss": 0.15206656455993653,
+      "step": 2960
+    },
+    {
+      "epoch": 0.5394832605531296,
+      "grad_norm": 0.17426837980747223,
+      "learning_rate": 2.306604173412659e-05,
+      "loss": 0.1441131591796875,
+      "step": 2965
+    },
+    {
+      "epoch": 0.5403930131004366,
+      "grad_norm": 0.16984431445598602,
+      "learning_rate": 2.2992662724952613e-05,
+      "loss": 0.14438753128051757,
+      "step": 2970
+    },
+    {
+      "epoch": 0.5413027656477438,
+      "grad_norm": 0.1814386397600174,
+      "learning_rate": 2.2919301117449167e-05,
+      "loss": 0.14887022972106934,
+      "step": 2975
+    },
+    {
+      "epoch": 0.5422125181950509,
+      "grad_norm": 0.158392995595932,
+      "learning_rate": 2.2845957547590368e-05,
+      "loss": 0.14404361248016356,
+      "step": 2980
+    },
+    {
+      "epoch": 0.5431222707423581,
+      "grad_norm": 0.17496263980865479,
+      "learning_rate": 2.2772632651193953e-05,
+      "loss": 0.1454906702041626,
+      "step": 2985
+    },
+    {
+      "epoch": 0.5440320232896652,
+      "grad_norm": 0.157533198595047,
+      "learning_rate": 2.2699327063915766e-05,
+      "loss": 0.1458217740058899,
+      "step": 2990
+    },
+    {
+      "epoch": 0.5449417758369723,
+      "grad_norm": 0.1767890453338623,
+      "learning_rate": 2.262604142124427e-05,
+      "loss": 0.14384825229644777,
+      "step": 2995
+    },
+    {
+      "epoch": 0.5458515283842795,
+      "grad_norm": 0.1851050704717636,
+      "learning_rate": 2.2552776358495033e-05,
+      "loss": 0.14832457304000854,
+      "step": 3000
+    },
+    {
+      "epoch": 0.5467612809315866,
+      "grad_norm": 0.164175882935524,
+      "learning_rate": 2.247953251080521e-05,
+      "loss": 0.14999878406524658,
+      "step": 3005
+    },
+    {
+      "epoch": 0.5476710334788938,
+      "grad_norm": 0.3403675854206085,
+      "learning_rate": 2.240631051312804e-05,
+      "loss": 0.1443937063217163,
+      "step": 3010
+    },
+    {
+      "epoch": 0.5485807860262009,
+      "grad_norm": 0.16751109063625336,
+      "learning_rate": 2.2333111000227342e-05,
+      "loss": 0.1462402105331421,
+      "step": 3015
+    },
+    {
+      "epoch": 0.549490538573508,
+      "grad_norm": 0.14741151034832,
+      "learning_rate": 2.225993460667201e-05,
+      "loss": 0.149855899810791,
+      "step": 3020
+    },
+    {
+      "epoch": 0.5504002911208151,
+      "grad_norm": 0.20605266094207764,
+      "learning_rate": 2.218678196683054e-05,
+      "loss": 0.15413178205490113,
+      "step": 3025
+    },
+    {
+      "epoch": 0.5513100436681223,
+      "grad_norm": 0.14884796738624573,
+      "learning_rate": 2.2113653714865473e-05,
+      "loss": 0.14592334032058715,
+      "step": 3030
+    },
+    {
+      "epoch": 0.5522197962154294,
+      "grad_norm": 0.17114350199699402,
+      "learning_rate": 2.2040550484727943e-05,
+      "loss": 0.1498338460922241,
+      "step": 3035
+    },
+    {
+      "epoch": 0.5531295487627366,
+      "grad_norm": 0.16496853530406952,
+      "learning_rate": 2.196747291015219e-05,
+      "loss": 0.14650315046310425,
+      "step": 3040
+    },
+    {
+      "epoch": 0.5540393013100436,
+      "grad_norm": 0.15172401070594788,
+      "learning_rate": 2.189442162465001e-05,
+      "loss": 0.14984124898910522,
+      "step": 3045
+    },
+    {
+      "epoch": 0.5549490538573508,
+      "grad_norm": 0.19258467853069305,
+      "learning_rate": 2.182139726150532e-05,
+      "loss": 0.1486764669418335,
+      "step": 3050
+    },
+    {
+      "epoch": 0.5558588064046579,
+      "grad_norm": 0.1749001443386078,
+      "learning_rate": 2.1748400453768652e-05,
+      "loss": 0.14983701705932617,
+      "step": 3055
+    },
+    {
+      "epoch": 0.5567685589519651,
+      "grad_norm": 0.37510567903518677,
+      "learning_rate": 2.1675431834251637e-05,
+      "loss": 0.14483561515808105,
+      "step": 3060
+    },
+    {
+      "epoch": 0.5576783114992722,
+      "grad_norm": 0.16932405531406403,
+      "learning_rate": 2.1602492035521553e-05,
+      "loss": 0.14487643241882325,
+      "step": 3065
+    },
+    {
+      "epoch": 0.5585880640465793,
+      "grad_norm": 0.174176424741745,
+      "learning_rate": 2.152958168989584e-05,
+      "loss": 0.14737497568130492,
+      "step": 3070
+    },
+    {
+      "epoch": 0.5594978165938864,
+      "grad_norm": 0.1601252257823944,
+      "learning_rate": 2.1456701429436577e-05,
+      "loss": 0.15183379650115966,
+      "step": 3075
+    },
+    {
+      "epoch": 0.5604075691411936,
+      "grad_norm": 0.14960910379886627,
+      "learning_rate": 2.1383851885945085e-05,
+      "loss": 0.143074893951416,
+      "step": 3080
+    },
+    {
+      "epoch": 0.5613173216885007,
+      "grad_norm": 0.1678633838891983,
+      "learning_rate": 2.1311033690956346e-05,
+      "loss": 0.14961432218551635,
+      "step": 3085
+    },
+    {
+      "epoch": 0.5622270742358079,
+      "grad_norm": 0.15814319252967834,
+      "learning_rate": 2.1238247475733613e-05,
+      "loss": 0.14308581352233887,
+      "step": 3090
+    },
+    {
+      "epoch": 0.5631368267831149,
+      "grad_norm": 0.21240772306919098,
+      "learning_rate": 2.1165493871262887e-05,
+      "loss": 0.14737485647201537,
+      "step": 3095
+    },
+    {
+      "epoch": 0.5640465793304221,
+      "grad_norm": 0.15161271393299103,
+      "learning_rate": 2.109277350824749e-05,
+      "loss": 0.14534420967102052,
+      "step": 3100
+    },
+    {
+      "epoch": 0.5649563318777293,
+      "grad_norm": 0.16572362184524536,
+      "learning_rate": 2.1020087017102537e-05,
+      "loss": 0.14299670457839966,
+      "step": 3105
+    },
+    {
+      "epoch": 0.5658660844250364,
+      "grad_norm": 0.1548164039850235,
+      "learning_rate": 2.094743502794954e-05,
+      "loss": 0.14371142387390137,
+      "step": 3110
+    },
+    {
+      "epoch": 0.5667758369723436,
+      "grad_norm": 0.2574169933795929,
+      "learning_rate": 2.0874818170610885e-05,
+      "loss": 0.14350423812866211,
+      "step": 3115
+    },
+    {
+      "epoch": 0.5676855895196506,
+      "grad_norm": 0.16359548270702362,
+      "learning_rate": 2.080223707460443e-05,
+      "loss": 0.1520243763923645,
+      "step": 3120
+    },
+    {
+      "epoch": 0.5685953420669578,
+      "grad_norm": 0.1798320859670639,
+      "learning_rate": 2.072969236913799e-05,
+      "loss": 0.14832595586776734,
+      "step": 3125
+    },
+    {
+      "epoch": 0.5695050946142649,
+      "grad_norm": 0.17045916616916656,
+      "learning_rate": 2.0657184683103926e-05,
+      "loss": 0.15308042764663696,
+      "step": 3130
+    },
+    {
+      "epoch": 0.5704148471615721,
+      "grad_norm": 0.16345897316932678,
+      "learning_rate": 2.058471464507366e-05,
+      "loss": 0.14564799070358275,
+      "step": 3135
+    },
+    {
+      "epoch": 0.5713245997088792,
+      "grad_norm": 0.15170110762119293,
+      "learning_rate": 2.0512282883292257e-05,
+      "loss": 0.14161767959594726,
+      "step": 3140
+    },
+    {
+      "epoch": 0.5722343522561864,
+      "grad_norm": 0.8107472658157349,
+      "learning_rate": 2.0439890025672955e-05,
+      "loss": 0.14481087923049926,
+      "step": 3145
+    },
+    {
+      "epoch": 0.5731441048034934,
+      "grad_norm": 0.15346679091453552,
+      "learning_rate": 2.036753669979174e-05,
+      "loss": 0.14860262870788574,
+      "step": 3150
+    },
+    {
+      "epoch": 0.5740538573508006,
+      "grad_norm": 0.1632593423128128,
+      "learning_rate": 2.0295223532881886e-05,
+      "loss": 0.1481687307357788,
+      "step": 3155
+    },
+    {
+      "epoch": 0.5749636098981077,
+      "grad_norm": 0.23399172723293304,
+      "learning_rate": 2.022295115182852e-05,
+      "loss": 0.149153733253479,
+      "step": 3160
+    },
+    {
+      "epoch": 0.5758733624454149,
+      "grad_norm": 0.14977394044399261,
+      "learning_rate": 2.015072018316323e-05,
+      "loss": 0.14921388626098633,
+      "step": 3165
+    },
+    {
+      "epoch": 0.576783114992722,
+      "grad_norm": 0.1550658792257309,
+      "learning_rate": 2.007853125305856e-05,
+      "loss": 0.1482759475708008,
+      "step": 3170
+    },
+    {
+      "epoch": 0.5776928675400291,
+      "grad_norm": 0.16661737859249115,
+      "learning_rate": 2.0006384987322645e-05,
+      "loss": 0.14903552532196046,
+      "step": 3175
+    },
+    {
+      "epoch": 0.5786026200873362,
+      "grad_norm": 0.1746823936700821,
+      "learning_rate": 1.9934282011393753e-05,
+      "loss": 0.1412947654724121,
+      "step": 3180
+    },
+    {
+      "epoch": 0.5795123726346434,
+      "grad_norm": 0.17025792598724365,
+      "learning_rate": 1.9862222950334857e-05,
+      "loss": 0.15289769172668458,
+      "step": 3185
+    },
+    {
+      "epoch": 0.5804221251819505,
+      "grad_norm": 0.16857658326625824,
+      "learning_rate": 1.9790208428828252e-05,
+      "loss": 0.14419941902160643,
+      "step": 3190
+    },
+    {
+      "epoch": 0.5813318777292577,
+      "grad_norm": 0.16099876165390015,
+      "learning_rate": 1.9718239071170118e-05,
+      "loss": 0.14476487636566163,
+      "step": 3195
+    },
+    {
+      "epoch": 0.5822416302765647,
+      "grad_norm": 0.16140873730182648,
+      "learning_rate": 1.964631550126508e-05,
+      "loss": 0.14588416814804078,
+      "step": 3200
+    },
+    {
+      "epoch": 0.5831513828238719,
+      "grad_norm": 0.15719448029994965,
+      "learning_rate": 1.957443834262087e-05,
+      "loss": 0.15144693851470947,
+      "step": 3205
+    },
+    {
+      "epoch": 0.584061135371179,
+      "grad_norm": 0.16512645781040192,
+      "learning_rate": 1.950260821834285e-05,
+      "loss": 0.14787566661834717,
+      "step": 3210
+    },
+    {
+      "epoch": 0.5849708879184862,
+      "grad_norm": 0.18584516644477844,
+      "learning_rate": 1.9430825751128643e-05,
+      "loss": 0.14514710903167724,
+      "step": 3215
+    },
+    {
+      "epoch": 0.5858806404657934,
+      "grad_norm": 0.17640981078147888,
+      "learning_rate": 1.9359091563262742e-05,
+      "loss": 0.1511004686355591,
+      "step": 3220
+    },
+    {
+      "epoch": 0.5867903930131004,
+      "grad_norm": 0.1697624921798706,
+      "learning_rate": 1.9287406276611095e-05,
+      "loss": 0.15392563343048096,
+      "step": 3225
+    },
+    {
+      "epoch": 0.5877001455604076,
+      "grad_norm": 0.1677260845899582,
+      "learning_rate": 1.9215770512615725e-05,
+      "loss": 0.15311745405197144,
+      "step": 3230
+    },
+    {
+      "epoch": 0.5886098981077147,
+      "grad_norm": 0.15357480943202972,
+      "learning_rate": 1.9144184892289337e-05,
+      "loss": 0.14370160102844237,
+      "step": 3235
+    },
+    {
+      "epoch": 0.5895196506550219,
+      "grad_norm": 0.18601207435131073,
+      "learning_rate": 1.9072650036209955e-05,
+      "loss": 0.14095077514648438,
+      "step": 3240
+    },
+    {
+      "epoch": 0.590429403202329,
+      "grad_norm": 0.17313526570796967,
+      "learning_rate": 1.9001166564515513e-05,
+      "loss": 0.148259174823761,
+      "step": 3245
+    },
+    {
+      "epoch": 0.5913391557496361,
+      "grad_norm": 0.1634378433227539,
+      "learning_rate": 1.8929735096898504e-05,
+      "loss": 0.15082294940948487,
+      "step": 3250
+    },
+    {
+      "epoch": 0.5922489082969432,
+      "grad_norm": 0.18542174994945526,
+      "learning_rate": 1.885835625260058e-05,
+      "loss": 0.14461435079574586,
+      "step": 3255
+    },
+    {
+      "epoch": 0.5931586608442504,
+      "grad_norm": 0.1740756630897522,
+      "learning_rate": 1.87870306504072e-05,
+      "loss": 0.14083608388900756,
+      "step": 3260
+    },
+    {
+      "epoch": 0.5940684133915575,
+      "grad_norm": 0.25606217980384827,
+      "learning_rate": 1.8715758908642288e-05,
+      "loss": 0.15125386714935302,
+      "step": 3265
+    },
+    {
+      "epoch": 0.5949781659388647,
+      "grad_norm": 0.20194627344608307,
+      "learning_rate": 1.8644541645162834e-05,
+      "loss": 0.14433003664016725,
+      "step": 3270
+    },
+    {
+      "epoch": 0.5958879184861717,
+      "grad_norm": 0.1902168095111847,
+      "learning_rate": 1.8573379477353542e-05,
+      "loss": 0.14718132019042968,
+      "step": 3275
+    },
+    {
+      "epoch": 0.5967976710334789,
+      "grad_norm": 0.15122972428798676,
+      "learning_rate": 1.850227302212151e-05,
+      "loss": 0.153376567363739,
+      "step": 3280
+    },
+    {
+      "epoch": 0.597707423580786,
+      "grad_norm": 0.14331959187984467,
+      "learning_rate": 1.843122289589085e-05,
+      "loss": 0.146630597114563,
+      "step": 3285
+    },
+    {
+      "epoch": 0.5986171761280932,
+      "grad_norm": 0.15083099901676178,
+      "learning_rate": 1.836022971459737e-05,
+      "loss": 0.1445971965789795,
+      "step": 3290
+    },
+    {
+      "epoch": 0.5995269286754003,
+      "grad_norm": 0.16585418581962585,
+      "learning_rate": 1.828929409368321e-05,
+      "loss": 0.15120241641998292,
+      "step": 3295
+    },
+    {
+      "epoch": 0.6004366812227074,
+      "grad_norm": 0.1653224229812622,
+      "learning_rate": 1.8218416648091524e-05,
+      "loss": 0.14349838495254516,
+      "step": 3300
+    },
+    {
+      "epoch": 0.6013464337700145,
+      "grad_norm": 0.1891375184059143,
+      "learning_rate": 1.8147597992261124e-05,
+      "loss": 0.15171384811401367,
+      "step": 3305
+    },
+    {
+      "epoch": 0.6022561863173217,
+      "grad_norm": 0.13392704725265503,
+      "learning_rate": 1.8076838740121187e-05,
+      "loss": 0.14607118368148803,
+      "step": 3310
+    },
+    {
+      "epoch": 0.6031659388646288,
+      "grad_norm": 0.15421944856643677,
+      "learning_rate": 1.8006139505085926e-05,
+      "loss": 0.1380957007408142,
+      "step": 3315
+    },
+    {
+      "epoch": 0.604075691411936,
+      "grad_norm": 0.16637761890888214,
+      "learning_rate": 1.7935500900049246e-05,
+      "loss": 0.14604611396789552,
+      "step": 3320
+    },
+    {
+      "epoch": 0.6049854439592431,
+      "grad_norm": 0.16638441383838654,
+      "learning_rate": 1.7864923537379445e-05,
+      "loss": 0.1513611912727356,
+      "step": 3325
+    },
+    {
+      "epoch": 0.6058951965065502,
+      "grad_norm": 0.1745707094669342,
+      "learning_rate": 1.779440802891394e-05,
+      "loss": 0.15391240119934083,
+      "step": 3330
+    },
+    {
+      "epoch": 0.6068049490538574,
+      "grad_norm": 0.1620505005121231,
+      "learning_rate": 1.77239549859539e-05,
+      "loss": 0.14986472129821776,
+      "step": 3335
+    },
+    {
+      "epoch": 0.6077147016011645,
+      "grad_norm": 0.1579132080078125,
+      "learning_rate": 1.7653565019259e-05,
+      "loss": 0.1466603994369507,
+      "step": 3340
+    },
+    {
+      "epoch": 0.6086244541484717,
+      "grad_norm": 0.19154994189739227,
+      "learning_rate": 1.7583238739042086e-05,
+      "loss": 0.15228934288024903,
+      "step": 3345
+    },
+    {
+      "epoch": 0.6095342066957787,
+      "grad_norm": 0.15771779417991638,
+      "learning_rate": 1.7512976754963913e-05,
+      "loss": 0.14965078830718995,
+      "step": 3350
+    },
+    {
+      "epoch": 0.6104439592430859,
+      "grad_norm": 0.18406136333942413,
+      "learning_rate": 1.744277967612785e-05,
+      "loss": 0.1473196864128113,
+      "step": 3355
+    },
+    {
+      "epoch": 0.611353711790393,
+      "grad_norm": 0.17603816092014313,
+      "learning_rate": 1.7372648111074607e-05,
+      "loss": 0.1430676221847534,
+      "step": 3360
+    },
+    {
+      "epoch": 0.6122634643377002,
+      "grad_norm": 0.156408429145813,
+      "learning_rate": 1.7302582667776933e-05,
+      "loss": 0.14018454551696777,
+      "step": 3365
+    },
+    {
+      "epoch": 0.6131732168850073,
+      "grad_norm": 0.14504843950271606,
+      "learning_rate": 1.7232583953634407e-05,
+      "loss": 0.14505640268325806,
+      "step": 3370
+    },
+    {
+      "epoch": 0.6140829694323144,
+      "grad_norm": 0.1864968240261078,
+      "learning_rate": 1.716265257546808e-05,
+      "loss": 0.14810394048690795,
+      "step": 3375
+    },
+    {
+      "epoch": 0.6149927219796215,
+      "grad_norm": 0.1621711403131485,
+      "learning_rate": 1.7092789139515295e-05,
+      "loss": 0.14203091859817504,
+      "step": 3380
+    },
+    {
+      "epoch": 0.6159024745269287,
+      "grad_norm": 0.17994914948940277,
+      "learning_rate": 1.70229942514244e-05,
+      "loss": 0.14565644264221192,
+      "step": 3385
+    },
+    {
+      "epoch": 0.6168122270742358,
+      "grad_norm": 0.1707388162612915,
+      "learning_rate": 1.6953268516249486e-05,
+      "loss": 0.14449434280395507,
+      "step": 3390
+    },
+    {
+      "epoch": 0.617721979621543,
+      "grad_norm": 0.16425329446792603,
+      "learning_rate": 1.6883612538445175e-05,
+      "loss": 0.15185940265655518,
+      "step": 3395
+    },
+    {
+      "epoch": 0.61863173216885,
+      "grad_norm": 0.15987788140773773,
+      "learning_rate": 1.6814026921861335e-05,
+      "loss": 0.14994431734085084,
+      "step": 3400
+    },
+    {
+      "epoch": 0.6195414847161572,
+      "grad_norm": 0.2987690269947052,
+      "learning_rate": 1.6744512269737894e-05,
+      "loss": 0.14652738571166993,
+      "step": 3405
+    },
+    {
+      "epoch": 0.6204512372634643,
+      "grad_norm": 0.1681315004825592,
+      "learning_rate": 1.6675069184699574e-05,
+      "loss": 0.14566165208816528,
+      "step": 3410
+    },
+    {
+      "epoch": 0.6213609898107715,
+      "grad_norm": 0.15847846865653992,
+      "learning_rate": 1.660569826875069e-05,
+      "loss": 0.1374401330947876,
+      "step": 3415
+    },
+    {
+      "epoch": 0.6222707423580786,
+      "grad_norm": 0.16370312869548798,
+      "learning_rate": 1.6536400123269907e-05,
+      "loss": 0.14905524253845215,
+      "step": 3420
+    },
+    {
+      "epoch": 0.6231804949053857,
+      "grad_norm": 0.16054444015026093,
+      "learning_rate": 1.6467175349005054e-05,
+      "loss": 0.1496324896812439,
+      "step": 3425
+    },
+    {
+      "epoch": 0.6240902474526928,
+      "grad_norm": 0.1663951277732849,
+      "learning_rate": 1.639802454606788e-05,
+      "loss": 0.1504170298576355,
+      "step": 3430
+    },
+    {
+      "epoch": 0.625,
+      "grad_norm": 0.1591310054063797,
+      "learning_rate": 1.6328948313928906e-05,
+      "loss": 0.1410186171531677,
+      "step": 3435
+    },
+    {
+      "epoch": 0.6259097525473072,
+      "grad_norm": 0.1637524962425232,
+      "learning_rate": 1.6259947251412178e-05,
+      "loss": 0.13963305950164795,
+      "step": 3440
+    },
+    {
+      "epoch": 0.6268195050946143,
+      "grad_norm": 0.1688017100095749,
+      "learning_rate": 1.6191021956690096e-05,
+      "loss": 0.14727941751480103,
+      "step": 3445
+    },
+    {
+      "epoch": 0.6277292576419214,
+      "grad_norm": 0.1691795438528061,
+      "learning_rate": 1.612217302727821e-05,
+      "loss": 0.14856183528900146,
+      "step": 3450
+    },
+    {
+      "epoch": 0.6286390101892285,
+      "grad_norm": 0.18501746654510498,
+      "learning_rate": 1.60534010600301e-05,
+      "loss": 0.1481746554374695,
+      "step": 3455
+    },
+    {
+      "epoch": 0.6295487627365357,
+      "grad_norm": 0.16234716773033142,
+      "learning_rate": 1.5984706651132125e-05,
+      "loss": 0.1427530527114868,
+      "step": 3460
+    },
+    {
+      "epoch": 0.6304585152838428,
+      "grad_norm": 0.16013780236244202,
+      "learning_rate": 1.5916090396098293e-05,
+      "loss": 0.14264426231384278,
+      "step": 3465
+    },
+    {
+      "epoch": 0.63136826783115,
+      "grad_norm": 0.17116396129131317,
+      "learning_rate": 1.5847552889765095e-05,
+      "loss": 0.14109257459640503,
+      "step": 3470
+    },
+    {
+      "epoch": 0.632278020378457,
+      "grad_norm": 0.16949769854545593,
+      "learning_rate": 1.5779094726286344e-05,
+      "loss": 0.1387040376663208,
+      "step": 3475
+    },
+    {
+      "epoch": 0.6331877729257642,
+      "grad_norm": 0.14983431994915009,
+      "learning_rate": 1.5710716499128044e-05,
+      "loss": 0.13645120859146118,
+      "step": 3480
+    },
+    {
+      "epoch": 0.6340975254730713,
+      "grad_norm": 0.1632554531097412,
+      "learning_rate": 1.564241880106321e-05,
+      "loss": 0.14883992671966553,
+      "step": 3485
+    },
+    {
+      "epoch": 0.6350072780203785,
+      "grad_norm": 0.15686506032943726,
+      "learning_rate": 1.5574202224166744e-05,
+      "loss": 0.14244272708892822,
+      "step": 3490
+    },
+    {
+      "epoch": 0.6359170305676856,
+      "grad_norm": 0.18843458592891693,
+      "learning_rate": 1.5506067359810333e-05,
+      "loss": 0.15149861574172974,
+      "step": 3495
+    },
+    {
+      "epoch": 0.6368267831149927,
+      "grad_norm": 0.15874551236629486,
+      "learning_rate": 1.5438014798657275e-05,
+      "loss": 0.15188233852386473,
+      "step": 3500
+    },
+    {
+      "epoch": 0.6377365356622998,
+      "grad_norm": 0.17014239728450775,
+      "learning_rate": 1.5370045130657366e-05,
+      "loss": 0.14694437980651856,
+      "step": 3505
+    },
+    {
+      "epoch": 0.638646288209607,
+      "grad_norm": 0.14744038879871368,
+      "learning_rate": 1.5302158945041838e-05,
+      "loss": 0.14434736967086792,
+      "step": 3510
+    },
+    {
+      "epoch": 0.6395560407569141,
+      "grad_norm": 0.2069770246744156,
+      "learning_rate": 1.523435683031818e-05,
+      "loss": 0.13982917070388795,
+      "step": 3515
+    },
+    {
+      "epoch": 0.6404657933042213,
+      "grad_norm": 0.17811502516269684,
+      "learning_rate": 1.5166639374265063e-05,
+      "loss": 0.1408839702606201,
+      "step": 3520
+    },
+    {
+      "epoch": 0.6413755458515283,
+      "grad_norm": 0.165786474943161,
+      "learning_rate": 1.509900716392728e-05,
+      "loss": 0.15312877893447877,
+      "step": 3525
+    },
+    {
+      "epoch": 0.6422852983988355,
+      "grad_norm": 0.1633884161710739,
+      "learning_rate": 1.5031460785610596e-05,
+      "loss": 0.1488795518875122,
+      "step": 3530
+    },
+    {
+      "epoch": 0.6431950509461426,
+      "grad_norm": 0.16498984396457672,
+      "learning_rate": 1.4964000824876723e-05,
+      "loss": 0.15031465291976928,
+      "step": 3535
+    },
+    {
+      "epoch": 0.6441048034934498,
+      "grad_norm": 0.18043678998947144,
+      "learning_rate": 1.4896627866538191e-05,
+      "loss": 0.147829806804657,
+      "step": 3540
+    },
+    {
+      "epoch": 0.6450145560407569,
+      "grad_norm": 0.16813597083091736,
+      "learning_rate": 1.4829342494653315e-05,
+      "loss": 0.1418998956680298,
+      "step": 3545
+    },
+    {
+      "epoch": 0.645924308588064,
+      "grad_norm": 0.1817242056131363,
+      "learning_rate": 1.4762145292521118e-05,
+      "loss": 0.14508869647979736,
+      "step": 3550
+    },
+    {
+      "epoch": 0.6468340611353712,
+      "grad_norm": 0.14666494727134705,
+      "learning_rate": 1.469503684267628e-05,
+      "loss": 0.14159854650497436,
+      "step": 3555
+    },
+    {
+      "epoch": 0.6477438136826783,
+      "grad_norm": 0.16485381126403809,
+      "learning_rate": 1.4628017726884086e-05,
+      "loss": 0.14419105052947997,
+      "step": 3560
+    },
+    {
+      "epoch": 0.6486535662299855,
+      "grad_norm": 0.16100342571735382,
+      "learning_rate": 1.4561088526135375e-05,
+      "loss": 0.14501721858978273,
+      "step": 3565
+    },
+    {
+      "epoch": 0.6495633187772926,
+      "grad_norm": 0.16996590793132782,
+      "learning_rate": 1.4494249820641493e-05,
+      "loss": 0.1377166509628296,
+      "step": 3570
+    },
+    {
+      "epoch": 0.6504730713245997,
+      "grad_norm": 0.16168837249279022,
+      "learning_rate": 1.4427502189829339e-05,
+      "loss": 0.1414325475692749,
+      "step": 3575
+    },
+    {
+      "epoch": 0.6513828238719068,
+      "grad_norm": 0.16318906843662262,
+      "learning_rate": 1.436084621233621e-05,
+      "loss": 0.14685193300247193,
+      "step": 3580
+    },
+    {
+      "epoch": 0.652292576419214,
+      "grad_norm": 0.1636219322681427,
+      "learning_rate": 1.4294282466004899e-05,
+      "loss": 0.1405899167060852,
+      "step": 3585
+    },
+    {
+      "epoch": 0.6532023289665211,
+      "grad_norm": 0.1838461309671402,
+      "learning_rate": 1.422781152787865e-05,
+      "loss": 0.14386332035064697,
+      "step": 3590
+    },
+    {
+      "epoch": 0.6541120815138283,
+      "grad_norm": 0.1796344667673111,
+      "learning_rate": 1.4161433974196115e-05,
+      "loss": 0.1513024687767029,
+      "step": 3595
+    },
+    {
+      "epoch": 0.6550218340611353,
+      "grad_norm": 0.16424529254436493,
+      "learning_rate": 1.4095150380386427e-05,
+      "loss": 0.14238927364349366,
+      "step": 3600
+    },
+    {
+      "epoch": 0.6559315866084425,
+      "grad_norm": 0.19264160096645355,
+      "learning_rate": 1.402896132106415e-05,
+      "loss": 0.14297477006912232,
+      "step": 3605
+    },
+    {
+      "epoch": 0.6568413391557496,
+      "grad_norm": 0.18319948017597198,
+      "learning_rate": 1.3962867370024347e-05,
+      "loss": 0.1448880434036255,
+      "step": 3610
+    },
+    {
+      "epoch": 0.6577510917030568,
+      "grad_norm": 0.16507290303707123,
+      "learning_rate": 1.389686910023758e-05,
+      "loss": 0.14724698066711425,
+      "step": 3615
+    },
+    {
+      "epoch": 0.6586608442503639,
+      "grad_norm": 0.17871244251728058,
+      "learning_rate": 1.3830967083844942e-05,
+      "loss": 0.14479386806488037,
+      "step": 3620
+    },
+    {
+      "epoch": 0.659570596797671,
+      "grad_norm": 0.1846228390932083,
+      "learning_rate": 1.3765161892153112e-05,
+      "loss": 0.1453616738319397,
+      "step": 3625
+    },
+    {
+      "epoch": 0.6604803493449781,
+      "grad_norm": 0.17185978591442108,
+      "learning_rate": 1.3699454095629372e-05,
+      "loss": 0.14906206130981445,
+      "step": 3630
+    },
+    {
+      "epoch": 0.6613901018922853,
+      "grad_norm": 0.14751191437244415,
+      "learning_rate": 1.3633844263896698e-05,
+      "loss": 0.13991892337799072,
+      "step": 3635
+    },
+    {
+      "epoch": 0.6622998544395924,
+      "grad_norm": 0.22059763967990875,
+      "learning_rate": 1.3568332965728817e-05,
+      "loss": 0.14680869579315187,
+      "step": 3640
+    },
+    {
+      "epoch": 0.6632096069868996,
+      "grad_norm": 0.15295909345149994,
+      "learning_rate": 1.3502920769045232e-05,
+      "loss": 0.1404443383216858,
+      "step": 3645
+    },
+    {
+      "epoch": 0.6641193595342066,
+      "grad_norm": 0.14600558578968048,
+      "learning_rate": 1.3437608240906364e-05,
+      "loss": 0.14663270711898804,
+      "step": 3650
+    },
+    {
+      "epoch": 0.6650291120815138,
+      "grad_norm": 0.15548352897167206,
+      "learning_rate": 1.3372395947508587e-05,
+      "loss": 0.1431443452835083,
+      "step": 3655
+    },
+    {
+      "epoch": 0.665938864628821,
+      "grad_norm": 0.1813388466835022,
+      "learning_rate": 1.3307284454179342e-05,
+      "loss": 0.1458706736564636,
+      "step": 3660
+    },
+    {
+      "epoch": 0.6668486171761281,
+      "grad_norm": 0.16326870024204254,
+      "learning_rate": 1.3242274325372247e-05,
+      "loss": 0.14700595140457154,
+      "step": 3665
+    },
+    {
+      "epoch": 0.6677583697234353,
+      "grad_norm": 0.18779197335243225,
+      "learning_rate": 1.3177366124662149e-05,
+      "loss": 0.1497237801551819,
+      "step": 3670
+    },
+    {
+      "epoch": 0.6686681222707423,
+      "grad_norm": 0.16291002929210663,
+      "learning_rate": 1.3112560414740315e-05,
+      "loss": 0.1387086868286133,
+      "step": 3675
+    },
+    {
+      "epoch": 0.6695778748180495,
+      "grad_norm": 0.1532297134399414,
+      "learning_rate": 1.3047857757409487e-05,
+      "loss": 0.14497545957565308,
+      "step": 3680
+    },
+    {
+      "epoch": 0.6704876273653566,
+      "grad_norm": 0.14697515964508057,
+      "learning_rate": 1.2983258713579066e-05,
+      "loss": 0.1494283437728882,
+      "step": 3685
+    },
+    {
+      "epoch": 0.6713973799126638,
+      "grad_norm": 0.15213452279567719,
+      "learning_rate": 1.2918763843260218e-05,
+      "loss": 0.1468907594680786,
+      "step": 3690
+    },
+    {
+      "epoch": 0.6723071324599709,
+      "grad_norm": 0.1745215803384781,
+      "learning_rate": 1.285437370556099e-05,
+      "loss": 0.14997754096984864,
+      "step": 3695
+    },
+    {
+      "epoch": 0.673216885007278,
+      "grad_norm": 0.19207637012004852,
+      "learning_rate": 1.2790088858681577e-05,
+      "loss": 0.14202862977981567,
+      "step": 3700
+    },
+    {
+      "epoch": 0.6741266375545851,
+      "grad_norm": 0.1521359086036682,
+      "learning_rate": 1.2725909859909313e-05,
+      "loss": 0.14547673463821412,
+      "step": 3705
+    },
+    {
+      "epoch": 0.6750363901018923,
+      "grad_norm": 0.16975535452365875,
+      "learning_rate": 1.2661837265613999e-05,
+      "loss": 0.14006874561309815,
+      "step": 3710
+    },
+    {
+      "epoch": 0.6759461426491994,
+      "grad_norm": 0.22234582901000977,
+      "learning_rate": 1.2597871631242992e-05,
+      "loss": 0.13691173791885375,
+      "step": 3715
+    },
+    {
+      "epoch": 0.6768558951965066,
+      "grad_norm": 0.16082969307899475,
+      "learning_rate": 1.2534013511316383e-05,
+      "loss": 0.14932308197021485,
+      "step": 3720
+    },
+    {
+      "epoch": 0.6777656477438136,
+      "grad_norm": 0.1751091182231903,
+      "learning_rate": 1.247026345942226e-05,
+      "loss": 0.14531974792480468,
+      "step": 3725
+    },
+    {
+      "epoch": 0.6786754002911208,
+      "grad_norm": 0.15838147699832916,
+      "learning_rate": 1.2406622028211844e-05,
+      "loss": 0.14759832620620728,
+      "step": 3730
+    },
+    {
+      "epoch": 0.6795851528384279,
+      "grad_norm": 0.1771744042634964,
+      "learning_rate": 1.2343089769394714e-05,
+      "loss": 0.1382831573486328,
+      "step": 3735
+    },
+    {
+      "epoch": 0.6804949053857351,
+      "grad_norm": 0.16301538050174713,
+      "learning_rate": 1.2279667233734037e-05,
+      "loss": 0.14444775581359864,
+      "step": 3740
+    },
+    {
+      "epoch": 0.6814046579330422,
+      "grad_norm": 0.1584121286869049,
+      "learning_rate": 1.2216354971041796e-05,
+      "loss": 0.14200170040130616,
+      "step": 3745
+    },
+    {
+      "epoch": 0.6823144104803494,
+      "grad_norm": 0.139187291264534,
+      "learning_rate": 1.2153153530174007e-05,
+      "loss": 0.14318310022354125,
+      "step": 3750
+    },
+    {
+      "epoch": 0.6832241630276564,
+      "grad_norm": 0.13665248453617096,
+      "learning_rate": 1.2090063459025955e-05,
+      "loss": 0.1411946654319763,
+      "step": 3755
+    },
+    {
+      "epoch": 0.6841339155749636,
+      "grad_norm": 0.16273781657218933,
+      "learning_rate": 1.2027085304527475e-05,
+      "loss": 0.14873508214950562,
+      "step": 3760
+    },
+    {
+      "epoch": 0.6850436681222707,
+      "grad_norm": 0.16317526996135712,
+      "learning_rate": 1.1964219612638194e-05,
+      "loss": 0.14644203186035157,
+      "step": 3765
+    },
+    {
+      "epoch": 0.6859534206695779,
+      "grad_norm": 0.17253617942333221,
+      "learning_rate": 1.1901466928342777e-05,
+      "loss": 0.14027841091156007,
+      "step": 3770
+    },
+    {
+      "epoch": 0.6868631732168851,
+      "grad_norm": 0.19692830741405487,
+      "learning_rate": 1.183882779564624e-05,
+      "loss": 0.14411110877990724,
+      "step": 3775
+    },
+    {
+      "epoch": 0.6877729257641921,
+      "grad_norm": 0.15444578230381012,
+      "learning_rate": 1.1776302757569214e-05,
+      "loss": 0.14355008602142333,
+      "step": 3780
+    },
+    {
+      "epoch": 0.6886826783114993,
+      "grad_norm": 0.1622200757265091,
+      "learning_rate": 1.1713892356143239e-05,
+      "loss": 0.14794334173202514,
+      "step": 3785
+    },
+    {
+      "epoch": 0.6895924308588064,
+      "grad_norm": 0.1898501068353653,
+      "learning_rate": 1.1651597132406073e-05,
+      "loss": 0.1418622612953186,
+      "step": 3790
+    },
+    {
+      "epoch": 0.6905021834061136,
+      "grad_norm": 0.17803208529949188,
+      "learning_rate": 1.1589417626396973e-05,
+      "loss": 0.14576040506362914,
+      "step": 3795
+    },
+    {
+      "epoch": 0.6914119359534207,
+      "grad_norm": 0.17138013243675232,
+      "learning_rate": 1.1527354377152053e-05,
+      "loss": 0.14494270086288452,
+      "step": 3800
+    },
+    {
+      "epoch": 0.6923216885007278,
+      "grad_norm": 0.15170913934707642,
+      "learning_rate": 1.1465407922699603e-05,
+      "loss": 0.144084370136261,
+      "step": 3805
+    },
+    {
+      "epoch": 0.6932314410480349,
+      "grad_norm": 0.158562570810318,
+      "learning_rate": 1.1403578800055387e-05,
+      "loss": 0.13636608123779298,
+      "step": 3810
+    },
+    {
+      "epoch": 0.6941411935953421,
+      "grad_norm": 0.17687302827835083,
+      "learning_rate": 1.1341867545218044e-05,
+      "loss": 0.14214688539505005,
+      "step": 3815
+    },
+    {
+      "epoch": 0.6950509461426492,
+      "grad_norm": 0.15394899249076843,
+      "learning_rate": 1.1280274693164378e-05,
+      "loss": 0.14914129972457885,
+      "step": 3820
+    },
+    {
+      "epoch": 0.6959606986899564,
+      "grad_norm": 0.15709355473518372,
+      "learning_rate": 1.12188007778448e-05,
+      "loss": 0.14798580408096312,
+      "step": 3825
+    },
+    {
+      "epoch": 0.6968704512372634,
+      "grad_norm": 0.16631539165973663,
+      "learning_rate": 1.115744633217864e-05,
+      "loss": 0.14756966829299928,
+      "step": 3830
+    },
+    {
+      "epoch": 0.6977802037845706,
+      "grad_norm": 0.15893076360225677,
+      "learning_rate": 1.109621188804951e-05,
+      "loss": 0.14061959981918334,
+      "step": 3835
+    },
+    {
+      "epoch": 0.6986899563318777,
+      "grad_norm": 0.183414489030838,
+      "learning_rate": 1.103509797630077e-05,
+      "loss": 0.1448473334312439,
+      "step": 3840
+    },
+    {
+      "epoch": 0.6995997088791849,
+      "grad_norm": 0.14087305963039398,
+      "learning_rate": 1.0974105126730841e-05,
+      "loss": 0.14369285106658936,
+      "step": 3845
+    },
+    {
+      "epoch": 0.700509461426492,
+      "grad_norm": 0.16919967532157898,
+      "learning_rate": 1.0913233868088685e-05,
+      "loss": 0.1478085398674011,
+      "step": 3850
+    },
+    {
+      "epoch": 0.7014192139737991,
+      "grad_norm": 0.1439533829689026,
+      "learning_rate": 1.0852484728069178e-05,
+      "loss": 0.14376721382141114,
+      "step": 3855
+    },
+    {
+      "epoch": 0.7023289665211062,
+      "grad_norm": 0.17719274759292603,
+      "learning_rate": 1.0791858233308521e-05,
+      "loss": 0.14089040756225585,
+      "step": 3860
+    },
+    {
+      "epoch": 0.7032387190684134,
+      "grad_norm": 0.19753769040107727,
+      "learning_rate": 1.0731354909379754e-05,
+      "loss": 0.15021742582321168,
+      "step": 3865
+    },
+    {
+      "epoch": 0.7041484716157205,
+      "grad_norm": 0.19186992943286896,
+      "learning_rate": 1.0670975280788086e-05,
+      "loss": 0.14113202095031738,
+      "step": 3870
+    },
+    {
+      "epoch": 0.7050582241630277,
+      "grad_norm": 0.1709229201078415,
+      "learning_rate": 1.0610719870966443e-05,
+      "loss": 0.1500566840171814,
+      "step": 3875
+    },
+    {
+      "epoch": 0.7059679767103348,
+      "grad_norm": 0.17846204340457916,
+      "learning_rate": 1.0550589202270892e-05,
+      "loss": 0.15014195442199707,
+      "step": 3880
+    },
+    {
+      "epoch": 0.7068777292576419,
+      "grad_norm": 0.1827082335948944,
+      "learning_rate": 1.0490583795976091e-05,
+      "loss": 0.1423472762107849,
+      "step": 3885
+    },
+    {
+      "epoch": 0.7077874818049491,
+      "grad_norm": 0.17418377101421356,
+      "learning_rate": 1.043070417227083e-05,
+      "loss": 0.14668900966644288,
+      "step": 3890
+    },
+    {
+      "epoch": 0.7086972343522562,
+      "grad_norm": 0.17385616898536682,
+      "learning_rate": 1.0370950850253449e-05,
+      "loss": 0.14627279043197633,
+      "step": 3895
+    },
+    {
+      "epoch": 0.7096069868995634,
+      "grad_norm": 0.16486723721027374,
+      "learning_rate": 1.0311324347927404e-05,
+      "loss": 0.14603652954101562,
+      "step": 3900
+    },
+    {
+      "epoch": 0.7105167394468704,
+      "grad_norm": 0.21806862950325012,
+      "learning_rate": 1.0251825182196732e-05,
+      "loss": 0.1488169550895691,
+      "step": 3905
+    },
+    {
+      "epoch": 0.7114264919941776,
+      "grad_norm": 0.19884569942951202,
+      "learning_rate": 1.019245386886159e-05,
+      "loss": 0.14387656450271608,
+      "step": 3910
+    },
+    {
+      "epoch": 0.7123362445414847,
+      "grad_norm": 0.16139011085033417,
+      "learning_rate": 1.0133210922613789e-05,
+      "loss": 0.1483074426651001,
+      "step": 3915
+    },
+    {
+      "epoch": 0.7132459970887919,
+      "grad_norm": 0.17000740766525269,
+      "learning_rate": 1.007409685703229e-05,
+      "loss": 0.14050065279006957,
+      "step": 3920
+    },
+    {
+      "epoch": 0.714155749636099,
+      "grad_norm": 0.17235304415225983,
+      "learning_rate": 1.0015112184578813e-05,
+      "loss": 0.1440442681312561,
+      "step": 3925
+    },
+    {
+      "epoch": 0.7150655021834061,
+      "grad_norm": 0.15737567842006683,
+      "learning_rate": 9.956257416593362e-06,
+      "loss": 0.14960765838623047,
+      "step": 3930
+    },
+    {
+      "epoch": 0.7159752547307132,
+      "grad_norm": 0.15499180555343628,
+      "learning_rate": 9.897533063289773e-06,
+      "loss": 0.14488829374313356,
+      "step": 3935
+    },
+    {
+      "epoch": 0.7168850072780204,
+      "grad_norm": 0.17744216322898865,
+      "learning_rate": 9.838939633751337e-06,
+      "loss": 0.1416949987411499,
+      "step": 3940
+    },
+    {
+      "epoch": 0.7177947598253275,
+      "grad_norm": 0.1597192883491516,
+      "learning_rate": 9.780477635926358e-06,
+      "loss": 0.14275280237197877,
+      "step": 3945
+    },
+    {
+      "epoch": 0.7187045123726347,
+      "grad_norm": 0.17800374329090118,
+      "learning_rate": 9.722147576623743e-06,
+      "loss": 0.14532098770141602,
+      "step": 3950
+    },
+    {
+      "epoch": 0.7196142649199417,
+      "grad_norm": 0.1828162521123886,
+      "learning_rate": 9.66394996150864e-06,
+      "loss": 0.14525585174560546,
+      "step": 3955
+    },
+    {
+      "epoch": 0.7205240174672489,
+      "grad_norm": 0.1800539344549179,
+      "learning_rate": 9.605885295098005e-06,
+      "loss": 0.14235819578170777,
+      "step": 3960
+    },
+    {
+      "epoch": 0.721433770014556,
+      "grad_norm": 0.16556483507156372,
+      "learning_rate": 9.54795408075628e-06,
+      "loss": 0.13965482711791993,
+      "step": 3965
+    },
+    {
+      "epoch": 0.7223435225618632,
+      "grad_norm": 0.1592024862766266,
+      "learning_rate": 9.49015682069101e-06,
+      "loss": 0.14051042795181273,
+      "step": 3970
+    },
+    {
+      "epoch": 0.7232532751091703,
+      "grad_norm": 0.18988847732543945,
+      "learning_rate": 9.43249401594846e-06,
+      "loss": 0.1436900496482849,
+      "step": 3975
+    },
+    {
+      "epoch": 0.7241630276564774,
+      "grad_norm": 0.24433808028697968,
+      "learning_rate": 9.374966166409329e-06,
+      "loss": 0.14883997440338134,
+      "step": 3980
+    },
+    {
+      "epoch": 0.7250727802037845,
+      "grad_norm": 0.15091639757156372,
+      "learning_rate": 9.317573770784352e-06,
+      "loss": 0.14726560115814208,
+      "step": 3985
+    },
+    {
+      "epoch": 0.7259825327510917,
+      "grad_norm": 0.17045573890209198,
+      "learning_rate": 9.260317326610051e-06,
+      "loss": 0.14120506048202514,
+      "step": 3990
+    },
+    {
+      "epoch": 0.7268922852983989,
+      "grad_norm": 0.18847957253456116,
+      "learning_rate": 9.203197330244343e-06,
+      "loss": 0.1377041220664978,
+      "step": 3995
+    },
+    {
+      "epoch": 0.727802037845706,
+      "grad_norm": 0.1516445279121399,
+      "learning_rate": 9.14621427686229e-06,
+      "loss": 0.14043946266174318,
+      "step": 4000
+    },
+    {
+      "epoch": 0.7287117903930131,
+      "grad_norm": 0.18264050781726837,
+      "learning_rate": 9.0893686604518e-06,
+      "loss": 0.14080368280410765,
+      "step": 4005
+    },
+    {
+      "epoch": 0.7296215429403202,
+      "grad_norm": 0.19129371643066406,
+      "learning_rate": 9.032660973809312e-06,
+      "loss": 0.1402561902999878,
+      "step": 4010
+    },
+    {
+      "epoch": 0.7305312954876274,
+      "grad_norm": 0.15762710571289062,
+      "learning_rate": 8.976091708535567e-06,
+      "loss": 0.14421157836914061,
+      "step": 4015
+    },
+    {
+      "epoch": 0.7314410480349345,
+      "grad_norm": 0.17785198986530304,
+      "learning_rate": 8.919661355031331e-06,
+      "loss": 0.14999009370803834,
+      "step": 4020
+    },
+    {
+      "epoch": 0.7323508005822417,
+      "grad_norm": 0.15306031703948975,
+      "learning_rate": 8.8633704024931e-06,
+      "loss": 0.14101698398590087,
+      "step": 4025
+    },
+    {
+      "epoch": 0.7332605531295487,
+      "grad_norm": 0.16481758654117584,
+      "learning_rate": 8.807219338908968e-06,
+      "loss": 0.14170764684677123,
+      "step": 4030
+    },
+    {
+      "epoch": 0.7341703056768559,
+      "grad_norm": 0.14892235398292542,
+      "learning_rate": 8.751208651054257e-06,
+      "loss": 0.15317896604537964,
+      "step": 4035
+    },
+    {
+      "epoch": 0.735080058224163,
+      "grad_norm": 0.1775592565536499,
+      "learning_rate": 8.695338824487409e-06,
+      "loss": 0.1520617723464966,
+      "step": 4040
+    },
+    {
+      "epoch": 0.7359898107714702,
+      "grad_norm": 0.1614258885383606,
+      "learning_rate": 8.639610343545728e-06,
+      "loss": 0.13747400045394897,
+      "step": 4045
+    },
+    {
+      "epoch": 0.7368995633187773,
+      "grad_norm": 0.21415506303310394,
+      "learning_rate": 8.58402369134117e-06,
+      "loss": 0.1432439088821411,
+      "step": 4050
+    },
+    {
+      "epoch": 0.7378093158660844,
+      "grad_norm": 0.1759418249130249,
+      "learning_rate": 8.528579349756205e-06,
+      "loss": 0.141641104221344,
+      "step": 4055
+    },
+    {
+      "epoch": 0.7387190684133915,
+      "grad_norm": 0.16738329827785492,
+      "learning_rate": 8.47327779943957e-06,
+      "loss": 0.14294810295104982,
+      "step": 4060
+    },
+    {
+      "epoch": 0.7396288209606987,
+      "grad_norm": 0.13916844129562378,
+      "learning_rate": 8.41811951980217e-06,
+      "loss": 0.13876968622207642,
+      "step": 4065
+    },
+    {
+      "epoch": 0.7405385735080058,
+      "grad_norm": 0.1828441321849823,
+      "learning_rate": 8.36310498901288e-06,
+      "loss": 0.148428475856781,
+      "step": 4070
+    },
+    {
+      "epoch": 0.741448326055313,
+      "grad_norm": 0.16534076631069183,
+      "learning_rate": 8.308234683994415e-06,
+      "loss": 0.14222711324691772,
+      "step": 4075
+    },
+    {
+      "epoch": 0.74235807860262,
+      "grad_norm": 0.17922644317150116,
+      "learning_rate": 8.253509080419198e-06,
+      "loss": 0.14365782737731933,
+      "step": 4080
+    },
+    {
+      "epoch": 0.7432678311499272,
+      "grad_norm": 0.15061035752296448,
+      "learning_rate": 8.198928652705204e-06,
+      "loss": 0.13571925163269044,
+      "step": 4085
+    },
+    {
+      "epoch": 0.7441775836972343,
+      "grad_norm": 0.18075402081012726,
+      "learning_rate": 8.144493874011908e-06,
+      "loss": 0.14385528564453126,
+      "step": 4090
+    },
+    {
+      "epoch": 0.7450873362445415,
+      "grad_norm": 0.16514739394187927,
+      "learning_rate": 8.090205216236135e-06,
+      "loss": 0.14920626878738402,
+      "step": 4095
+    },
+    {
+      "epoch": 0.7459970887918487,
+      "grad_norm": 0.16453702747821808,
+      "learning_rate": 8.03606315000797e-06,
+      "loss": 0.14704222679138185,
+      "step": 4100
+    },
+    {
+      "epoch": 0.7469068413391557,
+      "grad_norm": 0.16719917953014374,
+      "learning_rate": 7.982068144686707e-06,
+      "loss": 0.14722511768341065,
+      "step": 4105
+    },
+    {
+      "epoch": 0.7478165938864629,
+      "grad_norm": 0.18499110639095306,
+      "learning_rate": 7.92822066835677e-06,
+      "loss": 0.1401848554611206,
+      "step": 4110
+    },
+    {
+      "epoch": 0.74872634643377,
+      "grad_norm": 0.17249563336372375,
+      "learning_rate": 7.87452118782363e-06,
+      "loss": 0.15132423639297485,
+      "step": 4115
+    },
+    {
+      "epoch": 0.7496360989810772,
+      "grad_norm": 0.15049682557582855,
+      "learning_rate": 7.8209701686098e-06,
+      "loss": 0.1341150164604187,
+      "step": 4120
+    },
+    {
+      "epoch": 0.7505458515283843,
+      "grad_norm": 0.16892646253108978,
+      "learning_rate": 7.767568074950751e-06,
+      "loss": 0.1466840147972107,
+      "step": 4125
+    },
+    {
+      "epoch": 0.7514556040756915,
+      "grad_norm": 0.17288286983966827,
+      "learning_rate": 7.714315369790942e-06,
+      "loss": 0.13819680213928223,
+      "step": 4130
+    },
+    {
+      "epoch": 0.7523653566229985,
+      "grad_norm": 0.21893996000289917,
+      "learning_rate": 7.661212514779745e-06,
+      "loss": 0.14369510412216185,
+      "step": 4135
+    },
+    {
+      "epoch": 0.7532751091703057,
+      "grad_norm": 0.1674601435661316,
+      "learning_rate": 7.608259970267509e-06,
+      "loss": 0.14810250997543334,
+      "step": 4140
+    },
+    {
+      "epoch": 0.7541848617176128,
+      "grad_norm": 0.15875539183616638,
+      "learning_rate": 7.555458195301526e-06,
+      "loss": 0.14103198051452637,
+      "step": 4145
+    },
+    {
+      "epoch": 0.75509461426492,
+      "grad_norm": 0.19454079866409302,
+      "learning_rate": 7.502807647622037e-06,
+      "loss": 0.13848764896392823,
+      "step": 4150
+    },
+    {
+      "epoch": 0.756004366812227,
+      "grad_norm": 0.1795455813407898,
+      "learning_rate": 7.450308783658341e-06,
+      "loss": 0.14459335803985596,
+      "step": 4155
+    },
+    {
+      "epoch": 0.7569141193595342,
+      "grad_norm": 0.1643362045288086,
+      "learning_rate": 7.397962058524735e-06,
+      "loss": 0.14335378408432006,
+      "step": 4160
+    },
+    {
+      "epoch": 0.7578238719068413,
+      "grad_norm": 0.16362066566944122,
+      "learning_rate": 7.3457679260166475e-06,
+      "loss": 0.14222005605697632,
+      "step": 4165
+    },
+    {
+      "epoch": 0.7587336244541485,
+      "grad_norm": 0.17313003540039062,
+      "learning_rate": 7.293726838606674e-06,
+      "loss": 0.14272255897521974,
+      "step": 4170
+    },
+    {
+      "epoch": 0.7596433770014556,
+      "grad_norm": 0.1809929460287094,
+      "learning_rate": 7.2418392474406405e-06,
+      "loss": 0.14089123010635377,
+      "step": 4175
+    },
+    {
+      "epoch": 0.7605531295487628,
+      "grad_norm": 0.14306005835533142,
+      "learning_rate": 7.19010560233373e-06,
+      "loss": 0.13531534671783446,
+      "step": 4180
+    },
+    {
+      "epoch": 0.7614628820960698,
+      "grad_norm": 0.15525390207767487,
+      "learning_rate": 7.138526351766559e-06,
+      "loss": 0.14340845346450806,
+      "step": 4185
+    },
+    {
+      "epoch": 0.762372634643377,
+      "grad_norm": 0.24478943645954132,
+      "learning_rate": 7.087101942881263e-06,
+      "loss": 0.14744555950164795,
+      "step": 4190
+    },
+    {
+      "epoch": 0.7632823871906841,
+      "grad_norm": 0.31335577368736267,
+      "learning_rate": 7.035832821477711e-06,
+      "loss": 0.1484094500541687,
+      "step": 4195
+    },
+    {
+      "epoch": 0.7641921397379913,
+      "grad_norm": 0.15140366554260254,
+      "learning_rate": 6.984719432009515e-06,
+      "loss": 0.14991614818572999,
+      "step": 4200
+    },
+    {
+      "epoch": 0.7651018922852983,
+      "grad_norm": 0.16125506162643433,
+      "learning_rate": 6.933762217580289e-06,
+      "loss": 0.1408134937286377,
+      "step": 4205
+    },
+    {
+      "epoch": 0.7660116448326055,
+      "grad_norm": 0.2501450181007385,
+      "learning_rate": 6.882961619939726e-06,
+      "loss": 0.13875640630722047,
+      "step": 4210
+    },
+    {
+      "epoch": 0.7669213973799127,
+      "grad_norm": 0.16227811574935913,
+      "learning_rate": 6.8323180794798245e-06,
+      "loss": 0.14138660430908204,
+      "step": 4215
+    },
+    {
+      "epoch": 0.7678311499272198,
+      "grad_norm": 0.16676810383796692,
+      "learning_rate": 6.781832035231053e-06,
+      "loss": 0.14696706533432008,
+      "step": 4220
+    },
+    {
+      "epoch": 0.768740902474527,
+      "grad_norm": 0.14638574421405792,
+      "learning_rate": 6.731503924858518e-06,
+      "loss": 0.14263020753860473,
+      "step": 4225
+    },
+    {
+      "epoch": 0.769650655021834,
+      "grad_norm": 0.17093190550804138,
+      "learning_rate": 6.681334184658211e-06,
+      "loss": 0.14694111347198485,
+      "step": 4230
+    },
+    {
+      "epoch": 0.7705604075691412,
+      "grad_norm": 0.17174287140369415,
+      "learning_rate": 6.631323249553201e-06,
+      "loss": 0.13854929208755493,
+      "step": 4235
+    },
+    {
+      "epoch": 0.7714701601164483,
+      "grad_norm": 0.14599016308784485,
+      "learning_rate": 6.5814715530898745e-06,
+      "loss": 0.14058833122253417,
+      "step": 4240
+    },
+    {
+      "epoch": 0.7723799126637555,
+      "grad_norm": 0.16222265362739563,
+      "learning_rate": 6.531779527434176e-06,
+      "loss": 0.1428326725959778,
+      "step": 4245
+    },
+    {
+      "epoch": 0.7732896652110626,
+      "grad_norm": 0.1741994023323059,
+      "learning_rate": 6.482247603367839e-06,
+      "loss": 0.13985042572021483,
+      "step": 4250
+    },
+    {
+      "epoch": 0.7741994177583698,
+      "grad_norm": 0.17427101731300354,
+      "learning_rate": 6.432876210284688e-06,
+      "loss": 0.1442667603492737,
+      "step": 4255
+    },
+    {
+      "epoch": 0.7751091703056768,
+      "grad_norm": 0.1665259599685669,
+      "learning_rate": 6.383665776186912e-06,
+      "loss": 0.1421986222267151,
+      "step": 4260
+    },
+    {
+      "epoch": 0.776018922852984,
+      "grad_norm": 0.1728232353925705,
+      "learning_rate": 6.334616727681303e-06,
+      "loss": 0.1367053508758545,
+      "step": 4265
+    },
+    {
+      "epoch": 0.7769286754002911,
+      "grad_norm": 0.15882381796836853,
+      "learning_rate": 6.285729489975639e-06,
+      "loss": 0.14551182985305786,
+      "step": 4270
+    },
+    {
+      "epoch": 0.7778384279475983,
+      "grad_norm": 0.242042675614357,
+      "learning_rate": 6.2370044868749115e-06,
+      "loss": 0.1455132007598877,
+      "step": 4275
+    },
+    {
+      "epoch": 0.7787481804949054,
+      "grad_norm": 0.1599501073360443,
+      "learning_rate": 6.188442140777742e-06,
+      "loss": 0.1424942970275879,
+      "step": 4280
+    },
+    {
+      "epoch": 0.7796579330422125,
+      "grad_norm": 0.15182635188102722,
+      "learning_rate": 6.140042872672647e-06,
+      "loss": 0.14212887287139891,
+      "step": 4285
+    },
+    {
+      "epoch": 0.7805676855895196,
+      "grad_norm": 0.1720375418663025,
+      "learning_rate": 6.091807102134403e-06,
+      "loss": 0.14243412017822266,
+      "step": 4290
+    },
+    {
+      "epoch": 0.7814774381368268,
+      "grad_norm": 0.16436047852039337,
+      "learning_rate": 6.043735247320454e-06,
+      "loss": 0.15035657882690429,
+      "step": 4295
+    },
+    {
+      "epoch": 0.7823871906841339,
+      "grad_norm": 0.1498408019542694,
+      "learning_rate": 5.995827724967218e-06,
+      "loss": 0.14494839906692505,
+      "step": 4300
+    },
+    {
+      "epoch": 0.7832969432314411,
+      "grad_norm": 0.16924560070037842,
+      "learning_rate": 5.948084950386535e-06,
+      "loss": 0.13581212759017944,
+      "step": 4305
+    },
+    {
+      "epoch": 0.7842066957787481,
+      "grad_norm": 0.15889139473438263,
+      "learning_rate": 5.900507337462036e-06,
+      "loss": 0.15071530342102052,
+      "step": 4310
+    },
+    {
+      "epoch": 0.7851164483260553,
+      "grad_norm": 0.17201054096221924,
+      "learning_rate": 5.853095298645542e-06,
+      "loss": 0.1398628830909729,
+      "step": 4315
+    },
+    {
+      "epoch": 0.7860262008733624,
+      "grad_norm": 0.17965619266033173,
+      "learning_rate": 5.805849244953548e-06,
+      "loss": 0.14666696786880493,
+      "step": 4320
+    },
+    {
+      "epoch": 0.7869359534206696,
+      "grad_norm": 0.17514032125473022,
+      "learning_rate": 5.758769585963569e-06,
+      "loss": 0.1383386731147766,
+      "step": 4325
+    },
+    {
+      "epoch": 0.7878457059679768,
+      "grad_norm": 0.17497631907463074,
+      "learning_rate": 5.7118567298106744e-06,
+      "loss": 0.14362354278564454,
+      "step": 4330
+    },
+    {
+      "epoch": 0.7887554585152838,
+      "grad_norm": 0.16770458221435547,
+      "learning_rate": 5.665111083183905e-06,
+      "loss": 0.14136618375778198,
+      "step": 4335
+    },
+    {
+      "epoch": 0.789665211062591,
+      "grad_norm": 0.17134106159210205,
+      "learning_rate": 5.618533051322747e-06,
+      "loss": 0.1401529550552368,
+      "step": 4340
+    },
+    {
+      "epoch": 0.7905749636098981,
+      "grad_norm": 0.19458788633346558,
+      "learning_rate": 5.5721230380136435e-06,
+      "loss": 0.1393273115158081,
+      "step": 4345
+    },
+    {
+      "epoch": 0.7914847161572053,
+      "grad_norm": 0.19483692944049835,
+      "learning_rate": 5.525881445586467e-06,
+      "loss": 0.1369825482368469,
+      "step": 4350
+    },
+    {
+      "epoch": 0.7923944687045124,
+      "grad_norm": 0.3052191734313965,
+      "learning_rate": 5.4798086749110495e-06,
+      "loss": 0.14762181043624878,
+      "step": 4355
+    },
+    {
+      "epoch": 0.7933042212518195,
+      "grad_norm": 0.164458766579628,
+      "learning_rate": 5.4339051253937065e-06,
+      "loss": 0.14501686096191407,
+      "step": 4360
+    },
+    {
+      "epoch": 0.7942139737991266,
+      "grad_norm": 0.1719193458557129,
+      "learning_rate": 5.3881711949737625e-06,
+      "loss": 0.13321092128753662,
+      "step": 4365
+    },
+    {
+      "epoch": 0.7951237263464338,
+      "grad_norm": 0.17219696938991547,
+      "learning_rate": 5.342607280120121e-06,
+      "loss": 0.1413906455039978,
+      "step": 4370
+    },
+    {
+      "epoch": 0.7960334788937409,
+      "grad_norm": 0.15083056688308716,
+      "learning_rate": 5.297213775827789e-06,
+      "loss": 0.14772192239761353,
+      "step": 4375
+    },
+    {
+      "epoch": 0.7969432314410481,
+      "grad_norm": 0.1699071079492569,
+      "learning_rate": 5.251991075614507e-06,
+      "loss": 0.1392375946044922,
+      "step": 4380
+    },
+    {
+      "epoch": 0.7978529839883551,
+      "grad_norm": 0.1680395007133484,
+      "learning_rate": 5.206939571517302e-06,
+      "loss": 0.14185575246810914,
+      "step": 4385
+    },
+    {
+      "epoch": 0.7987627365356623,
+      "grad_norm": 0.16526710987091064,
+      "learning_rate": 5.162059654089083e-06,
+      "loss": 0.15001428127288818,
+      "step": 4390
+    },
+    {
+      "epoch": 0.7996724890829694,
+      "grad_norm": 0.16281752288341522,
+      "learning_rate": 5.1173517123952794e-06,
+      "loss": 0.13747023344039916,
+      "step": 4395
+    },
+    {
+      "epoch": 0.8005822416302766,
+      "grad_norm": 0.1454378366470337,
+      "learning_rate": 5.072816134010458e-06,
+      "loss": 0.14710829257965088,
+      "step": 4400
+    },
+    {
+      "epoch": 0.8014919941775837,
+      "grad_norm": 0.16565890610218048,
+      "learning_rate": 5.028453305014966e-06,
+      "loss": 0.14138611555099487,
+      "step": 4405
+    },
+    {
+      "epoch": 0.8024017467248908,
+      "grad_norm": 0.1962810605764389,
+      "learning_rate": 4.984263609991577e-06,
+      "loss": 0.13836177587509155,
+      "step": 4410
+    },
+    {
+      "epoch": 0.8033114992721979,
+      "grad_norm": 0.16091369092464447,
+      "learning_rate": 4.940247432022149e-06,
+      "loss": 0.14407440423965454,
+      "step": 4415
+    },
+    {
+      "epoch": 0.8042212518195051,
+      "grad_norm": 0.1930241584777832,
+      "learning_rate": 4.89640515268433e-06,
+      "loss": 0.14346336126327514,
+      "step": 4420
+    },
+    {
+      "epoch": 0.8051310043668122,
+      "grad_norm": 0.19301500916481018,
+      "learning_rate": 4.852737152048242e-06,
+      "loss": 0.14174317121505736,
+      "step": 4425
+    },
+    {
+      "epoch": 0.8060407569141194,
+      "grad_norm": 0.1541353315114975,
+      "learning_rate": 4.80924380867315e-06,
+      "loss": 0.14100592136383056,
+      "step": 4430
+    },
+    {
+      "epoch": 0.8069505094614265,
+      "grad_norm": 0.16285750269889832,
+      "learning_rate": 4.765925499604243e-06,
+      "loss": 0.1441288709640503,
+      "step": 4435
+    },
+    {
+      "epoch": 0.8078602620087336,
+      "grad_norm": 0.17382675409317017,
+      "learning_rate": 4.722782600369299e-06,
+      "loss": 0.13763951063156127,
+      "step": 4440
+    },
+    {
+      "epoch": 0.8087700145560408,
+      "grad_norm": 0.1697344034910202,
+      "learning_rate": 4.679815484975505e-06,
+      "loss": 0.1410105347633362,
+      "step": 4445
+    },
+    {
+      "epoch": 0.8096797671033479,
+      "grad_norm": 0.19964542984962463,
+      "learning_rate": 4.637024525906131e-06,
+      "loss": 0.1439276695251465,
+      "step": 4450
+    },
+    {
+      "epoch": 0.8105895196506551,
+      "grad_norm": 0.165307879447937,
+      "learning_rate": 4.59441009411736e-06,
+      "loss": 0.13897504806518554,
+      "step": 4455
+    },
+    {
+      "epoch": 0.8114992721979621,
+      "grad_norm": 0.16687989234924316,
+      "learning_rate": 4.551972559035067e-06,
+      "loss": 0.1422593355178833,
+      "step": 4460
+    },
+    {
+      "epoch": 0.8124090247452693,
+      "grad_norm": 0.15737789869308472,
+      "learning_rate": 4.509712288551571e-06,
+      "loss": 0.1452128052711487,
+      "step": 4465
+    },
+    {
+      "epoch": 0.8133187772925764,
+      "grad_norm": 0.17116659879684448,
+      "learning_rate": 4.467629649022509e-06,
+      "loss": 0.14385371208190917,
+      "step": 4470
+    },
+    {
+      "epoch": 0.8142285298398836,
+      "grad_norm": 0.17457640171051025,
+      "learning_rate": 4.425725005263623e-06,
+      "loss": 0.14808475971221924,
+      "step": 4475
+    },
+    {
+      "epoch": 0.8151382823871907,
+      "grad_norm": 0.1621970385313034,
+      "learning_rate": 4.383998720547583e-06,
+      "loss": 0.13927959203720092,
+      "step": 4480
+    },
+    {
+      "epoch": 0.8160480349344978,
+      "grad_norm": 0.176296666264534,
+      "learning_rate": 4.342451156600896e-06,
+      "loss": 0.15041060447692872,
+      "step": 4485
+    },
+    {
+      "epoch": 0.8169577874818049,
+      "grad_norm": 0.17157645523548126,
+      "learning_rate": 4.301082673600698e-06,
+      "loss": 0.13932652473449708,
+      "step": 4490
+    },
+    {
+      "epoch": 0.8178675400291121,
+      "grad_norm": 0.15378527343273163,
+      "learning_rate": 4.259893630171682e-06,
+      "loss": 0.1406856894493103,
+      "step": 4495
+    },
+    {
+      "epoch": 0.8187772925764192,
+      "grad_norm": 0.1750226765871048,
+      "learning_rate": 4.218884383382987e-06,
+      "loss": 0.1350164532661438,
+      "step": 4500
+    },
+    {
+      "epoch": 0.8196870451237264,
+      "grad_norm": 0.1393742561340332,
+      "learning_rate": 4.178055288745053e-06,
+      "loss": 0.13769235610961914,
+      "step": 4505
+    },
+    {
+      "epoch": 0.8205967976710334,
+      "grad_norm": 0.1668994128704071,
+      "learning_rate": 4.137406700206617e-06,
+      "loss": 0.14029752016067504,
+      "step": 4510
+    },
+    {
+      "epoch": 0.8215065502183406,
+      "grad_norm": 0.1833454668521881,
+      "learning_rate": 4.0969389701515675e-06,
+      "loss": 0.14276301860809326,
+      "step": 4515
+    },
+    {
+      "epoch": 0.8224163027656477,
+      "grad_norm": 0.16187874972820282,
+      "learning_rate": 4.056652449395945e-06,
+      "loss": 0.1444832682609558,
+      "step": 4520
+    },
+    {
+      "epoch": 0.8233260553129549,
+      "grad_norm": 0.1453280746936798,
+      "learning_rate": 4.01654748718488e-06,
+      "loss": 0.14512733221054078,
+      "step": 4525
+    },
+    {
+      "epoch": 0.824235807860262,
+      "grad_norm": 0.1782725751399994,
+      "learning_rate": 3.976624431189563e-06,
+      "loss": 0.14093561172485353,
+      "step": 4530
+    },
+    {
+      "epoch": 0.8251455604075691,
+      "grad_norm": 0.17374491691589355,
+      "learning_rate": 3.936883627504234e-06,
+      "loss": 0.14031401872634888,
+      "step": 4535
+    },
+    {
+      "epoch": 0.8260553129548762,
+      "grad_norm": 0.1609172821044922,
+      "learning_rate": 3.897325420643174e-06,
+      "loss": 0.1428336262702942,
+      "step": 4540
+    },
+    {
+      "epoch": 0.8269650655021834,
+      "grad_norm": 0.1520884931087494,
+      "learning_rate": 3.85795015353774e-06,
+      "loss": 0.1460547924041748,
+      "step": 4545
+    },
+    {
+      "epoch": 0.8278748180494906,
+      "grad_norm": 0.20986326038837433,
+      "learning_rate": 3.818758167533376e-06,
+      "loss": 0.14706350564956666,
+      "step": 4550
+    },
+    {
+      "epoch": 0.8287845705967977,
+      "grad_norm": 0.16825413703918457,
+      "learning_rate": 3.7797498023866396e-06,
+      "loss": 0.14507200717926025,
+      "step": 4555
+    },
+    {
+      "epoch": 0.8296943231441049,
+      "grad_norm": 0.16758380830287933,
+      "learning_rate": 3.740925396262296e-06,
+      "loss": 0.14898381233215333,
+      "step": 4560
+    },
+    {
+      "epoch": 0.8306040756914119,
+      "grad_norm": 0.15207453072071075,
+      "learning_rate": 3.7022852857303503e-06,
+      "loss": 0.14138854742050172,
+      "step": 4565
+    },
+    {
+      "epoch": 0.8315138282387191,
+      "grad_norm": 0.15150749683380127,
+      "learning_rate": 3.66382980576315e-06,
+      "loss": 0.13894975185394287,
+      "step": 4570
+    },
+    {
+      "epoch": 0.8324235807860262,
+      "grad_norm": 0.17071188986301422,
+      "learning_rate": 3.625559289732472e-06,
+      "loss": 0.14072470664978026,
+      "step": 4575
+    },
+    {
+      "epoch": 0.8333333333333334,
+      "grad_norm": 0.154335618019104,
+      "learning_rate": 3.5874740694066294e-06,
+      "loss": 0.13791344165802003,
+      "step": 4580
+    },
+    {
+      "epoch": 0.8342430858806404,
+      "grad_norm": 0.14017128944396973,
+      "learning_rate": 3.5495744749476116e-06,
+      "loss": 0.14427922964096068,
+      "step": 4585
+    },
+    {
+      "epoch": 0.8351528384279476,
+      "grad_norm": 0.17210033535957336,
+      "learning_rate": 3.5118608349081983e-06,
+      "loss": 0.15191166400909423,
+      "step": 4590
+    },
+    {
+      "epoch": 0.8360625909752547,
+      "grad_norm": 0.18715685606002808,
+      "learning_rate": 3.4743334762291358e-06,
+      "loss": 0.14451316595077515,
+      "step": 4595
+    },
+    {
+      "epoch": 0.8369723435225619,
+      "grad_norm": 0.18079884350299835,
+      "learning_rate": 3.436992724236293e-06,
+      "loss": 0.13530746698379517,
+      "step": 4600
+    },
+    {
+      "epoch": 0.837882096069869,
+      "grad_norm": 0.13519920408725739,
+      "learning_rate": 3.399838902637817e-06,
+      "loss": 0.1477964401245117,
+      "step": 4605
+    },
+    {
+      "epoch": 0.8387918486171762,
+      "grad_norm": 0.1778026670217514,
+      "learning_rate": 3.3628723335213885e-06,
+      "loss": 0.14419831037521363,
+      "step": 4610
+    },
+    {
+      "epoch": 0.8397016011644832,
+      "grad_norm": 0.15165366232395172,
+      "learning_rate": 3.326093337351355e-06,
+      "loss": 0.13888469934463502,
+      "step": 4615
+    },
+    {
+      "epoch": 0.8406113537117904,
+      "grad_norm": 0.17049473524093628,
+      "learning_rate": 3.2895022329660018e-06,
+      "loss": 0.14438477754592896,
+      "step": 4620
+    },
+    {
+      "epoch": 0.8415211062590975,
+      "grad_norm": 0.16536414623260498,
+      "learning_rate": 3.2530993375747833e-06,
+      "loss": 0.1444351315498352,
+      "step": 4625
+    },
+    {
+      "epoch": 0.8424308588064047,
+      "grad_norm": 0.17570015788078308,
+      "learning_rate": 3.2168849667555402e-06,
+      "loss": 0.13861945867538453,
+      "step": 4630
+    },
+    {
+      "epoch": 0.8433406113537117,
+      "grad_norm": 0.1699545532464981,
+      "learning_rate": 3.1808594344518132e-06,
+      "loss": 0.13902754783630372,
+      "step": 4635
+    },
+    {
+      "epoch": 0.8442503639010189,
+      "grad_norm": 0.12331254780292511,
+      "learning_rate": 3.1450230529700837e-06,
+      "loss": 0.14104254245758058,
+      "step": 4640
+    },
+    {
+      "epoch": 0.845160116448326,
+      "grad_norm": 0.1508190929889679,
+      "learning_rate": 3.1093761329770708e-06,
+      "loss": 0.13288766145706177,
+      "step": 4645
+    },
+    {
+      "epoch": 0.8460698689956332,
+      "grad_norm": 0.19049489498138428,
+      "learning_rate": 3.0739189834970735e-06,
+      "loss": 0.14914840459823608,
+      "step": 4650
+    },
+    {
+      "epoch": 0.8469796215429404,
+      "grad_norm": 0.1662369966506958,
+      "learning_rate": 3.0386519119092293e-06,
+      "loss": 0.14222898483276367,
+      "step": 4655
+    },
+    {
+      "epoch": 0.8478893740902474,
+      "grad_norm": 0.18985967338085175,
+      "learning_rate": 3.0035752239449126e-06,
+      "loss": 0.14431113004684448,
+      "step": 4660
+    },
+    {
+      "epoch": 0.8487991266375546,
+      "grad_norm": 0.17005261778831482,
+      "learning_rate": 2.9686892236850337e-06,
+      "loss": 0.14140807390213012,
+      "step": 4665
+    },
+    {
+      "epoch": 0.8497088791848617,
+      "grad_norm": 0.16786684095859528,
+      "learning_rate": 2.9339942135574394e-06,
+      "loss": 0.14161460399627684,
+      "step": 4670
+    },
+    {
+      "epoch": 0.8506186317321689,
+      "grad_norm": 0.16358181834220886,
+      "learning_rate": 2.899490494334281e-06,
+      "loss": 0.14674670696258546,
+      "step": 4675
+    },
+    {
+      "epoch": 0.851528384279476,
+      "grad_norm": 0.1651349812746048,
+      "learning_rate": 2.8651783651293867e-06,
+      "loss": 0.13794611692428588,
+      "step": 4680
+    },
+    {
+      "epoch": 0.8524381368267832,
+      "grad_norm": 0.16934923827648163,
+      "learning_rate": 2.831058123395694e-06,
+      "loss": 0.13199397325515747,
+      "step": 4685
+    },
+    {
+      "epoch": 0.8533478893740902,
+      "grad_norm": 0.1704150140285492,
+      "learning_rate": 2.797130064922665e-06,
+      "loss": 0.14044904708862305,
+      "step": 4690
+    },
+    {
+      "epoch": 0.8542576419213974,
+      "grad_norm": 0.1814192682504654,
+      "learning_rate": 2.7633944838337143e-06,
+      "loss": 0.1465100646018982,
+      "step": 4695
+    },
+    {
+      "epoch": 0.8551673944687045,
+      "grad_norm": 0.18942610919475555,
+      "learning_rate": 2.729851672583669e-06,
+      "loss": 0.14685982465744019,
+      "step": 4700
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.584293375636099e+18,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-4700/training_args.bin b/checkpoint-4700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-4700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/checkpoint-4800/README.md b/checkpoint-4800/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-4800/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-4800/adapter_config.json b/checkpoint-4800/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-4800/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-4800/adapter_model.safetensors b/checkpoint-4800/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..3d0a1ba8062224b35070478c18c1ae754c7b13a4
--- /dev/null
+++ b/checkpoint-4800/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:67b1d36e77867d8477a11fe1aaf8a2840d592a443782cc064829db4dfafb6a87
+size 169741912
diff --git a/checkpoint-4800/chat_template.jinja b/checkpoint-4800/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-4800/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-4800/optimizer.pt b/checkpoint-4800/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ce496aea57076dacb2e9da05d33f3c53ad7509d6
--- /dev/null
+++ b/checkpoint-4800/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8a45cc82d4224e600cf448c802460c6ce98419f7fc4a1ea80c6bd6835f378e34
+size 72807355
diff --git a/checkpoint-4800/processor_config.json b/checkpoint-4800/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-4800/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-4800/rng_state.pth b/checkpoint-4800/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-4800/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-4800/scheduler.pt b/checkpoint-4800/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1ec01cd25f540fcd1967eba137981512bb4614a3
--- /dev/null
+++ b/checkpoint-4800/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aedf70d9ea7ff9ec0e7c368bea8350cbcd293e65545950562608f0d2a10a4e0c
+size 1465
diff --git a/checkpoint-4800/tokenizer.json b/checkpoint-4800/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-4800/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-4800/tokenizer_config.json b/checkpoint-4800/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-4800/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-4800/trainer_state.json b/checkpoint-4800/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..68a3ec5e69df42f786cd9c4fb293a5d9cf426b0a
--- /dev/null
+++ b/checkpoint-4800/trainer_state.json
@@ -0,0 +1,6762 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.8733624454148472,
+  "eval_steps": 100,
+  "global_step": 4800,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    },
+    {
+      "epoch": 0.03729985443959243,
+      "grad_norm": 0.061678580939769745,
+      "learning_rate": 4.999340748636462e-05,
+      "loss": 0.24956226348876953,
+      "step": 205
+    },
+    {
+      "epoch": 0.03820960698689956,
+      "grad_norm": 0.07328873127698898,
+      "learning_rate": 4.999160884067051e-05,
+      "loss": 0.26169676780700685,
+      "step": 210
+    },
+    {
+      "epoch": 0.0391193595342067,
+      "grad_norm": 0.08287990838289261,
+      "learning_rate": 4.9989593541926246e-05,
+      "loss": 0.2574604034423828,
+      "step": 215
+    },
+    {
+      "epoch": 0.04002911208151383,
+      "grad_norm": 0.06787359714508057,
+      "learning_rate": 4.9987361607602525e-05,
+      "loss": 0.25351409912109374,
+      "step": 220
+    },
+    {
+      "epoch": 0.04093886462882096,
+      "grad_norm": 0.06695502996444702,
+      "learning_rate": 4.998491305704805e-05,
+      "loss": 0.24522039890289307,
+      "step": 225
+    },
+    {
+      "epoch": 0.041848617176128096,
+      "grad_norm": 0.08872214704751968,
+      "learning_rate": 4.9982247911489375e-05,
+      "loss": 0.2581867933273315,
+      "step": 230
+    },
+    {
+      "epoch": 0.042758369723435226,
+      "grad_norm": 0.07637131959199905,
+      "learning_rate": 4.9979366194030743e-05,
+      "loss": 0.25569658279418944,
+      "step": 235
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 0.08158119022846222,
+      "learning_rate": 4.997626792965385e-05,
+      "loss": 0.2529409646987915,
+      "step": 240
+    },
+    {
+      "epoch": 0.04457787481804949,
+      "grad_norm": 0.07529161125421524,
+      "learning_rate": 4.997295314521766e-05,
+      "loss": 0.24049024581909179,
+      "step": 245
+    },
+    {
+      "epoch": 0.04548762736535662,
+      "grad_norm": 0.08860139548778534,
+      "learning_rate": 4.996942186945813e-05,
+      "loss": 0.2490522861480713,
+      "step": 250
+    },
+    {
+      "epoch": 0.04639737991266375,
+      "grad_norm": 0.0850321501493454,
+      "learning_rate": 4.9965674132988005e-05,
+      "loss": 0.24180831909179687,
+      "step": 255
+    },
+    {
+      "epoch": 0.04730713245997089,
+      "grad_norm": 0.07556115090847015,
+      "learning_rate": 4.996170996829653e-05,
+      "loss": 0.2509631872177124,
+      "step": 260
+    },
+    {
+      "epoch": 0.04821688500727802,
+      "grad_norm": 0.07971206307411194,
+      "learning_rate": 4.995752940974918e-05,
+      "loss": 0.24398891925811766,
+      "step": 265
+    },
+    {
+      "epoch": 0.04912663755458515,
+      "grad_norm": 0.09149336814880371,
+      "learning_rate": 4.9953132493587344e-05,
+      "loss": 0.2300492286682129,
+      "step": 270
+    },
+    {
+      "epoch": 0.050036390101892286,
+      "grad_norm": 0.08265820890665054,
+      "learning_rate": 4.9948519257928034e-05,
+      "loss": 0.24246792793273925,
+      "step": 275
+    },
+    {
+      "epoch": 0.050946142649199416,
+      "grad_norm": 0.10328587144613266,
+      "learning_rate": 4.9943689742763534e-05,
+      "loss": 0.2367171049118042,
+      "step": 280
+    },
+    {
+      "epoch": 0.05185589519650655,
+      "grad_norm": 0.0836917981505394,
+      "learning_rate": 4.993864398996105e-05,
+      "loss": 0.23215813636779786,
+      "step": 285
+    },
+    {
+      "epoch": 0.05276564774381368,
+      "grad_norm": 0.09475161135196686,
+      "learning_rate": 4.99333820432624e-05,
+      "loss": 0.2350748062133789,
+      "step": 290
+    },
+    {
+      "epoch": 0.05367540029112081,
+      "grad_norm": 0.08040128648281097,
+      "learning_rate": 4.992790394828355e-05,
+      "loss": 0.23253886699676513,
+      "step": 295
+    },
+    {
+      "epoch": 0.05458515283842795,
+      "grad_norm": 0.08852150291204453,
+      "learning_rate": 4.992220975251428e-05,
+      "loss": 0.23856515884399415,
+      "step": 300
+    },
+    {
+      "epoch": 0.05549490538573508,
+      "grad_norm": 0.09565229713916779,
+      "learning_rate": 4.991629950531775e-05,
+      "loss": 0.23311660289764405,
+      "step": 305
+    },
+    {
+      "epoch": 0.05640465793304221,
+      "grad_norm": 0.08158160001039505,
+      "learning_rate": 4.991017325793009e-05,
+      "loss": 0.22467944622039795,
+      "step": 310
+    },
+    {
+      "epoch": 0.05731441048034935,
+      "grad_norm": 0.07746429741382599,
+      "learning_rate": 4.990383106345994e-05,
+      "loss": 0.229844069480896,
+      "step": 315
+    },
+    {
+      "epoch": 0.05822416302765648,
+      "grad_norm": 0.08564355969429016,
+      "learning_rate": 4.989727297688797e-05,
+      "loss": 0.22414517402648926,
+      "step": 320
+    },
+    {
+      "epoch": 0.05913391557496361,
+      "grad_norm": 0.07517435401678085,
+      "learning_rate": 4.9890499055066435e-05,
+      "loss": 0.2236532211303711,
+      "step": 325
+    },
+    {
+      "epoch": 0.060043668122270744,
+      "grad_norm": 0.111734539270401,
+      "learning_rate": 4.988350935671869e-05,
+      "loss": 0.21474847793579102,
+      "step": 330
+    },
+    {
+      "epoch": 0.060953420669577874,
+      "grad_norm": 0.09906989336013794,
+      "learning_rate": 4.987630394243866e-05,
+      "loss": 0.23321933746337892,
+      "step": 335
+    },
+    {
+      "epoch": 0.06186317321688501,
+      "grad_norm": 0.10131457448005676,
+      "learning_rate": 4.98688828746903e-05,
+      "loss": 0.2310662031173706,
+      "step": 340
+    },
+    {
+      "epoch": 0.06277292576419213,
+      "grad_norm": 0.09203507006168365,
+      "learning_rate": 4.986124621780708e-05,
+      "loss": 0.22021169662475587,
+      "step": 345
+    },
+    {
+      "epoch": 0.06368267831149928,
+      "grad_norm": 0.09505912661552429,
+      "learning_rate": 4.9853394037991416e-05,
+      "loss": 0.2197155237197876,
+      "step": 350
+    },
+    {
+      "epoch": 0.06459243085880641,
+      "grad_norm": 0.09038657695055008,
+      "learning_rate": 4.984532640331412e-05,
+      "loss": 0.22066287994384765,
+      "step": 355
+    },
+    {
+      "epoch": 0.06550218340611354,
+      "grad_norm": 0.09707064181566238,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 0.22455451488494874,
+      "step": 360
+    },
+    {
+      "epoch": 0.06641193595342067,
+      "grad_norm": 0.10367228090763092,
+      "learning_rate": 4.98285450509961e-05,
+      "loss": 0.21993820667266845,
+      "step": 365
+    },
+    {
+      "epoch": 0.0673216885007278,
+      "grad_norm": 0.12229471653699875,
+      "learning_rate": 4.9819831478833456e-05,
+      "loss": 0.2168867588043213,
+      "step": 370
+    },
+    {
+      "epoch": 0.06823144104803494,
+      "grad_norm": 0.0964592918753624,
+      "learning_rate": 4.981090274276406e-05,
+      "loss": 0.21579203605651856,
+      "step": 375
+    },
+    {
+      "epoch": 0.06914119359534207,
+      "grad_norm": 0.09400496631860733,
+      "learning_rate": 4.980175892019141e-05,
+      "loss": 0.20972180366516113,
+      "step": 380
+    },
+    {
+      "epoch": 0.0700509461426492,
+      "grad_norm": 0.08158645778894424,
+      "learning_rate": 4.9792400090383594e-05,
+      "loss": 0.22148358821868896,
+      "step": 385
+    },
+    {
+      "epoch": 0.07096069868995633,
+      "grad_norm": 0.10916394740343094,
+      "learning_rate": 4.978282633447261e-05,
+      "loss": 0.2214418649673462,
+      "step": 390
+    },
+    {
+      "epoch": 0.07187045123726346,
+      "grad_norm": 0.11138810962438583,
+      "learning_rate": 4.9773037735453636e-05,
+      "loss": 0.21814754009246826,
+      "step": 395
+    },
+    {
+      "epoch": 0.07278020378457059,
+      "grad_norm": 0.10914396494626999,
+      "learning_rate": 4.9763034378184365e-05,
+      "loss": 0.21310818195343018,
+      "step": 400
+    },
+    {
+      "epoch": 0.07368995633187773,
+      "grad_norm": 0.1043366864323616,
+      "learning_rate": 4.975281634938421e-05,
+      "loss": 0.21266789436340333,
+      "step": 405
+    },
+    {
+      "epoch": 0.07459970887918486,
+      "grad_norm": 0.1036868542432785,
+      "learning_rate": 4.9742383737633594e-05,
+      "loss": 0.21606721878051757,
+      "step": 410
+    },
+    {
+      "epoch": 0.075509461426492,
+      "grad_norm": 0.11640442907810211,
+      "learning_rate": 4.9731736633373144e-05,
+      "loss": 0.21532948017120362,
+      "step": 415
+    },
+    {
+      "epoch": 0.07641921397379912,
+      "grad_norm": 0.11219926178455353,
+      "learning_rate": 4.9720875128902956e-05,
+      "loss": 0.2191627025604248,
+      "step": 420
+    },
+    {
+      "epoch": 0.07732896652110625,
+      "grad_norm": 0.12103637307882309,
+      "learning_rate": 4.970979931838176e-05,
+      "loss": 0.20938868522644044,
+      "step": 425
+    },
+    {
+      "epoch": 0.0782387190684134,
+      "grad_norm": 0.13274189829826355,
+      "learning_rate": 4.96985092978261e-05,
+      "loss": 0.21792960166931152,
+      "step": 430
+    },
+    {
+      "epoch": 0.07914847161572053,
+      "grad_norm": 0.11164513230323792,
+      "learning_rate": 4.968700516510954e-05,
+      "loss": 0.2022618055343628,
+      "step": 435
+    },
+    {
+      "epoch": 0.08005822416302766,
+      "grad_norm": 0.09532847255468369,
+      "learning_rate": 4.967528701996174e-05,
+      "loss": 0.21255812644958497,
+      "step": 440
+    },
+    {
+      "epoch": 0.08096797671033479,
+      "grad_norm": 0.10279258340597153,
+      "learning_rate": 4.96633549639677e-05,
+      "loss": 0.20683050155639648,
+      "step": 445
+    },
+    {
+      "epoch": 0.08187772925764192,
+      "grad_norm": 0.1257462352514267,
+      "learning_rate": 4.965120910056677e-05,
+      "loss": 0.21419920921325683,
+      "step": 450
+    },
+    {
+      "epoch": 0.08278748180494905,
+      "grad_norm": 0.11663137376308441,
+      "learning_rate": 4.963884953505186e-05,
+      "loss": 0.2072287082672119,
+      "step": 455
+    },
+    {
+      "epoch": 0.08369723435225619,
+      "grad_norm": 0.10488224029541016,
+      "learning_rate": 4.96262763745684e-05,
+      "loss": 0.1982678532600403,
+      "step": 460
+    },
+    {
+      "epoch": 0.08460698689956332,
+      "grad_norm": 0.11801692098379135,
+      "learning_rate": 4.961348972811354e-05,
+      "loss": 0.20662031173706055,
+      "step": 465
+    },
+    {
+      "epoch": 0.08551673944687045,
+      "grad_norm": 0.11318827420473099,
+      "learning_rate": 4.96004897065351e-05,
+      "loss": 0.20947303771972656,
+      "step": 470
+    },
+    {
+      "epoch": 0.08642649199417758,
+      "grad_norm": 0.13409486413002014,
+      "learning_rate": 4.95872764225307e-05,
+      "loss": 0.19670876264572143,
+      "step": 475
+    },
+    {
+      "epoch": 0.08733624454148471,
+      "grad_norm": 0.14440792798995972,
+      "learning_rate": 4.957384999064672e-05,
+      "loss": 0.19842848777770997,
+      "step": 480
+    },
+    {
+      "epoch": 0.08824599708879186,
+      "grad_norm": 0.12246996909379959,
+      "learning_rate": 4.956021052727731e-05,
+      "loss": 0.20318071842193602,
+      "step": 485
+    },
+    {
+      "epoch": 0.08915574963609899,
+      "grad_norm": 0.13437233865261078,
+      "learning_rate": 4.954635815066342e-05,
+      "loss": 0.21675212383270265,
+      "step": 490
+    },
+    {
+      "epoch": 0.09006550218340612,
+      "grad_norm": 0.11109672486782074,
+      "learning_rate": 4.9532292980891744e-05,
+      "loss": 0.2100757837295532,
+      "step": 495
+    },
+    {
+      "epoch": 0.09097525473071325,
+      "grad_norm": 0.1388893872499466,
+      "learning_rate": 4.9518015139893675e-05,
+      "loss": 0.20303285121917725,
+      "step": 500
+    },
+    {
+      "epoch": 0.09188500727802038,
+      "grad_norm": 0.13239721953868866,
+      "learning_rate": 4.950352475144427e-05,
+      "loss": 0.2152268409729004,
+      "step": 505
+    },
+    {
+      "epoch": 0.0927947598253275,
+      "grad_norm": 0.12834979593753815,
+      "learning_rate": 4.948882194116119e-05,
+      "loss": 0.20799248218536376,
+      "step": 510
+    },
+    {
+      "epoch": 0.09370451237263465,
+      "grad_norm": 0.11886704713106155,
+      "learning_rate": 4.947390683650354e-05,
+      "loss": 0.20394976139068605,
+      "step": 515
+    },
+    {
+      "epoch": 0.09461426491994178,
+      "grad_norm": 0.11398876458406448,
+      "learning_rate": 4.945877956677083e-05,
+      "loss": 0.2091092586517334,
+      "step": 520
+    },
+    {
+      "epoch": 0.09552401746724891,
+      "grad_norm": 0.1422540694475174,
+      "learning_rate": 4.944344026310186e-05,
+      "loss": 0.19564238786697388,
+      "step": 525
+    },
+    {
+      "epoch": 0.09643377001455604,
+      "grad_norm": 0.11359584331512451,
+      "learning_rate": 4.9427889058473535e-05,
+      "loss": 0.20493624210357667,
+      "step": 530
+    },
+    {
+      "epoch": 0.09734352256186317,
+      "grad_norm": 0.11703553050756454,
+      "learning_rate": 4.941212608769974e-05,
+      "loss": 0.2098615884780884,
+      "step": 535
+    },
+    {
+      "epoch": 0.0982532751091703,
+      "grad_norm": 0.14552047848701477,
+      "learning_rate": 4.939615148743017e-05,
+      "loss": 0.20382182598114013,
+      "step": 540
+    },
+    {
+      "epoch": 0.09916302765647744,
+      "grad_norm": 0.13178016245365143,
+      "learning_rate": 4.937996539614914e-05,
+      "loss": 0.19901862144470214,
+      "step": 545
+    },
+    {
+      "epoch": 0.10007278020378457,
+      "grad_norm": 0.635392427444458,
+      "learning_rate": 4.936356795417439e-05,
+      "loss": 0.20694944858551026,
+      "step": 550
+    },
+    {
+      "epoch": 0.1009825327510917,
+      "grad_norm": 0.15019077062606812,
+      "learning_rate": 4.934695930365586e-05,
+      "loss": 0.19313746690750122,
+      "step": 555
+    },
+    {
+      "epoch": 0.10189228529839883,
+      "grad_norm": 0.12941956520080566,
+      "learning_rate": 4.9330139588574474e-05,
+      "loss": 0.19671722650527954,
+      "step": 560
+    },
+    {
+      "epoch": 0.10280203784570596,
+      "grad_norm": 0.13818831741809845,
+      "learning_rate": 4.931310895474088e-05,
+      "loss": 0.20026786327362062,
+      "step": 565
+    },
+    {
+      "epoch": 0.1037117903930131,
+      "grad_norm": 0.12011194974184036,
+      "learning_rate": 4.929586754979417e-05,
+      "loss": 0.1932437539100647,
+      "step": 570
+    },
+    {
+      "epoch": 0.10462154294032024,
+      "grad_norm": 0.1345364898443222,
+      "learning_rate": 4.9278415523200644e-05,
+      "loss": 0.20245940685272218,
+      "step": 575
+    },
+    {
+      "epoch": 0.10553129548762737,
+      "grad_norm": 0.13281017541885376,
+      "learning_rate": 4.926075302625247e-05,
+      "loss": 0.19864981174468993,
+      "step": 580
+    },
+    {
+      "epoch": 0.1064410480349345,
+      "grad_norm": 0.13465586304664612,
+      "learning_rate": 4.924288021206639e-05,
+      "loss": 0.19573183059692384,
+      "step": 585
+    },
+    {
+      "epoch": 0.10735080058224163,
+      "grad_norm": 0.15225961804389954,
+      "learning_rate": 4.9224797235582396e-05,
+      "loss": 0.19946801662445068,
+      "step": 590
+    },
+    {
+      "epoch": 0.10826055312954876,
+      "grad_norm": 0.12816746532917023,
+      "learning_rate": 4.92065042535624e-05,
+      "loss": 0.19851526021957397,
+      "step": 595
+    },
+    {
+      "epoch": 0.1091703056768559,
+      "grad_norm": 0.13802853226661682,
+      "learning_rate": 4.9188001424588824e-05,
+      "loss": 0.19321763515472412,
+      "step": 600
+    },
+    {
+      "epoch": 0.11008005822416303,
+      "grad_norm": 0.17504797875881195,
+      "learning_rate": 4.9169288909063295e-05,
+      "loss": 0.2032616138458252,
+      "step": 605
+    },
+    {
+      "epoch": 0.11098981077147016,
+      "grad_norm": 0.13544194400310516,
+      "learning_rate": 4.91503668692052e-05,
+      "loss": 0.2011256456375122,
+      "step": 610
+    },
+    {
+      "epoch": 0.11189956331877729,
+      "grad_norm": 1.3976134061813354,
+      "learning_rate": 4.91312354690503e-05,
+      "loss": 0.19916868209838867,
+      "step": 615
+    },
+    {
+      "epoch": 0.11280931586608442,
+      "grad_norm": 0.1465059071779251,
+      "learning_rate": 4.91118948744493e-05,
+      "loss": 0.19487457275390624,
+      "step": 620
+    },
+    {
+      "epoch": 0.11371906841339156,
+      "grad_norm": 0.12103168666362762,
+      "learning_rate": 4.909234525306645e-05,
+      "loss": 0.1907251238822937,
+      "step": 625
+    },
+    {
+      "epoch": 0.1146288209606987,
+      "grad_norm": 0.12660574913024902,
+      "learning_rate": 4.907258677437802e-05,
+      "loss": 0.19327253103256226,
+      "step": 630
+    },
+    {
+      "epoch": 0.11553857350800582,
+      "grad_norm": 0.1347813606262207,
+      "learning_rate": 4.90526196096709e-05,
+      "loss": 0.19637736082077026,
+      "step": 635
+    },
+    {
+      "epoch": 0.11644832605531295,
+      "grad_norm": 0.14953652024269104,
+      "learning_rate": 4.903244393204107e-05,
+      "loss": 0.20325069427490233,
+      "step": 640
+    },
+    {
+      "epoch": 0.11735807860262008,
+      "grad_norm": 0.13936272263526917,
+      "learning_rate": 4.901205991639213e-05,
+      "loss": 0.1930275321006775,
+      "step": 645
+    },
+    {
+      "epoch": 0.11826783114992721,
+      "grad_norm": 0.1448420137166977,
+      "learning_rate": 4.899146773943374e-05,
+      "loss": 0.20026936531066894,
+      "step": 650
+    },
+    {
+      "epoch": 0.11917758369723436,
+      "grad_norm": 0.1312534064054489,
+      "learning_rate": 4.897066757968014e-05,
+      "loss": 0.19062033891677857,
+      "step": 655
+    },
+    {
+      "epoch": 0.12008733624454149,
+      "grad_norm": 0.13644742965698242,
+      "learning_rate": 4.894965961744859e-05,
+      "loss": 0.18719595670700073,
+      "step": 660
+    },
+    {
+      "epoch": 0.12099708879184862,
+      "grad_norm": 0.14276087284088135,
+      "learning_rate": 4.892844403485777e-05,
+      "loss": 0.19784307479858398,
+      "step": 665
+    },
+    {
+      "epoch": 0.12190684133915575,
+      "grad_norm": 0.14735399186611176,
+      "learning_rate": 4.890702101582623e-05,
+      "loss": 0.19163782596588136,
+      "step": 670
+    },
+    {
+      "epoch": 0.12281659388646288,
+      "grad_norm": 0.15742065012454987,
+      "learning_rate": 4.888539074607082e-05,
+      "loss": 0.19312986135482788,
+      "step": 675
+    },
+    {
+      "epoch": 0.12372634643377002,
+      "grad_norm": 0.12917031347751617,
+      "learning_rate": 4.8863553413105025e-05,
+      "loss": 0.20066320896148682,
+      "step": 680
+    },
+    {
+      "epoch": 0.12463609898107715,
+      "grad_norm": 0.1484801322221756,
+      "learning_rate": 4.884150920623737e-05,
+      "loss": 0.20096096992492676,
+      "step": 685
+    },
+    {
+      "epoch": 0.12554585152838427,
+      "grad_norm": 0.1455296128988266,
+      "learning_rate": 4.88192583165698e-05,
+      "loss": 0.20518505573272705,
+      "step": 690
+    },
+    {
+      "epoch": 0.12645560407569142,
+      "grad_norm": 0.14517490565776825,
+      "learning_rate": 4.879680093699598e-05,
+      "loss": 0.18859238624572755,
+      "step": 695
+    },
+    {
+      "epoch": 0.12736535662299855,
+      "grad_norm": 0.18778090178966522,
+      "learning_rate": 4.877413726219964e-05,
+      "loss": 0.197074818611145,
+      "step": 700
+    },
+    {
+      "epoch": 0.12827510917030568,
+      "grad_norm": 0.13497677445411682,
+      "learning_rate": 4.87512674886529e-05,
+      "loss": 0.18713107109069824,
+      "step": 705
+    },
+    {
+      "epoch": 0.12918486171761281,
+      "grad_norm": 0.12657155096530914,
+      "learning_rate": 4.872819181461455e-05,
+      "loss": 0.1858484387397766,
+      "step": 710
+    },
+    {
+      "epoch": 0.13009461426491994,
+      "grad_norm": 0.11458148807287216,
+      "learning_rate": 4.870491044012834e-05,
+      "loss": 0.18732179403305055,
+      "step": 715
+    },
+    {
+      "epoch": 0.13100436681222707,
+      "grad_norm": 0.13000249862670898,
+      "learning_rate": 4.8681423567021244e-05,
+      "loss": 0.1872936010360718,
+      "step": 720
+    },
+    {
+      "epoch": 0.1319141193595342,
+      "grad_norm": 0.14580890536308289,
+      "learning_rate": 4.865773139890172e-05,
+      "loss": 0.19280019998550416,
+      "step": 725
+    },
+    {
+      "epoch": 0.13282387190684133,
+      "grad_norm": 0.1507277935743332,
+      "learning_rate": 4.8633834141157913e-05,
+      "loss": 0.1898929238319397,
+      "step": 730
+    },
+    {
+      "epoch": 0.13373362445414846,
+      "grad_norm": 0.1418737769126892,
+      "learning_rate": 4.860973200095592e-05,
+      "loss": 0.17926375865936278,
+      "step": 735
+    },
+    {
+      "epoch": 0.1346433770014556,
+      "grad_norm": 0.17151866853237152,
+      "learning_rate": 4.858542518723794e-05,
+      "loss": 0.18963592052459716,
+      "step": 740
+    },
+    {
+      "epoch": 0.13555312954876272,
+      "grad_norm": 0.11162743717432022,
+      "learning_rate": 4.8560913910720535e-05,
+      "loss": 0.19466646909713745,
+      "step": 745
+    },
+    {
+      "epoch": 0.13646288209606988,
+      "grad_norm": 0.15628376603126526,
+      "learning_rate": 4.8536198383892725e-05,
+      "loss": 0.19494034051895143,
+      "step": 750
+    },
+    {
+      "epoch": 0.137372634643377,
+      "grad_norm": 0.18209289014339447,
+      "learning_rate": 4.851127882101421e-05,
+      "loss": 0.18747550249099731,
+      "step": 755
+    },
+    {
+      "epoch": 0.13828238719068414,
+      "grad_norm": 0.14559614658355713,
+      "learning_rate": 4.8486155438113454e-05,
+      "loss": 0.1897158980369568,
+      "step": 760
+    },
+    {
+      "epoch": 0.13919213973799127,
+      "grad_norm": 0.3198587894439697,
+      "learning_rate": 4.846082845298586e-05,
+      "loss": 0.18571001291275024,
+      "step": 765
+    },
+    {
+      "epoch": 0.1401018922852984,
+      "grad_norm": 0.1486678421497345,
+      "learning_rate": 4.843529808519189e-05,
+      "loss": 0.19561930894851684,
+      "step": 770
+    },
+    {
+      "epoch": 0.14101164483260553,
+      "grad_norm": 0.15318170189857483,
+      "learning_rate": 4.840956455605509e-05,
+      "loss": 0.187040114402771,
+      "step": 775
+    },
+    {
+      "epoch": 0.14192139737991266,
+      "grad_norm": 0.13754244148731232,
+      "learning_rate": 4.838362808866025e-05,
+      "loss": 0.18345539569854735,
+      "step": 780
+    },
+    {
+      "epoch": 0.1428311499272198,
+      "grad_norm": 0.12943248450756073,
+      "learning_rate": 4.835748890785143e-05,
+      "loss": 0.1921079397201538,
+      "step": 785
+    },
+    {
+      "epoch": 0.14374090247452692,
+      "grad_norm": 0.110458143055439,
+      "learning_rate": 4.833114724023001e-05,
+      "loss": 0.17927205562591553,
+      "step": 790
+    },
+    {
+      "epoch": 0.14465065502183405,
+      "grad_norm": 0.2421770840883255,
+      "learning_rate": 4.830460331415275e-05,
+      "loss": 0.18317567110061644,
+      "step": 795
+    },
+    {
+      "epoch": 0.14556040756914118,
+      "grad_norm": 0.14752762019634247,
+      "learning_rate": 4.8277857359729787e-05,
+      "loss": 0.1843916058540344,
+      "step": 800
+    },
+    {
+      "epoch": 0.14647016011644834,
+      "grad_norm": 0.15043556690216064,
+      "learning_rate": 4.8250909608822644e-05,
+      "loss": 0.18354393243789674,
+      "step": 805
+    },
+    {
+      "epoch": 0.14737991266375547,
+      "grad_norm": 0.1381794661283493,
+      "learning_rate": 4.822376029504223e-05,
+      "loss": 0.1789781332015991,
+      "step": 810
+    },
+    {
+      "epoch": 0.1482896652110626,
+      "grad_norm": 0.18386174738407135,
+      "learning_rate": 4.819640965374681e-05,
+      "loss": 0.19494292736053467,
+      "step": 815
+    },
+    {
+      "epoch": 0.14919941775836973,
+      "grad_norm": 0.13829593360424042,
+      "learning_rate": 4.816885792203996e-05,
+      "loss": 0.18486063480377196,
+      "step": 820
+    },
+    {
+      "epoch": 0.15010917030567686,
+      "grad_norm": 0.15033291280269623,
+      "learning_rate": 4.814110533876852e-05,
+      "loss": 0.18061509132385253,
+      "step": 825
+    },
+    {
+      "epoch": 0.151018922852984,
+      "grad_norm": 0.17150473594665527,
+      "learning_rate": 4.811315214452051e-05,
+      "loss": 0.18464866876602173,
+      "step": 830
+    },
+    {
+      "epoch": 0.15192867540029112,
+      "grad_norm": 0.15317125618457794,
+      "learning_rate": 4.808499858162307e-05,
+      "loss": 0.1837708592414856,
+      "step": 835
+    },
+    {
+      "epoch": 0.15283842794759825,
+      "grad_norm": 0.2671392560005188,
+      "learning_rate": 4.805664489414031e-05,
+      "loss": 0.19338636398315429,
+      "step": 840
+    },
+    {
+      "epoch": 0.15374818049490538,
+      "grad_norm": 0.14047028124332428,
+      "learning_rate": 4.802809132787125e-05,
+      "loss": 0.17069108486175538,
+      "step": 845
+    },
+    {
+      "epoch": 0.1546579330422125,
+      "grad_norm": 0.1520431935787201,
+      "learning_rate": 4.799933813034768e-05,
+      "loss": 0.18607735633850098,
+      "step": 850
+    },
+    {
+      "epoch": 0.15556768558951964,
+      "grad_norm": 0.17239463329315186,
+      "learning_rate": 4.797038555083197e-05,
+      "loss": 0.18069062232971192,
+      "step": 855
+    },
+    {
+      "epoch": 0.1564774381368268,
+      "grad_norm": 0.1377955675125122,
+      "learning_rate": 4.794123384031495e-05,
+      "loss": 0.18870222568511963,
+      "step": 860
+    },
+    {
+      "epoch": 0.15738719068413393,
+      "grad_norm": 0.15901461243629456,
+      "learning_rate": 4.791188325151373e-05,
+      "loss": 0.18128334283828734,
+      "step": 865
+    },
+    {
+      "epoch": 0.15829694323144106,
+      "grad_norm": 0.14634132385253906,
+      "learning_rate": 4.7882334038869495e-05,
+      "loss": 0.1866163969039917,
+      "step": 870
+    },
+    {
+      "epoch": 0.1592066957787482,
+      "grad_norm": 0.15361061692237854,
+      "learning_rate": 4.785258645854529e-05,
+      "loss": 0.17850807905197144,
+      "step": 875
+    },
+    {
+      "epoch": 0.16011644832605532,
+      "grad_norm": 0.13751649856567383,
+      "learning_rate": 4.782264076842385e-05,
+      "loss": 0.17731113433837892,
+      "step": 880
+    },
+    {
+      "epoch": 0.16102620087336245,
+      "grad_norm": 0.17909638583660126,
+      "learning_rate": 4.7792497228105314e-05,
+      "loss": 0.18344542980194092,
+      "step": 885
+    },
+    {
+      "epoch": 0.16193595342066958,
+      "grad_norm": 0.16038304567337036,
+      "learning_rate": 4.776215609890498e-05,
+      "loss": 0.18868647813796996,
+      "step": 890
+    },
+    {
+      "epoch": 0.1628457059679767,
+      "grad_norm": 0.1653951108455658,
+      "learning_rate": 4.773161764385107e-05,
+      "loss": 0.18614152669906617,
+      "step": 895
+    },
+    {
+      "epoch": 0.16375545851528384,
+      "grad_norm": 0.16193026304244995,
+      "learning_rate": 4.770088212768241e-05,
+      "loss": 0.18564575910568237,
+      "step": 900
+    },
+    {
+      "epoch": 0.16466521106259097,
+      "grad_norm": 0.16048531234264374,
+      "learning_rate": 4.7669949816846173e-05,
+      "loss": 0.18330031633377075,
+      "step": 905
+    },
+    {
+      "epoch": 0.1655749636098981,
+      "grad_norm": 0.1440177708864212,
+      "learning_rate": 4.7638820979495534e-05,
+      "loss": 0.17712442874908446,
+      "step": 910
+    },
+    {
+      "epoch": 0.16648471615720525,
+      "grad_norm": 0.19635969400405884,
+      "learning_rate": 4.760749588548738e-05,
+      "loss": 0.18679027557373046,
+      "step": 915
+    },
+    {
+      "epoch": 0.16739446870451238,
+      "grad_norm": 0.15576541423797607,
+      "learning_rate": 4.757597480637995e-05,
+      "loss": 0.19283764362335204,
+      "step": 920
+    },
+    {
+      "epoch": 0.1683042212518195,
+      "grad_norm": 0.1550331562757492,
+      "learning_rate": 4.7544258015430463e-05,
+      "loss": 0.18269542455673218,
+      "step": 925
+    },
+    {
+      "epoch": 0.16921397379912664,
+      "grad_norm": 0.18369626998901367,
+      "learning_rate": 4.75123457875928e-05,
+      "loss": 0.1697891116142273,
+      "step": 930
+    },
+    {
+      "epoch": 0.17012372634643377,
+      "grad_norm": 0.15266314148902893,
+      "learning_rate": 4.7480238399515074e-05,
+      "loss": 0.18523451089859008,
+      "step": 935
+    },
+    {
+      "epoch": 0.1710334788937409,
+      "grad_norm": 0.16709664463996887,
+      "learning_rate": 4.744793612953724e-05,
+      "loss": 0.1803238034248352,
+      "step": 940
+    },
+    {
+      "epoch": 0.17194323144104803,
+      "grad_norm": 0.14929179847240448,
+      "learning_rate": 4.741543925768872e-05,
+      "loss": 0.1861217737197876,
+      "step": 945
+    },
+    {
+      "epoch": 0.17285298398835516,
+      "grad_norm": 0.1362280696630478,
+      "learning_rate": 4.7382748065685915e-05,
+      "loss": 0.17896100282669067,
+      "step": 950
+    },
+    {
+      "epoch": 0.1737627365356623,
+      "grad_norm": 0.15290239453315735,
+      "learning_rate": 4.734986283692982e-05,
+      "loss": 0.18432788848876952,
+      "step": 955
+    },
+    {
+      "epoch": 0.17467248908296942,
+      "grad_norm": 0.1287035197019577,
+      "learning_rate": 4.73167838565035e-05,
+      "loss": 0.18485682010650634,
+      "step": 960
+    },
+    {
+      "epoch": 0.17558224163027655,
+      "grad_norm": 0.17969627678394318,
+      "learning_rate": 4.728351141116971e-05,
+      "loss": 0.17361557483673096,
+      "step": 965
+    },
+    {
+      "epoch": 0.1764919941775837,
+      "grad_norm": 0.13751201331615448,
+      "learning_rate": 4.7250045789368326e-05,
+      "loss": 0.1731679320335388,
+      "step": 970
+    },
+    {
+      "epoch": 0.17740174672489084,
+      "grad_norm": 0.1603265255689621,
+      "learning_rate": 4.721638728121388e-05,
+      "loss": 0.17308170795440675,
+      "step": 975
+    },
+    {
+      "epoch": 0.17831149927219797,
+      "grad_norm": 0.1592789888381958,
+      "learning_rate": 4.718253617849306e-05,
+      "loss": 0.17534757852554322,
+      "step": 980
+    },
+    {
+      "epoch": 0.1792212518195051,
+      "grad_norm": 0.12727224826812744,
+      "learning_rate": 4.714849277466214e-05,
+      "loss": 0.17817609310150145,
+      "step": 985
+    },
+    {
+      "epoch": 0.18013100436681223,
+      "grad_norm": 0.15401554107666016,
+      "learning_rate": 4.711425736484447e-05,
+      "loss": 0.1733405351638794,
+      "step": 990
+    },
+    {
+      "epoch": 0.18104075691411936,
+      "grad_norm": 0.13253968954086304,
+      "learning_rate": 4.7079830245827906e-05,
+      "loss": 0.17846795320510864,
+      "step": 995
+    },
+    {
+      "epoch": 0.1819505094614265,
+      "grad_norm": 0.21846213936805725,
+      "learning_rate": 4.7045211716062245e-05,
+      "loss": 0.18021599054336548,
+      "step": 1000
+    },
+    {
+      "epoch": 0.18286026200873362,
+      "grad_norm": 0.16867990791797638,
+      "learning_rate": 4.7010402075656595e-05,
+      "loss": 0.18232386112213134,
+      "step": 1005
+    },
+    {
+      "epoch": 0.18377001455604075,
+      "grad_norm": 0.17180582880973816,
+      "learning_rate": 4.697540162637686e-05,
+      "loss": 0.1816317319869995,
+      "step": 1010
+    },
+    {
+      "epoch": 0.18467976710334788,
+      "grad_norm": 0.16480213403701782,
+      "learning_rate": 4.694021067164303e-05,
+      "loss": 0.17718446254730225,
+      "step": 1015
+    },
+    {
+      "epoch": 0.185589519650655,
+      "grad_norm": 0.15015918016433716,
+      "learning_rate": 4.6904829516526605e-05,
+      "loss": 0.17412011623382567,
+      "step": 1020
+    },
+    {
+      "epoch": 0.18649927219796217,
+      "grad_norm": 0.14445139467716217,
+      "learning_rate": 4.686925846774795e-05,
+      "loss": 0.1778018832206726,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1874090247452693,
+      "grad_norm": 0.1701960265636444,
+      "learning_rate": 4.683349783367362e-05,
+      "loss": 0.16901081800460815,
+      "step": 1030
+    },
+    {
+      "epoch": 0.18831877729257643,
+      "grad_norm": 0.15894867479801178,
+      "learning_rate": 4.679754792431368e-05,
+      "loss": 0.17055928707122803,
+      "step": 1035
+    },
+    {
+      "epoch": 0.18922852983988356,
+      "grad_norm": 0.1511942446231842,
+      "learning_rate": 4.676140905131903e-05,
+      "loss": 0.17339680194854737,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1901382823871907,
+      "grad_norm": 0.14735209941864014,
+      "learning_rate": 4.672508152797872e-05,
+      "loss": 0.17802717685699462,
+      "step": 1045
+    },
+    {
+      "epoch": 0.19104803493449782,
+      "grad_norm": 0.17367291450500488,
+      "learning_rate": 4.66885656692172e-05,
+      "loss": 0.1732744097709656,
+      "step": 1050
+    },
+    {
+      "epoch": 0.19195778748180495,
+      "grad_norm": 0.147227481007576,
+      "learning_rate": 4.665186179159159e-05,
+      "loss": 0.17040517330169677,
+      "step": 1055
+    },
+    {
+      "epoch": 0.19286754002911208,
+      "grad_norm": 0.1709655076265335,
+      "learning_rate": 4.6614970213289e-05,
+      "loss": 0.17794088125228882,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1937772925764192,
+      "grad_norm": 0.1588088721036911,
+      "learning_rate": 4.657789125412366e-05,
+      "loss": 0.17180380821228028,
+      "step": 1065
+    },
+    {
+      "epoch": 0.19468704512372634,
+      "grad_norm": 0.14827021956443787,
+      "learning_rate": 4.654062523553428e-05,
+      "loss": 0.182997989654541,
+      "step": 1070
+    },
+    {
+      "epoch": 0.19559679767103347,
+      "grad_norm": 0.16230466961860657,
+      "learning_rate": 4.6503172480581126e-05,
+      "loss": 0.17346880435943604,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1965065502183406,
+      "grad_norm": 0.1637624353170395,
+      "learning_rate": 4.646553331394333e-05,
+      "loss": 0.17263576984405518,
+      "step": 1080
+    },
+    {
+      "epoch": 0.19741630276564776,
+      "grad_norm": 0.15977843105793,
+      "learning_rate": 4.642770806191603e-05,
+      "loss": 0.17284308671951293,
+      "step": 1085
+    },
+    {
+      "epoch": 0.19832605531295489,
+      "grad_norm": 0.15394869446754456,
+      "learning_rate": 4.6389697052407534e-05,
+      "loss": 0.17797101736068727,
+      "step": 1090
+    },
+    {
+      "epoch": 0.19923580786026202,
+      "grad_norm": 0.15995225310325623,
+      "learning_rate": 4.6351500614936485e-05,
+      "loss": 0.18137198686599731,
+      "step": 1095
+    },
+    {
+      "epoch": 0.20014556040756915,
+      "grad_norm": 0.1779479682445526,
+      "learning_rate": 4.6313119080629006e-05,
+      "loss": 0.17998344898223878,
+      "step": 1100
+    },
+    {
+      "epoch": 0.20105531295487628,
+      "grad_norm": 0.14362832903862,
+      "learning_rate": 4.627455278221584e-05,
+      "loss": 0.18196423053741456,
+      "step": 1105
+    },
+    {
+      "epoch": 0.2019650655021834,
+      "grad_norm": 0.15951639413833618,
+      "learning_rate": 4.623580205402947e-05,
+      "loss": 0.17423888444900512,
+      "step": 1110
+    },
+    {
+      "epoch": 0.20287481804949054,
+      "grad_norm": 0.17273563146591187,
+      "learning_rate": 4.619686723200115e-05,
+      "loss": 0.17392473220825194,
+      "step": 1115
+    },
+    {
+      "epoch": 0.20378457059679767,
+      "grad_norm": 0.1655360758304596,
+      "learning_rate": 4.615774865365813e-05,
+      "loss": 0.17528389692306517,
+      "step": 1120
+    },
+    {
+      "epoch": 0.2046943231441048,
+      "grad_norm": 0.15920691192150116,
+      "learning_rate": 4.611844665812058e-05,
+      "loss": 0.1806849241256714,
+      "step": 1125
+    },
+    {
+      "epoch": 0.20560407569141192,
+      "grad_norm": 0.16114577651023865,
+      "learning_rate": 4.607896158609875e-05,
+      "loss": 0.17217352390289306,
+      "step": 1130
+    },
+    {
+      "epoch": 0.20651382823871905,
+      "grad_norm": 0.1499422937631607,
+      "learning_rate": 4.603929377988999e-05,
+      "loss": 0.17806737422943114,
+      "step": 1135
+    },
+    {
+      "epoch": 0.2074235807860262,
+      "grad_norm": 0.17605191469192505,
+      "learning_rate": 4.5999443583375765e-05,
+      "loss": 0.17842113971710205,
+      "step": 1140
+    },
+    {
+      "epoch": 0.20833333333333334,
+      "grad_norm": 0.16117210686206818,
+      "learning_rate": 4.595941134201871e-05,
+      "loss": 0.18379683494567872,
+      "step": 1145
+    },
+    {
+      "epoch": 0.20924308588064047,
+      "grad_norm": 0.21199050545692444,
+      "learning_rate": 4.591919740285957e-05,
+      "loss": 0.16286123991012574,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2101528384279476,
+      "grad_norm": 0.15100529789924622,
+      "learning_rate": 4.587880211451427e-05,
+      "loss": 0.17995200157165528,
+      "step": 1155
+    },
+    {
+      "epoch": 0.21106259097525473,
+      "grad_norm": 0.16618172824382782,
+      "learning_rate": 4.583822582717085e-05,
+      "loss": 0.16960303783416747,
+      "step": 1160
+    },
+    {
+      "epoch": 0.21197234352256186,
+      "grad_norm": 0.14743569493293762,
+      "learning_rate": 4.579746889258643e-05,
+      "loss": 0.17762668132781984,
+      "step": 1165
+    },
+    {
+      "epoch": 0.212882096069869,
+      "grad_norm": 0.1697179079055786,
+      "learning_rate": 4.575653166408417e-05,
+      "loss": 0.16656005382537842,
+      "step": 1170
+    },
+    {
+      "epoch": 0.21379184861717612,
+      "grad_norm": 0.14886513352394104,
+      "learning_rate": 4.57154144965502e-05,
+      "loss": 0.17091882228851318,
+      "step": 1175
+    },
+    {
+      "epoch": 0.21470160116448325,
+      "grad_norm": 0.18197473883628845,
+      "learning_rate": 4.5674117746430556e-05,
+      "loss": 0.1770920753479004,
+      "step": 1180
+    },
+    {
+      "epoch": 0.21561135371179038,
+      "grad_norm": 0.17323088645935059,
+      "learning_rate": 4.563264177172807e-05,
+      "loss": 0.1734643578529358,
+      "step": 1185
+    },
+    {
+      "epoch": 0.2165211062590975,
+      "grad_norm": 0.1521984338760376,
+      "learning_rate": 4.559098693199929e-05,
+      "loss": 0.17515116930007935,
+      "step": 1190
+    },
+    {
+      "epoch": 0.21743085880640467,
+      "grad_norm": 0.1842304915189743,
+      "learning_rate": 4.554915358835134e-05,
+      "loss": 0.16798022985458375,
+      "step": 1195
+    },
+    {
+      "epoch": 0.2183406113537118,
+      "grad_norm": 0.14753451943397522,
+      "learning_rate": 4.5507142103438794e-05,
+      "loss": 0.1755476713180542,
+      "step": 1200
+    },
+    {
+      "epoch": 0.21925036390101893,
+      "grad_norm": 0.17096194624900818,
+      "learning_rate": 4.546495284146057e-05,
+      "loss": 0.1792473554611206,
+      "step": 1205
+    },
+    {
+      "epoch": 0.22016011644832606,
+      "grad_norm": 0.1579233556985855,
+      "learning_rate": 4.542258616815669e-05,
+      "loss": 0.17230144739151002,
+      "step": 1210
+    },
+    {
+      "epoch": 0.2210698689956332,
+      "grad_norm": 0.177297905087471,
+      "learning_rate": 4.5380042450805216e-05,
+      "loss": 0.1807127833366394,
+      "step": 1215
+    },
+    {
+      "epoch": 0.22197962154294032,
+      "grad_norm": 0.14331696927547455,
+      "learning_rate": 4.533732205821897e-05,
+      "loss": 0.17201389074325563,
+      "step": 1220
+    },
+    {
+      "epoch": 0.22288937409024745,
+      "grad_norm": 0.14473360776901245,
+      "learning_rate": 4.529442536074239e-05,
+      "loss": 0.17036900520324708,
+      "step": 1225
+    },
+    {
+      "epoch": 0.22379912663755458,
+      "grad_norm": 0.1820901483297348,
+      "learning_rate": 4.5251352730248314e-05,
+      "loss": 0.17704882621765136,
+      "step": 1230
+    },
+    {
+      "epoch": 0.2247088791848617,
+      "grad_norm": 0.1948976367712021,
+      "learning_rate": 4.5208104540134746e-05,
+      "loss": 0.1706973433494568,
+      "step": 1235
+    },
+    {
+      "epoch": 0.22561863173216884,
+      "grad_norm": 0.16660070419311523,
+      "learning_rate": 4.51646811653216e-05,
+      "loss": 0.17636821269989014,
+      "step": 1240
+    },
+    {
+      "epoch": 0.22652838427947597,
+      "grad_norm": 0.1699984073638916,
+      "learning_rate": 4.512108298224751e-05,
+      "loss": 0.16986632347106934,
+      "step": 1245
+    },
+    {
+      "epoch": 0.22743813682678313,
+      "grad_norm": 0.17601042985916138,
+      "learning_rate": 4.50773103688665e-05,
+      "loss": 0.17507898807525635,
+      "step": 1250
+    },
+    {
+      "epoch": 0.22834788937409026,
+      "grad_norm": 0.17557238042354584,
+      "learning_rate": 4.503336370464476e-05,
+      "loss": 0.17702863216400147,
+      "step": 1255
+    },
+    {
+      "epoch": 0.2292576419213974,
+      "grad_norm": 0.1800651252269745,
+      "learning_rate": 4.498924337055729e-05,
+      "loss": 0.16419180631637573,
+      "step": 1260
+    },
+    {
+      "epoch": 0.23016739446870452,
+      "grad_norm": 0.2022479772567749,
+      "learning_rate": 4.494494974908468e-05,
+      "loss": 0.17482060194015503,
+      "step": 1265
+    },
+    {
+      "epoch": 0.23107714701601165,
+      "grad_norm": 0.14180205762386322,
+      "learning_rate": 4.490048322420973e-05,
+      "loss": 0.1723136067390442,
+      "step": 1270
+    },
+    {
+      "epoch": 0.23198689956331878,
+      "grad_norm": 0.18607310950756073,
+      "learning_rate": 4.485584418141419e-05,
+      "loss": 0.17096419334411622,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2328966521106259,
+      "grad_norm": 0.15958310663700104,
+      "learning_rate": 4.481103300767529e-05,
+      "loss": 0.1656244158744812,
+      "step": 1280
+    },
+    {
+      "epoch": 0.23380640465793304,
+      "grad_norm": 0.17552383244037628,
+      "learning_rate": 4.476605009146255e-05,
+      "loss": 0.17677626609802247,
+      "step": 1285
+    },
+    {
+      "epoch": 0.23471615720524017,
+      "grad_norm": 0.15299823880195618,
+      "learning_rate": 4.472089582273429e-05,
+      "loss": 0.1778991103172302,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2356259097525473,
+      "grad_norm": 0.14613987505435944,
+      "learning_rate": 4.46755705929343e-05,
+      "loss": 0.17071452140808105,
+      "step": 1295
+    },
+    {
+      "epoch": 0.23653566229985443,
+      "grad_norm": 0.17781122028827667,
+      "learning_rate": 4.463007479498843e-05,
+      "loss": 0.16955430507659913,
+      "step": 1300
+    },
+    {
+      "epoch": 0.23744541484716158,
+      "grad_norm": 0.16326487064361572,
+      "learning_rate": 4.458440882330119e-05,
+      "loss": 0.1777693510055542,
+      "step": 1305
+    },
+    {
+      "epoch": 0.23835516739446871,
+      "grad_norm": 0.17701926827430725,
+      "learning_rate": 4.4538573073752365e-05,
+      "loss": 0.16323351860046387,
+      "step": 1310
+    },
+    {
+      "epoch": 0.23926491994177584,
+      "grad_norm": 0.13104717433452606,
+      "learning_rate": 4.449256794369349e-05,
+      "loss": 0.17653456926345826,
+      "step": 1315
+    },
+    {
+      "epoch": 0.24017467248908297,
+      "grad_norm": 0.1796836256980896,
+      "learning_rate": 4.444639383194452e-05,
+      "loss": 0.17189600467681884,
+      "step": 1320
+    },
+    {
+      "epoch": 0.2410844250363901,
+      "grad_norm": 0.14919696748256683,
+      "learning_rate": 4.440005113879029e-05,
+      "loss": 0.17003334760665895,
+      "step": 1325
+    },
+    {
+      "epoch": 0.24199417758369723,
+      "grad_norm": 0.1728784441947937,
+      "learning_rate": 4.4353540265977064e-05,
+      "loss": 0.17397408485412597,
+      "step": 1330
+    },
+    {
+      "epoch": 0.24290393013100436,
+      "grad_norm": 0.14591015875339508,
+      "learning_rate": 4.43068616167091e-05,
+      "loss": 0.16498478651046752,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2438136826783115,
+      "grad_norm": 0.18417201936244965,
+      "learning_rate": 4.4260015595645055e-05,
+      "loss": 0.16841750144958495,
+      "step": 1340
+    },
+    {
+      "epoch": 0.24472343522561862,
+      "grad_norm": 0.16264279186725616,
+      "learning_rate": 4.4213002608894605e-05,
+      "loss": 0.16907373666763306,
+      "step": 1345
+    },
+    {
+      "epoch": 0.24563318777292575,
+      "grad_norm": 0.15248481929302216,
+      "learning_rate": 4.416582306401481e-05,
+      "loss": 0.15931472778320313,
+      "step": 1350
+    },
+    {
+      "epoch": 0.24654294032023288,
+      "grad_norm": 0.1488373875617981,
+      "learning_rate": 4.4118477370006636e-05,
+      "loss": 0.1701716423034668,
+      "step": 1355
+    },
+    {
+      "epoch": 0.24745269286754004,
+      "grad_norm": 0.14679782092571259,
+      "learning_rate": 4.407096593731142e-05,
+      "loss": 0.157412326335907,
+      "step": 1360
+    },
+    {
+      "epoch": 0.24836244541484717,
+      "grad_norm": 0.17139530181884766,
+      "learning_rate": 4.402328917780728e-05,
+      "loss": 0.17303754091262818,
+      "step": 1365
+    },
+    {
+      "epoch": 0.2492721979621543,
+      "grad_norm": 0.1534871757030487,
+      "learning_rate": 4.397544750480554e-05,
+      "loss": 0.1786255121231079,
+      "step": 1370
+    },
+    {
+      "epoch": 0.2501819505094614,
+      "grad_norm": 0.1876252293586731,
+      "learning_rate": 4.39274413330472e-05,
+      "loss": 0.16442898511886597,
+      "step": 1375
+    },
+    {
+      "epoch": 0.25109170305676853,
+      "grad_norm": 0.16165752708911896,
+      "learning_rate": 4.387927107869928e-05,
+      "loss": 0.1780426025390625,
+      "step": 1380
+    },
+    {
+      "epoch": 0.25200145560407566,
+      "grad_norm": 0.17242255806922913,
+      "learning_rate": 4.383093715935124e-05,
+      "loss": 0.15959256887435913,
+      "step": 1385
+    },
+    {
+      "epoch": 0.25291120815138285,
+      "grad_norm": 0.1627114862203598,
+      "learning_rate": 4.378243999401137e-05,
+      "loss": 0.17606115341186523,
+      "step": 1390
+    },
+    {
+      "epoch": 0.25382096069869,
+      "grad_norm": 0.15911224484443665,
+      "learning_rate": 4.373378000310312e-05,
+      "loss": 0.16798585653305054,
+      "step": 1395
+    },
+    {
+      "epoch": 0.2547307132459971,
+      "grad_norm": 0.15542249381542206,
+      "learning_rate": 4.3684957608461505e-05,
+      "loss": 0.1695417881011963,
+      "step": 1400
+    },
+    {
+      "epoch": 0.25564046579330424,
+      "grad_norm": 0.1475304812192917,
+      "learning_rate": 4.363597323332941e-05,
+      "loss": 0.16340878009796142,
+      "step": 1405
+    },
+    {
+      "epoch": 0.25655021834061137,
+      "grad_norm": 0.16943927109241486,
+      "learning_rate": 4.358682730235395e-05,
+      "loss": 0.17240238189697266,
+      "step": 1410
+    },
+    {
+      "epoch": 0.2574599708879185,
+      "grad_norm": 0.1816391944885254,
+      "learning_rate": 4.3537520241582744e-05,
+      "loss": 0.16558437347412108,
+      "step": 1415
+    },
+    {
+      "epoch": 0.25836972343522563,
+      "grad_norm": 0.23851341009140015,
+      "learning_rate": 4.348805247846027e-05,
+      "loss": 0.16796000003814698,
+      "step": 1420
+    },
+    {
+      "epoch": 0.25927947598253276,
+      "grad_norm": 0.15415243804454803,
+      "learning_rate": 4.343842444182414e-05,
+      "loss": 0.1746017098426819,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2601892285298399,
+      "grad_norm": 0.15651032328605652,
+      "learning_rate": 4.338863656190139e-05,
+      "loss": 0.1649057984352112,
+      "step": 1430
+    },
+    {
+      "epoch": 0.261098981077147,
+      "grad_norm": 0.16601966321468353,
+      "learning_rate": 4.333868927030471e-05,
+      "loss": 0.15888988971710205,
+      "step": 1435
+    },
+    {
+      "epoch": 0.26200873362445415,
+      "grad_norm": 0.1549467295408249,
+      "learning_rate": 4.328858300002876e-05,
+      "loss": 0.16357985734939576,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2629184861717613,
+      "grad_norm": 0.16332370042800903,
+      "learning_rate": 4.32383181854464e-05,
+      "loss": 0.16749982833862304,
+      "step": 1445
+    },
+    {
+      "epoch": 0.2638282387190684,
+      "grad_norm": 0.14827077090740204,
+      "learning_rate": 4.3187895262304894e-05,
+      "loss": 0.16886214017868043,
+      "step": 1450
+    },
+    {
+      "epoch": 0.26473799126637554,
+      "grad_norm": 0.1557198166847229,
+      "learning_rate": 4.313731466772216e-05,
+      "loss": 0.17512214183807373,
+      "step": 1455
+    },
+    {
+      "epoch": 0.26564774381368267,
+      "grad_norm": 0.17263570427894592,
+      "learning_rate": 4.308657684018299e-05,
+      "loss": 0.16248074769973755,
+      "step": 1460
+    },
+    {
+      "epoch": 0.2665574963609898,
+      "grad_norm": 0.17135761678218842,
+      "learning_rate": 4.303568221953521e-05,
+      "loss": 0.16605921983718872,
+      "step": 1465
+    },
+    {
+      "epoch": 0.26746724890829693,
+      "grad_norm": 0.14322632551193237,
+      "learning_rate": 4.2984631246985897e-05,
+      "loss": 0.1610772728919983,
+      "step": 1470
+    },
+    {
+      "epoch": 0.26837700145560406,
+      "grad_norm": 0.18852312862873077,
+      "learning_rate": 4.2933424365097564e-05,
+      "loss": 0.1686462163925171,
+      "step": 1475
+    },
+    {
+      "epoch": 0.2692867540029112,
+      "grad_norm": 0.1780245155096054,
+      "learning_rate": 4.2882062017784294e-05,
+      "loss": 0.16953932046890258,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2701965065502183,
+      "grad_norm": 0.180568665266037,
+      "learning_rate": 4.2830544650307895e-05,
+      "loss": 0.16442664861679077,
+      "step": 1485
+    },
+    {
+      "epoch": 0.27110625909752545,
+      "grad_norm": 0.16876435279846191,
+      "learning_rate": 4.277887270927407e-05,
+      "loss": 0.17128173112869263,
+      "step": 1490
+    },
+    {
+      "epoch": 0.2720160116448326,
+      "grad_norm": 0.164053276181221,
+      "learning_rate": 4.2727046642628513e-05,
+      "loss": 0.16331382989883422,
+      "step": 1495
+    },
+    {
+      "epoch": 0.27292576419213976,
+      "grad_norm": 0.14577528834342957,
+      "learning_rate": 4.267506689965305e-05,
+      "loss": 0.1638316035270691,
+      "step": 1500
+    },
+    {
+      "epoch": 0.2738355167394469,
+      "grad_norm": 0.1648740917444229,
+      "learning_rate": 4.262293393096171e-05,
+      "loss": 0.15332664251327516,
+      "step": 1505
+    },
+    {
+      "epoch": 0.274745269286754,
+      "grad_norm": 0.16445094347000122,
+      "learning_rate": 4.257064818849685e-05,
+      "loss": 0.1706634521484375,
+      "step": 1510
+    },
+    {
+      "epoch": 0.27565502183406115,
+      "grad_norm": 0.1584935486316681,
+      "learning_rate": 4.251821012552524e-05,
+      "loss": 0.1684114694595337,
+      "step": 1515
+    },
+    {
+      "epoch": 0.2765647743813683,
+      "grad_norm": 0.17215611040592194,
+      "learning_rate": 4.24656201966341e-05,
+      "loss": 0.15594131946563722,
+      "step": 1520
+    },
+    {
+      "epoch": 0.2774745269286754,
+      "grad_norm": 0.15945589542388916,
+      "learning_rate": 4.2412878857727214e-05,
+      "loss": 0.1686659574508667,
+      "step": 1525
+    },
+    {
+      "epoch": 0.27838427947598254,
+      "grad_norm": 0.16103951632976532,
+      "learning_rate": 4.2359986566020906e-05,
+      "loss": 0.17779340744018554,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2792940320232897,
+      "grad_norm": 0.1770307570695877,
+      "learning_rate": 4.230694378004014e-05,
+      "loss": 0.16786882877349854,
+      "step": 1535
+    },
+    {
+      "epoch": 0.2802037845705968,
+      "grad_norm": 0.16225053369998932,
+      "learning_rate": 4.2253750959614504e-05,
+      "loss": 0.16558897495269775,
+      "step": 1540
+    },
+    {
+      "epoch": 0.28111353711790393,
+      "grad_norm": 0.27213969826698303,
+      "learning_rate": 4.220040856587425e-05,
+      "loss": 0.1641119599342346,
+      "step": 1545
+    },
+    {
+      "epoch": 0.28202328966521106,
+      "grad_norm": 0.1773071587085724,
+      "learning_rate": 4.2146917061246284e-05,
+      "loss": 0.16919140815734862,
+      "step": 1550
+    },
+    {
+      "epoch": 0.2829330422125182,
+      "grad_norm": 0.15519705414772034,
+      "learning_rate": 4.209327690945014e-05,
+      "loss": 0.15501506328582765,
+      "step": 1555
+    },
+    {
+      "epoch": 0.2838427947598253,
+      "grad_norm": 0.19921597838401794,
+      "learning_rate": 4.203948857549402e-05,
+      "loss": 0.1690821886062622,
+      "step": 1560
+    },
+    {
+      "epoch": 0.28475254730713245,
+      "grad_norm": 0.15417630970478058,
+      "learning_rate": 4.1985552525670696e-05,
+      "loss": 0.1675640344619751,
+      "step": 1565
+    },
+    {
+      "epoch": 0.2856622998544396,
+      "grad_norm": 0.1739572137594223,
+      "learning_rate": 4.193146922755348e-05,
+      "loss": 0.16738017797470092,
+      "step": 1570
+    },
+    {
+      "epoch": 0.2865720524017467,
+      "grad_norm": 0.1384361982345581,
+      "learning_rate": 4.187723914999221e-05,
+      "loss": 0.16802358627319336,
+      "step": 1575
+    },
+    {
+      "epoch": 0.28748180494905384,
+      "grad_norm": 0.1491454839706421,
+      "learning_rate": 4.182286276310915e-05,
+      "loss": 0.1619583249092102,
+      "step": 1580
+    },
+    {
+      "epoch": 0.288391557496361,
+      "grad_norm": 0.15831919014453888,
+      "learning_rate": 4.176834053829492e-05,
+      "loss": 0.1625199794769287,
+      "step": 1585
+    },
+    {
+      "epoch": 0.2893013100436681,
+      "grad_norm": 0.16265396773815155,
+      "learning_rate": 4.1713672948204416e-05,
+      "loss": 0.16718552112579346,
+      "step": 1590
+    },
+    {
+      "epoch": 0.29021106259097523,
+      "grad_norm": 0.15153461694717407,
+      "learning_rate": 4.1658860466752714e-05,
+      "loss": 0.15979087352752686,
+      "step": 1595
+    },
+    {
+      "epoch": 0.29112081513828236,
+      "grad_norm": 0.1620412915945053,
+      "learning_rate": 4.160390356911096e-05,
+      "loss": 0.16103557348251343,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2920305676855895,
+      "grad_norm": 0.16673807799816132,
+      "learning_rate": 4.154880273170223e-05,
+      "loss": 0.16394708156585694,
+      "step": 1605
+    },
+    {
+      "epoch": 0.2929403202328967,
+      "grad_norm": 0.14834867417812347,
+      "learning_rate": 4.149355843219744e-05,
+      "loss": 0.15916435718536376,
+      "step": 1610
+    },
+    {
+      "epoch": 0.2938500727802038,
+      "grad_norm": 0.16977964341640472,
+      "learning_rate": 4.143817114951119e-05,
+      "loss": 0.16538127660751342,
+      "step": 1615
+    },
+    {
+      "epoch": 0.29475982532751094,
+      "grad_norm": 0.17986875772476196,
+      "learning_rate": 4.138264136379756e-05,
+      "loss": 0.15514618158340454,
+      "step": 1620
+    },
+    {
+      "epoch": 0.29566957787481807,
+      "grad_norm": 0.15794920921325684,
+      "learning_rate": 4.132696955644605e-05,
+      "loss": 0.15992183685302735,
+      "step": 1625
+    },
+    {
+      "epoch": 0.2965793304221252,
+      "grad_norm": 0.19955399632453918,
+      "learning_rate": 4.127115621007731e-05,
+      "loss": 0.16362056732177735,
+      "step": 1630
+    },
+    {
+      "epoch": 0.29748908296943233,
+      "grad_norm": 0.1352023035287857,
+      "learning_rate": 4.121520180853903e-05,
+      "loss": 0.15631601810455323,
+      "step": 1635
+    },
+    {
+      "epoch": 0.29839883551673946,
+      "grad_norm": 0.15340781211853027,
+      "learning_rate": 4.1159106836901674e-05,
+      "loss": 0.1571858048439026,
+      "step": 1640
+    },
+    {
+      "epoch": 0.2993085880640466,
+      "grad_norm": 0.15311770141124725,
+      "learning_rate": 4.110287178145433e-05,
+      "loss": 0.16082344055175782,
+      "step": 1645
+    },
+    {
+      "epoch": 0.3002183406113537,
+      "grad_norm": 0.17811627686023712,
+      "learning_rate": 4.10464971297005e-05,
+      "loss": 0.16117215156555176,
+      "step": 1650
+    },
+    {
+      "epoch": 0.30112809315866085,
+      "grad_norm": 0.21060039103031158,
+      "learning_rate": 4.0989983370353805e-05,
+      "loss": 0.15838587284088135,
+      "step": 1655
+    },
+    {
+      "epoch": 0.302037845705968,
+      "grad_norm": 0.155836820602417,
+      "learning_rate": 4.093333099333383e-05,
+      "loss": 0.16648870706558228,
+      "step": 1660
+    },
+    {
+      "epoch": 0.3029475982532751,
+      "grad_norm": 0.13711698353290558,
+      "learning_rate": 4.0876540489761826e-05,
+      "loss": 0.16899349689483642,
+      "step": 1665
+    },
+    {
+      "epoch": 0.30385735080058224,
+      "grad_norm": 0.15162716805934906,
+      "learning_rate": 4.0819612351956485e-05,
+      "loss": 0.16574090719223022,
+      "step": 1670
+    },
+    {
+      "epoch": 0.30476710334788937,
+      "grad_norm": 0.15016348659992218,
+      "learning_rate": 4.0762547073429615e-05,
+      "loss": 0.1689780354499817,
+      "step": 1675
+    },
+    {
+      "epoch": 0.3056768558951965,
+      "grad_norm": 0.15182986855506897,
+      "learning_rate": 4.070534514888194e-05,
+      "loss": 0.1593686819076538,
+      "step": 1680
+    },
+    {
+      "epoch": 0.3065866084425036,
+      "grad_norm": 0.15648750960826874,
+      "learning_rate": 4.0648007074198765e-05,
+      "loss": 0.16436235904693602,
+      "step": 1685
+    },
+    {
+      "epoch": 0.30749636098981076,
+      "grad_norm": 0.18339484930038452,
+      "learning_rate": 4.0590533346445665e-05,
+      "loss": 0.1678077220916748,
+      "step": 1690
+    },
+    {
+      "epoch": 0.3084061135371179,
+      "grad_norm": 0.16426527500152588,
+      "learning_rate": 4.053292446386422e-05,
+      "loss": 0.1689227342605591,
+      "step": 1695
+    },
+    {
+      "epoch": 0.309315866084425,
+      "grad_norm": 0.16129335761070251,
+      "learning_rate": 4.047518092586766e-05,
+      "loss": 0.16592445373535156,
+      "step": 1700
+    },
+    {
+      "epoch": 0.31022561863173215,
+      "grad_norm": 0.15512363612651825,
+      "learning_rate": 4.041730323303654e-05,
+      "loss": 0.16142364740371704,
+      "step": 1705
+    },
+    {
+      "epoch": 0.3111353711790393,
+      "grad_norm": 0.159842386841774,
+      "learning_rate": 4.0359291887114425e-05,
+      "loss": 0.1702875852584839,
+      "step": 1710
+    },
+    {
+      "epoch": 0.3120451237263464,
+      "grad_norm": 0.19558854401111603,
+      "learning_rate": 4.030114739100352e-05,
+      "loss": 0.15966148376464845,
+      "step": 1715
+    },
+    {
+      "epoch": 0.3129548762736536,
+      "grad_norm": 0.1577496975660324,
+      "learning_rate": 4.024287024876029e-05,
+      "loss": 0.1620358943939209,
+      "step": 1720
+    },
+    {
+      "epoch": 0.3138646288209607,
+      "grad_norm": 0.1629355251789093,
+      "learning_rate": 4.0184460965591144e-05,
+      "loss": 0.16511552333831786,
+      "step": 1725
+    },
+    {
+      "epoch": 0.31477438136826785,
+      "grad_norm": 0.17060767114162445,
+      "learning_rate": 4.0125920047848e-05,
+      "loss": 0.15672838687896729,
+      "step": 1730
+    },
+    {
+      "epoch": 0.315684133915575,
+      "grad_norm": 0.22447620332241058,
+      "learning_rate": 4.006724800302394e-05,
+      "loss": 0.15339784622192382,
+      "step": 1735
+    },
+    {
+      "epoch": 0.3165938864628821,
+      "grad_norm": 0.14572037756443024,
+      "learning_rate": 4.000844533974878e-05,
+      "loss": 0.16566959619522095,
+      "step": 1740
+    },
+    {
+      "epoch": 0.31750363901018924,
+      "grad_norm": 0.15915483236312866,
+      "learning_rate": 3.9949512567784684e-05,
+      "loss": 0.16153957843780517,
+      "step": 1745
+    },
+    {
+      "epoch": 0.3184133915574964,
+      "grad_norm": 0.1668540984392166,
+      "learning_rate": 3.9890450198021704e-05,
+      "loss": 0.1659809947013855,
+      "step": 1750
+    },
+    {
+      "epoch": 0.3193231441048035,
+      "grad_norm": 0.16612035036087036,
+      "learning_rate": 3.983125874247341e-05,
+      "loss": 0.16941241025924683,
+      "step": 1755
+    },
+    {
+      "epoch": 0.32023289665211063,
+      "grad_norm": 0.15163679420948029,
+      "learning_rate": 3.9771938714272407e-05,
+      "loss": 0.16053590774536133,
+      "step": 1760
+    },
+    {
+      "epoch": 0.32114264919941776,
+      "grad_norm": 0.1797824203968048,
+      "learning_rate": 3.97124906276659e-05,
+      "loss": 0.1667110800743103,
+      "step": 1765
+    },
+    {
+      "epoch": 0.3220524017467249,
+      "grad_norm": 0.15076608955860138,
+      "learning_rate": 3.9652914998011237e-05,
+      "loss": 0.1607860803604126,
+      "step": 1770
+    },
+    {
+      "epoch": 0.322962154294032,
+      "grad_norm": 0.16523587703704834,
+      "learning_rate": 3.959321234177144e-05,
+      "loss": 0.16515827178955078,
+      "step": 1775
+    },
+    {
+      "epoch": 0.32387190684133915,
+      "grad_norm": 0.22065149247646332,
+      "learning_rate": 3.9533383176510746e-05,
+      "loss": 0.1618957757949829,
+      "step": 1780
+    },
+    {
+      "epoch": 0.3247816593886463,
+      "grad_norm": 0.16426463425159454,
+      "learning_rate": 3.9473428020890066e-05,
+      "loss": 0.15763382911682128,
+      "step": 1785
+    },
+    {
+      "epoch": 0.3256914119359534,
+      "grad_norm": 0.16474904119968414,
+      "learning_rate": 3.941334739466257e-05,
+      "loss": 0.15135571956634522,
+      "step": 1790
+    },
+    {
+      "epoch": 0.32660116448326054,
+      "grad_norm": 0.16746412217617035,
+      "learning_rate": 3.935314181866909e-05,
+      "loss": 0.15925389528274536,
+      "step": 1795
+    },
+    {
+      "epoch": 0.32751091703056767,
+      "grad_norm": 0.17819371819496155,
+      "learning_rate": 3.929281181483369e-05,
+      "loss": 0.1598669171333313,
+      "step": 1800
+    },
+    {
+      "epoch": 0.3284206695778748,
+      "grad_norm": 0.1816040277481079,
+      "learning_rate": 3.923235790615907e-05,
+      "loss": 0.1652522087097168,
+      "step": 1805
+    },
+    {
+      "epoch": 0.32933042212518193,
+      "grad_norm": 0.14846695959568024,
+      "learning_rate": 3.917178061672211e-05,
+      "loss": 0.16665585041046144,
+      "step": 1810
+    },
+    {
+      "epoch": 0.33024017467248906,
+      "grad_norm": 0.1734926551580429,
+      "learning_rate": 3.911108047166924e-05,
+      "loss": 0.16069791316986085,
+      "step": 1815
+    },
+    {
+      "epoch": 0.3311499272197962,
+      "grad_norm": 0.16154922544956207,
+      "learning_rate": 3.905025799721194e-05,
+      "loss": 0.16114097833633423,
+      "step": 1820
+    },
+    {
+      "epoch": 0.3320596797671033,
+      "grad_norm": 0.1538771390914917,
+      "learning_rate": 3.898931372062217e-05,
+      "loss": 0.1602831244468689,
+      "step": 1825
+    },
+    {
+      "epoch": 0.3329694323144105,
+      "grad_norm": 0.14036566019058228,
+      "learning_rate": 3.892824817022781e-05,
+      "loss": 0.1502395749092102,
+      "step": 1830
+    },
+    {
+      "epoch": 0.33387918486171764,
+      "grad_norm": 0.19212059676647186,
+      "learning_rate": 3.886706187540804e-05,
+      "loss": 0.16265250444412233,
+      "step": 1835
+    },
+    {
+      "epoch": 0.33478893740902477,
+      "grad_norm": 0.17410333454608917,
+      "learning_rate": 3.880575536658881e-05,
+      "loss": 0.15689224004745483,
+      "step": 1840
+    },
+    {
+      "epoch": 0.3356986899563319,
+      "grad_norm": 0.15165294706821442,
+      "learning_rate": 3.874432917523817e-05,
+      "loss": 0.15033140182495117,
+      "step": 1845
+    },
+    {
+      "epoch": 0.336608442503639,
+      "grad_norm": 0.16166730225086212,
+      "learning_rate": 3.8682783833861736e-05,
+      "loss": 0.16896235942840576,
+      "step": 1850
+    },
+    {
+      "epoch": 0.33751819505094616,
+      "grad_norm": 0.16497021913528442,
+      "learning_rate": 3.8621119875998026e-05,
+      "loss": 0.1600774645805359,
+      "step": 1855
+    },
+    {
+      "epoch": 0.3384279475982533,
+      "grad_norm": 0.17264948785305023,
+      "learning_rate": 3.855933783621384e-05,
+      "loss": 0.16947593688964843,
+      "step": 1860
+    },
+    {
+      "epoch": 0.3393377001455604,
+      "grad_norm": 0.16870704293251038,
+      "learning_rate": 3.8497438250099636e-05,
+      "loss": 0.16062095165252685,
+      "step": 1865
+    },
+    {
+      "epoch": 0.34024745269286755,
+      "grad_norm": 0.16644036769866943,
+      "learning_rate": 3.843542165426492e-05,
+      "loss": 0.16015599966049193,
+      "step": 1870
+    },
+    {
+      "epoch": 0.3411572052401747,
+      "grad_norm": 0.1626352220773697,
+      "learning_rate": 3.837328858633349e-05,
+      "loss": 0.17444703578948975,
+      "step": 1875
+    },
+    {
+      "epoch": 0.3420669577874818,
+      "grad_norm": 0.1427375227212906,
+      "learning_rate": 3.83110395849389e-05,
+      "loss": 0.1589805006980896,
+      "step": 1880
+    },
+    {
+      "epoch": 0.34297671033478894,
+      "grad_norm": 0.17840255796909332,
+      "learning_rate": 3.824867518971973e-05,
+      "loss": 0.15953952074050903,
+      "step": 1885
+    },
+    {
+      "epoch": 0.34388646288209607,
+      "grad_norm": 0.16998249292373657,
+      "learning_rate": 3.818619594131489e-05,
+      "loss": 0.16027032136917113,
+      "step": 1890
+    },
+    {
+      "epoch": 0.3447962154294032,
+      "grad_norm": 0.14950257539749146,
+      "learning_rate": 3.812360238135897e-05,
+      "loss": 0.15335670709609986,
+      "step": 1895
+    },
+    {
+      "epoch": 0.3457059679767103,
+      "grad_norm": 0.1678011417388916,
+      "learning_rate": 3.806089505247752e-05,
+      "loss": 0.1560648798942566,
+      "step": 1900
+    },
+    {
+      "epoch": 0.34661572052401746,
+      "grad_norm": 0.17944541573524475,
+      "learning_rate": 3.799807449828238e-05,
+      "loss": 0.16072254180908202,
+      "step": 1905
+    },
+    {
+      "epoch": 0.3475254730713246,
+      "grad_norm": 0.166817307472229,
+      "learning_rate": 3.793514126336691e-05,
+      "loss": 0.1542820692062378,
+      "step": 1910
+    },
+    {
+      "epoch": 0.3484352256186317,
+      "grad_norm": 0.16047626733779907,
+      "learning_rate": 3.787209589330134e-05,
+      "loss": 0.16092092990875245,
+      "step": 1915
+    },
+    {
+      "epoch": 0.34934497816593885,
+      "grad_norm": 0.16478900611400604,
+      "learning_rate": 3.7808938934627965e-05,
+      "loss": 0.16765867471694945,
+      "step": 1920
+    },
+    {
+      "epoch": 0.350254730713246,
+      "grad_norm": 0.15349514782428741,
+      "learning_rate": 3.774567093485648e-05,
+      "loss": 0.15890377759933472,
+      "step": 1925
+    },
+    {
+      "epoch": 0.3511644832605531,
+      "grad_norm": 0.1515921950340271,
+      "learning_rate": 3.768229244245917e-05,
+      "loss": 0.16668319702148438,
+      "step": 1930
+    },
+    {
+      "epoch": 0.35207423580786024,
+      "grad_norm": 0.16310466825962067,
+      "learning_rate": 3.7618804006866195e-05,
+      "loss": 0.15182652473449706,
+      "step": 1935
+    },
+    {
+      "epoch": 0.3529839883551674,
+      "grad_norm": 0.17294517159461975,
+      "learning_rate": 3.755520617846084e-05,
+      "loss": 0.16287628412246705,
+      "step": 1940
+    },
+    {
+      "epoch": 0.35389374090247455,
+      "grad_norm": 0.1482895463705063,
+      "learning_rate": 3.749149950857467e-05,
+      "loss": 0.15321952104568481,
+      "step": 1945
+    },
+    {
+      "epoch": 0.3548034934497817,
+      "grad_norm": 0.2236029952764511,
+      "learning_rate": 3.7427684549482847e-05,
+      "loss": 0.15403482913970948,
+      "step": 1950
+    },
+    {
+      "epoch": 0.3557132459970888,
+      "grad_norm": 0.20185327529907227,
+      "learning_rate": 3.736376185439927e-05,
+      "loss": 0.1633884072303772,
+      "step": 1955
+    },
+    {
+      "epoch": 0.35662299854439594,
+      "grad_norm": 0.13906247913837433,
+      "learning_rate": 3.7299731977471816e-05,
+      "loss": 0.15925350189208984,
+      "step": 1960
+    },
+    {
+      "epoch": 0.35753275109170307,
+      "grad_norm": 0.18665002286434174,
+      "learning_rate": 3.723559547377751e-05,
+      "loss": 0.1612026572227478,
+      "step": 1965
+    },
+    {
+      "epoch": 0.3584425036390102,
+      "grad_norm": 0.16913433372974396,
+      "learning_rate": 3.717135289931774e-05,
+      "loss": 0.15479494333267213,
+      "step": 1970
+    },
+    {
+      "epoch": 0.35935225618631733,
+      "grad_norm": 0.1620066910982132,
+      "learning_rate": 3.7107004811013434e-05,
+      "loss": 0.1604058027267456,
+      "step": 1975
+    },
+    {
+      "epoch": 0.36026200873362446,
+      "grad_norm": 0.16838301718235016,
+      "learning_rate": 3.704255176670021e-05,
+      "loss": 0.15335073471069335,
+      "step": 1980
+    },
+    {
+      "epoch": 0.3611717612809316,
+      "grad_norm": 0.3054695427417755,
+      "learning_rate": 3.6977994325123535e-05,
+      "loss": 0.16558053493499755,
+      "step": 1985
+    },
+    {
+      "epoch": 0.3620815138282387,
+      "grad_norm": 0.1526716649532318,
+      "learning_rate": 3.6913333045933934e-05,
+      "loss": 0.16148923635482787,
+      "step": 1990
+    },
+    {
+      "epoch": 0.36299126637554585,
+      "grad_norm": 0.15328513085842133,
+      "learning_rate": 3.684856848968209e-05,
+      "loss": 0.1553613781929016,
+      "step": 1995
+    },
+    {
+      "epoch": 0.363901018922853,
+      "grad_norm": 0.16129714250564575,
+      "learning_rate": 3.6783701217813995e-05,
+      "loss": 0.16724612712860107,
+      "step": 2000
+    },
+    {
+      "epoch": 0.3648107714701601,
+      "grad_norm": 0.15715539455413818,
+      "learning_rate": 3.6718731792666086e-05,
+      "loss": 0.15867922306060792,
+      "step": 2005
+    },
+    {
+      "epoch": 0.36572052401746724,
+      "grad_norm": 0.15569166839122772,
+      "learning_rate": 3.6653660777460366e-05,
+      "loss": 0.1552058696746826,
+      "step": 2010
+    },
+    {
+      "epoch": 0.36663027656477437,
+      "grad_norm": 0.16223010420799255,
+      "learning_rate": 3.6588488736299535e-05,
+      "loss": 0.1583200454711914,
+      "step": 2015
+    },
+    {
+      "epoch": 0.3675400291120815,
+      "grad_norm": 0.18441995978355408,
+      "learning_rate": 3.652321623416209e-05,
+      "loss": 0.15050662755966188,
+      "step": 2020
+    },
+    {
+      "epoch": 0.36844978165938863,
+      "grad_norm": 0.13792674243450165,
+      "learning_rate": 3.645784383689742e-05,
+      "loss": 0.15458759069442748,
+      "step": 2025
+    },
+    {
+      "epoch": 0.36935953420669576,
+      "grad_norm": 0.14993111789226532,
+      "learning_rate": 3.639237211122091e-05,
+      "loss": 0.15926222801208495,
+      "step": 2030
+    },
+    {
+      "epoch": 0.3702692867540029,
+      "grad_norm": 0.16815930604934692,
+      "learning_rate": 3.632680162470904e-05,
+      "loss": 0.15524441003799438,
+      "step": 2035
+    },
+    {
+      "epoch": 0.37117903930131,
+      "grad_norm": 0.13312821090221405,
+      "learning_rate": 3.626113294579441e-05,
+      "loss": 0.15883516073226928,
+      "step": 2040
+    },
+    {
+      "epoch": 0.37208879184861715,
+      "grad_norm": 0.16838273406028748,
+      "learning_rate": 3.619536664376091e-05,
+      "loss": 0.15829603672027587,
+      "step": 2045
+    },
+    {
+      "epoch": 0.37299854439592434,
+      "grad_norm": 0.14706873893737793,
+      "learning_rate": 3.612950328873869e-05,
+      "loss": 0.15644397735595703,
+      "step": 2050
+    },
+    {
+      "epoch": 0.37390829694323147,
+      "grad_norm": 0.1644199639558792,
+      "learning_rate": 3.606354345169926e-05,
+      "loss": 0.15858219861984252,
+      "step": 2055
+    },
+    {
+      "epoch": 0.3748180494905386,
+      "grad_norm": 0.18077051639556885,
+      "learning_rate": 3.599748770445055e-05,
+      "loss": 0.1641286849975586,
+      "step": 2060
+    },
+    {
+      "epoch": 0.3757278020378457,
+      "grad_norm": 0.16329127550125122,
+      "learning_rate": 3.5931336619631914e-05,
+      "loss": 0.15027186870574952,
+      "step": 2065
+    },
+    {
+      "epoch": 0.37663755458515286,
+      "grad_norm": 0.16346783936023712,
+      "learning_rate": 3.586509077070922e-05,
+      "loss": 0.1558641314506531,
+      "step": 2070
+    },
+    {
+      "epoch": 0.37754730713246,
+      "grad_norm": 0.1727602630853653,
+      "learning_rate": 3.5798750731969834e-05,
+      "loss": 0.15390506982803345,
+      "step": 2075
+    },
+    {
+      "epoch": 0.3784570596797671,
+      "grad_norm": 0.7598192691802979,
+      "learning_rate": 3.5732317078517654e-05,
+      "loss": 0.1533232808113098,
+      "step": 2080
+    },
+    {
+      "epoch": 0.37936681222707425,
+      "grad_norm": 0.1433355212211609,
+      "learning_rate": 3.5665790386268124e-05,
+      "loss": 0.15560413599014283,
+      "step": 2085
+    },
+    {
+      "epoch": 0.3802765647743814,
+      "grad_norm": 0.18439625203609467,
+      "learning_rate": 3.559917123194325e-05,
+      "loss": 0.16695556640625,
+      "step": 2090
+    },
+    {
+      "epoch": 0.3811863173216885,
+      "grad_norm": 0.1693502813577652,
+      "learning_rate": 3.55324601930666e-05,
+      "loss": 0.15957870483398437,
+      "step": 2095
+    },
+    {
+      "epoch": 0.38209606986899564,
+      "grad_norm": 0.17776088416576385,
+      "learning_rate": 3.54656578479583e-05,
+      "loss": 0.1527492880821228,
+      "step": 2100
+    },
+    {
+      "epoch": 0.38300582241630277,
+      "grad_norm": 0.15993724763393402,
+      "learning_rate": 3.539876477572998e-05,
+      "loss": 0.1567505717277527,
+      "step": 2105
+    },
+    {
+      "epoch": 0.3839155749636099,
+      "grad_norm": 0.17067375779151917,
+      "learning_rate": 3.533178155627981e-05,
+      "loss": 0.14660797119140626,
+      "step": 2110
+    },
+    {
+      "epoch": 0.384825327510917,
+      "grad_norm": 0.20239882171154022,
+      "learning_rate": 3.526470877028745e-05,
+      "loss": 0.1596767544746399,
+      "step": 2115
+    },
+    {
+      "epoch": 0.38573508005822416,
+      "grad_norm": 0.1863643079996109,
+      "learning_rate": 3.5197546999209005e-05,
+      "loss": 0.15738571882247926,
+      "step": 2120
+    },
+    {
+      "epoch": 0.3866448326055313,
+      "grad_norm": 0.16994133591651917,
+      "learning_rate": 3.5130296825272014e-05,
+      "loss": 0.16255316734313965,
+      "step": 2125
+    },
+    {
+      "epoch": 0.3875545851528384,
+      "grad_norm": 0.18703415989875793,
+      "learning_rate": 3.5062958831470355e-05,
+      "loss": 0.15206334590911866,
+      "step": 2130
+    },
+    {
+      "epoch": 0.38846433770014555,
+      "grad_norm": 0.15433982014656067,
+      "learning_rate": 3.4995533601559226e-05,
+      "loss": 0.1590178370475769,
+      "step": 2135
+    },
+    {
+      "epoch": 0.3893740902474527,
+      "grad_norm": 0.16498146951198578,
+      "learning_rate": 3.4928021720050104e-05,
+      "loss": 0.14759145975112914,
+      "step": 2140
+    },
+    {
+      "epoch": 0.3902838427947598,
+      "grad_norm": 0.17880478501319885,
+      "learning_rate": 3.486042377220562e-05,
+      "loss": 0.1642458915710449,
+      "step": 2145
+    },
+    {
+      "epoch": 0.39119359534206694,
+      "grad_norm": 0.14700061082839966,
+      "learning_rate": 3.479274034403455e-05,
+      "loss": 0.16105138063430785,
+      "step": 2150
+    },
+    {
+      "epoch": 0.39210334788937407,
+      "grad_norm": 0.1620762050151825,
+      "learning_rate": 3.472497202228664e-05,
+      "loss": 0.15104985237121582,
+      "step": 2155
+    },
+    {
+      "epoch": 0.3930131004366812,
+      "grad_norm": 0.1625058799982071,
+      "learning_rate": 3.4657119394447654e-05,
+      "loss": 0.16145485639572144,
+      "step": 2160
+    },
+    {
+      "epoch": 0.3939228529839884,
+      "grad_norm": 0.1631549596786499,
+      "learning_rate": 3.458918304873417e-05,
+      "loss": 0.16712255477905275,
+      "step": 2165
+    },
+    {
+      "epoch": 0.3948326055312955,
+      "grad_norm": 0.16041551530361176,
+      "learning_rate": 3.452116357408853e-05,
+      "loss": 0.15118330717086792,
+      "step": 2170
+    },
+    {
+      "epoch": 0.39574235807860264,
+      "grad_norm": 0.16692611575126648,
+      "learning_rate": 3.44530615601737e-05,
+      "loss": 0.16982550621032716,
+      "step": 2175
+    },
+    {
+      "epoch": 0.39665211062590977,
+      "grad_norm": 0.16082268953323364,
+      "learning_rate": 3.438487759736821e-05,
+      "loss": 0.1513260006904602,
+      "step": 2180
+    },
+    {
+      "epoch": 0.3975618631732169,
+      "grad_norm": 0.1474589854478836,
+      "learning_rate": 3.4316612276761004e-05,
+      "loss": 0.14968743324279785,
+      "step": 2185
+    },
+    {
+      "epoch": 0.39847161572052403,
+      "grad_norm": 0.14531342685222626,
+      "learning_rate": 3.42482661901463e-05,
+      "loss": 0.1563260555267334,
+      "step": 2190
+    },
+    {
+      "epoch": 0.39938136826783116,
+      "grad_norm": 0.16775506734848022,
+      "learning_rate": 3.41798399300185e-05,
+      "loss": 0.14861010313034057,
+      "step": 2195
+    },
+    {
+      "epoch": 0.4002911208151383,
+      "grad_norm": 0.15065217018127441,
+      "learning_rate": 3.411133408956703e-05,
+      "loss": 0.15559519529342652,
+      "step": 2200
+    },
+    {
+      "epoch": 0.4012008733624454,
+      "grad_norm": 0.16655296087265015,
+      "learning_rate": 3.4042749262671184e-05,
+      "loss": 0.16025567054748535,
+      "step": 2205
+    },
+    {
+      "epoch": 0.40211062590975255,
+      "grad_norm": 0.14773905277252197,
+      "learning_rate": 3.397408604389501e-05,
+      "loss": 0.15074082612991332,
+      "step": 2210
+    },
+    {
+      "epoch": 0.4030203784570597,
+      "grad_norm": 0.16233304142951965,
+      "learning_rate": 3.3905345028482125e-05,
+      "loss": 0.15490520000457764,
+      "step": 2215
+    },
+    {
+      "epoch": 0.4039301310043668,
+      "grad_norm": 0.17520153522491455,
+      "learning_rate": 3.383652681235058e-05,
+      "loss": 0.1517520785331726,
+      "step": 2220
+    },
+    {
+      "epoch": 0.40483988355167394,
+      "grad_norm": 0.14749875664710999,
+      "learning_rate": 3.376763199208766e-05,
+      "loss": 0.15410997867584228,
+      "step": 2225
+    },
+    {
+      "epoch": 0.40574963609898107,
+      "grad_norm": 0.16855919361114502,
+      "learning_rate": 3.369866116494477e-05,
+      "loss": 0.1510261058807373,
+      "step": 2230
+    },
+    {
+      "epoch": 0.4066593886462882,
+      "grad_norm": 0.1594122350215912,
+      "learning_rate": 3.362961492883218e-05,
+      "loss": 0.1493813395500183,
+      "step": 2235
+    },
+    {
+      "epoch": 0.40756914119359533,
+      "grad_norm": 0.13645926117897034,
+      "learning_rate": 3.3560493882313915e-05,
+      "loss": 0.14876762628555298,
+      "step": 2240
+    },
+    {
+      "epoch": 0.40847889374090246,
+      "grad_norm": 0.14304400980472565,
+      "learning_rate": 3.349129862460251e-05,
+      "loss": 0.15567013025283813,
+      "step": 2245
+    },
+    {
+      "epoch": 0.4093886462882096,
+      "grad_norm": 0.17040041089057922,
+      "learning_rate": 3.342202975555386e-05,
+      "loss": 0.1563249945640564,
+      "step": 2250
+    },
+    {
+      "epoch": 0.4102983988355167,
+      "grad_norm": 0.15594671666622162,
+      "learning_rate": 3.3352687875661984e-05,
+      "loss": 0.1546410083770752,
+      "step": 2255
+    },
+    {
+      "epoch": 0.41120815138282385,
+      "grad_norm": 0.1677195280790329,
+      "learning_rate": 3.328327358605384e-05,
+      "loss": 0.15710171461105346,
+      "step": 2260
+    },
+    {
+      "epoch": 0.412117903930131,
+      "grad_norm": 0.1731705516576767,
+      "learning_rate": 3.321378748848412e-05,
+      "loss": 0.16444036960601807,
+      "step": 2265
+    },
+    {
+      "epoch": 0.4130276564774381,
+      "grad_norm": 0.18779033422470093,
+      "learning_rate": 3.3144230185329984e-05,
+      "loss": 0.15659687519073487,
+      "step": 2270
+    },
+    {
+      "epoch": 0.4139374090247453,
+      "grad_norm": 0.1543768346309662,
+      "learning_rate": 3.3074602279585913e-05,
+      "loss": 0.15100739002227784,
+      "step": 2275
+    },
+    {
+      "epoch": 0.4148471615720524,
+      "grad_norm": 0.16672168672084808,
+      "learning_rate": 3.300490437485843e-05,
+      "loss": 0.15535364151000977,
+      "step": 2280
+    },
+    {
+      "epoch": 0.41575691411935956,
+      "grad_norm": 0.16741308569908142,
+      "learning_rate": 3.293513707536089e-05,
+      "loss": 0.15523911714553834,
+      "step": 2285
+    },
+    {
+      "epoch": 0.4166666666666667,
+      "grad_norm": 0.1488303542137146,
+      "learning_rate": 3.286530098590822e-05,
+      "loss": 0.1542000651359558,
+      "step": 2290
+    },
+    {
+      "epoch": 0.4175764192139738,
+      "grad_norm": 0.1637732982635498,
+      "learning_rate": 3.2795396711911694e-05,
+      "loss": 0.15354831218719484,
+      "step": 2295
+    },
+    {
+      "epoch": 0.41848617176128095,
+      "grad_norm": 0.1472022533416748,
+      "learning_rate": 3.272542485937369e-05,
+      "loss": 0.16235145330429077,
+      "step": 2300
+    },
+    {
+      "epoch": 0.4193959243085881,
+      "grad_norm": 0.15908290445804596,
+      "learning_rate": 3.265538603488241e-05,
+      "loss": 0.15642645359039306,
+      "step": 2305
+    },
+    {
+      "epoch": 0.4203056768558952,
+      "grad_norm": 0.1584865301847458,
+      "learning_rate": 3.2585280845606645e-05,
+      "loss": 0.15490249395370484,
+      "step": 2310
+    },
+    {
+      "epoch": 0.42121542940320233,
+      "grad_norm": 0.15893949568271637,
+      "learning_rate": 3.251510989929052e-05,
+      "loss": 0.1598116159439087,
+      "step": 2315
+    },
+    {
+      "epoch": 0.42212518195050946,
+      "grad_norm": 0.18930596113204956,
+      "learning_rate": 3.244487380424817e-05,
+      "loss": 0.1482008934020996,
+      "step": 2320
+    },
+    {
+      "epoch": 0.4230349344978166,
+      "grad_norm": 0.132876455783844,
+      "learning_rate": 3.237457316935856e-05,
+      "loss": 0.15304710865020751,
+      "step": 2325
+    },
+    {
+      "epoch": 0.4239446870451237,
+      "grad_norm": 0.16447032988071442,
+      "learning_rate": 3.2304208604060106e-05,
+      "loss": 0.15298750400543212,
+      "step": 2330
+    },
+    {
+      "epoch": 0.42485443959243085,
+      "grad_norm": 0.17748120427131653,
+      "learning_rate": 3.223378071834546e-05,
+      "loss": 0.1556084156036377,
+      "step": 2335
+    },
+    {
+      "epoch": 0.425764192139738,
+      "grad_norm": 0.16366586089134216,
+      "learning_rate": 3.2163290122756206e-05,
+      "loss": 0.14387927055358887,
+      "step": 2340
+    },
+    {
+      "epoch": 0.4266739446870451,
+      "grad_norm": 0.15398970246315002,
+      "learning_rate": 3.209273742837755e-05,
+      "loss": 0.16091293096542358,
+      "step": 2345
+    },
+    {
+      "epoch": 0.42758369723435224,
+      "grad_norm": 0.164212167263031,
+      "learning_rate": 3.202212324683305e-05,
+      "loss": 0.15523531436920165,
+      "step": 2350
+    },
+    {
+      "epoch": 0.4284934497816594,
+      "grad_norm": 0.16749800741672516,
+      "learning_rate": 3.1951448190279255e-05,
+      "loss": 0.15354975461959838,
+      "step": 2355
+    },
+    {
+      "epoch": 0.4294032023289665,
+      "grad_norm": 0.14137034118175507,
+      "learning_rate": 3.18807128714005e-05,
+      "loss": 0.14981694221496583,
+      "step": 2360
+    },
+    {
+      "epoch": 0.43031295487627363,
+      "grad_norm": 0.14848439395427704,
+      "learning_rate": 3.1809917903403507e-05,
+      "loss": 0.15448769330978393,
+      "step": 2365
+    },
+    {
+      "epoch": 0.43122270742358076,
+      "grad_norm": 0.1747605800628662,
+      "learning_rate": 3.1739063900012095e-05,
+      "loss": 0.15882387161254882,
+      "step": 2370
+    },
+    {
+      "epoch": 0.4321324599708879,
+      "grad_norm": 0.16054467856884003,
+      "learning_rate": 3.166815147546186e-05,
+      "loss": 0.15170297622680665,
+      "step": 2375
+    },
+    {
+      "epoch": 0.433042212518195,
+      "grad_norm": 0.15428027510643005,
+      "learning_rate": 3.1597181244494886e-05,
+      "loss": 0.16202548742294312,
+      "step": 2380
+    },
+    {
+      "epoch": 0.4339519650655022,
+      "grad_norm": 0.16747219860553741,
+      "learning_rate": 3.1526153822354325e-05,
+      "loss": 0.15461477041244506,
+      "step": 2385
+    },
+    {
+      "epoch": 0.43486171761280934,
+      "grad_norm": 0.17415772378444672,
+      "learning_rate": 3.145506982477918e-05,
+      "loss": 0.16173542737960817,
+      "step": 2390
+    },
+    {
+      "epoch": 0.43577147016011647,
+      "grad_norm": 0.1293518990278244,
+      "learning_rate": 3.1383929867998865e-05,
+      "loss": 0.15572521686553956,
+      "step": 2395
+    },
+    {
+      "epoch": 0.4366812227074236,
+      "grad_norm": 0.16909323632717133,
+      "learning_rate": 3.1312734568727935e-05,
+      "loss": 0.15898628234863282,
+      "step": 2400
+    },
+    {
+      "epoch": 0.43759097525473073,
+      "grad_norm": 0.16770294308662415,
+      "learning_rate": 3.124148454416069e-05,
+      "loss": 0.1536281704902649,
+      "step": 2405
+    },
+    {
+      "epoch": 0.43850072780203786,
+      "grad_norm": 0.14078612625598907,
+      "learning_rate": 3.117018041196585e-05,
+      "loss": 0.15274266004562378,
+      "step": 2410
+    },
+    {
+      "epoch": 0.439410480349345,
+      "grad_norm": 0.15457536280155182,
+      "learning_rate": 3.1098822790281226e-05,
+      "loss": 0.15391263961791993,
+      "step": 2415
+    },
+    {
+      "epoch": 0.4403202328966521,
+      "grad_norm": 0.1640717089176178,
+      "learning_rate": 3.102741229770827e-05,
+      "loss": 0.15515168905258178,
+      "step": 2420
+    },
+    {
+      "epoch": 0.44122998544395925,
+      "grad_norm": 0.2601533830165863,
+      "learning_rate": 3.095594955330683e-05,
+      "loss": 0.1587247371673584,
+      "step": 2425
+    },
+    {
+      "epoch": 0.4421397379912664,
+      "grad_norm": 0.1352529525756836,
+      "learning_rate": 3.08844351765897e-05,
+      "loss": 0.1483217477798462,
+      "step": 2430
+    },
+    {
+      "epoch": 0.4430494905385735,
+      "grad_norm": 0.18479721248149872,
+      "learning_rate": 3.081286978751728e-05,
+      "loss": 0.15121787786483765,
+      "step": 2435
+    },
+    {
+      "epoch": 0.44395924308588064,
+      "grad_norm": 0.16954511404037476,
+      "learning_rate": 3.074125400649221e-05,
+      "loss": 0.16073100566864013,
+      "step": 2440
+    },
+    {
+      "epoch": 0.44486899563318777,
+      "grad_norm": 0.15154729783535004,
+      "learning_rate": 3.0669588454353944e-05,
+      "loss": 0.15738017559051515,
+      "step": 2445
+    },
+    {
+      "epoch": 0.4457787481804949,
+      "grad_norm": 0.1540488302707672,
+      "learning_rate": 3.059787375237344e-05,
+      "loss": 0.1515384554862976,
+      "step": 2450
+    },
+    {
+      "epoch": 0.44668850072780203,
+      "grad_norm": 0.1814432442188263,
+      "learning_rate": 3.052611052224774e-05,
+      "loss": 0.15731438398361205,
+      "step": 2455
+    },
+    {
+      "epoch": 0.44759825327510916,
+      "grad_norm": 0.16657036542892456,
+      "learning_rate": 3.0454299386094542e-05,
+      "loss": 0.15741543769836425,
+      "step": 2460
+    },
+    {
+      "epoch": 0.4485080058224163,
+      "grad_norm": 0.2177237570285797,
+      "learning_rate": 3.0382440966446875e-05,
+      "loss": 0.14972515106201173,
+      "step": 2465
+    },
+    {
+      "epoch": 0.4494177583697234,
+      "grad_norm": 0.1669909954071045,
+      "learning_rate": 3.031053588624766e-05,
+      "loss": 0.1506432294845581,
+      "step": 2470
+    },
+    {
+      "epoch": 0.45032751091703055,
+      "grad_norm": 0.1752234250307083,
+      "learning_rate": 3.0238584768844313e-05,
+      "loss": 0.14969609975814818,
+      "step": 2475
+    },
+    {
+      "epoch": 0.4512372634643377,
+      "grad_norm": 0.18267901241779327,
+      "learning_rate": 3.0166588237983363e-05,
+      "loss": 0.15112748146057128,
+      "step": 2480
+    },
+    {
+      "epoch": 0.4521470160116448,
+      "grad_norm": 0.16250105202198029,
+      "learning_rate": 3.0094546917805007e-05,
+      "loss": 0.15864100456237792,
+      "step": 2485
+    },
+    {
+      "epoch": 0.45305676855895194,
+      "grad_norm": 0.14825721085071564,
+      "learning_rate": 3.0022461432837752e-05,
+      "loss": 0.1513954520225525,
+      "step": 2490
+    },
+    {
+      "epoch": 0.4539665211062591,
+      "grad_norm": 0.1626640111207962,
+      "learning_rate": 2.9950332407992943e-05,
+      "loss": 0.1505578875541687,
+      "step": 2495
+    },
+    {
+      "epoch": 0.45487627365356625,
+      "grad_norm": 0.1535351574420929,
+      "learning_rate": 2.987816046855939e-05,
+      "loss": 0.15255829095840454,
+      "step": 2500
+    },
+    {
+      "epoch": 0.4557860262008734,
+      "grad_norm": 0.17552775144577026,
+      "learning_rate": 2.9805946240197928e-05,
+      "loss": 0.1516443133354187,
+      "step": 2505
+    },
+    {
+      "epoch": 0.4566957787481805,
+      "grad_norm": 0.16020981967449188,
+      "learning_rate": 2.9733690348935994e-05,
+      "loss": 0.14519743919372557,
+      "step": 2510
+    },
+    {
+      "epoch": 0.45760553129548764,
+      "grad_norm": 0.17800211906433105,
+      "learning_rate": 2.9661393421162204e-05,
+      "loss": 0.15679080486297609,
+      "step": 2515
+    },
+    {
+      "epoch": 0.4585152838427948,
+      "grad_norm": 0.16016991436481476,
+      "learning_rate": 2.9589056083620902e-05,
+      "loss": 0.14768127202987671,
+      "step": 2520
+    },
+    {
+      "epoch": 0.4594250363901019,
+      "grad_norm": 0.16272081434726715,
+      "learning_rate": 2.951667896340679e-05,
+      "loss": 0.1513301968574524,
+      "step": 2525
+    },
+    {
+      "epoch": 0.46033478893740903,
+      "grad_norm": 0.1726413071155548,
+      "learning_rate": 2.9444262687959402e-05,
+      "loss": 0.14819332361221313,
+      "step": 2530
+    },
+    {
+      "epoch": 0.46124454148471616,
+      "grad_norm": 0.1670403778553009,
+      "learning_rate": 2.9371807885057735e-05,
+      "loss": 0.15245940685272216,
+      "step": 2535
+    },
+    {
+      "epoch": 0.4621542940320233,
+      "grad_norm": 0.1650049239397049,
+      "learning_rate": 2.9299315182814772e-05,
+      "loss": 0.15187418460845947,
+      "step": 2540
+    },
+    {
+      "epoch": 0.4630640465793304,
+      "grad_norm": 0.16327734291553497,
+      "learning_rate": 2.9226785209672047e-05,
+      "loss": 0.15579828023910522,
+      "step": 2545
+    },
+    {
+      "epoch": 0.46397379912663755,
+      "grad_norm": 0.3367880582809448,
+      "learning_rate": 2.91542185943942e-05,
+      "loss": 0.15617697238922118,
+      "step": 2550
+    },
+    {
+      "epoch": 0.4648835516739447,
+      "grad_norm": 0.1731594055891037,
+      "learning_rate": 2.908161596606353e-05,
+      "loss": 0.1559603691101074,
+      "step": 2555
+    },
+    {
+      "epoch": 0.4657933042212518,
+      "grad_norm": 0.1477293074131012,
+      "learning_rate": 2.9008977954074517e-05,
+      "loss": 0.15567959547042848,
+      "step": 2560
+    },
+    {
+      "epoch": 0.46670305676855894,
+      "grad_norm": 0.16227173805236816,
+      "learning_rate": 2.8936305188128392e-05,
+      "loss": 0.1522113561630249,
+      "step": 2565
+    },
+    {
+      "epoch": 0.4676128093158661,
+      "grad_norm": 0.2031075656414032,
+      "learning_rate": 2.8863598298227674e-05,
+      "loss": 0.15054640769958497,
+      "step": 2570
+    },
+    {
+      "epoch": 0.4685225618631732,
+      "grad_norm": 0.18351472914218903,
+      "learning_rate": 2.8790857914670698e-05,
+      "loss": 0.15837019681930542,
+      "step": 2575
+    },
+    {
+      "epoch": 0.46943231441048033,
+      "grad_norm": 0.15914765000343323,
+      "learning_rate": 2.871808466804616e-05,
+      "loss": 0.1550259470939636,
+      "step": 2580
+    },
+    {
+      "epoch": 0.47034206695778746,
+      "grad_norm": 0.17366717755794525,
+      "learning_rate": 2.8645279189227636e-05,
+      "loss": 0.15702390670776367,
+      "step": 2585
+    },
+    {
+      "epoch": 0.4712518195050946,
+      "grad_norm": 0.13677838444709778,
+      "learning_rate": 2.8572442109368134e-05,
+      "loss": 0.15485031604766847,
+      "step": 2590
+    },
+    {
+      "epoch": 0.4721615720524017,
+      "grad_norm": 0.1477748304605484,
+      "learning_rate": 2.8499574059894617e-05,
+      "loss": 0.14577245712280273,
+      "step": 2595
+    },
+    {
+      "epoch": 0.47307132459970885,
+      "grad_norm": 0.1582217663526535,
+      "learning_rate": 2.842667567250252e-05,
+      "loss": 0.15586793422698975,
+      "step": 2600
+    },
+    {
+      "epoch": 0.47398107714701604,
+      "grad_norm": 0.19658738374710083,
+      "learning_rate": 2.8353747579150268e-05,
+      "loss": 0.15060495138168334,
+      "step": 2605
+    },
+    {
+      "epoch": 0.47489082969432317,
+      "grad_norm": 0.176767036318779,
+      "learning_rate": 2.828079041205382e-05,
+      "loss": 0.15116705894470214,
+      "step": 2610
+    },
+    {
+      "epoch": 0.4758005822416303,
+      "grad_norm": 0.16972507536411285,
+      "learning_rate": 2.820780480368117e-05,
+      "loss": 0.1541937470436096,
+      "step": 2615
+    },
+    {
+      "epoch": 0.47671033478893743,
+      "grad_norm": 0.1548585742712021,
+      "learning_rate": 2.8134791386746884e-05,
+      "loss": 0.14334756135940552,
+      "step": 2620
+    },
+    {
+      "epoch": 0.47762008733624456,
+      "grad_norm": 0.15411986410617828,
+      "learning_rate": 2.806175079420658e-05,
+      "loss": 0.14642289876937867,
+      "step": 2625
+    },
+    {
+      "epoch": 0.4785298398835517,
+      "grad_norm": 0.16609491407871246,
+      "learning_rate": 2.7988683659251474e-05,
+      "loss": 0.15083469152450563,
+      "step": 2630
+    },
+    {
+      "epoch": 0.4794395924308588,
+      "grad_norm": 0.16592684388160706,
+      "learning_rate": 2.791559061530289e-05,
+      "loss": 0.14218480587005616,
+      "step": 2635
+    },
+    {
+      "epoch": 0.48034934497816595,
+      "grad_norm": 0.1764935404062271,
+      "learning_rate": 2.7842472296006722e-05,
+      "loss": 0.15004343986511232,
+      "step": 2640
+    },
+    {
+      "epoch": 0.4812590975254731,
+      "grad_norm": 0.20094354450702667,
+      "learning_rate": 2.7769329335228022e-05,
+      "loss": 0.14975016117095946,
+      "step": 2645
+    },
+    {
+      "epoch": 0.4821688500727802,
+      "grad_norm": 0.1869269460439682,
+      "learning_rate": 2.769616236704542e-05,
+      "loss": 0.155981707572937,
+      "step": 2650
+    },
+    {
+      "epoch": 0.48307860262008734,
+      "grad_norm": 0.16671574115753174,
+      "learning_rate": 2.762297202574571e-05,
+      "loss": 0.14633859395980836,
+      "step": 2655
+    },
+    {
+      "epoch": 0.48398835516739447,
+      "grad_norm": 0.14999663829803467,
+      "learning_rate": 2.754975894581826e-05,
+      "loss": 0.15692603588104248,
+      "step": 2660
+    },
+    {
+      "epoch": 0.4848981077147016,
+      "grad_norm": 0.16893649101257324,
+      "learning_rate": 2.7476523761949592e-05,
+      "loss": 0.14530394077301026,
+      "step": 2665
+    },
+    {
+      "epoch": 0.48580786026200873,
+      "grad_norm": 0.16039884090423584,
+      "learning_rate": 2.740326710901784e-05,
+      "loss": 0.15013915300369263,
+      "step": 2670
+    },
+    {
+      "epoch": 0.48671761280931586,
+      "grad_norm": 0.16672006249427795,
+      "learning_rate": 2.732998962208725e-05,
+      "loss": 0.15667349100112915,
+      "step": 2675
+    },
+    {
+      "epoch": 0.487627365356623,
+      "grad_norm": 0.2160867303609848,
+      "learning_rate": 2.7256691936402684e-05,
+      "loss": 0.14335414171218872,
+      "step": 2680
+    },
+    {
+      "epoch": 0.4885371179039301,
+      "grad_norm": 0.349030077457428,
+      "learning_rate": 2.71833746873841e-05,
+      "loss": 0.1437530279159546,
+      "step": 2685
+    },
+    {
+      "epoch": 0.48944687045123725,
+      "grad_norm": 0.18380966782569885,
+      "learning_rate": 2.7110038510621073e-05,
+      "loss": 0.1476014256477356,
+      "step": 2690
+    },
+    {
+      "epoch": 0.4903566229985444,
+      "grad_norm": 0.1523742377758026,
+      "learning_rate": 2.703668404186722e-05,
+      "loss": 0.14578526020050048,
+      "step": 2695
+    },
+    {
+      "epoch": 0.4912663755458515,
+      "grad_norm": 0.16092729568481445,
+      "learning_rate": 2.696331191703479e-05,
+      "loss": 0.15335593223571778,
+      "step": 2700
+    },
+    {
+      "epoch": 0.49217612809315864,
+      "grad_norm": 0.17185333371162415,
+      "learning_rate": 2.688992277218904e-05,
+      "loss": 0.1540898084640503,
+      "step": 2705
+    },
+    {
+      "epoch": 0.49308588064046577,
+      "grad_norm": 0.1521969735622406,
+      "learning_rate": 2.6816517243542792e-05,
+      "loss": 0.15171396732330322,
+      "step": 2710
+    },
+    {
+      "epoch": 0.49399563318777295,
+      "grad_norm": 0.16064171493053436,
+      "learning_rate": 2.674309596745092e-05,
+      "loss": 0.1505839228630066,
+      "step": 2715
+    },
+    {
+      "epoch": 0.4949053857350801,
+      "grad_norm": 0.16430898010730743,
+      "learning_rate": 2.6669659580404795e-05,
+      "loss": 0.1551363468170166,
+      "step": 2720
+    },
+    {
+      "epoch": 0.4958151382823872,
+      "grad_norm": 0.16125477850437164,
+      "learning_rate": 2.659620871902677e-05,
+      "loss": 0.15069286823272704,
+      "step": 2725
+    },
+    {
+      "epoch": 0.49672489082969434,
+      "grad_norm": 0.1428450047969818,
+      "learning_rate": 2.652274402006471e-05,
+      "loss": 0.15511081218719483,
+      "step": 2730
+    },
+    {
+      "epoch": 0.4976346433770015,
+      "grad_norm": 0.15452754497528076,
+      "learning_rate": 2.6449266120386406e-05,
+      "loss": 0.14941939115524291,
+      "step": 2735
+    },
+    {
+      "epoch": 0.4985443959243086,
+      "grad_norm": 0.17243537306785583,
+      "learning_rate": 2.6375775656974123e-05,
+      "loss": 0.151741623878479,
+      "step": 2740
+    },
+    {
+      "epoch": 0.49945414847161573,
+      "grad_norm": 0.13736453652381897,
+      "learning_rate": 2.6302273266919008e-05,
+      "loss": 0.147042977809906,
+      "step": 2745
+    },
+    {
+      "epoch": 0.5003639010189228,
+      "grad_norm": 0.16241495311260223,
+      "learning_rate": 2.6228759587415614e-05,
+      "loss": 0.14664684534072875,
+      "step": 2750
+    },
+    {
+      "epoch": 0.50127365356623,
+      "grad_norm": 0.193496435880661,
+      "learning_rate": 2.6155235255756356e-05,
+      "loss": 0.15486966371536254,
+      "step": 2755
+    },
+    {
+      "epoch": 0.5021834061135371,
+      "grad_norm": 0.1542847901582718,
+      "learning_rate": 2.6081700909326e-05,
+      "loss": 0.15148009061813356,
+      "step": 2760
+    },
+    {
+      "epoch": 0.5030931586608443,
+      "grad_norm": 0.1696511209011078,
+      "learning_rate": 2.6008157185596142e-05,
+      "loss": 0.14190055131912233,
+      "step": 2765
+    },
+    {
+      "epoch": 0.5040029112081513,
+      "grad_norm": 0.14690077304840088,
+      "learning_rate": 2.5934604722119655e-05,
+      "loss": 0.1570739269256592,
+      "step": 2770
+    },
+    {
+      "epoch": 0.5049126637554585,
+      "grad_norm": 0.17149671912193298,
+      "learning_rate": 2.5861044156525162e-05,
+      "loss": 0.14940304756164552,
+      "step": 2775
+    },
+    {
+      "epoch": 0.5058224163027657,
+      "grad_norm": 0.16639231145381927,
+      "learning_rate": 2.578747612651155e-05,
+      "loss": 0.15691237449645995,
+      "step": 2780
+    },
+    {
+      "epoch": 0.5067321688500728,
+      "grad_norm": 0.2062763124704361,
+      "learning_rate": 2.5713901269842404e-05,
+      "loss": 0.1564734935760498,
+      "step": 2785
+    },
+    {
+      "epoch": 0.50764192139738,
+      "grad_norm": 0.12636308372020721,
+      "learning_rate": 2.5640320224340502e-05,
+      "loss": 0.14539417028427123,
+      "step": 2790
+    },
+    {
+      "epoch": 0.508551673944687,
+      "grad_norm": 0.16893689334392548,
+      "learning_rate": 2.556673362788225e-05,
+      "loss": 0.15440930128097535,
+      "step": 2795
+    },
+    {
+      "epoch": 0.5094614264919942,
+      "grad_norm": 0.16250015795230865,
+      "learning_rate": 2.54931421183922e-05,
+      "loss": 0.14485647678375244,
+      "step": 2800
+    },
+    {
+      "epoch": 0.5103711790393013,
+      "grad_norm": 0.1700994372367859,
+      "learning_rate": 2.5419546333837462e-05,
+      "loss": 0.15411126613616943,
+      "step": 2805
+    },
+    {
+      "epoch": 0.5112809315866085,
+      "grad_norm": 0.1547706127166748,
+      "learning_rate": 2.5345946912222256e-05,
+      "loss": 0.15516072511672974,
+      "step": 2810
+    },
+    {
+      "epoch": 0.5121906841339156,
+      "grad_norm": 0.17955681681632996,
+      "learning_rate": 2.527234449158228e-05,
+      "loss": 0.15546923875808716,
+      "step": 2815
+    },
+    {
+      "epoch": 0.5131004366812227,
+      "grad_norm": 0.163709819316864,
+      "learning_rate": 2.519873970997927e-05,
+      "loss": 0.15665037631988527,
+      "step": 2820
+    },
+    {
+      "epoch": 0.5140101892285298,
+      "grad_norm": 0.17859576642513275,
+      "learning_rate": 2.5125133205495405e-05,
+      "loss": 0.1539722204208374,
+      "step": 2825
+    },
+    {
+      "epoch": 0.514919941775837,
+      "grad_norm": 0.17443150281906128,
+      "learning_rate": 2.5051525616227806e-05,
+      "loss": 0.148411762714386,
+      "step": 2830
+    },
+    {
+      "epoch": 0.5158296943231441,
+      "grad_norm": 0.17397581040859222,
+      "learning_rate": 2.4977917580283007e-05,
+      "loss": 0.14880497455596925,
+      "step": 2835
+    },
+    {
+      "epoch": 0.5167394468704513,
+      "grad_norm": 0.14565663039684296,
+      "learning_rate": 2.4904309735771405e-05,
+      "loss": 0.14934680461883545,
+      "step": 2840
+    },
+    {
+      "epoch": 0.5176491994177583,
+      "grad_norm": 0.17895659804344177,
+      "learning_rate": 2.4830702720801746e-05,
+      "loss": 0.15287939310073853,
+      "step": 2845
+    },
+    {
+      "epoch": 0.5185589519650655,
+      "grad_norm": 0.15812788903713226,
+      "learning_rate": 2.4757097173475572e-05,
+      "loss": 0.14576947689056396,
+      "step": 2850
+    },
+    {
+      "epoch": 0.5194687045123726,
+      "grad_norm": 0.17123781144618988,
+      "learning_rate": 2.46834937318817e-05,
+      "loss": 0.15224847793579102,
+      "step": 2855
+    },
+    {
+      "epoch": 0.5203784570596798,
+      "grad_norm": 0.14845474064350128,
+      "learning_rate": 2.460989303409072e-05,
+      "loss": 0.14901585578918458,
+      "step": 2860
+    },
+    {
+      "epoch": 0.5212882096069869,
+      "grad_norm": 0.23493704199790955,
+      "learning_rate": 2.4536295718149407e-05,
+      "loss": 0.1517487049102783,
+      "step": 2865
+    },
+    {
+      "epoch": 0.522197962154294,
+      "grad_norm": 0.16209843754768372,
+      "learning_rate": 2.4462702422075217e-05,
+      "loss": 0.14327445030212402,
+      "step": 2870
+    },
+    {
+      "epoch": 0.5231077147016011,
+      "grad_norm": 0.17249803245067596,
+      "learning_rate": 2.4389113783850793e-05,
+      "loss": 0.1517549753189087,
+      "step": 2875
+    },
+    {
+      "epoch": 0.5240174672489083,
+      "grad_norm": 0.14561402797698975,
+      "learning_rate": 2.431553044141836e-05,
+      "loss": 0.14764087200164794,
+      "step": 2880
+    },
+    {
+      "epoch": 0.5249272197962155,
+      "grad_norm": 0.17033302783966064,
+      "learning_rate": 2.4241953032674256e-05,
+      "loss": 0.15181604623794556,
+      "step": 2885
+    },
+    {
+      "epoch": 0.5258369723435226,
+      "grad_norm": 0.1184430941939354,
+      "learning_rate": 2.4168382195463367e-05,
+      "loss": 0.14264242649078368,
+      "step": 2890
+    },
+    {
+      "epoch": 0.5267467248908297,
+      "grad_norm": 0.17521196603775024,
+      "learning_rate": 2.4094818567573618e-05,
+      "loss": 0.1509538173675537,
+      "step": 2895
+    },
+    {
+      "epoch": 0.5276564774381368,
+      "grad_norm": 0.1681576371192932,
+      "learning_rate": 2.4021262786730428e-05,
+      "loss": 0.15344605445861817,
+      "step": 2900
+    },
+    {
+      "epoch": 0.528566229985444,
+      "grad_norm": 0.17134182155132294,
+      "learning_rate": 2.3947715490591206e-05,
+      "loss": 0.15161689519882202,
+      "step": 2905
+    },
+    {
+      "epoch": 0.5294759825327511,
+      "grad_norm": 0.1796472817659378,
+      "learning_rate": 2.3874177316739778e-05,
+      "loss": 0.15086464881896972,
+      "step": 2910
+    },
+    {
+      "epoch": 0.5303857350800583,
+      "grad_norm": 0.23268625140190125,
+      "learning_rate": 2.380064890268093e-05,
+      "loss": 0.15354180335998535,
+      "step": 2915
+    },
+    {
+      "epoch": 0.5312954876273653,
+      "grad_norm": 0.16318941116333008,
+      "learning_rate": 2.372713088583481e-05,
+      "loss": 0.15131797790527343,
+      "step": 2920
+    },
+    {
+      "epoch": 0.5322052401746725,
+      "grad_norm": 0.18171803653240204,
+      "learning_rate": 2.365362390353143e-05,
+      "loss": 0.15784090757369995,
+      "step": 2925
+    },
+    {
+      "epoch": 0.5331149927219796,
+      "grad_norm": 0.17672640085220337,
+      "learning_rate": 2.3580128593005156e-05,
+      "loss": 0.15509436130523682,
+      "step": 2930
+    },
+    {
+      "epoch": 0.5340247452692868,
+      "grad_norm": 0.15985223650932312,
+      "learning_rate": 2.3506645591389174e-05,
+      "loss": 0.14851027727127075,
+      "step": 2935
+    },
+    {
+      "epoch": 0.5349344978165939,
+      "grad_norm": 0.16597607731819153,
+      "learning_rate": 2.343317553570995e-05,
+      "loss": 0.1504931092262268,
+      "step": 2940
+    },
+    {
+      "epoch": 0.535844250363901,
+      "grad_norm": 0.20180748403072357,
+      "learning_rate": 2.3359719062881725e-05,
+      "loss": 0.15023820400238036,
+      "step": 2945
+    },
+    {
+      "epoch": 0.5367540029112081,
+      "grad_norm": 0.1735963076353073,
+      "learning_rate": 2.3286276809701e-05,
+      "loss": 0.15374408960342406,
+      "step": 2950
+    },
+    {
+      "epoch": 0.5376637554585153,
+      "grad_norm": 0.17629501223564148,
+      "learning_rate": 2.3212849412840995e-05,
+      "loss": 0.15007833242416382,
+      "step": 2955
+    },
+    {
+      "epoch": 0.5385735080058224,
+      "grad_norm": 0.1493796557188034,
+      "learning_rate": 2.3139437508846155e-05,
+      "loss": 0.15206656455993653,
+      "step": 2960
+    },
+    {
+      "epoch": 0.5394832605531296,
+      "grad_norm": 0.17426837980747223,
+      "learning_rate": 2.306604173412659e-05,
+      "loss": 0.1441131591796875,
+      "step": 2965
+    },
+    {
+      "epoch": 0.5403930131004366,
+      "grad_norm": 0.16984431445598602,
+      "learning_rate": 2.2992662724952613e-05,
+      "loss": 0.14438753128051757,
+      "step": 2970
+    },
+    {
+      "epoch": 0.5413027656477438,
+      "grad_norm": 0.1814386397600174,
+      "learning_rate": 2.2919301117449167e-05,
+      "loss": 0.14887022972106934,
+      "step": 2975
+    },
+    {
+      "epoch": 0.5422125181950509,
+      "grad_norm": 0.158392995595932,
+      "learning_rate": 2.2845957547590368e-05,
+      "loss": 0.14404361248016356,
+      "step": 2980
+    },
+    {
+      "epoch": 0.5431222707423581,
+      "grad_norm": 0.17496263980865479,
+      "learning_rate": 2.2772632651193953e-05,
+      "loss": 0.1454906702041626,
+      "step": 2985
+    },
+    {
+      "epoch": 0.5440320232896652,
+      "grad_norm": 0.157533198595047,
+      "learning_rate": 2.2699327063915766e-05,
+      "loss": 0.1458217740058899,
+      "step": 2990
+    },
+    {
+      "epoch": 0.5449417758369723,
+      "grad_norm": 0.1767890453338623,
+      "learning_rate": 2.262604142124427e-05,
+      "loss": 0.14384825229644777,
+      "step": 2995
+    },
+    {
+      "epoch": 0.5458515283842795,
+      "grad_norm": 0.1851050704717636,
+      "learning_rate": 2.2552776358495033e-05,
+      "loss": 0.14832457304000854,
+      "step": 3000
+    },
+    {
+      "epoch": 0.5467612809315866,
+      "grad_norm": 0.164175882935524,
+      "learning_rate": 2.247953251080521e-05,
+      "loss": 0.14999878406524658,
+      "step": 3005
+    },
+    {
+      "epoch": 0.5476710334788938,
+      "grad_norm": 0.3403675854206085,
+      "learning_rate": 2.240631051312804e-05,
+      "loss": 0.1443937063217163,
+      "step": 3010
+    },
+    {
+      "epoch": 0.5485807860262009,
+      "grad_norm": 0.16751109063625336,
+      "learning_rate": 2.2333111000227342e-05,
+      "loss": 0.1462402105331421,
+      "step": 3015
+    },
+    {
+      "epoch": 0.549490538573508,
+      "grad_norm": 0.14741151034832,
+      "learning_rate": 2.225993460667201e-05,
+      "loss": 0.149855899810791,
+      "step": 3020
+    },
+    {
+      "epoch": 0.5504002911208151,
+      "grad_norm": 0.20605266094207764,
+      "learning_rate": 2.218678196683054e-05,
+      "loss": 0.15413178205490113,
+      "step": 3025
+    },
+    {
+      "epoch": 0.5513100436681223,
+      "grad_norm": 0.14884796738624573,
+      "learning_rate": 2.2113653714865473e-05,
+      "loss": 0.14592334032058715,
+      "step": 3030
+    },
+    {
+      "epoch": 0.5522197962154294,
+      "grad_norm": 0.17114350199699402,
+      "learning_rate": 2.2040550484727943e-05,
+      "loss": 0.1498338460922241,
+      "step": 3035
+    },
+    {
+      "epoch": 0.5531295487627366,
+      "grad_norm": 0.16496853530406952,
+      "learning_rate": 2.196747291015219e-05,
+      "loss": 0.14650315046310425,
+      "step": 3040
+    },
+    {
+      "epoch": 0.5540393013100436,
+      "grad_norm": 0.15172401070594788,
+      "learning_rate": 2.189442162465001e-05,
+      "loss": 0.14984124898910522,
+      "step": 3045
+    },
+    {
+      "epoch": 0.5549490538573508,
+      "grad_norm": 0.19258467853069305,
+      "learning_rate": 2.182139726150532e-05,
+      "loss": 0.1486764669418335,
+      "step": 3050
+    },
+    {
+      "epoch": 0.5558588064046579,
+      "grad_norm": 0.1749001443386078,
+      "learning_rate": 2.1748400453768652e-05,
+      "loss": 0.14983701705932617,
+      "step": 3055
+    },
+    {
+      "epoch": 0.5567685589519651,
+      "grad_norm": 0.37510567903518677,
+      "learning_rate": 2.1675431834251637e-05,
+      "loss": 0.14483561515808105,
+      "step": 3060
+    },
+    {
+      "epoch": 0.5576783114992722,
+      "grad_norm": 0.16932405531406403,
+      "learning_rate": 2.1602492035521553e-05,
+      "loss": 0.14487643241882325,
+      "step": 3065
+    },
+    {
+      "epoch": 0.5585880640465793,
+      "grad_norm": 0.174176424741745,
+      "learning_rate": 2.152958168989584e-05,
+      "loss": 0.14737497568130492,
+      "step": 3070
+    },
+    {
+      "epoch": 0.5594978165938864,
+      "grad_norm": 0.1601252257823944,
+      "learning_rate": 2.1456701429436577e-05,
+      "loss": 0.15183379650115966,
+      "step": 3075
+    },
+    {
+      "epoch": 0.5604075691411936,
+      "grad_norm": 0.14960910379886627,
+      "learning_rate": 2.1383851885945085e-05,
+      "loss": 0.143074893951416,
+      "step": 3080
+    },
+    {
+      "epoch": 0.5613173216885007,
+      "grad_norm": 0.1678633838891983,
+      "learning_rate": 2.1311033690956346e-05,
+      "loss": 0.14961432218551635,
+      "step": 3085
+    },
+    {
+      "epoch": 0.5622270742358079,
+      "grad_norm": 0.15814319252967834,
+      "learning_rate": 2.1238247475733613e-05,
+      "loss": 0.14308581352233887,
+      "step": 3090
+    },
+    {
+      "epoch": 0.5631368267831149,
+      "grad_norm": 0.21240772306919098,
+      "learning_rate": 2.1165493871262887e-05,
+      "loss": 0.14737485647201537,
+      "step": 3095
+    },
+    {
+      "epoch": 0.5640465793304221,
+      "grad_norm": 0.15161271393299103,
+      "learning_rate": 2.109277350824749e-05,
+      "loss": 0.14534420967102052,
+      "step": 3100
+    },
+    {
+      "epoch": 0.5649563318777293,
+      "grad_norm": 0.16572362184524536,
+      "learning_rate": 2.1020087017102537e-05,
+      "loss": 0.14299670457839966,
+      "step": 3105
+    },
+    {
+      "epoch": 0.5658660844250364,
+      "grad_norm": 0.1548164039850235,
+      "learning_rate": 2.094743502794954e-05,
+      "loss": 0.14371142387390137,
+      "step": 3110
+    },
+    {
+      "epoch": 0.5667758369723436,
+      "grad_norm": 0.2574169933795929,
+      "learning_rate": 2.0874818170610885e-05,
+      "loss": 0.14350423812866211,
+      "step": 3115
+    },
+    {
+      "epoch": 0.5676855895196506,
+      "grad_norm": 0.16359548270702362,
+      "learning_rate": 2.080223707460443e-05,
+      "loss": 0.1520243763923645,
+      "step": 3120
+    },
+    {
+      "epoch": 0.5685953420669578,
+      "grad_norm": 0.1798320859670639,
+      "learning_rate": 2.072969236913799e-05,
+      "loss": 0.14832595586776734,
+      "step": 3125
+    },
+    {
+      "epoch": 0.5695050946142649,
+      "grad_norm": 0.17045916616916656,
+      "learning_rate": 2.0657184683103926e-05,
+      "loss": 0.15308042764663696,
+      "step": 3130
+    },
+    {
+      "epoch": 0.5704148471615721,
+      "grad_norm": 0.16345897316932678,
+      "learning_rate": 2.058471464507366e-05,
+      "loss": 0.14564799070358275,
+      "step": 3135
+    },
+    {
+      "epoch": 0.5713245997088792,
+      "grad_norm": 0.15170110762119293,
+      "learning_rate": 2.0512282883292257e-05,
+      "loss": 0.14161767959594726,
+      "step": 3140
+    },
+    {
+      "epoch": 0.5722343522561864,
+      "grad_norm": 0.8107472658157349,
+      "learning_rate": 2.0439890025672955e-05,
+      "loss": 0.14481087923049926,
+      "step": 3145
+    },
+    {
+      "epoch": 0.5731441048034934,
+      "grad_norm": 0.15346679091453552,
+      "learning_rate": 2.036753669979174e-05,
+      "loss": 0.14860262870788574,
+      "step": 3150
+    },
+    {
+      "epoch": 0.5740538573508006,
+      "grad_norm": 0.1632593423128128,
+      "learning_rate": 2.0295223532881886e-05,
+      "loss": 0.1481687307357788,
+      "step": 3155
+    },
+    {
+      "epoch": 0.5749636098981077,
+      "grad_norm": 0.23399172723293304,
+      "learning_rate": 2.022295115182852e-05,
+      "loss": 0.149153733253479,
+      "step": 3160
+    },
+    {
+      "epoch": 0.5758733624454149,
+      "grad_norm": 0.14977394044399261,
+      "learning_rate": 2.015072018316323e-05,
+      "loss": 0.14921388626098633,
+      "step": 3165
+    },
+    {
+      "epoch": 0.576783114992722,
+      "grad_norm": 0.1550658792257309,
+      "learning_rate": 2.007853125305856e-05,
+      "loss": 0.1482759475708008,
+      "step": 3170
+    },
+    {
+      "epoch": 0.5776928675400291,
+      "grad_norm": 0.16661737859249115,
+      "learning_rate": 2.0006384987322645e-05,
+      "loss": 0.14903552532196046,
+      "step": 3175
+    },
+    {
+      "epoch": 0.5786026200873362,
+      "grad_norm": 0.1746823936700821,
+      "learning_rate": 1.9934282011393753e-05,
+      "loss": 0.1412947654724121,
+      "step": 3180
+    },
+    {
+      "epoch": 0.5795123726346434,
+      "grad_norm": 0.17025792598724365,
+      "learning_rate": 1.9862222950334857e-05,
+      "loss": 0.15289769172668458,
+      "step": 3185
+    },
+    {
+      "epoch": 0.5804221251819505,
+      "grad_norm": 0.16857658326625824,
+      "learning_rate": 1.9790208428828252e-05,
+      "loss": 0.14419941902160643,
+      "step": 3190
+    },
+    {
+      "epoch": 0.5813318777292577,
+      "grad_norm": 0.16099876165390015,
+      "learning_rate": 1.9718239071170118e-05,
+      "loss": 0.14476487636566163,
+      "step": 3195
+    },
+    {
+      "epoch": 0.5822416302765647,
+      "grad_norm": 0.16140873730182648,
+      "learning_rate": 1.964631550126508e-05,
+      "loss": 0.14588416814804078,
+      "step": 3200
+    },
+    {
+      "epoch": 0.5831513828238719,
+      "grad_norm": 0.15719448029994965,
+      "learning_rate": 1.957443834262087e-05,
+      "loss": 0.15144693851470947,
+      "step": 3205
+    },
+    {
+      "epoch": 0.584061135371179,
+      "grad_norm": 0.16512645781040192,
+      "learning_rate": 1.950260821834285e-05,
+      "loss": 0.14787566661834717,
+      "step": 3210
+    },
+    {
+      "epoch": 0.5849708879184862,
+      "grad_norm": 0.18584516644477844,
+      "learning_rate": 1.9430825751128643e-05,
+      "loss": 0.14514710903167724,
+      "step": 3215
+    },
+    {
+      "epoch": 0.5858806404657934,
+      "grad_norm": 0.17640981078147888,
+      "learning_rate": 1.9359091563262742e-05,
+      "loss": 0.1511004686355591,
+      "step": 3220
+    },
+    {
+      "epoch": 0.5867903930131004,
+      "grad_norm": 0.1697624921798706,
+      "learning_rate": 1.9287406276611095e-05,
+      "loss": 0.15392563343048096,
+      "step": 3225
+    },
+    {
+      "epoch": 0.5877001455604076,
+      "grad_norm": 0.1677260845899582,
+      "learning_rate": 1.9215770512615725e-05,
+      "loss": 0.15311745405197144,
+      "step": 3230
+    },
+    {
+      "epoch": 0.5886098981077147,
+      "grad_norm": 0.15357480943202972,
+      "learning_rate": 1.9144184892289337e-05,
+      "loss": 0.14370160102844237,
+      "step": 3235
+    },
+    {
+      "epoch": 0.5895196506550219,
+      "grad_norm": 0.18601207435131073,
+      "learning_rate": 1.9072650036209955e-05,
+      "loss": 0.14095077514648438,
+      "step": 3240
+    },
+    {
+      "epoch": 0.590429403202329,
+      "grad_norm": 0.17313526570796967,
+      "learning_rate": 1.9001166564515513e-05,
+      "loss": 0.148259174823761,
+      "step": 3245
+    },
+    {
+      "epoch": 0.5913391557496361,
+      "grad_norm": 0.1634378433227539,
+      "learning_rate": 1.8929735096898504e-05,
+      "loss": 0.15082294940948487,
+      "step": 3250
+    },
+    {
+      "epoch": 0.5922489082969432,
+      "grad_norm": 0.18542174994945526,
+      "learning_rate": 1.885835625260058e-05,
+      "loss": 0.14461435079574586,
+      "step": 3255
+    },
+    {
+      "epoch": 0.5931586608442504,
+      "grad_norm": 0.1740756630897522,
+      "learning_rate": 1.87870306504072e-05,
+      "loss": 0.14083608388900756,
+      "step": 3260
+    },
+    {
+      "epoch": 0.5940684133915575,
+      "grad_norm": 0.25606217980384827,
+      "learning_rate": 1.8715758908642288e-05,
+      "loss": 0.15125386714935302,
+      "step": 3265
+    },
+    {
+      "epoch": 0.5949781659388647,
+      "grad_norm": 0.20194627344608307,
+      "learning_rate": 1.8644541645162834e-05,
+      "loss": 0.14433003664016725,
+      "step": 3270
+    },
+    {
+      "epoch": 0.5958879184861717,
+      "grad_norm": 0.1902168095111847,
+      "learning_rate": 1.8573379477353542e-05,
+      "loss": 0.14718132019042968,
+      "step": 3275
+    },
+    {
+      "epoch": 0.5967976710334789,
+      "grad_norm": 0.15122972428798676,
+      "learning_rate": 1.850227302212151e-05,
+      "loss": 0.153376567363739,
+      "step": 3280
+    },
+    {
+      "epoch": 0.597707423580786,
+      "grad_norm": 0.14331959187984467,
+      "learning_rate": 1.843122289589085e-05,
+      "loss": 0.146630597114563,
+      "step": 3285
+    },
+    {
+      "epoch": 0.5986171761280932,
+      "grad_norm": 0.15083099901676178,
+      "learning_rate": 1.836022971459737e-05,
+      "loss": 0.1445971965789795,
+      "step": 3290
+    },
+    {
+      "epoch": 0.5995269286754003,
+      "grad_norm": 0.16585418581962585,
+      "learning_rate": 1.828929409368321e-05,
+      "loss": 0.15120241641998292,
+      "step": 3295
+    },
+    {
+      "epoch": 0.6004366812227074,
+      "grad_norm": 0.1653224229812622,
+      "learning_rate": 1.8218416648091524e-05,
+      "loss": 0.14349838495254516,
+      "step": 3300
+    },
+    {
+      "epoch": 0.6013464337700145,
+      "grad_norm": 0.1891375184059143,
+      "learning_rate": 1.8147597992261124e-05,
+      "loss": 0.15171384811401367,
+      "step": 3305
+    },
+    {
+      "epoch": 0.6022561863173217,
+      "grad_norm": 0.13392704725265503,
+      "learning_rate": 1.8076838740121187e-05,
+      "loss": 0.14607118368148803,
+      "step": 3310
+    },
+    {
+      "epoch": 0.6031659388646288,
+      "grad_norm": 0.15421944856643677,
+      "learning_rate": 1.8006139505085926e-05,
+      "loss": 0.1380957007408142,
+      "step": 3315
+    },
+    {
+      "epoch": 0.604075691411936,
+      "grad_norm": 0.16637761890888214,
+      "learning_rate": 1.7935500900049246e-05,
+      "loss": 0.14604611396789552,
+      "step": 3320
+    },
+    {
+      "epoch": 0.6049854439592431,
+      "grad_norm": 0.16638441383838654,
+      "learning_rate": 1.7864923537379445e-05,
+      "loss": 0.1513611912727356,
+      "step": 3325
+    },
+    {
+      "epoch": 0.6058951965065502,
+      "grad_norm": 0.1745707094669342,
+      "learning_rate": 1.779440802891394e-05,
+      "loss": 0.15391240119934083,
+      "step": 3330
+    },
+    {
+      "epoch": 0.6068049490538574,
+      "grad_norm": 0.1620505005121231,
+      "learning_rate": 1.77239549859539e-05,
+      "loss": 0.14986472129821776,
+      "step": 3335
+    },
+    {
+      "epoch": 0.6077147016011645,
+      "grad_norm": 0.1579132080078125,
+      "learning_rate": 1.7653565019259e-05,
+      "loss": 0.1466603994369507,
+      "step": 3340
+    },
+    {
+      "epoch": 0.6086244541484717,
+      "grad_norm": 0.19154994189739227,
+      "learning_rate": 1.7583238739042086e-05,
+      "loss": 0.15228934288024903,
+      "step": 3345
+    },
+    {
+      "epoch": 0.6095342066957787,
+      "grad_norm": 0.15771779417991638,
+      "learning_rate": 1.7512976754963913e-05,
+      "loss": 0.14965078830718995,
+      "step": 3350
+    },
+    {
+      "epoch": 0.6104439592430859,
+      "grad_norm": 0.18406136333942413,
+      "learning_rate": 1.744277967612785e-05,
+      "loss": 0.1473196864128113,
+      "step": 3355
+    },
+    {
+      "epoch": 0.611353711790393,
+      "grad_norm": 0.17603816092014313,
+      "learning_rate": 1.7372648111074607e-05,
+      "loss": 0.1430676221847534,
+      "step": 3360
+    },
+    {
+      "epoch": 0.6122634643377002,
+      "grad_norm": 0.156408429145813,
+      "learning_rate": 1.7302582667776933e-05,
+      "loss": 0.14018454551696777,
+      "step": 3365
+    },
+    {
+      "epoch": 0.6131732168850073,
+      "grad_norm": 0.14504843950271606,
+      "learning_rate": 1.7232583953634407e-05,
+      "loss": 0.14505640268325806,
+      "step": 3370
+    },
+    {
+      "epoch": 0.6140829694323144,
+      "grad_norm": 0.1864968240261078,
+      "learning_rate": 1.716265257546808e-05,
+      "loss": 0.14810394048690795,
+      "step": 3375
+    },
+    {
+      "epoch": 0.6149927219796215,
+      "grad_norm": 0.1621711403131485,
+      "learning_rate": 1.7092789139515295e-05,
+      "loss": 0.14203091859817504,
+      "step": 3380
+    },
+    {
+      "epoch": 0.6159024745269287,
+      "grad_norm": 0.17994914948940277,
+      "learning_rate": 1.70229942514244e-05,
+      "loss": 0.14565644264221192,
+      "step": 3385
+    },
+    {
+      "epoch": 0.6168122270742358,
+      "grad_norm": 0.1707388162612915,
+      "learning_rate": 1.6953268516249486e-05,
+      "loss": 0.14449434280395507,
+      "step": 3390
+    },
+    {
+      "epoch": 0.617721979621543,
+      "grad_norm": 0.16425329446792603,
+      "learning_rate": 1.6883612538445175e-05,
+      "loss": 0.15185940265655518,
+      "step": 3395
+    },
+    {
+      "epoch": 0.61863173216885,
+      "grad_norm": 0.15987788140773773,
+      "learning_rate": 1.6814026921861335e-05,
+      "loss": 0.14994431734085084,
+      "step": 3400
+    },
+    {
+      "epoch": 0.6195414847161572,
+      "grad_norm": 0.2987690269947052,
+      "learning_rate": 1.6744512269737894e-05,
+      "loss": 0.14652738571166993,
+      "step": 3405
+    },
+    {
+      "epoch": 0.6204512372634643,
+      "grad_norm": 0.1681315004825592,
+      "learning_rate": 1.6675069184699574e-05,
+      "loss": 0.14566165208816528,
+      "step": 3410
+    },
+    {
+      "epoch": 0.6213609898107715,
+      "grad_norm": 0.15847846865653992,
+      "learning_rate": 1.660569826875069e-05,
+      "loss": 0.1374401330947876,
+      "step": 3415
+    },
+    {
+      "epoch": 0.6222707423580786,
+      "grad_norm": 0.16370312869548798,
+      "learning_rate": 1.6536400123269907e-05,
+      "loss": 0.14905524253845215,
+      "step": 3420
+    },
+    {
+      "epoch": 0.6231804949053857,
+      "grad_norm": 0.16054444015026093,
+      "learning_rate": 1.6467175349005054e-05,
+      "loss": 0.1496324896812439,
+      "step": 3425
+    },
+    {
+      "epoch": 0.6240902474526928,
+      "grad_norm": 0.1663951277732849,
+      "learning_rate": 1.639802454606788e-05,
+      "loss": 0.1504170298576355,
+      "step": 3430
+    },
+    {
+      "epoch": 0.625,
+      "grad_norm": 0.1591310054063797,
+      "learning_rate": 1.6328948313928906e-05,
+      "loss": 0.1410186171531677,
+      "step": 3435
+    },
+    {
+      "epoch": 0.6259097525473072,
+      "grad_norm": 0.1637524962425232,
+      "learning_rate": 1.6259947251412178e-05,
+      "loss": 0.13963305950164795,
+      "step": 3440
+    },
+    {
+      "epoch": 0.6268195050946143,
+      "grad_norm": 0.1688017100095749,
+      "learning_rate": 1.6191021956690096e-05,
+      "loss": 0.14727941751480103,
+      "step": 3445
+    },
+    {
+      "epoch": 0.6277292576419214,
+      "grad_norm": 0.1691795438528061,
+      "learning_rate": 1.612217302727821e-05,
+      "loss": 0.14856183528900146,
+      "step": 3450
+    },
+    {
+      "epoch": 0.6286390101892285,
+      "grad_norm": 0.18501746654510498,
+      "learning_rate": 1.60534010600301e-05,
+      "loss": 0.1481746554374695,
+      "step": 3455
+    },
+    {
+      "epoch": 0.6295487627365357,
+      "grad_norm": 0.16234716773033142,
+      "learning_rate": 1.5984706651132125e-05,
+      "loss": 0.1427530527114868,
+      "step": 3460
+    },
+    {
+      "epoch": 0.6304585152838428,
+      "grad_norm": 0.16013780236244202,
+      "learning_rate": 1.5916090396098293e-05,
+      "loss": 0.14264426231384278,
+      "step": 3465
+    },
+    {
+      "epoch": 0.63136826783115,
+      "grad_norm": 0.17116396129131317,
+      "learning_rate": 1.5847552889765095e-05,
+      "loss": 0.14109257459640503,
+      "step": 3470
+    },
+    {
+      "epoch": 0.632278020378457,
+      "grad_norm": 0.16949769854545593,
+      "learning_rate": 1.5779094726286344e-05,
+      "loss": 0.1387040376663208,
+      "step": 3475
+    },
+    {
+      "epoch": 0.6331877729257642,
+      "grad_norm": 0.14983431994915009,
+      "learning_rate": 1.5710716499128044e-05,
+      "loss": 0.13645120859146118,
+      "step": 3480
+    },
+    {
+      "epoch": 0.6340975254730713,
+      "grad_norm": 0.1632554531097412,
+      "learning_rate": 1.564241880106321e-05,
+      "loss": 0.14883992671966553,
+      "step": 3485
+    },
+    {
+      "epoch": 0.6350072780203785,
+      "grad_norm": 0.15686506032943726,
+      "learning_rate": 1.5574202224166744e-05,
+      "loss": 0.14244272708892822,
+      "step": 3490
+    },
+    {
+      "epoch": 0.6359170305676856,
+      "grad_norm": 0.18843458592891693,
+      "learning_rate": 1.5506067359810333e-05,
+      "loss": 0.15149861574172974,
+      "step": 3495
+    },
+    {
+      "epoch": 0.6368267831149927,
+      "grad_norm": 0.15874551236629486,
+      "learning_rate": 1.5438014798657275e-05,
+      "loss": 0.15188233852386473,
+      "step": 3500
+    },
+    {
+      "epoch": 0.6377365356622998,
+      "grad_norm": 0.17014239728450775,
+      "learning_rate": 1.5370045130657366e-05,
+      "loss": 0.14694437980651856,
+      "step": 3505
+    },
+    {
+      "epoch": 0.638646288209607,
+      "grad_norm": 0.14744038879871368,
+      "learning_rate": 1.5302158945041838e-05,
+      "loss": 0.14434736967086792,
+      "step": 3510
+    },
+    {
+      "epoch": 0.6395560407569141,
+      "grad_norm": 0.2069770246744156,
+      "learning_rate": 1.523435683031818e-05,
+      "loss": 0.13982917070388795,
+      "step": 3515
+    },
+    {
+      "epoch": 0.6404657933042213,
+      "grad_norm": 0.17811502516269684,
+      "learning_rate": 1.5166639374265063e-05,
+      "loss": 0.1408839702606201,
+      "step": 3520
+    },
+    {
+      "epoch": 0.6413755458515283,
+      "grad_norm": 0.165786474943161,
+      "learning_rate": 1.509900716392728e-05,
+      "loss": 0.15312877893447877,
+      "step": 3525
+    },
+    {
+      "epoch": 0.6422852983988355,
+      "grad_norm": 0.1633884161710739,
+      "learning_rate": 1.5031460785610596e-05,
+      "loss": 0.1488795518875122,
+      "step": 3530
+    },
+    {
+      "epoch": 0.6431950509461426,
+      "grad_norm": 0.16498984396457672,
+      "learning_rate": 1.4964000824876723e-05,
+      "loss": 0.15031465291976928,
+      "step": 3535
+    },
+    {
+      "epoch": 0.6441048034934498,
+      "grad_norm": 0.18043678998947144,
+      "learning_rate": 1.4896627866538191e-05,
+      "loss": 0.147829806804657,
+      "step": 3540
+    },
+    {
+      "epoch": 0.6450145560407569,
+      "grad_norm": 0.16813597083091736,
+      "learning_rate": 1.4829342494653315e-05,
+      "loss": 0.1418998956680298,
+      "step": 3545
+    },
+    {
+      "epoch": 0.645924308588064,
+      "grad_norm": 0.1817242056131363,
+      "learning_rate": 1.4762145292521118e-05,
+      "loss": 0.14508869647979736,
+      "step": 3550
+    },
+    {
+      "epoch": 0.6468340611353712,
+      "grad_norm": 0.14666494727134705,
+      "learning_rate": 1.469503684267628e-05,
+      "loss": 0.14159854650497436,
+      "step": 3555
+    },
+    {
+      "epoch": 0.6477438136826783,
+      "grad_norm": 0.16485381126403809,
+      "learning_rate": 1.4628017726884086e-05,
+      "loss": 0.14419105052947997,
+      "step": 3560
+    },
+    {
+      "epoch": 0.6486535662299855,
+      "grad_norm": 0.16100342571735382,
+      "learning_rate": 1.4561088526135375e-05,
+      "loss": 0.14501721858978273,
+      "step": 3565
+    },
+    {
+      "epoch": 0.6495633187772926,
+      "grad_norm": 0.16996590793132782,
+      "learning_rate": 1.4494249820641493e-05,
+      "loss": 0.1377166509628296,
+      "step": 3570
+    },
+    {
+      "epoch": 0.6504730713245997,
+      "grad_norm": 0.16168837249279022,
+      "learning_rate": 1.4427502189829339e-05,
+      "loss": 0.1414325475692749,
+      "step": 3575
+    },
+    {
+      "epoch": 0.6513828238719068,
+      "grad_norm": 0.16318906843662262,
+      "learning_rate": 1.436084621233621e-05,
+      "loss": 0.14685193300247193,
+      "step": 3580
+    },
+    {
+      "epoch": 0.652292576419214,
+      "grad_norm": 0.1636219322681427,
+      "learning_rate": 1.4294282466004899e-05,
+      "loss": 0.1405899167060852,
+      "step": 3585
+    },
+    {
+      "epoch": 0.6532023289665211,
+      "grad_norm": 0.1838461309671402,
+      "learning_rate": 1.422781152787865e-05,
+      "loss": 0.14386332035064697,
+      "step": 3590
+    },
+    {
+      "epoch": 0.6541120815138283,
+      "grad_norm": 0.1796344667673111,
+      "learning_rate": 1.4161433974196115e-05,
+      "loss": 0.1513024687767029,
+      "step": 3595
+    },
+    {
+      "epoch": 0.6550218340611353,
+      "grad_norm": 0.16424529254436493,
+      "learning_rate": 1.4095150380386427e-05,
+      "loss": 0.14238927364349366,
+      "step": 3600
+    },
+    {
+      "epoch": 0.6559315866084425,
+      "grad_norm": 0.19264160096645355,
+      "learning_rate": 1.402896132106415e-05,
+      "loss": 0.14297477006912232,
+      "step": 3605
+    },
+    {
+      "epoch": 0.6568413391557496,
+      "grad_norm": 0.18319948017597198,
+      "learning_rate": 1.3962867370024347e-05,
+      "loss": 0.1448880434036255,
+      "step": 3610
+    },
+    {
+      "epoch": 0.6577510917030568,
+      "grad_norm": 0.16507290303707123,
+      "learning_rate": 1.389686910023758e-05,
+      "loss": 0.14724698066711425,
+      "step": 3615
+    },
+    {
+      "epoch": 0.6586608442503639,
+      "grad_norm": 0.17871244251728058,
+      "learning_rate": 1.3830967083844942e-05,
+      "loss": 0.14479386806488037,
+      "step": 3620
+    },
+    {
+      "epoch": 0.659570596797671,
+      "grad_norm": 0.1846228390932083,
+      "learning_rate": 1.3765161892153112e-05,
+      "loss": 0.1453616738319397,
+      "step": 3625
+    },
+    {
+      "epoch": 0.6604803493449781,
+      "grad_norm": 0.17185978591442108,
+      "learning_rate": 1.3699454095629372e-05,
+      "loss": 0.14906206130981445,
+      "step": 3630
+    },
+    {
+      "epoch": 0.6613901018922853,
+      "grad_norm": 0.14751191437244415,
+      "learning_rate": 1.3633844263896698e-05,
+      "loss": 0.13991892337799072,
+      "step": 3635
+    },
+    {
+      "epoch": 0.6622998544395924,
+      "grad_norm": 0.22059763967990875,
+      "learning_rate": 1.3568332965728817e-05,
+      "loss": 0.14680869579315187,
+      "step": 3640
+    },
+    {
+      "epoch": 0.6632096069868996,
+      "grad_norm": 0.15295909345149994,
+      "learning_rate": 1.3502920769045232e-05,
+      "loss": 0.1404443383216858,
+      "step": 3645
+    },
+    {
+      "epoch": 0.6641193595342066,
+      "grad_norm": 0.14600558578968048,
+      "learning_rate": 1.3437608240906364e-05,
+      "loss": 0.14663270711898804,
+      "step": 3650
+    },
+    {
+      "epoch": 0.6650291120815138,
+      "grad_norm": 0.15548352897167206,
+      "learning_rate": 1.3372395947508587e-05,
+      "loss": 0.1431443452835083,
+      "step": 3655
+    },
+    {
+      "epoch": 0.665938864628821,
+      "grad_norm": 0.1813388466835022,
+      "learning_rate": 1.3307284454179342e-05,
+      "loss": 0.1458706736564636,
+      "step": 3660
+    },
+    {
+      "epoch": 0.6668486171761281,
+      "grad_norm": 0.16326870024204254,
+      "learning_rate": 1.3242274325372247e-05,
+      "loss": 0.14700595140457154,
+      "step": 3665
+    },
+    {
+      "epoch": 0.6677583697234353,
+      "grad_norm": 0.18779197335243225,
+      "learning_rate": 1.3177366124662149e-05,
+      "loss": 0.1497237801551819,
+      "step": 3670
+    },
+    {
+      "epoch": 0.6686681222707423,
+      "grad_norm": 0.16291002929210663,
+      "learning_rate": 1.3112560414740315e-05,
+      "loss": 0.1387086868286133,
+      "step": 3675
+    },
+    {
+      "epoch": 0.6695778748180495,
+      "grad_norm": 0.1532297134399414,
+      "learning_rate": 1.3047857757409487e-05,
+      "loss": 0.14497545957565308,
+      "step": 3680
+    },
+    {
+      "epoch": 0.6704876273653566,
+      "grad_norm": 0.14697515964508057,
+      "learning_rate": 1.2983258713579066e-05,
+      "loss": 0.1494283437728882,
+      "step": 3685
+    },
+    {
+      "epoch": 0.6713973799126638,
+      "grad_norm": 0.15213452279567719,
+      "learning_rate": 1.2918763843260218e-05,
+      "loss": 0.1468907594680786,
+      "step": 3690
+    },
+    {
+      "epoch": 0.6723071324599709,
+      "grad_norm": 0.1745215803384781,
+      "learning_rate": 1.285437370556099e-05,
+      "loss": 0.14997754096984864,
+      "step": 3695
+    },
+    {
+      "epoch": 0.673216885007278,
+      "grad_norm": 0.19207637012004852,
+      "learning_rate": 1.2790088858681577e-05,
+      "loss": 0.14202862977981567,
+      "step": 3700
+    },
+    {
+      "epoch": 0.6741266375545851,
+      "grad_norm": 0.1521359086036682,
+      "learning_rate": 1.2725909859909313e-05,
+      "loss": 0.14547673463821412,
+      "step": 3705
+    },
+    {
+      "epoch": 0.6750363901018923,
+      "grad_norm": 0.16975535452365875,
+      "learning_rate": 1.2661837265613999e-05,
+      "loss": 0.14006874561309815,
+      "step": 3710
+    },
+    {
+      "epoch": 0.6759461426491994,
+      "grad_norm": 0.22234582901000977,
+      "learning_rate": 1.2597871631242992e-05,
+      "loss": 0.13691173791885375,
+      "step": 3715
+    },
+    {
+      "epoch": 0.6768558951965066,
+      "grad_norm": 0.16082969307899475,
+      "learning_rate": 1.2534013511316383e-05,
+      "loss": 0.14932308197021485,
+      "step": 3720
+    },
+    {
+      "epoch": 0.6777656477438136,
+      "grad_norm": 0.1751091182231903,
+      "learning_rate": 1.247026345942226e-05,
+      "loss": 0.14531974792480468,
+      "step": 3725
+    },
+    {
+      "epoch": 0.6786754002911208,
+      "grad_norm": 0.15838147699832916,
+      "learning_rate": 1.2406622028211844e-05,
+      "loss": 0.14759832620620728,
+      "step": 3730
+    },
+    {
+      "epoch": 0.6795851528384279,
+      "grad_norm": 0.1771744042634964,
+      "learning_rate": 1.2343089769394714e-05,
+      "loss": 0.1382831573486328,
+      "step": 3735
+    },
+    {
+      "epoch": 0.6804949053857351,
+      "grad_norm": 0.16301538050174713,
+      "learning_rate": 1.2279667233734037e-05,
+      "loss": 0.14444775581359864,
+      "step": 3740
+    },
+    {
+      "epoch": 0.6814046579330422,
+      "grad_norm": 0.1584121286869049,
+      "learning_rate": 1.2216354971041796e-05,
+      "loss": 0.14200170040130616,
+      "step": 3745
+    },
+    {
+      "epoch": 0.6823144104803494,
+      "grad_norm": 0.139187291264534,
+      "learning_rate": 1.2153153530174007e-05,
+      "loss": 0.14318310022354125,
+      "step": 3750
+    },
+    {
+      "epoch": 0.6832241630276564,
+      "grad_norm": 0.13665248453617096,
+      "learning_rate": 1.2090063459025955e-05,
+      "loss": 0.1411946654319763,
+      "step": 3755
+    },
+    {
+      "epoch": 0.6841339155749636,
+      "grad_norm": 0.16273781657218933,
+      "learning_rate": 1.2027085304527475e-05,
+      "loss": 0.14873508214950562,
+      "step": 3760
+    },
+    {
+      "epoch": 0.6850436681222707,
+      "grad_norm": 0.16317526996135712,
+      "learning_rate": 1.1964219612638194e-05,
+      "loss": 0.14644203186035157,
+      "step": 3765
+    },
+    {
+      "epoch": 0.6859534206695779,
+      "grad_norm": 0.17253617942333221,
+      "learning_rate": 1.1901466928342777e-05,
+      "loss": 0.14027841091156007,
+      "step": 3770
+    },
+    {
+      "epoch": 0.6868631732168851,
+      "grad_norm": 0.19692830741405487,
+      "learning_rate": 1.183882779564624e-05,
+      "loss": 0.14411110877990724,
+      "step": 3775
+    },
+    {
+      "epoch": 0.6877729257641921,
+      "grad_norm": 0.15444578230381012,
+      "learning_rate": 1.1776302757569214e-05,
+      "loss": 0.14355008602142333,
+      "step": 3780
+    },
+    {
+      "epoch": 0.6886826783114993,
+      "grad_norm": 0.1622200757265091,
+      "learning_rate": 1.1713892356143239e-05,
+      "loss": 0.14794334173202514,
+      "step": 3785
+    },
+    {
+      "epoch": 0.6895924308588064,
+      "grad_norm": 0.1898501068353653,
+      "learning_rate": 1.1651597132406073e-05,
+      "loss": 0.1418622612953186,
+      "step": 3790
+    },
+    {
+      "epoch": 0.6905021834061136,
+      "grad_norm": 0.17803208529949188,
+      "learning_rate": 1.1589417626396973e-05,
+      "loss": 0.14576040506362914,
+      "step": 3795
+    },
+    {
+      "epoch": 0.6914119359534207,
+      "grad_norm": 0.17138013243675232,
+      "learning_rate": 1.1527354377152053e-05,
+      "loss": 0.14494270086288452,
+      "step": 3800
+    },
+    {
+      "epoch": 0.6923216885007278,
+      "grad_norm": 0.15170913934707642,
+      "learning_rate": 1.1465407922699603e-05,
+      "loss": 0.144084370136261,
+      "step": 3805
+    },
+    {
+      "epoch": 0.6932314410480349,
+      "grad_norm": 0.158562570810318,
+      "learning_rate": 1.1403578800055387e-05,
+      "loss": 0.13636608123779298,
+      "step": 3810
+    },
+    {
+      "epoch": 0.6941411935953421,
+      "grad_norm": 0.17687302827835083,
+      "learning_rate": 1.1341867545218044e-05,
+      "loss": 0.14214688539505005,
+      "step": 3815
+    },
+    {
+      "epoch": 0.6950509461426492,
+      "grad_norm": 0.15394899249076843,
+      "learning_rate": 1.1280274693164378e-05,
+      "loss": 0.14914129972457885,
+      "step": 3820
+    },
+    {
+      "epoch": 0.6959606986899564,
+      "grad_norm": 0.15709355473518372,
+      "learning_rate": 1.12188007778448e-05,
+      "loss": 0.14798580408096312,
+      "step": 3825
+    },
+    {
+      "epoch": 0.6968704512372634,
+      "grad_norm": 0.16631539165973663,
+      "learning_rate": 1.115744633217864e-05,
+      "loss": 0.14756966829299928,
+      "step": 3830
+    },
+    {
+      "epoch": 0.6977802037845706,
+      "grad_norm": 0.15893076360225677,
+      "learning_rate": 1.109621188804951e-05,
+      "loss": 0.14061959981918334,
+      "step": 3835
+    },
+    {
+      "epoch": 0.6986899563318777,
+      "grad_norm": 0.183414489030838,
+      "learning_rate": 1.103509797630077e-05,
+      "loss": 0.1448473334312439,
+      "step": 3840
+    },
+    {
+      "epoch": 0.6995997088791849,
+      "grad_norm": 0.14087305963039398,
+      "learning_rate": 1.0974105126730841e-05,
+      "loss": 0.14369285106658936,
+      "step": 3845
+    },
+    {
+      "epoch": 0.700509461426492,
+      "grad_norm": 0.16919967532157898,
+      "learning_rate": 1.0913233868088685e-05,
+      "loss": 0.1478085398674011,
+      "step": 3850
+    },
+    {
+      "epoch": 0.7014192139737991,
+      "grad_norm": 0.1439533829689026,
+      "learning_rate": 1.0852484728069178e-05,
+      "loss": 0.14376721382141114,
+      "step": 3855
+    },
+    {
+      "epoch": 0.7023289665211062,
+      "grad_norm": 0.17719274759292603,
+      "learning_rate": 1.0791858233308521e-05,
+      "loss": 0.14089040756225585,
+      "step": 3860
+    },
+    {
+      "epoch": 0.7032387190684134,
+      "grad_norm": 0.19753769040107727,
+      "learning_rate": 1.0731354909379754e-05,
+      "loss": 0.15021742582321168,
+      "step": 3865
+    },
+    {
+      "epoch": 0.7041484716157205,
+      "grad_norm": 0.19186992943286896,
+      "learning_rate": 1.0670975280788086e-05,
+      "loss": 0.14113202095031738,
+      "step": 3870
+    },
+    {
+      "epoch": 0.7050582241630277,
+      "grad_norm": 0.1709229201078415,
+      "learning_rate": 1.0610719870966443e-05,
+      "loss": 0.1500566840171814,
+      "step": 3875
+    },
+    {
+      "epoch": 0.7059679767103348,
+      "grad_norm": 0.17846204340457916,
+      "learning_rate": 1.0550589202270892e-05,
+      "loss": 0.15014195442199707,
+      "step": 3880
+    },
+    {
+      "epoch": 0.7068777292576419,
+      "grad_norm": 0.1827082335948944,
+      "learning_rate": 1.0490583795976091e-05,
+      "loss": 0.1423472762107849,
+      "step": 3885
+    },
+    {
+      "epoch": 0.7077874818049491,
+      "grad_norm": 0.17418377101421356,
+      "learning_rate": 1.043070417227083e-05,
+      "loss": 0.14668900966644288,
+      "step": 3890
+    },
+    {
+      "epoch": 0.7086972343522562,
+      "grad_norm": 0.17385616898536682,
+      "learning_rate": 1.0370950850253449e-05,
+      "loss": 0.14627279043197633,
+      "step": 3895
+    },
+    {
+      "epoch": 0.7096069868995634,
+      "grad_norm": 0.16486723721027374,
+      "learning_rate": 1.0311324347927404e-05,
+      "loss": 0.14603652954101562,
+      "step": 3900
+    },
+    {
+      "epoch": 0.7105167394468704,
+      "grad_norm": 0.21806862950325012,
+      "learning_rate": 1.0251825182196732e-05,
+      "loss": 0.1488169550895691,
+      "step": 3905
+    },
+    {
+      "epoch": 0.7114264919941776,
+      "grad_norm": 0.19884569942951202,
+      "learning_rate": 1.019245386886159e-05,
+      "loss": 0.14387656450271608,
+      "step": 3910
+    },
+    {
+      "epoch": 0.7123362445414847,
+      "grad_norm": 0.16139011085033417,
+      "learning_rate": 1.0133210922613789e-05,
+      "loss": 0.1483074426651001,
+      "step": 3915
+    },
+    {
+      "epoch": 0.7132459970887919,
+      "grad_norm": 0.17000740766525269,
+      "learning_rate": 1.007409685703229e-05,
+      "loss": 0.14050065279006957,
+      "step": 3920
+    },
+    {
+      "epoch": 0.714155749636099,
+      "grad_norm": 0.17235304415225983,
+      "learning_rate": 1.0015112184578813e-05,
+      "loss": 0.1440442681312561,
+      "step": 3925
+    },
+    {
+      "epoch": 0.7150655021834061,
+      "grad_norm": 0.15737567842006683,
+      "learning_rate": 9.956257416593362e-06,
+      "loss": 0.14960765838623047,
+      "step": 3930
+    },
+    {
+      "epoch": 0.7159752547307132,
+      "grad_norm": 0.15499180555343628,
+      "learning_rate": 9.897533063289773e-06,
+      "loss": 0.14488829374313356,
+      "step": 3935
+    },
+    {
+      "epoch": 0.7168850072780204,
+      "grad_norm": 0.17744216322898865,
+      "learning_rate": 9.838939633751337e-06,
+      "loss": 0.1416949987411499,
+      "step": 3940
+    },
+    {
+      "epoch": 0.7177947598253275,
+      "grad_norm": 0.1597192883491516,
+      "learning_rate": 9.780477635926358e-06,
+      "loss": 0.14275280237197877,
+      "step": 3945
+    },
+    {
+      "epoch": 0.7187045123726347,
+      "grad_norm": 0.17800374329090118,
+      "learning_rate": 9.722147576623743e-06,
+      "loss": 0.14532098770141602,
+      "step": 3950
+    },
+    {
+      "epoch": 0.7196142649199417,
+      "grad_norm": 0.1828162521123886,
+      "learning_rate": 9.66394996150864e-06,
+      "loss": 0.14525585174560546,
+      "step": 3955
+    },
+    {
+      "epoch": 0.7205240174672489,
+      "grad_norm": 0.1800539344549179,
+      "learning_rate": 9.605885295098005e-06,
+      "loss": 0.14235819578170777,
+      "step": 3960
+    },
+    {
+      "epoch": 0.721433770014556,
+      "grad_norm": 0.16556483507156372,
+      "learning_rate": 9.54795408075628e-06,
+      "loss": 0.13965482711791993,
+      "step": 3965
+    },
+    {
+      "epoch": 0.7223435225618632,
+      "grad_norm": 0.1592024862766266,
+      "learning_rate": 9.49015682069101e-06,
+      "loss": 0.14051042795181273,
+      "step": 3970
+    },
+    {
+      "epoch": 0.7232532751091703,
+      "grad_norm": 0.18988847732543945,
+      "learning_rate": 9.43249401594846e-06,
+      "loss": 0.1436900496482849,
+      "step": 3975
+    },
+    {
+      "epoch": 0.7241630276564774,
+      "grad_norm": 0.24433808028697968,
+      "learning_rate": 9.374966166409329e-06,
+      "loss": 0.14883997440338134,
+      "step": 3980
+    },
+    {
+      "epoch": 0.7250727802037845,
+      "grad_norm": 0.15091639757156372,
+      "learning_rate": 9.317573770784352e-06,
+      "loss": 0.14726560115814208,
+      "step": 3985
+    },
+    {
+      "epoch": 0.7259825327510917,
+      "grad_norm": 0.17045573890209198,
+      "learning_rate": 9.260317326610051e-06,
+      "loss": 0.14120506048202514,
+      "step": 3990
+    },
+    {
+      "epoch": 0.7268922852983989,
+      "grad_norm": 0.18847957253456116,
+      "learning_rate": 9.203197330244343e-06,
+      "loss": 0.1377041220664978,
+      "step": 3995
+    },
+    {
+      "epoch": 0.727802037845706,
+      "grad_norm": 0.1516445279121399,
+      "learning_rate": 9.14621427686229e-06,
+      "loss": 0.14043946266174318,
+      "step": 4000
+    },
+    {
+      "epoch": 0.7287117903930131,
+      "grad_norm": 0.18264050781726837,
+      "learning_rate": 9.0893686604518e-06,
+      "loss": 0.14080368280410765,
+      "step": 4005
+    },
+    {
+      "epoch": 0.7296215429403202,
+      "grad_norm": 0.19129371643066406,
+      "learning_rate": 9.032660973809312e-06,
+      "loss": 0.1402561902999878,
+      "step": 4010
+    },
+    {
+      "epoch": 0.7305312954876274,
+      "grad_norm": 0.15762710571289062,
+      "learning_rate": 8.976091708535567e-06,
+      "loss": 0.14421157836914061,
+      "step": 4015
+    },
+    {
+      "epoch": 0.7314410480349345,
+      "grad_norm": 0.17785198986530304,
+      "learning_rate": 8.919661355031331e-06,
+      "loss": 0.14999009370803834,
+      "step": 4020
+    },
+    {
+      "epoch": 0.7323508005822417,
+      "grad_norm": 0.15306031703948975,
+      "learning_rate": 8.8633704024931e-06,
+      "loss": 0.14101698398590087,
+      "step": 4025
+    },
+    {
+      "epoch": 0.7332605531295487,
+      "grad_norm": 0.16481758654117584,
+      "learning_rate": 8.807219338908968e-06,
+      "loss": 0.14170764684677123,
+      "step": 4030
+    },
+    {
+      "epoch": 0.7341703056768559,
+      "grad_norm": 0.14892235398292542,
+      "learning_rate": 8.751208651054257e-06,
+      "loss": 0.15317896604537964,
+      "step": 4035
+    },
+    {
+      "epoch": 0.735080058224163,
+      "grad_norm": 0.1775592565536499,
+      "learning_rate": 8.695338824487409e-06,
+      "loss": 0.1520617723464966,
+      "step": 4040
+    },
+    {
+      "epoch": 0.7359898107714702,
+      "grad_norm": 0.1614258885383606,
+      "learning_rate": 8.639610343545728e-06,
+      "loss": 0.13747400045394897,
+      "step": 4045
+    },
+    {
+      "epoch": 0.7368995633187773,
+      "grad_norm": 0.21415506303310394,
+      "learning_rate": 8.58402369134117e-06,
+      "loss": 0.1432439088821411,
+      "step": 4050
+    },
+    {
+      "epoch": 0.7378093158660844,
+      "grad_norm": 0.1759418249130249,
+      "learning_rate": 8.528579349756205e-06,
+      "loss": 0.141641104221344,
+      "step": 4055
+    },
+    {
+      "epoch": 0.7387190684133915,
+      "grad_norm": 0.16738329827785492,
+      "learning_rate": 8.47327779943957e-06,
+      "loss": 0.14294810295104982,
+      "step": 4060
+    },
+    {
+      "epoch": 0.7396288209606987,
+      "grad_norm": 0.13916844129562378,
+      "learning_rate": 8.41811951980217e-06,
+      "loss": 0.13876968622207642,
+      "step": 4065
+    },
+    {
+      "epoch": 0.7405385735080058,
+      "grad_norm": 0.1828441321849823,
+      "learning_rate": 8.36310498901288e-06,
+      "loss": 0.148428475856781,
+      "step": 4070
+    },
+    {
+      "epoch": 0.741448326055313,
+      "grad_norm": 0.16534076631069183,
+      "learning_rate": 8.308234683994415e-06,
+      "loss": 0.14222711324691772,
+      "step": 4075
+    },
+    {
+      "epoch": 0.74235807860262,
+      "grad_norm": 0.17922644317150116,
+      "learning_rate": 8.253509080419198e-06,
+      "loss": 0.14365782737731933,
+      "step": 4080
+    },
+    {
+      "epoch": 0.7432678311499272,
+      "grad_norm": 0.15061035752296448,
+      "learning_rate": 8.198928652705204e-06,
+      "loss": 0.13571925163269044,
+      "step": 4085
+    },
+    {
+      "epoch": 0.7441775836972343,
+      "grad_norm": 0.18075402081012726,
+      "learning_rate": 8.144493874011908e-06,
+      "loss": 0.14385528564453126,
+      "step": 4090
+    },
+    {
+      "epoch": 0.7450873362445415,
+      "grad_norm": 0.16514739394187927,
+      "learning_rate": 8.090205216236135e-06,
+      "loss": 0.14920626878738402,
+      "step": 4095
+    },
+    {
+      "epoch": 0.7459970887918487,
+      "grad_norm": 0.16453702747821808,
+      "learning_rate": 8.03606315000797e-06,
+      "loss": 0.14704222679138185,
+      "step": 4100
+    },
+    {
+      "epoch": 0.7469068413391557,
+      "grad_norm": 0.16719917953014374,
+      "learning_rate": 7.982068144686707e-06,
+      "loss": 0.14722511768341065,
+      "step": 4105
+    },
+    {
+      "epoch": 0.7478165938864629,
+      "grad_norm": 0.18499110639095306,
+      "learning_rate": 7.92822066835677e-06,
+      "loss": 0.1401848554611206,
+      "step": 4110
+    },
+    {
+      "epoch": 0.74872634643377,
+      "grad_norm": 0.17249563336372375,
+      "learning_rate": 7.87452118782363e-06,
+      "loss": 0.15132423639297485,
+      "step": 4115
+    },
+    {
+      "epoch": 0.7496360989810772,
+      "grad_norm": 0.15049682557582855,
+      "learning_rate": 7.8209701686098e-06,
+      "loss": 0.1341150164604187,
+      "step": 4120
+    },
+    {
+      "epoch": 0.7505458515283843,
+      "grad_norm": 0.16892646253108978,
+      "learning_rate": 7.767568074950751e-06,
+      "loss": 0.1466840147972107,
+      "step": 4125
+    },
+    {
+      "epoch": 0.7514556040756915,
+      "grad_norm": 0.17288286983966827,
+      "learning_rate": 7.714315369790942e-06,
+      "loss": 0.13819680213928223,
+      "step": 4130
+    },
+    {
+      "epoch": 0.7523653566229985,
+      "grad_norm": 0.21893996000289917,
+      "learning_rate": 7.661212514779745e-06,
+      "loss": 0.14369510412216185,
+      "step": 4135
+    },
+    {
+      "epoch": 0.7532751091703057,
+      "grad_norm": 0.1674601435661316,
+      "learning_rate": 7.608259970267509e-06,
+      "loss": 0.14810250997543334,
+      "step": 4140
+    },
+    {
+      "epoch": 0.7541848617176128,
+      "grad_norm": 0.15875539183616638,
+      "learning_rate": 7.555458195301526e-06,
+      "loss": 0.14103198051452637,
+      "step": 4145
+    },
+    {
+      "epoch": 0.75509461426492,
+      "grad_norm": 0.19454079866409302,
+      "learning_rate": 7.502807647622037e-06,
+      "loss": 0.13848764896392823,
+      "step": 4150
+    },
+    {
+      "epoch": 0.756004366812227,
+      "grad_norm": 0.1795455813407898,
+      "learning_rate": 7.450308783658341e-06,
+      "loss": 0.14459335803985596,
+      "step": 4155
+    },
+    {
+      "epoch": 0.7569141193595342,
+      "grad_norm": 0.1643362045288086,
+      "learning_rate": 7.397962058524735e-06,
+      "loss": 0.14335378408432006,
+      "step": 4160
+    },
+    {
+      "epoch": 0.7578238719068413,
+      "grad_norm": 0.16362066566944122,
+      "learning_rate": 7.3457679260166475e-06,
+      "loss": 0.14222005605697632,
+      "step": 4165
+    },
+    {
+      "epoch": 0.7587336244541485,
+      "grad_norm": 0.17313003540039062,
+      "learning_rate": 7.293726838606674e-06,
+      "loss": 0.14272255897521974,
+      "step": 4170
+    },
+    {
+      "epoch": 0.7596433770014556,
+      "grad_norm": 0.1809929460287094,
+      "learning_rate": 7.2418392474406405e-06,
+      "loss": 0.14089123010635377,
+      "step": 4175
+    },
+    {
+      "epoch": 0.7605531295487628,
+      "grad_norm": 0.14306005835533142,
+      "learning_rate": 7.19010560233373e-06,
+      "loss": 0.13531534671783446,
+      "step": 4180
+    },
+    {
+      "epoch": 0.7614628820960698,
+      "grad_norm": 0.15525390207767487,
+      "learning_rate": 7.138526351766559e-06,
+      "loss": 0.14340845346450806,
+      "step": 4185
+    },
+    {
+      "epoch": 0.762372634643377,
+      "grad_norm": 0.24478943645954132,
+      "learning_rate": 7.087101942881263e-06,
+      "loss": 0.14744555950164795,
+      "step": 4190
+    },
+    {
+      "epoch": 0.7632823871906841,
+      "grad_norm": 0.31335577368736267,
+      "learning_rate": 7.035832821477711e-06,
+      "loss": 0.1484094500541687,
+      "step": 4195
+    },
+    {
+      "epoch": 0.7641921397379913,
+      "grad_norm": 0.15140366554260254,
+      "learning_rate": 6.984719432009515e-06,
+      "loss": 0.14991614818572999,
+      "step": 4200
+    },
+    {
+      "epoch": 0.7651018922852983,
+      "grad_norm": 0.16125506162643433,
+      "learning_rate": 6.933762217580289e-06,
+      "loss": 0.1408134937286377,
+      "step": 4205
+    },
+    {
+      "epoch": 0.7660116448326055,
+      "grad_norm": 0.2501450181007385,
+      "learning_rate": 6.882961619939726e-06,
+      "loss": 0.13875640630722047,
+      "step": 4210
+    },
+    {
+      "epoch": 0.7669213973799127,
+      "grad_norm": 0.16227811574935913,
+      "learning_rate": 6.8323180794798245e-06,
+      "loss": 0.14138660430908204,
+      "step": 4215
+    },
+    {
+      "epoch": 0.7678311499272198,
+      "grad_norm": 0.16676810383796692,
+      "learning_rate": 6.781832035231053e-06,
+      "loss": 0.14696706533432008,
+      "step": 4220
+    },
+    {
+      "epoch": 0.768740902474527,
+      "grad_norm": 0.14638574421405792,
+      "learning_rate": 6.731503924858518e-06,
+      "loss": 0.14263020753860473,
+      "step": 4225
+    },
+    {
+      "epoch": 0.769650655021834,
+      "grad_norm": 0.17093190550804138,
+      "learning_rate": 6.681334184658211e-06,
+      "loss": 0.14694111347198485,
+      "step": 4230
+    },
+    {
+      "epoch": 0.7705604075691412,
+      "grad_norm": 0.17174287140369415,
+      "learning_rate": 6.631323249553201e-06,
+      "loss": 0.13854929208755493,
+      "step": 4235
+    },
+    {
+      "epoch": 0.7714701601164483,
+      "grad_norm": 0.14599016308784485,
+      "learning_rate": 6.5814715530898745e-06,
+      "loss": 0.14058833122253417,
+      "step": 4240
+    },
+    {
+      "epoch": 0.7723799126637555,
+      "grad_norm": 0.16222265362739563,
+      "learning_rate": 6.531779527434176e-06,
+      "loss": 0.1428326725959778,
+      "step": 4245
+    },
+    {
+      "epoch": 0.7732896652110626,
+      "grad_norm": 0.1741994023323059,
+      "learning_rate": 6.482247603367839e-06,
+      "loss": 0.13985042572021483,
+      "step": 4250
+    },
+    {
+      "epoch": 0.7741994177583698,
+      "grad_norm": 0.17427101731300354,
+      "learning_rate": 6.432876210284688e-06,
+      "loss": 0.1442667603492737,
+      "step": 4255
+    },
+    {
+      "epoch": 0.7751091703056768,
+      "grad_norm": 0.1665259599685669,
+      "learning_rate": 6.383665776186912e-06,
+      "loss": 0.1421986222267151,
+      "step": 4260
+    },
+    {
+      "epoch": 0.776018922852984,
+      "grad_norm": 0.1728232353925705,
+      "learning_rate": 6.334616727681303e-06,
+      "loss": 0.1367053508758545,
+      "step": 4265
+    },
+    {
+      "epoch": 0.7769286754002911,
+      "grad_norm": 0.15882381796836853,
+      "learning_rate": 6.285729489975639e-06,
+      "loss": 0.14551182985305786,
+      "step": 4270
+    },
+    {
+      "epoch": 0.7778384279475983,
+      "grad_norm": 0.242042675614357,
+      "learning_rate": 6.2370044868749115e-06,
+      "loss": 0.1455132007598877,
+      "step": 4275
+    },
+    {
+      "epoch": 0.7787481804949054,
+      "grad_norm": 0.1599501073360443,
+      "learning_rate": 6.188442140777742e-06,
+      "loss": 0.1424942970275879,
+      "step": 4280
+    },
+    {
+      "epoch": 0.7796579330422125,
+      "grad_norm": 0.15182635188102722,
+      "learning_rate": 6.140042872672647e-06,
+      "loss": 0.14212887287139891,
+      "step": 4285
+    },
+    {
+      "epoch": 0.7805676855895196,
+      "grad_norm": 0.1720375418663025,
+      "learning_rate": 6.091807102134403e-06,
+      "loss": 0.14243412017822266,
+      "step": 4290
+    },
+    {
+      "epoch": 0.7814774381368268,
+      "grad_norm": 0.16436047852039337,
+      "learning_rate": 6.043735247320454e-06,
+      "loss": 0.15035657882690429,
+      "step": 4295
+    },
+    {
+      "epoch": 0.7823871906841339,
+      "grad_norm": 0.1498408019542694,
+      "learning_rate": 5.995827724967218e-06,
+      "loss": 0.14494839906692505,
+      "step": 4300
+    },
+    {
+      "epoch": 0.7832969432314411,
+      "grad_norm": 0.16924560070037842,
+      "learning_rate": 5.948084950386535e-06,
+      "loss": 0.13581212759017944,
+      "step": 4305
+    },
+    {
+      "epoch": 0.7842066957787481,
+      "grad_norm": 0.15889139473438263,
+      "learning_rate": 5.900507337462036e-06,
+      "loss": 0.15071530342102052,
+      "step": 4310
+    },
+    {
+      "epoch": 0.7851164483260553,
+      "grad_norm": 0.17201054096221924,
+      "learning_rate": 5.853095298645542e-06,
+      "loss": 0.1398628830909729,
+      "step": 4315
+    },
+    {
+      "epoch": 0.7860262008733624,
+      "grad_norm": 0.17965619266033173,
+      "learning_rate": 5.805849244953548e-06,
+      "loss": 0.14666696786880493,
+      "step": 4320
+    },
+    {
+      "epoch": 0.7869359534206696,
+      "grad_norm": 0.17514032125473022,
+      "learning_rate": 5.758769585963569e-06,
+      "loss": 0.1383386731147766,
+      "step": 4325
+    },
+    {
+      "epoch": 0.7878457059679768,
+      "grad_norm": 0.17497631907463074,
+      "learning_rate": 5.7118567298106744e-06,
+      "loss": 0.14362354278564454,
+      "step": 4330
+    },
+    {
+      "epoch": 0.7887554585152838,
+      "grad_norm": 0.16770458221435547,
+      "learning_rate": 5.665111083183905e-06,
+      "loss": 0.14136618375778198,
+      "step": 4335
+    },
+    {
+      "epoch": 0.789665211062591,
+      "grad_norm": 0.17134106159210205,
+      "learning_rate": 5.618533051322747e-06,
+      "loss": 0.1401529550552368,
+      "step": 4340
+    },
+    {
+      "epoch": 0.7905749636098981,
+      "grad_norm": 0.19458788633346558,
+      "learning_rate": 5.5721230380136435e-06,
+      "loss": 0.1393273115158081,
+      "step": 4345
+    },
+    {
+      "epoch": 0.7914847161572053,
+      "grad_norm": 0.19483692944049835,
+      "learning_rate": 5.525881445586467e-06,
+      "loss": 0.1369825482368469,
+      "step": 4350
+    },
+    {
+      "epoch": 0.7923944687045124,
+      "grad_norm": 0.3052191734313965,
+      "learning_rate": 5.4798086749110495e-06,
+      "loss": 0.14762181043624878,
+      "step": 4355
+    },
+    {
+      "epoch": 0.7933042212518195,
+      "grad_norm": 0.164458766579628,
+      "learning_rate": 5.4339051253937065e-06,
+      "loss": 0.14501686096191407,
+      "step": 4360
+    },
+    {
+      "epoch": 0.7942139737991266,
+      "grad_norm": 0.1719193458557129,
+      "learning_rate": 5.3881711949737625e-06,
+      "loss": 0.13321092128753662,
+      "step": 4365
+    },
+    {
+      "epoch": 0.7951237263464338,
+      "grad_norm": 0.17219696938991547,
+      "learning_rate": 5.342607280120121e-06,
+      "loss": 0.1413906455039978,
+      "step": 4370
+    },
+    {
+      "epoch": 0.7960334788937409,
+      "grad_norm": 0.15083056688308716,
+      "learning_rate": 5.297213775827789e-06,
+      "loss": 0.14772192239761353,
+      "step": 4375
+    },
+    {
+      "epoch": 0.7969432314410481,
+      "grad_norm": 0.1699071079492569,
+      "learning_rate": 5.251991075614507e-06,
+      "loss": 0.1392375946044922,
+      "step": 4380
+    },
+    {
+      "epoch": 0.7978529839883551,
+      "grad_norm": 0.1680395007133484,
+      "learning_rate": 5.206939571517302e-06,
+      "loss": 0.14185575246810914,
+      "step": 4385
+    },
+    {
+      "epoch": 0.7987627365356623,
+      "grad_norm": 0.16526710987091064,
+      "learning_rate": 5.162059654089083e-06,
+      "loss": 0.15001428127288818,
+      "step": 4390
+    },
+    {
+      "epoch": 0.7996724890829694,
+      "grad_norm": 0.16281752288341522,
+      "learning_rate": 5.1173517123952794e-06,
+      "loss": 0.13747023344039916,
+      "step": 4395
+    },
+    {
+      "epoch": 0.8005822416302766,
+      "grad_norm": 0.1454378366470337,
+      "learning_rate": 5.072816134010458e-06,
+      "loss": 0.14710829257965088,
+      "step": 4400
+    },
+    {
+      "epoch": 0.8014919941775837,
+      "grad_norm": 0.16565890610218048,
+      "learning_rate": 5.028453305014966e-06,
+      "loss": 0.14138611555099487,
+      "step": 4405
+    },
+    {
+      "epoch": 0.8024017467248908,
+      "grad_norm": 0.1962810605764389,
+      "learning_rate": 4.984263609991577e-06,
+      "loss": 0.13836177587509155,
+      "step": 4410
+    },
+    {
+      "epoch": 0.8033114992721979,
+      "grad_norm": 0.16091369092464447,
+      "learning_rate": 4.940247432022149e-06,
+      "loss": 0.14407440423965454,
+      "step": 4415
+    },
+    {
+      "epoch": 0.8042212518195051,
+      "grad_norm": 0.1930241584777832,
+      "learning_rate": 4.89640515268433e-06,
+      "loss": 0.14346336126327514,
+      "step": 4420
+    },
+    {
+      "epoch": 0.8051310043668122,
+      "grad_norm": 0.19301500916481018,
+      "learning_rate": 4.852737152048242e-06,
+      "loss": 0.14174317121505736,
+      "step": 4425
+    },
+    {
+      "epoch": 0.8060407569141194,
+      "grad_norm": 0.1541353315114975,
+      "learning_rate": 4.80924380867315e-06,
+      "loss": 0.14100592136383056,
+      "step": 4430
+    },
+    {
+      "epoch": 0.8069505094614265,
+      "grad_norm": 0.16285750269889832,
+      "learning_rate": 4.765925499604243e-06,
+      "loss": 0.1441288709640503,
+      "step": 4435
+    },
+    {
+      "epoch": 0.8078602620087336,
+      "grad_norm": 0.17382675409317017,
+      "learning_rate": 4.722782600369299e-06,
+      "loss": 0.13763951063156127,
+      "step": 4440
+    },
+    {
+      "epoch": 0.8087700145560408,
+      "grad_norm": 0.1697344034910202,
+      "learning_rate": 4.679815484975505e-06,
+      "loss": 0.1410105347633362,
+      "step": 4445
+    },
+    {
+      "epoch": 0.8096797671033479,
+      "grad_norm": 0.19964542984962463,
+      "learning_rate": 4.637024525906131e-06,
+      "loss": 0.1439276695251465,
+      "step": 4450
+    },
+    {
+      "epoch": 0.8105895196506551,
+      "grad_norm": 0.165307879447937,
+      "learning_rate": 4.59441009411736e-06,
+      "loss": 0.13897504806518554,
+      "step": 4455
+    },
+    {
+      "epoch": 0.8114992721979621,
+      "grad_norm": 0.16687989234924316,
+      "learning_rate": 4.551972559035067e-06,
+      "loss": 0.1422593355178833,
+      "step": 4460
+    },
+    {
+      "epoch": 0.8124090247452693,
+      "grad_norm": 0.15737789869308472,
+      "learning_rate": 4.509712288551571e-06,
+      "loss": 0.1452128052711487,
+      "step": 4465
+    },
+    {
+      "epoch": 0.8133187772925764,
+      "grad_norm": 0.17116659879684448,
+      "learning_rate": 4.467629649022509e-06,
+      "loss": 0.14385371208190917,
+      "step": 4470
+    },
+    {
+      "epoch": 0.8142285298398836,
+      "grad_norm": 0.17457640171051025,
+      "learning_rate": 4.425725005263623e-06,
+      "loss": 0.14808475971221924,
+      "step": 4475
+    },
+    {
+      "epoch": 0.8151382823871907,
+      "grad_norm": 0.1621970385313034,
+      "learning_rate": 4.383998720547583e-06,
+      "loss": 0.13927959203720092,
+      "step": 4480
+    },
+    {
+      "epoch": 0.8160480349344978,
+      "grad_norm": 0.176296666264534,
+      "learning_rate": 4.342451156600896e-06,
+      "loss": 0.15041060447692872,
+      "step": 4485
+    },
+    {
+      "epoch": 0.8169577874818049,
+      "grad_norm": 0.17157645523548126,
+      "learning_rate": 4.301082673600698e-06,
+      "loss": 0.13932652473449708,
+      "step": 4490
+    },
+    {
+      "epoch": 0.8178675400291121,
+      "grad_norm": 0.15378527343273163,
+      "learning_rate": 4.259893630171682e-06,
+      "loss": 0.1406856894493103,
+      "step": 4495
+    },
+    {
+      "epoch": 0.8187772925764192,
+      "grad_norm": 0.1750226765871048,
+      "learning_rate": 4.218884383382987e-06,
+      "loss": 0.1350164532661438,
+      "step": 4500
+    },
+    {
+      "epoch": 0.8196870451237264,
+      "grad_norm": 0.1393742561340332,
+      "learning_rate": 4.178055288745053e-06,
+      "loss": 0.13769235610961914,
+      "step": 4505
+    },
+    {
+      "epoch": 0.8205967976710334,
+      "grad_norm": 0.1668994128704071,
+      "learning_rate": 4.137406700206617e-06,
+      "loss": 0.14029752016067504,
+      "step": 4510
+    },
+    {
+      "epoch": 0.8215065502183406,
+      "grad_norm": 0.1833454668521881,
+      "learning_rate": 4.0969389701515675e-06,
+      "loss": 0.14276301860809326,
+      "step": 4515
+    },
+    {
+      "epoch": 0.8224163027656477,
+      "grad_norm": 0.16187874972820282,
+      "learning_rate": 4.056652449395945e-06,
+      "loss": 0.1444832682609558,
+      "step": 4520
+    },
+    {
+      "epoch": 0.8233260553129549,
+      "grad_norm": 0.1453280746936798,
+      "learning_rate": 4.01654748718488e-06,
+      "loss": 0.14512733221054078,
+      "step": 4525
+    },
+    {
+      "epoch": 0.824235807860262,
+      "grad_norm": 0.1782725751399994,
+      "learning_rate": 3.976624431189563e-06,
+      "loss": 0.14093561172485353,
+      "step": 4530
+    },
+    {
+      "epoch": 0.8251455604075691,
+      "grad_norm": 0.17374491691589355,
+      "learning_rate": 3.936883627504234e-06,
+      "loss": 0.14031401872634888,
+      "step": 4535
+    },
+    {
+      "epoch": 0.8260553129548762,
+      "grad_norm": 0.1609172821044922,
+      "learning_rate": 3.897325420643174e-06,
+      "loss": 0.1428336262702942,
+      "step": 4540
+    },
+    {
+      "epoch": 0.8269650655021834,
+      "grad_norm": 0.1520884931087494,
+      "learning_rate": 3.85795015353774e-06,
+      "loss": 0.1460547924041748,
+      "step": 4545
+    },
+    {
+      "epoch": 0.8278748180494906,
+      "grad_norm": 0.20986326038837433,
+      "learning_rate": 3.818758167533376e-06,
+      "loss": 0.14706350564956666,
+      "step": 4550
+    },
+    {
+      "epoch": 0.8287845705967977,
+      "grad_norm": 0.16825413703918457,
+      "learning_rate": 3.7797498023866396e-06,
+      "loss": 0.14507200717926025,
+      "step": 4555
+    },
+    {
+      "epoch": 0.8296943231441049,
+      "grad_norm": 0.16758380830287933,
+      "learning_rate": 3.740925396262296e-06,
+      "loss": 0.14898381233215333,
+      "step": 4560
+    },
+    {
+      "epoch": 0.8306040756914119,
+      "grad_norm": 0.15207453072071075,
+      "learning_rate": 3.7022852857303503e-06,
+      "loss": 0.14138854742050172,
+      "step": 4565
+    },
+    {
+      "epoch": 0.8315138282387191,
+      "grad_norm": 0.15150749683380127,
+      "learning_rate": 3.66382980576315e-06,
+      "loss": 0.13894975185394287,
+      "step": 4570
+    },
+    {
+      "epoch": 0.8324235807860262,
+      "grad_norm": 0.17071188986301422,
+      "learning_rate": 3.625559289732472e-06,
+      "loss": 0.14072470664978026,
+      "step": 4575
+    },
+    {
+      "epoch": 0.8333333333333334,
+      "grad_norm": 0.154335618019104,
+      "learning_rate": 3.5874740694066294e-06,
+      "loss": 0.13791344165802003,
+      "step": 4580
+    },
+    {
+      "epoch": 0.8342430858806404,
+      "grad_norm": 0.14017128944396973,
+      "learning_rate": 3.5495744749476116e-06,
+      "loss": 0.14427922964096068,
+      "step": 4585
+    },
+    {
+      "epoch": 0.8351528384279476,
+      "grad_norm": 0.17210033535957336,
+      "learning_rate": 3.5118608349081983e-06,
+      "loss": 0.15191166400909423,
+      "step": 4590
+    },
+    {
+      "epoch": 0.8360625909752547,
+      "grad_norm": 0.18715685606002808,
+      "learning_rate": 3.4743334762291358e-06,
+      "loss": 0.14451316595077515,
+      "step": 4595
+    },
+    {
+      "epoch": 0.8369723435225619,
+      "grad_norm": 0.18079884350299835,
+      "learning_rate": 3.436992724236293e-06,
+      "loss": 0.13530746698379517,
+      "step": 4600
+    },
+    {
+      "epoch": 0.837882096069869,
+      "grad_norm": 0.13519920408725739,
+      "learning_rate": 3.399838902637817e-06,
+      "loss": 0.1477964401245117,
+      "step": 4605
+    },
+    {
+      "epoch": 0.8387918486171762,
+      "grad_norm": 0.1778026670217514,
+      "learning_rate": 3.3628723335213885e-06,
+      "loss": 0.14419831037521363,
+      "step": 4610
+    },
+    {
+      "epoch": 0.8397016011644832,
+      "grad_norm": 0.15165366232395172,
+      "learning_rate": 3.326093337351355e-06,
+      "loss": 0.13888469934463502,
+      "step": 4615
+    },
+    {
+      "epoch": 0.8406113537117904,
+      "grad_norm": 0.17049473524093628,
+      "learning_rate": 3.2895022329660018e-06,
+      "loss": 0.14438477754592896,
+      "step": 4620
+    },
+    {
+      "epoch": 0.8415211062590975,
+      "grad_norm": 0.16536414623260498,
+      "learning_rate": 3.2530993375747833e-06,
+      "loss": 0.1444351315498352,
+      "step": 4625
+    },
+    {
+      "epoch": 0.8424308588064047,
+      "grad_norm": 0.17570015788078308,
+      "learning_rate": 3.2168849667555402e-06,
+      "loss": 0.13861945867538453,
+      "step": 4630
+    },
+    {
+      "epoch": 0.8433406113537117,
+      "grad_norm": 0.1699545532464981,
+      "learning_rate": 3.1808594344518132e-06,
+      "loss": 0.13902754783630372,
+      "step": 4635
+    },
+    {
+      "epoch": 0.8442503639010189,
+      "grad_norm": 0.12331254780292511,
+      "learning_rate": 3.1450230529700837e-06,
+      "loss": 0.14104254245758058,
+      "step": 4640
+    },
+    {
+      "epoch": 0.845160116448326,
+      "grad_norm": 0.1508190929889679,
+      "learning_rate": 3.1093761329770708e-06,
+      "loss": 0.13288766145706177,
+      "step": 4645
+    },
+    {
+      "epoch": 0.8460698689956332,
+      "grad_norm": 0.19049489498138428,
+      "learning_rate": 3.0739189834970735e-06,
+      "loss": 0.14914840459823608,
+      "step": 4650
+    },
+    {
+      "epoch": 0.8469796215429404,
+      "grad_norm": 0.1662369966506958,
+      "learning_rate": 3.0386519119092293e-06,
+      "loss": 0.14222898483276367,
+      "step": 4655
+    },
+    {
+      "epoch": 0.8478893740902474,
+      "grad_norm": 0.18985967338085175,
+      "learning_rate": 3.0035752239449126e-06,
+      "loss": 0.14431113004684448,
+      "step": 4660
+    },
+    {
+      "epoch": 0.8487991266375546,
+      "grad_norm": 0.17005261778831482,
+      "learning_rate": 2.9686892236850337e-06,
+      "loss": 0.14140807390213012,
+      "step": 4665
+    },
+    {
+      "epoch": 0.8497088791848617,
+      "grad_norm": 0.16786684095859528,
+      "learning_rate": 2.9339942135574394e-06,
+      "loss": 0.14161460399627684,
+      "step": 4670
+    },
+    {
+      "epoch": 0.8506186317321689,
+      "grad_norm": 0.16358181834220886,
+      "learning_rate": 2.899490494334281e-06,
+      "loss": 0.14674670696258546,
+      "step": 4675
+    },
+    {
+      "epoch": 0.851528384279476,
+      "grad_norm": 0.1651349812746048,
+      "learning_rate": 2.8651783651293867e-06,
+      "loss": 0.13794611692428588,
+      "step": 4680
+    },
+    {
+      "epoch": 0.8524381368267832,
+      "grad_norm": 0.16934923827648163,
+      "learning_rate": 2.831058123395694e-06,
+      "loss": 0.13199397325515747,
+      "step": 4685
+    },
+    {
+      "epoch": 0.8533478893740902,
+      "grad_norm": 0.1704150140285492,
+      "learning_rate": 2.797130064922665e-06,
+      "loss": 0.14044904708862305,
+      "step": 4690
+    },
+    {
+      "epoch": 0.8542576419213974,
+      "grad_norm": 0.1814192682504654,
+      "learning_rate": 2.7633944838337143e-06,
+      "loss": 0.1465100646018982,
+      "step": 4695
+    },
+    {
+      "epoch": 0.8551673944687045,
+      "grad_norm": 0.18942610919475555,
+      "learning_rate": 2.729851672583669e-06,
+      "loss": 0.14685982465744019,
+      "step": 4700
+    },
+    {
+      "epoch": 0.8560771470160117,
+      "grad_norm": 0.17895208299160004,
+      "learning_rate": 2.6965019219562155e-06,
+      "loss": 0.13971571922302245,
+      "step": 4705
+    },
+    {
+      "epoch": 0.8569868995633187,
+      "grad_norm": 0.22735828161239624,
+      "learning_rate": 2.6633455210614055e-06,
+      "loss": 0.13776102066040039,
+      "step": 4710
+    },
+    {
+      "epoch": 0.8578966521106259,
+      "grad_norm": 0.16779793798923492,
+      "learning_rate": 2.630382757333133e-06,
+      "loss": 0.14134042263031005,
+      "step": 4715
+    },
+    {
+      "epoch": 0.858806404657933,
+      "grad_norm": 0.2148888260126114,
+      "learning_rate": 2.597613916526637e-06,
+      "loss": 0.14680721759796142,
+      "step": 4720
+    },
+    {
+      "epoch": 0.8597161572052402,
+      "grad_norm": 0.16560257971286774,
+      "learning_rate": 2.565039282716045e-06,
+      "loss": 0.14137234687805175,
+      "step": 4725
+    },
+    {
+      "epoch": 0.8606259097525473,
+      "grad_norm": 0.16197068989276886,
+      "learning_rate": 2.532659138291879e-06,
+      "loss": 0.14969314336776735,
+      "step": 4730
+    },
+    {
+      "epoch": 0.8615356622998545,
+      "grad_norm": 0.14650246500968933,
+      "learning_rate": 2.5004737639586497e-06,
+      "loss": 0.13532910346984864,
+      "step": 4735
+    },
+    {
+      "epoch": 0.8624454148471615,
+      "grad_norm": 0.1565634310245514,
+      "learning_rate": 2.4684834387323943e-06,
+      "loss": 0.14146244525909424,
+      "step": 4740
+    },
+    {
+      "epoch": 0.8633551673944687,
+      "grad_norm": 0.18060864508152008,
+      "learning_rate": 2.4366884399382393e-06,
+      "loss": 0.14218534231185914,
+      "step": 4745
+    },
+    {
+      "epoch": 0.8642649199417758,
+      "grad_norm": 0.24613255262374878,
+      "learning_rate": 2.4050890432080557e-06,
+      "loss": 0.13907679319381713,
+      "step": 4750
+    },
+    {
+      "epoch": 0.865174672489083,
+      "grad_norm": 0.16036023199558258,
+      "learning_rate": 2.3736855224780057e-06,
+      "loss": 0.13718113899230958,
+      "step": 4755
+    },
+    {
+      "epoch": 0.86608442503639,
+      "grad_norm": 0.16678516566753387,
+      "learning_rate": 2.3424781499862075e-06,
+      "loss": 0.1327962040901184,
+      "step": 4760
+    },
+    {
+      "epoch": 0.8669941775836972,
+      "grad_norm": 0.1763770878314972,
+      "learning_rate": 2.3114671962703727e-06,
+      "loss": 0.14390318393707274,
+      "step": 4765
+    },
+    {
+      "epoch": 0.8679039301310044,
+      "grad_norm": 0.17735697329044342,
+      "learning_rate": 2.280652930165428e-06,
+      "loss": 0.15223288536071777,
+      "step": 4770
+    },
+    {
+      "epoch": 0.8688136826783115,
+      "grad_norm": 0.15827041864395142,
+      "learning_rate": 2.250035618801241e-06,
+      "loss": 0.14296332597732545,
+      "step": 4775
+    },
+    {
+      "epoch": 0.8697234352256187,
+      "grad_norm": 0.16876135766506195,
+      "learning_rate": 2.219615527600244e-06,
+      "loss": 0.1359076738357544,
+      "step": 4780
+    },
+    {
+      "epoch": 0.8706331877729258,
+      "grad_norm": 0.1800110638141632,
+      "learning_rate": 2.189392920275174e-06,
+      "loss": 0.1424281358718872,
+      "step": 4785
+    },
+    {
+      "epoch": 0.8715429403202329,
+      "grad_norm": 0.1409560889005661,
+      "learning_rate": 2.159368058826783e-06,
+      "loss": 0.14480490684509278,
+      "step": 4790
+    },
+    {
+      "epoch": 0.87245269286754,
+      "grad_norm": 0.1634288728237152,
+      "learning_rate": 2.129541203541535e-06,
+      "loss": 0.14513269662857056,
+      "step": 4795
+    },
+    {
+      "epoch": 0.8733624454148472,
+      "grad_norm": 0.17126062512397766,
+      "learning_rate": 2.099912612989391e-06,
+      "loss": 0.13546934127807617,
+      "step": 4800
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.6398928370952863e+18,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-4800/training_args.bin b/checkpoint-4800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-4800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/checkpoint-4900/README.md b/checkpoint-4900/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-4900/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-4900/adapter_config.json b/checkpoint-4900/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-4900/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-4900/adapter_model.safetensors b/checkpoint-4900/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7d51edf4626c3b55837775311a22fd372effdbae
--- /dev/null
+++ b/checkpoint-4900/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:16f0911a72c2009ffcdaa13711a34bf91ac4328965a15f81fbe2cd8119715038
+size 169741912
diff --git a/checkpoint-4900/chat_template.jinja b/checkpoint-4900/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-4900/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-4900/optimizer.pt b/checkpoint-4900/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6acbd0ce8f094f3a6f20f6507bea363e430f6cd4
--- /dev/null
+++ b/checkpoint-4900/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c7b71bcb4660a49d3ac1b098d9d6628096a0792a596b5545fa1d36cf13c890e6
+size 72807355
diff --git a/checkpoint-4900/processor_config.json b/checkpoint-4900/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-4900/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-4900/rng_state.pth b/checkpoint-4900/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-4900/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-4900/scheduler.pt b/checkpoint-4900/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b449a85b5e3a28974a9a9d73bbca9d0b916c4c1a
--- /dev/null
+++ b/checkpoint-4900/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:94fba794993de2c072fab349e1124df8846a79fce1e519060425ea6e08ae19a7
+size 1465
diff --git a/checkpoint-4900/tokenizer.json b/checkpoint-4900/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-4900/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-4900/tokenizer_config.json b/checkpoint-4900/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-4900/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-4900/trainer_state.json b/checkpoint-4900/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..6bce90e59bb5de169f031c8e1893176287a19433
--- /dev/null
+++ b/checkpoint-4900/trainer_state.json
@@ -0,0 +1,6902 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.8915574963609898,
+  "eval_steps": 100,
+  "global_step": 4900,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    },
+    {
+      "epoch": 0.03729985443959243,
+      "grad_norm": 0.061678580939769745,
+      "learning_rate": 4.999340748636462e-05,
+      "loss": 0.24956226348876953,
+      "step": 205
+    },
+    {
+      "epoch": 0.03820960698689956,
+      "grad_norm": 0.07328873127698898,
+      "learning_rate": 4.999160884067051e-05,
+      "loss": 0.26169676780700685,
+      "step": 210
+    },
+    {
+      "epoch": 0.0391193595342067,
+      "grad_norm": 0.08287990838289261,
+      "learning_rate": 4.9989593541926246e-05,
+      "loss": 0.2574604034423828,
+      "step": 215
+    },
+    {
+      "epoch": 0.04002911208151383,
+      "grad_norm": 0.06787359714508057,
+      "learning_rate": 4.9987361607602525e-05,
+      "loss": 0.25351409912109374,
+      "step": 220
+    },
+    {
+      "epoch": 0.04093886462882096,
+      "grad_norm": 0.06695502996444702,
+      "learning_rate": 4.998491305704805e-05,
+      "loss": 0.24522039890289307,
+      "step": 225
+    },
+    {
+      "epoch": 0.041848617176128096,
+      "grad_norm": 0.08872214704751968,
+      "learning_rate": 4.9982247911489375e-05,
+      "loss": 0.2581867933273315,
+      "step": 230
+    },
+    {
+      "epoch": 0.042758369723435226,
+      "grad_norm": 0.07637131959199905,
+      "learning_rate": 4.9979366194030743e-05,
+      "loss": 0.25569658279418944,
+      "step": 235
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 0.08158119022846222,
+      "learning_rate": 4.997626792965385e-05,
+      "loss": 0.2529409646987915,
+      "step": 240
+    },
+    {
+      "epoch": 0.04457787481804949,
+      "grad_norm": 0.07529161125421524,
+      "learning_rate": 4.997295314521766e-05,
+      "loss": 0.24049024581909179,
+      "step": 245
+    },
+    {
+      "epoch": 0.04548762736535662,
+      "grad_norm": 0.08860139548778534,
+      "learning_rate": 4.996942186945813e-05,
+      "loss": 0.2490522861480713,
+      "step": 250
+    },
+    {
+      "epoch": 0.04639737991266375,
+      "grad_norm": 0.0850321501493454,
+      "learning_rate": 4.9965674132988005e-05,
+      "loss": 0.24180831909179687,
+      "step": 255
+    },
+    {
+      "epoch": 0.04730713245997089,
+      "grad_norm": 0.07556115090847015,
+      "learning_rate": 4.996170996829653e-05,
+      "loss": 0.2509631872177124,
+      "step": 260
+    },
+    {
+      "epoch": 0.04821688500727802,
+      "grad_norm": 0.07971206307411194,
+      "learning_rate": 4.995752940974918e-05,
+      "loss": 0.24398891925811766,
+      "step": 265
+    },
+    {
+      "epoch": 0.04912663755458515,
+      "grad_norm": 0.09149336814880371,
+      "learning_rate": 4.9953132493587344e-05,
+      "loss": 0.2300492286682129,
+      "step": 270
+    },
+    {
+      "epoch": 0.050036390101892286,
+      "grad_norm": 0.08265820890665054,
+      "learning_rate": 4.9948519257928034e-05,
+      "loss": 0.24246792793273925,
+      "step": 275
+    },
+    {
+      "epoch": 0.050946142649199416,
+      "grad_norm": 0.10328587144613266,
+      "learning_rate": 4.9943689742763534e-05,
+      "loss": 0.2367171049118042,
+      "step": 280
+    },
+    {
+      "epoch": 0.05185589519650655,
+      "grad_norm": 0.0836917981505394,
+      "learning_rate": 4.993864398996105e-05,
+      "loss": 0.23215813636779786,
+      "step": 285
+    },
+    {
+      "epoch": 0.05276564774381368,
+      "grad_norm": 0.09475161135196686,
+      "learning_rate": 4.99333820432624e-05,
+      "loss": 0.2350748062133789,
+      "step": 290
+    },
+    {
+      "epoch": 0.05367540029112081,
+      "grad_norm": 0.08040128648281097,
+      "learning_rate": 4.992790394828355e-05,
+      "loss": 0.23253886699676513,
+      "step": 295
+    },
+    {
+      "epoch": 0.05458515283842795,
+      "grad_norm": 0.08852150291204453,
+      "learning_rate": 4.992220975251428e-05,
+      "loss": 0.23856515884399415,
+      "step": 300
+    },
+    {
+      "epoch": 0.05549490538573508,
+      "grad_norm": 0.09565229713916779,
+      "learning_rate": 4.991629950531775e-05,
+      "loss": 0.23311660289764405,
+      "step": 305
+    },
+    {
+      "epoch": 0.05640465793304221,
+      "grad_norm": 0.08158160001039505,
+      "learning_rate": 4.991017325793009e-05,
+      "loss": 0.22467944622039795,
+      "step": 310
+    },
+    {
+      "epoch": 0.05731441048034935,
+      "grad_norm": 0.07746429741382599,
+      "learning_rate": 4.990383106345994e-05,
+      "loss": 0.229844069480896,
+      "step": 315
+    },
+    {
+      "epoch": 0.05822416302765648,
+      "grad_norm": 0.08564355969429016,
+      "learning_rate": 4.989727297688797e-05,
+      "loss": 0.22414517402648926,
+      "step": 320
+    },
+    {
+      "epoch": 0.05913391557496361,
+      "grad_norm": 0.07517435401678085,
+      "learning_rate": 4.9890499055066435e-05,
+      "loss": 0.2236532211303711,
+      "step": 325
+    },
+    {
+      "epoch": 0.060043668122270744,
+      "grad_norm": 0.111734539270401,
+      "learning_rate": 4.988350935671869e-05,
+      "loss": 0.21474847793579102,
+      "step": 330
+    },
+    {
+      "epoch": 0.060953420669577874,
+      "grad_norm": 0.09906989336013794,
+      "learning_rate": 4.987630394243866e-05,
+      "loss": 0.23321933746337892,
+      "step": 335
+    },
+    {
+      "epoch": 0.06186317321688501,
+      "grad_norm": 0.10131457448005676,
+      "learning_rate": 4.98688828746903e-05,
+      "loss": 0.2310662031173706,
+      "step": 340
+    },
+    {
+      "epoch": 0.06277292576419213,
+      "grad_norm": 0.09203507006168365,
+      "learning_rate": 4.986124621780708e-05,
+      "loss": 0.22021169662475587,
+      "step": 345
+    },
+    {
+      "epoch": 0.06368267831149928,
+      "grad_norm": 0.09505912661552429,
+      "learning_rate": 4.9853394037991416e-05,
+      "loss": 0.2197155237197876,
+      "step": 350
+    },
+    {
+      "epoch": 0.06459243085880641,
+      "grad_norm": 0.09038657695055008,
+      "learning_rate": 4.984532640331412e-05,
+      "loss": 0.22066287994384765,
+      "step": 355
+    },
+    {
+      "epoch": 0.06550218340611354,
+      "grad_norm": 0.09707064181566238,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 0.22455451488494874,
+      "step": 360
+    },
+    {
+      "epoch": 0.06641193595342067,
+      "grad_norm": 0.10367228090763092,
+      "learning_rate": 4.98285450509961e-05,
+      "loss": 0.21993820667266845,
+      "step": 365
+    },
+    {
+      "epoch": 0.0673216885007278,
+      "grad_norm": 0.12229471653699875,
+      "learning_rate": 4.9819831478833456e-05,
+      "loss": 0.2168867588043213,
+      "step": 370
+    },
+    {
+      "epoch": 0.06823144104803494,
+      "grad_norm": 0.0964592918753624,
+      "learning_rate": 4.981090274276406e-05,
+      "loss": 0.21579203605651856,
+      "step": 375
+    },
+    {
+      "epoch": 0.06914119359534207,
+      "grad_norm": 0.09400496631860733,
+      "learning_rate": 4.980175892019141e-05,
+      "loss": 0.20972180366516113,
+      "step": 380
+    },
+    {
+      "epoch": 0.0700509461426492,
+      "grad_norm": 0.08158645778894424,
+      "learning_rate": 4.9792400090383594e-05,
+      "loss": 0.22148358821868896,
+      "step": 385
+    },
+    {
+      "epoch": 0.07096069868995633,
+      "grad_norm": 0.10916394740343094,
+      "learning_rate": 4.978282633447261e-05,
+      "loss": 0.2214418649673462,
+      "step": 390
+    },
+    {
+      "epoch": 0.07187045123726346,
+      "grad_norm": 0.11138810962438583,
+      "learning_rate": 4.9773037735453636e-05,
+      "loss": 0.21814754009246826,
+      "step": 395
+    },
+    {
+      "epoch": 0.07278020378457059,
+      "grad_norm": 0.10914396494626999,
+      "learning_rate": 4.9763034378184365e-05,
+      "loss": 0.21310818195343018,
+      "step": 400
+    },
+    {
+      "epoch": 0.07368995633187773,
+      "grad_norm": 0.1043366864323616,
+      "learning_rate": 4.975281634938421e-05,
+      "loss": 0.21266789436340333,
+      "step": 405
+    },
+    {
+      "epoch": 0.07459970887918486,
+      "grad_norm": 0.1036868542432785,
+      "learning_rate": 4.9742383737633594e-05,
+      "loss": 0.21606721878051757,
+      "step": 410
+    },
+    {
+      "epoch": 0.075509461426492,
+      "grad_norm": 0.11640442907810211,
+      "learning_rate": 4.9731736633373144e-05,
+      "loss": 0.21532948017120362,
+      "step": 415
+    },
+    {
+      "epoch": 0.07641921397379912,
+      "grad_norm": 0.11219926178455353,
+      "learning_rate": 4.9720875128902956e-05,
+      "loss": 0.2191627025604248,
+      "step": 420
+    },
+    {
+      "epoch": 0.07732896652110625,
+      "grad_norm": 0.12103637307882309,
+      "learning_rate": 4.970979931838176e-05,
+      "loss": 0.20938868522644044,
+      "step": 425
+    },
+    {
+      "epoch": 0.0782387190684134,
+      "grad_norm": 0.13274189829826355,
+      "learning_rate": 4.96985092978261e-05,
+      "loss": 0.21792960166931152,
+      "step": 430
+    },
+    {
+      "epoch": 0.07914847161572053,
+      "grad_norm": 0.11164513230323792,
+      "learning_rate": 4.968700516510954e-05,
+      "loss": 0.2022618055343628,
+      "step": 435
+    },
+    {
+      "epoch": 0.08005822416302766,
+      "grad_norm": 0.09532847255468369,
+      "learning_rate": 4.967528701996174e-05,
+      "loss": 0.21255812644958497,
+      "step": 440
+    },
+    {
+      "epoch": 0.08096797671033479,
+      "grad_norm": 0.10279258340597153,
+      "learning_rate": 4.96633549639677e-05,
+      "loss": 0.20683050155639648,
+      "step": 445
+    },
+    {
+      "epoch": 0.08187772925764192,
+      "grad_norm": 0.1257462352514267,
+      "learning_rate": 4.965120910056677e-05,
+      "loss": 0.21419920921325683,
+      "step": 450
+    },
+    {
+      "epoch": 0.08278748180494905,
+      "grad_norm": 0.11663137376308441,
+      "learning_rate": 4.963884953505186e-05,
+      "loss": 0.2072287082672119,
+      "step": 455
+    },
+    {
+      "epoch": 0.08369723435225619,
+      "grad_norm": 0.10488224029541016,
+      "learning_rate": 4.96262763745684e-05,
+      "loss": 0.1982678532600403,
+      "step": 460
+    },
+    {
+      "epoch": 0.08460698689956332,
+      "grad_norm": 0.11801692098379135,
+      "learning_rate": 4.961348972811354e-05,
+      "loss": 0.20662031173706055,
+      "step": 465
+    },
+    {
+      "epoch": 0.08551673944687045,
+      "grad_norm": 0.11318827420473099,
+      "learning_rate": 4.96004897065351e-05,
+      "loss": 0.20947303771972656,
+      "step": 470
+    },
+    {
+      "epoch": 0.08642649199417758,
+      "grad_norm": 0.13409486413002014,
+      "learning_rate": 4.95872764225307e-05,
+      "loss": 0.19670876264572143,
+      "step": 475
+    },
+    {
+      "epoch": 0.08733624454148471,
+      "grad_norm": 0.14440792798995972,
+      "learning_rate": 4.957384999064672e-05,
+      "loss": 0.19842848777770997,
+      "step": 480
+    },
+    {
+      "epoch": 0.08824599708879186,
+      "grad_norm": 0.12246996909379959,
+      "learning_rate": 4.956021052727731e-05,
+      "loss": 0.20318071842193602,
+      "step": 485
+    },
+    {
+      "epoch": 0.08915574963609899,
+      "grad_norm": 0.13437233865261078,
+      "learning_rate": 4.954635815066342e-05,
+      "loss": 0.21675212383270265,
+      "step": 490
+    },
+    {
+      "epoch": 0.09006550218340612,
+      "grad_norm": 0.11109672486782074,
+      "learning_rate": 4.9532292980891744e-05,
+      "loss": 0.2100757837295532,
+      "step": 495
+    },
+    {
+      "epoch": 0.09097525473071325,
+      "grad_norm": 0.1388893872499466,
+      "learning_rate": 4.9518015139893675e-05,
+      "loss": 0.20303285121917725,
+      "step": 500
+    },
+    {
+      "epoch": 0.09188500727802038,
+      "grad_norm": 0.13239721953868866,
+      "learning_rate": 4.950352475144427e-05,
+      "loss": 0.2152268409729004,
+      "step": 505
+    },
+    {
+      "epoch": 0.0927947598253275,
+      "grad_norm": 0.12834979593753815,
+      "learning_rate": 4.948882194116119e-05,
+      "loss": 0.20799248218536376,
+      "step": 510
+    },
+    {
+      "epoch": 0.09370451237263465,
+      "grad_norm": 0.11886704713106155,
+      "learning_rate": 4.947390683650354e-05,
+      "loss": 0.20394976139068605,
+      "step": 515
+    },
+    {
+      "epoch": 0.09461426491994178,
+      "grad_norm": 0.11398876458406448,
+      "learning_rate": 4.945877956677083e-05,
+      "loss": 0.2091092586517334,
+      "step": 520
+    },
+    {
+      "epoch": 0.09552401746724891,
+      "grad_norm": 0.1422540694475174,
+      "learning_rate": 4.944344026310186e-05,
+      "loss": 0.19564238786697388,
+      "step": 525
+    },
+    {
+      "epoch": 0.09643377001455604,
+      "grad_norm": 0.11359584331512451,
+      "learning_rate": 4.9427889058473535e-05,
+      "loss": 0.20493624210357667,
+      "step": 530
+    },
+    {
+      "epoch": 0.09734352256186317,
+      "grad_norm": 0.11703553050756454,
+      "learning_rate": 4.941212608769974e-05,
+      "loss": 0.2098615884780884,
+      "step": 535
+    },
+    {
+      "epoch": 0.0982532751091703,
+      "grad_norm": 0.14552047848701477,
+      "learning_rate": 4.939615148743017e-05,
+      "loss": 0.20382182598114013,
+      "step": 540
+    },
+    {
+      "epoch": 0.09916302765647744,
+      "grad_norm": 0.13178016245365143,
+      "learning_rate": 4.937996539614914e-05,
+      "loss": 0.19901862144470214,
+      "step": 545
+    },
+    {
+      "epoch": 0.10007278020378457,
+      "grad_norm": 0.635392427444458,
+      "learning_rate": 4.936356795417439e-05,
+      "loss": 0.20694944858551026,
+      "step": 550
+    },
+    {
+      "epoch": 0.1009825327510917,
+      "grad_norm": 0.15019077062606812,
+      "learning_rate": 4.934695930365586e-05,
+      "loss": 0.19313746690750122,
+      "step": 555
+    },
+    {
+      "epoch": 0.10189228529839883,
+      "grad_norm": 0.12941956520080566,
+      "learning_rate": 4.9330139588574474e-05,
+      "loss": 0.19671722650527954,
+      "step": 560
+    },
+    {
+      "epoch": 0.10280203784570596,
+      "grad_norm": 0.13818831741809845,
+      "learning_rate": 4.931310895474088e-05,
+      "loss": 0.20026786327362062,
+      "step": 565
+    },
+    {
+      "epoch": 0.1037117903930131,
+      "grad_norm": 0.12011194974184036,
+      "learning_rate": 4.929586754979417e-05,
+      "loss": 0.1932437539100647,
+      "step": 570
+    },
+    {
+      "epoch": 0.10462154294032024,
+      "grad_norm": 0.1345364898443222,
+      "learning_rate": 4.9278415523200644e-05,
+      "loss": 0.20245940685272218,
+      "step": 575
+    },
+    {
+      "epoch": 0.10553129548762737,
+      "grad_norm": 0.13281017541885376,
+      "learning_rate": 4.926075302625247e-05,
+      "loss": 0.19864981174468993,
+      "step": 580
+    },
+    {
+      "epoch": 0.1064410480349345,
+      "grad_norm": 0.13465586304664612,
+      "learning_rate": 4.924288021206639e-05,
+      "loss": 0.19573183059692384,
+      "step": 585
+    },
+    {
+      "epoch": 0.10735080058224163,
+      "grad_norm": 0.15225961804389954,
+      "learning_rate": 4.9224797235582396e-05,
+      "loss": 0.19946801662445068,
+      "step": 590
+    },
+    {
+      "epoch": 0.10826055312954876,
+      "grad_norm": 0.12816746532917023,
+      "learning_rate": 4.92065042535624e-05,
+      "loss": 0.19851526021957397,
+      "step": 595
+    },
+    {
+      "epoch": 0.1091703056768559,
+      "grad_norm": 0.13802853226661682,
+      "learning_rate": 4.9188001424588824e-05,
+      "loss": 0.19321763515472412,
+      "step": 600
+    },
+    {
+      "epoch": 0.11008005822416303,
+      "grad_norm": 0.17504797875881195,
+      "learning_rate": 4.9169288909063295e-05,
+      "loss": 0.2032616138458252,
+      "step": 605
+    },
+    {
+      "epoch": 0.11098981077147016,
+      "grad_norm": 0.13544194400310516,
+      "learning_rate": 4.91503668692052e-05,
+      "loss": 0.2011256456375122,
+      "step": 610
+    },
+    {
+      "epoch": 0.11189956331877729,
+      "grad_norm": 1.3976134061813354,
+      "learning_rate": 4.91312354690503e-05,
+      "loss": 0.19916868209838867,
+      "step": 615
+    },
+    {
+      "epoch": 0.11280931586608442,
+      "grad_norm": 0.1465059071779251,
+      "learning_rate": 4.91118948744493e-05,
+      "loss": 0.19487457275390624,
+      "step": 620
+    },
+    {
+      "epoch": 0.11371906841339156,
+      "grad_norm": 0.12103168666362762,
+      "learning_rate": 4.909234525306645e-05,
+      "loss": 0.1907251238822937,
+      "step": 625
+    },
+    {
+      "epoch": 0.1146288209606987,
+      "grad_norm": 0.12660574913024902,
+      "learning_rate": 4.907258677437802e-05,
+      "loss": 0.19327253103256226,
+      "step": 630
+    },
+    {
+      "epoch": 0.11553857350800582,
+      "grad_norm": 0.1347813606262207,
+      "learning_rate": 4.90526196096709e-05,
+      "loss": 0.19637736082077026,
+      "step": 635
+    },
+    {
+      "epoch": 0.11644832605531295,
+      "grad_norm": 0.14953652024269104,
+      "learning_rate": 4.903244393204107e-05,
+      "loss": 0.20325069427490233,
+      "step": 640
+    },
+    {
+      "epoch": 0.11735807860262008,
+      "grad_norm": 0.13936272263526917,
+      "learning_rate": 4.901205991639213e-05,
+      "loss": 0.1930275321006775,
+      "step": 645
+    },
+    {
+      "epoch": 0.11826783114992721,
+      "grad_norm": 0.1448420137166977,
+      "learning_rate": 4.899146773943374e-05,
+      "loss": 0.20026936531066894,
+      "step": 650
+    },
+    {
+      "epoch": 0.11917758369723436,
+      "grad_norm": 0.1312534064054489,
+      "learning_rate": 4.897066757968014e-05,
+      "loss": 0.19062033891677857,
+      "step": 655
+    },
+    {
+      "epoch": 0.12008733624454149,
+      "grad_norm": 0.13644742965698242,
+      "learning_rate": 4.894965961744859e-05,
+      "loss": 0.18719595670700073,
+      "step": 660
+    },
+    {
+      "epoch": 0.12099708879184862,
+      "grad_norm": 0.14276087284088135,
+      "learning_rate": 4.892844403485777e-05,
+      "loss": 0.19784307479858398,
+      "step": 665
+    },
+    {
+      "epoch": 0.12190684133915575,
+      "grad_norm": 0.14735399186611176,
+      "learning_rate": 4.890702101582623e-05,
+      "loss": 0.19163782596588136,
+      "step": 670
+    },
+    {
+      "epoch": 0.12281659388646288,
+      "grad_norm": 0.15742065012454987,
+      "learning_rate": 4.888539074607082e-05,
+      "loss": 0.19312986135482788,
+      "step": 675
+    },
+    {
+      "epoch": 0.12372634643377002,
+      "grad_norm": 0.12917031347751617,
+      "learning_rate": 4.8863553413105025e-05,
+      "loss": 0.20066320896148682,
+      "step": 680
+    },
+    {
+      "epoch": 0.12463609898107715,
+      "grad_norm": 0.1484801322221756,
+      "learning_rate": 4.884150920623737e-05,
+      "loss": 0.20096096992492676,
+      "step": 685
+    },
+    {
+      "epoch": 0.12554585152838427,
+      "grad_norm": 0.1455296128988266,
+      "learning_rate": 4.88192583165698e-05,
+      "loss": 0.20518505573272705,
+      "step": 690
+    },
+    {
+      "epoch": 0.12645560407569142,
+      "grad_norm": 0.14517490565776825,
+      "learning_rate": 4.879680093699598e-05,
+      "loss": 0.18859238624572755,
+      "step": 695
+    },
+    {
+      "epoch": 0.12736535662299855,
+      "grad_norm": 0.18778090178966522,
+      "learning_rate": 4.877413726219964e-05,
+      "loss": 0.197074818611145,
+      "step": 700
+    },
+    {
+      "epoch": 0.12827510917030568,
+      "grad_norm": 0.13497677445411682,
+      "learning_rate": 4.87512674886529e-05,
+      "loss": 0.18713107109069824,
+      "step": 705
+    },
+    {
+      "epoch": 0.12918486171761281,
+      "grad_norm": 0.12657155096530914,
+      "learning_rate": 4.872819181461455e-05,
+      "loss": 0.1858484387397766,
+      "step": 710
+    },
+    {
+      "epoch": 0.13009461426491994,
+      "grad_norm": 0.11458148807287216,
+      "learning_rate": 4.870491044012834e-05,
+      "loss": 0.18732179403305055,
+      "step": 715
+    },
+    {
+      "epoch": 0.13100436681222707,
+      "grad_norm": 0.13000249862670898,
+      "learning_rate": 4.8681423567021244e-05,
+      "loss": 0.1872936010360718,
+      "step": 720
+    },
+    {
+      "epoch": 0.1319141193595342,
+      "grad_norm": 0.14580890536308289,
+      "learning_rate": 4.865773139890172e-05,
+      "loss": 0.19280019998550416,
+      "step": 725
+    },
+    {
+      "epoch": 0.13282387190684133,
+      "grad_norm": 0.1507277935743332,
+      "learning_rate": 4.8633834141157913e-05,
+      "loss": 0.1898929238319397,
+      "step": 730
+    },
+    {
+      "epoch": 0.13373362445414846,
+      "grad_norm": 0.1418737769126892,
+      "learning_rate": 4.860973200095592e-05,
+      "loss": 0.17926375865936278,
+      "step": 735
+    },
+    {
+      "epoch": 0.1346433770014556,
+      "grad_norm": 0.17151866853237152,
+      "learning_rate": 4.858542518723794e-05,
+      "loss": 0.18963592052459716,
+      "step": 740
+    },
+    {
+      "epoch": 0.13555312954876272,
+      "grad_norm": 0.11162743717432022,
+      "learning_rate": 4.8560913910720535e-05,
+      "loss": 0.19466646909713745,
+      "step": 745
+    },
+    {
+      "epoch": 0.13646288209606988,
+      "grad_norm": 0.15628376603126526,
+      "learning_rate": 4.8536198383892725e-05,
+      "loss": 0.19494034051895143,
+      "step": 750
+    },
+    {
+      "epoch": 0.137372634643377,
+      "grad_norm": 0.18209289014339447,
+      "learning_rate": 4.851127882101421e-05,
+      "loss": 0.18747550249099731,
+      "step": 755
+    },
+    {
+      "epoch": 0.13828238719068414,
+      "grad_norm": 0.14559614658355713,
+      "learning_rate": 4.8486155438113454e-05,
+      "loss": 0.1897158980369568,
+      "step": 760
+    },
+    {
+      "epoch": 0.13919213973799127,
+      "grad_norm": 0.3198587894439697,
+      "learning_rate": 4.846082845298586e-05,
+      "loss": 0.18571001291275024,
+      "step": 765
+    },
+    {
+      "epoch": 0.1401018922852984,
+      "grad_norm": 0.1486678421497345,
+      "learning_rate": 4.843529808519189e-05,
+      "loss": 0.19561930894851684,
+      "step": 770
+    },
+    {
+      "epoch": 0.14101164483260553,
+      "grad_norm": 0.15318170189857483,
+      "learning_rate": 4.840956455605509e-05,
+      "loss": 0.187040114402771,
+      "step": 775
+    },
+    {
+      "epoch": 0.14192139737991266,
+      "grad_norm": 0.13754244148731232,
+      "learning_rate": 4.838362808866025e-05,
+      "loss": 0.18345539569854735,
+      "step": 780
+    },
+    {
+      "epoch": 0.1428311499272198,
+      "grad_norm": 0.12943248450756073,
+      "learning_rate": 4.835748890785143e-05,
+      "loss": 0.1921079397201538,
+      "step": 785
+    },
+    {
+      "epoch": 0.14374090247452692,
+      "grad_norm": 0.110458143055439,
+      "learning_rate": 4.833114724023001e-05,
+      "loss": 0.17927205562591553,
+      "step": 790
+    },
+    {
+      "epoch": 0.14465065502183405,
+      "grad_norm": 0.2421770840883255,
+      "learning_rate": 4.830460331415275e-05,
+      "loss": 0.18317567110061644,
+      "step": 795
+    },
+    {
+      "epoch": 0.14556040756914118,
+      "grad_norm": 0.14752762019634247,
+      "learning_rate": 4.8277857359729787e-05,
+      "loss": 0.1843916058540344,
+      "step": 800
+    },
+    {
+      "epoch": 0.14647016011644834,
+      "grad_norm": 0.15043556690216064,
+      "learning_rate": 4.8250909608822644e-05,
+      "loss": 0.18354393243789674,
+      "step": 805
+    },
+    {
+      "epoch": 0.14737991266375547,
+      "grad_norm": 0.1381794661283493,
+      "learning_rate": 4.822376029504223e-05,
+      "loss": 0.1789781332015991,
+      "step": 810
+    },
+    {
+      "epoch": 0.1482896652110626,
+      "grad_norm": 0.18386174738407135,
+      "learning_rate": 4.819640965374681e-05,
+      "loss": 0.19494292736053467,
+      "step": 815
+    },
+    {
+      "epoch": 0.14919941775836973,
+      "grad_norm": 0.13829593360424042,
+      "learning_rate": 4.816885792203996e-05,
+      "loss": 0.18486063480377196,
+      "step": 820
+    },
+    {
+      "epoch": 0.15010917030567686,
+      "grad_norm": 0.15033291280269623,
+      "learning_rate": 4.814110533876852e-05,
+      "loss": 0.18061509132385253,
+      "step": 825
+    },
+    {
+      "epoch": 0.151018922852984,
+      "grad_norm": 0.17150473594665527,
+      "learning_rate": 4.811315214452051e-05,
+      "loss": 0.18464866876602173,
+      "step": 830
+    },
+    {
+      "epoch": 0.15192867540029112,
+      "grad_norm": 0.15317125618457794,
+      "learning_rate": 4.808499858162307e-05,
+      "loss": 0.1837708592414856,
+      "step": 835
+    },
+    {
+      "epoch": 0.15283842794759825,
+      "grad_norm": 0.2671392560005188,
+      "learning_rate": 4.805664489414031e-05,
+      "loss": 0.19338636398315429,
+      "step": 840
+    },
+    {
+      "epoch": 0.15374818049490538,
+      "grad_norm": 0.14047028124332428,
+      "learning_rate": 4.802809132787125e-05,
+      "loss": 0.17069108486175538,
+      "step": 845
+    },
+    {
+      "epoch": 0.1546579330422125,
+      "grad_norm": 0.1520431935787201,
+      "learning_rate": 4.799933813034768e-05,
+      "loss": 0.18607735633850098,
+      "step": 850
+    },
+    {
+      "epoch": 0.15556768558951964,
+      "grad_norm": 0.17239463329315186,
+      "learning_rate": 4.797038555083197e-05,
+      "loss": 0.18069062232971192,
+      "step": 855
+    },
+    {
+      "epoch": 0.1564774381368268,
+      "grad_norm": 0.1377955675125122,
+      "learning_rate": 4.794123384031495e-05,
+      "loss": 0.18870222568511963,
+      "step": 860
+    },
+    {
+      "epoch": 0.15738719068413393,
+      "grad_norm": 0.15901461243629456,
+      "learning_rate": 4.791188325151373e-05,
+      "loss": 0.18128334283828734,
+      "step": 865
+    },
+    {
+      "epoch": 0.15829694323144106,
+      "grad_norm": 0.14634132385253906,
+      "learning_rate": 4.7882334038869495e-05,
+      "loss": 0.1866163969039917,
+      "step": 870
+    },
+    {
+      "epoch": 0.1592066957787482,
+      "grad_norm": 0.15361061692237854,
+      "learning_rate": 4.785258645854529e-05,
+      "loss": 0.17850807905197144,
+      "step": 875
+    },
+    {
+      "epoch": 0.16011644832605532,
+      "grad_norm": 0.13751649856567383,
+      "learning_rate": 4.782264076842385e-05,
+      "loss": 0.17731113433837892,
+      "step": 880
+    },
+    {
+      "epoch": 0.16102620087336245,
+      "grad_norm": 0.17909638583660126,
+      "learning_rate": 4.7792497228105314e-05,
+      "loss": 0.18344542980194092,
+      "step": 885
+    },
+    {
+      "epoch": 0.16193595342066958,
+      "grad_norm": 0.16038304567337036,
+      "learning_rate": 4.776215609890498e-05,
+      "loss": 0.18868647813796996,
+      "step": 890
+    },
+    {
+      "epoch": 0.1628457059679767,
+      "grad_norm": 0.1653951108455658,
+      "learning_rate": 4.773161764385107e-05,
+      "loss": 0.18614152669906617,
+      "step": 895
+    },
+    {
+      "epoch": 0.16375545851528384,
+      "grad_norm": 0.16193026304244995,
+      "learning_rate": 4.770088212768241e-05,
+      "loss": 0.18564575910568237,
+      "step": 900
+    },
+    {
+      "epoch": 0.16466521106259097,
+      "grad_norm": 0.16048531234264374,
+      "learning_rate": 4.7669949816846173e-05,
+      "loss": 0.18330031633377075,
+      "step": 905
+    },
+    {
+      "epoch": 0.1655749636098981,
+      "grad_norm": 0.1440177708864212,
+      "learning_rate": 4.7638820979495534e-05,
+      "loss": 0.17712442874908446,
+      "step": 910
+    },
+    {
+      "epoch": 0.16648471615720525,
+      "grad_norm": 0.19635969400405884,
+      "learning_rate": 4.760749588548738e-05,
+      "loss": 0.18679027557373046,
+      "step": 915
+    },
+    {
+      "epoch": 0.16739446870451238,
+      "grad_norm": 0.15576541423797607,
+      "learning_rate": 4.757597480637995e-05,
+      "loss": 0.19283764362335204,
+      "step": 920
+    },
+    {
+      "epoch": 0.1683042212518195,
+      "grad_norm": 0.1550331562757492,
+      "learning_rate": 4.7544258015430463e-05,
+      "loss": 0.18269542455673218,
+      "step": 925
+    },
+    {
+      "epoch": 0.16921397379912664,
+      "grad_norm": 0.18369626998901367,
+      "learning_rate": 4.75123457875928e-05,
+      "loss": 0.1697891116142273,
+      "step": 930
+    },
+    {
+      "epoch": 0.17012372634643377,
+      "grad_norm": 0.15266314148902893,
+      "learning_rate": 4.7480238399515074e-05,
+      "loss": 0.18523451089859008,
+      "step": 935
+    },
+    {
+      "epoch": 0.1710334788937409,
+      "grad_norm": 0.16709664463996887,
+      "learning_rate": 4.744793612953724e-05,
+      "loss": 0.1803238034248352,
+      "step": 940
+    },
+    {
+      "epoch": 0.17194323144104803,
+      "grad_norm": 0.14929179847240448,
+      "learning_rate": 4.741543925768872e-05,
+      "loss": 0.1861217737197876,
+      "step": 945
+    },
+    {
+      "epoch": 0.17285298398835516,
+      "grad_norm": 0.1362280696630478,
+      "learning_rate": 4.7382748065685915e-05,
+      "loss": 0.17896100282669067,
+      "step": 950
+    },
+    {
+      "epoch": 0.1737627365356623,
+      "grad_norm": 0.15290239453315735,
+      "learning_rate": 4.734986283692982e-05,
+      "loss": 0.18432788848876952,
+      "step": 955
+    },
+    {
+      "epoch": 0.17467248908296942,
+      "grad_norm": 0.1287035197019577,
+      "learning_rate": 4.73167838565035e-05,
+      "loss": 0.18485682010650634,
+      "step": 960
+    },
+    {
+      "epoch": 0.17558224163027655,
+      "grad_norm": 0.17969627678394318,
+      "learning_rate": 4.728351141116971e-05,
+      "loss": 0.17361557483673096,
+      "step": 965
+    },
+    {
+      "epoch": 0.1764919941775837,
+      "grad_norm": 0.13751201331615448,
+      "learning_rate": 4.7250045789368326e-05,
+      "loss": 0.1731679320335388,
+      "step": 970
+    },
+    {
+      "epoch": 0.17740174672489084,
+      "grad_norm": 0.1603265255689621,
+      "learning_rate": 4.721638728121388e-05,
+      "loss": 0.17308170795440675,
+      "step": 975
+    },
+    {
+      "epoch": 0.17831149927219797,
+      "grad_norm": 0.1592789888381958,
+      "learning_rate": 4.718253617849306e-05,
+      "loss": 0.17534757852554322,
+      "step": 980
+    },
+    {
+      "epoch": 0.1792212518195051,
+      "grad_norm": 0.12727224826812744,
+      "learning_rate": 4.714849277466214e-05,
+      "loss": 0.17817609310150145,
+      "step": 985
+    },
+    {
+      "epoch": 0.18013100436681223,
+      "grad_norm": 0.15401554107666016,
+      "learning_rate": 4.711425736484447e-05,
+      "loss": 0.1733405351638794,
+      "step": 990
+    },
+    {
+      "epoch": 0.18104075691411936,
+      "grad_norm": 0.13253968954086304,
+      "learning_rate": 4.7079830245827906e-05,
+      "loss": 0.17846795320510864,
+      "step": 995
+    },
+    {
+      "epoch": 0.1819505094614265,
+      "grad_norm": 0.21846213936805725,
+      "learning_rate": 4.7045211716062245e-05,
+      "loss": 0.18021599054336548,
+      "step": 1000
+    },
+    {
+      "epoch": 0.18286026200873362,
+      "grad_norm": 0.16867990791797638,
+      "learning_rate": 4.7010402075656595e-05,
+      "loss": 0.18232386112213134,
+      "step": 1005
+    },
+    {
+      "epoch": 0.18377001455604075,
+      "grad_norm": 0.17180582880973816,
+      "learning_rate": 4.697540162637686e-05,
+      "loss": 0.1816317319869995,
+      "step": 1010
+    },
+    {
+      "epoch": 0.18467976710334788,
+      "grad_norm": 0.16480213403701782,
+      "learning_rate": 4.694021067164303e-05,
+      "loss": 0.17718446254730225,
+      "step": 1015
+    },
+    {
+      "epoch": 0.185589519650655,
+      "grad_norm": 0.15015918016433716,
+      "learning_rate": 4.6904829516526605e-05,
+      "loss": 0.17412011623382567,
+      "step": 1020
+    },
+    {
+      "epoch": 0.18649927219796217,
+      "grad_norm": 0.14445139467716217,
+      "learning_rate": 4.686925846774795e-05,
+      "loss": 0.1778018832206726,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1874090247452693,
+      "grad_norm": 0.1701960265636444,
+      "learning_rate": 4.683349783367362e-05,
+      "loss": 0.16901081800460815,
+      "step": 1030
+    },
+    {
+      "epoch": 0.18831877729257643,
+      "grad_norm": 0.15894867479801178,
+      "learning_rate": 4.679754792431368e-05,
+      "loss": 0.17055928707122803,
+      "step": 1035
+    },
+    {
+      "epoch": 0.18922852983988356,
+      "grad_norm": 0.1511942446231842,
+      "learning_rate": 4.676140905131903e-05,
+      "loss": 0.17339680194854737,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1901382823871907,
+      "grad_norm": 0.14735209941864014,
+      "learning_rate": 4.672508152797872e-05,
+      "loss": 0.17802717685699462,
+      "step": 1045
+    },
+    {
+      "epoch": 0.19104803493449782,
+      "grad_norm": 0.17367291450500488,
+      "learning_rate": 4.66885656692172e-05,
+      "loss": 0.1732744097709656,
+      "step": 1050
+    },
+    {
+      "epoch": 0.19195778748180495,
+      "grad_norm": 0.147227481007576,
+      "learning_rate": 4.665186179159159e-05,
+      "loss": 0.17040517330169677,
+      "step": 1055
+    },
+    {
+      "epoch": 0.19286754002911208,
+      "grad_norm": 0.1709655076265335,
+      "learning_rate": 4.6614970213289e-05,
+      "loss": 0.17794088125228882,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1937772925764192,
+      "grad_norm": 0.1588088721036911,
+      "learning_rate": 4.657789125412366e-05,
+      "loss": 0.17180380821228028,
+      "step": 1065
+    },
+    {
+      "epoch": 0.19468704512372634,
+      "grad_norm": 0.14827021956443787,
+      "learning_rate": 4.654062523553428e-05,
+      "loss": 0.182997989654541,
+      "step": 1070
+    },
+    {
+      "epoch": 0.19559679767103347,
+      "grad_norm": 0.16230466961860657,
+      "learning_rate": 4.6503172480581126e-05,
+      "loss": 0.17346880435943604,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1965065502183406,
+      "grad_norm": 0.1637624353170395,
+      "learning_rate": 4.646553331394333e-05,
+      "loss": 0.17263576984405518,
+      "step": 1080
+    },
+    {
+      "epoch": 0.19741630276564776,
+      "grad_norm": 0.15977843105793,
+      "learning_rate": 4.642770806191603e-05,
+      "loss": 0.17284308671951293,
+      "step": 1085
+    },
+    {
+      "epoch": 0.19832605531295489,
+      "grad_norm": 0.15394869446754456,
+      "learning_rate": 4.6389697052407534e-05,
+      "loss": 0.17797101736068727,
+      "step": 1090
+    },
+    {
+      "epoch": 0.19923580786026202,
+      "grad_norm": 0.15995225310325623,
+      "learning_rate": 4.6351500614936485e-05,
+      "loss": 0.18137198686599731,
+      "step": 1095
+    },
+    {
+      "epoch": 0.20014556040756915,
+      "grad_norm": 0.1779479682445526,
+      "learning_rate": 4.6313119080629006e-05,
+      "loss": 0.17998344898223878,
+      "step": 1100
+    },
+    {
+      "epoch": 0.20105531295487628,
+      "grad_norm": 0.14362832903862,
+      "learning_rate": 4.627455278221584e-05,
+      "loss": 0.18196423053741456,
+      "step": 1105
+    },
+    {
+      "epoch": 0.2019650655021834,
+      "grad_norm": 0.15951639413833618,
+      "learning_rate": 4.623580205402947e-05,
+      "loss": 0.17423888444900512,
+      "step": 1110
+    },
+    {
+      "epoch": 0.20287481804949054,
+      "grad_norm": 0.17273563146591187,
+      "learning_rate": 4.619686723200115e-05,
+      "loss": 0.17392473220825194,
+      "step": 1115
+    },
+    {
+      "epoch": 0.20378457059679767,
+      "grad_norm": 0.1655360758304596,
+      "learning_rate": 4.615774865365813e-05,
+      "loss": 0.17528389692306517,
+      "step": 1120
+    },
+    {
+      "epoch": 0.2046943231441048,
+      "grad_norm": 0.15920691192150116,
+      "learning_rate": 4.611844665812058e-05,
+      "loss": 0.1806849241256714,
+      "step": 1125
+    },
+    {
+      "epoch": 0.20560407569141192,
+      "grad_norm": 0.16114577651023865,
+      "learning_rate": 4.607896158609875e-05,
+      "loss": 0.17217352390289306,
+      "step": 1130
+    },
+    {
+      "epoch": 0.20651382823871905,
+      "grad_norm": 0.1499422937631607,
+      "learning_rate": 4.603929377988999e-05,
+      "loss": 0.17806737422943114,
+      "step": 1135
+    },
+    {
+      "epoch": 0.2074235807860262,
+      "grad_norm": 0.17605191469192505,
+      "learning_rate": 4.5999443583375765e-05,
+      "loss": 0.17842113971710205,
+      "step": 1140
+    },
+    {
+      "epoch": 0.20833333333333334,
+      "grad_norm": 0.16117210686206818,
+      "learning_rate": 4.595941134201871e-05,
+      "loss": 0.18379683494567872,
+      "step": 1145
+    },
+    {
+      "epoch": 0.20924308588064047,
+      "grad_norm": 0.21199050545692444,
+      "learning_rate": 4.591919740285957e-05,
+      "loss": 0.16286123991012574,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2101528384279476,
+      "grad_norm": 0.15100529789924622,
+      "learning_rate": 4.587880211451427e-05,
+      "loss": 0.17995200157165528,
+      "step": 1155
+    },
+    {
+      "epoch": 0.21106259097525473,
+      "grad_norm": 0.16618172824382782,
+      "learning_rate": 4.583822582717085e-05,
+      "loss": 0.16960303783416747,
+      "step": 1160
+    },
+    {
+      "epoch": 0.21197234352256186,
+      "grad_norm": 0.14743569493293762,
+      "learning_rate": 4.579746889258643e-05,
+      "loss": 0.17762668132781984,
+      "step": 1165
+    },
+    {
+      "epoch": 0.212882096069869,
+      "grad_norm": 0.1697179079055786,
+      "learning_rate": 4.575653166408417e-05,
+      "loss": 0.16656005382537842,
+      "step": 1170
+    },
+    {
+      "epoch": 0.21379184861717612,
+      "grad_norm": 0.14886513352394104,
+      "learning_rate": 4.57154144965502e-05,
+      "loss": 0.17091882228851318,
+      "step": 1175
+    },
+    {
+      "epoch": 0.21470160116448325,
+      "grad_norm": 0.18197473883628845,
+      "learning_rate": 4.5674117746430556e-05,
+      "loss": 0.1770920753479004,
+      "step": 1180
+    },
+    {
+      "epoch": 0.21561135371179038,
+      "grad_norm": 0.17323088645935059,
+      "learning_rate": 4.563264177172807e-05,
+      "loss": 0.1734643578529358,
+      "step": 1185
+    },
+    {
+      "epoch": 0.2165211062590975,
+      "grad_norm": 0.1521984338760376,
+      "learning_rate": 4.559098693199929e-05,
+      "loss": 0.17515116930007935,
+      "step": 1190
+    },
+    {
+      "epoch": 0.21743085880640467,
+      "grad_norm": 0.1842304915189743,
+      "learning_rate": 4.554915358835134e-05,
+      "loss": 0.16798022985458375,
+      "step": 1195
+    },
+    {
+      "epoch": 0.2183406113537118,
+      "grad_norm": 0.14753451943397522,
+      "learning_rate": 4.5507142103438794e-05,
+      "loss": 0.1755476713180542,
+      "step": 1200
+    },
+    {
+      "epoch": 0.21925036390101893,
+      "grad_norm": 0.17096194624900818,
+      "learning_rate": 4.546495284146057e-05,
+      "loss": 0.1792473554611206,
+      "step": 1205
+    },
+    {
+      "epoch": 0.22016011644832606,
+      "grad_norm": 0.1579233556985855,
+      "learning_rate": 4.542258616815669e-05,
+      "loss": 0.17230144739151002,
+      "step": 1210
+    },
+    {
+      "epoch": 0.2210698689956332,
+      "grad_norm": 0.177297905087471,
+      "learning_rate": 4.5380042450805216e-05,
+      "loss": 0.1807127833366394,
+      "step": 1215
+    },
+    {
+      "epoch": 0.22197962154294032,
+      "grad_norm": 0.14331696927547455,
+      "learning_rate": 4.533732205821897e-05,
+      "loss": 0.17201389074325563,
+      "step": 1220
+    },
+    {
+      "epoch": 0.22288937409024745,
+      "grad_norm": 0.14473360776901245,
+      "learning_rate": 4.529442536074239e-05,
+      "loss": 0.17036900520324708,
+      "step": 1225
+    },
+    {
+      "epoch": 0.22379912663755458,
+      "grad_norm": 0.1820901483297348,
+      "learning_rate": 4.5251352730248314e-05,
+      "loss": 0.17704882621765136,
+      "step": 1230
+    },
+    {
+      "epoch": 0.2247088791848617,
+      "grad_norm": 0.1948976367712021,
+      "learning_rate": 4.5208104540134746e-05,
+      "loss": 0.1706973433494568,
+      "step": 1235
+    },
+    {
+      "epoch": 0.22561863173216884,
+      "grad_norm": 0.16660070419311523,
+      "learning_rate": 4.51646811653216e-05,
+      "loss": 0.17636821269989014,
+      "step": 1240
+    },
+    {
+      "epoch": 0.22652838427947597,
+      "grad_norm": 0.1699984073638916,
+      "learning_rate": 4.512108298224751e-05,
+      "loss": 0.16986632347106934,
+      "step": 1245
+    },
+    {
+      "epoch": 0.22743813682678313,
+      "grad_norm": 0.17601042985916138,
+      "learning_rate": 4.50773103688665e-05,
+      "loss": 0.17507898807525635,
+      "step": 1250
+    },
+    {
+      "epoch": 0.22834788937409026,
+      "grad_norm": 0.17557238042354584,
+      "learning_rate": 4.503336370464476e-05,
+      "loss": 0.17702863216400147,
+      "step": 1255
+    },
+    {
+      "epoch": 0.2292576419213974,
+      "grad_norm": 0.1800651252269745,
+      "learning_rate": 4.498924337055729e-05,
+      "loss": 0.16419180631637573,
+      "step": 1260
+    },
+    {
+      "epoch": 0.23016739446870452,
+      "grad_norm": 0.2022479772567749,
+      "learning_rate": 4.494494974908468e-05,
+      "loss": 0.17482060194015503,
+      "step": 1265
+    },
+    {
+      "epoch": 0.23107714701601165,
+      "grad_norm": 0.14180205762386322,
+      "learning_rate": 4.490048322420973e-05,
+      "loss": 0.1723136067390442,
+      "step": 1270
+    },
+    {
+      "epoch": 0.23198689956331878,
+      "grad_norm": 0.18607310950756073,
+      "learning_rate": 4.485584418141419e-05,
+      "loss": 0.17096419334411622,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2328966521106259,
+      "grad_norm": 0.15958310663700104,
+      "learning_rate": 4.481103300767529e-05,
+      "loss": 0.1656244158744812,
+      "step": 1280
+    },
+    {
+      "epoch": 0.23380640465793304,
+      "grad_norm": 0.17552383244037628,
+      "learning_rate": 4.476605009146255e-05,
+      "loss": 0.17677626609802247,
+      "step": 1285
+    },
+    {
+      "epoch": 0.23471615720524017,
+      "grad_norm": 0.15299823880195618,
+      "learning_rate": 4.472089582273429e-05,
+      "loss": 0.1778991103172302,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2356259097525473,
+      "grad_norm": 0.14613987505435944,
+      "learning_rate": 4.46755705929343e-05,
+      "loss": 0.17071452140808105,
+      "step": 1295
+    },
+    {
+      "epoch": 0.23653566229985443,
+      "grad_norm": 0.17781122028827667,
+      "learning_rate": 4.463007479498843e-05,
+      "loss": 0.16955430507659913,
+      "step": 1300
+    },
+    {
+      "epoch": 0.23744541484716158,
+      "grad_norm": 0.16326487064361572,
+      "learning_rate": 4.458440882330119e-05,
+      "loss": 0.1777693510055542,
+      "step": 1305
+    },
+    {
+      "epoch": 0.23835516739446871,
+      "grad_norm": 0.17701926827430725,
+      "learning_rate": 4.4538573073752365e-05,
+      "loss": 0.16323351860046387,
+      "step": 1310
+    },
+    {
+      "epoch": 0.23926491994177584,
+      "grad_norm": 0.13104717433452606,
+      "learning_rate": 4.449256794369349e-05,
+      "loss": 0.17653456926345826,
+      "step": 1315
+    },
+    {
+      "epoch": 0.24017467248908297,
+      "grad_norm": 0.1796836256980896,
+      "learning_rate": 4.444639383194452e-05,
+      "loss": 0.17189600467681884,
+      "step": 1320
+    },
+    {
+      "epoch": 0.2410844250363901,
+      "grad_norm": 0.14919696748256683,
+      "learning_rate": 4.440005113879029e-05,
+      "loss": 0.17003334760665895,
+      "step": 1325
+    },
+    {
+      "epoch": 0.24199417758369723,
+      "grad_norm": 0.1728784441947937,
+      "learning_rate": 4.4353540265977064e-05,
+      "loss": 0.17397408485412597,
+      "step": 1330
+    },
+    {
+      "epoch": 0.24290393013100436,
+      "grad_norm": 0.14591015875339508,
+      "learning_rate": 4.43068616167091e-05,
+      "loss": 0.16498478651046752,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2438136826783115,
+      "grad_norm": 0.18417201936244965,
+      "learning_rate": 4.4260015595645055e-05,
+      "loss": 0.16841750144958495,
+      "step": 1340
+    },
+    {
+      "epoch": 0.24472343522561862,
+      "grad_norm": 0.16264279186725616,
+      "learning_rate": 4.4213002608894605e-05,
+      "loss": 0.16907373666763306,
+      "step": 1345
+    },
+    {
+      "epoch": 0.24563318777292575,
+      "grad_norm": 0.15248481929302216,
+      "learning_rate": 4.416582306401481e-05,
+      "loss": 0.15931472778320313,
+      "step": 1350
+    },
+    {
+      "epoch": 0.24654294032023288,
+      "grad_norm": 0.1488373875617981,
+      "learning_rate": 4.4118477370006636e-05,
+      "loss": 0.1701716423034668,
+      "step": 1355
+    },
+    {
+      "epoch": 0.24745269286754004,
+      "grad_norm": 0.14679782092571259,
+      "learning_rate": 4.407096593731142e-05,
+      "loss": 0.157412326335907,
+      "step": 1360
+    },
+    {
+      "epoch": 0.24836244541484717,
+      "grad_norm": 0.17139530181884766,
+      "learning_rate": 4.402328917780728e-05,
+      "loss": 0.17303754091262818,
+      "step": 1365
+    },
+    {
+      "epoch": 0.2492721979621543,
+      "grad_norm": 0.1534871757030487,
+      "learning_rate": 4.397544750480554e-05,
+      "loss": 0.1786255121231079,
+      "step": 1370
+    },
+    {
+      "epoch": 0.2501819505094614,
+      "grad_norm": 0.1876252293586731,
+      "learning_rate": 4.39274413330472e-05,
+      "loss": 0.16442898511886597,
+      "step": 1375
+    },
+    {
+      "epoch": 0.25109170305676853,
+      "grad_norm": 0.16165752708911896,
+      "learning_rate": 4.387927107869928e-05,
+      "loss": 0.1780426025390625,
+      "step": 1380
+    },
+    {
+      "epoch": 0.25200145560407566,
+      "grad_norm": 0.17242255806922913,
+      "learning_rate": 4.383093715935124e-05,
+      "loss": 0.15959256887435913,
+      "step": 1385
+    },
+    {
+      "epoch": 0.25291120815138285,
+      "grad_norm": 0.1627114862203598,
+      "learning_rate": 4.378243999401137e-05,
+      "loss": 0.17606115341186523,
+      "step": 1390
+    },
+    {
+      "epoch": 0.25382096069869,
+      "grad_norm": 0.15911224484443665,
+      "learning_rate": 4.373378000310312e-05,
+      "loss": 0.16798585653305054,
+      "step": 1395
+    },
+    {
+      "epoch": 0.2547307132459971,
+      "grad_norm": 0.15542249381542206,
+      "learning_rate": 4.3684957608461505e-05,
+      "loss": 0.1695417881011963,
+      "step": 1400
+    },
+    {
+      "epoch": 0.25564046579330424,
+      "grad_norm": 0.1475304812192917,
+      "learning_rate": 4.363597323332941e-05,
+      "loss": 0.16340878009796142,
+      "step": 1405
+    },
+    {
+      "epoch": 0.25655021834061137,
+      "grad_norm": 0.16943927109241486,
+      "learning_rate": 4.358682730235395e-05,
+      "loss": 0.17240238189697266,
+      "step": 1410
+    },
+    {
+      "epoch": 0.2574599708879185,
+      "grad_norm": 0.1816391944885254,
+      "learning_rate": 4.3537520241582744e-05,
+      "loss": 0.16558437347412108,
+      "step": 1415
+    },
+    {
+      "epoch": 0.25836972343522563,
+      "grad_norm": 0.23851341009140015,
+      "learning_rate": 4.348805247846027e-05,
+      "loss": 0.16796000003814698,
+      "step": 1420
+    },
+    {
+      "epoch": 0.25927947598253276,
+      "grad_norm": 0.15415243804454803,
+      "learning_rate": 4.343842444182414e-05,
+      "loss": 0.1746017098426819,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2601892285298399,
+      "grad_norm": 0.15651032328605652,
+      "learning_rate": 4.338863656190139e-05,
+      "loss": 0.1649057984352112,
+      "step": 1430
+    },
+    {
+      "epoch": 0.261098981077147,
+      "grad_norm": 0.16601966321468353,
+      "learning_rate": 4.333868927030471e-05,
+      "loss": 0.15888988971710205,
+      "step": 1435
+    },
+    {
+      "epoch": 0.26200873362445415,
+      "grad_norm": 0.1549467295408249,
+      "learning_rate": 4.328858300002876e-05,
+      "loss": 0.16357985734939576,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2629184861717613,
+      "grad_norm": 0.16332370042800903,
+      "learning_rate": 4.32383181854464e-05,
+      "loss": 0.16749982833862304,
+      "step": 1445
+    },
+    {
+      "epoch": 0.2638282387190684,
+      "grad_norm": 0.14827077090740204,
+      "learning_rate": 4.3187895262304894e-05,
+      "loss": 0.16886214017868043,
+      "step": 1450
+    },
+    {
+      "epoch": 0.26473799126637554,
+      "grad_norm": 0.1557198166847229,
+      "learning_rate": 4.313731466772216e-05,
+      "loss": 0.17512214183807373,
+      "step": 1455
+    },
+    {
+      "epoch": 0.26564774381368267,
+      "grad_norm": 0.17263570427894592,
+      "learning_rate": 4.308657684018299e-05,
+      "loss": 0.16248074769973755,
+      "step": 1460
+    },
+    {
+      "epoch": 0.2665574963609898,
+      "grad_norm": 0.17135761678218842,
+      "learning_rate": 4.303568221953521e-05,
+      "loss": 0.16605921983718872,
+      "step": 1465
+    },
+    {
+      "epoch": 0.26746724890829693,
+      "grad_norm": 0.14322632551193237,
+      "learning_rate": 4.2984631246985897e-05,
+      "loss": 0.1610772728919983,
+      "step": 1470
+    },
+    {
+      "epoch": 0.26837700145560406,
+      "grad_norm": 0.18852312862873077,
+      "learning_rate": 4.2933424365097564e-05,
+      "loss": 0.1686462163925171,
+      "step": 1475
+    },
+    {
+      "epoch": 0.2692867540029112,
+      "grad_norm": 0.1780245155096054,
+      "learning_rate": 4.2882062017784294e-05,
+      "loss": 0.16953932046890258,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2701965065502183,
+      "grad_norm": 0.180568665266037,
+      "learning_rate": 4.2830544650307895e-05,
+      "loss": 0.16442664861679077,
+      "step": 1485
+    },
+    {
+      "epoch": 0.27110625909752545,
+      "grad_norm": 0.16876435279846191,
+      "learning_rate": 4.277887270927407e-05,
+      "loss": 0.17128173112869263,
+      "step": 1490
+    },
+    {
+      "epoch": 0.2720160116448326,
+      "grad_norm": 0.164053276181221,
+      "learning_rate": 4.2727046642628513e-05,
+      "loss": 0.16331382989883422,
+      "step": 1495
+    },
+    {
+      "epoch": 0.27292576419213976,
+      "grad_norm": 0.14577528834342957,
+      "learning_rate": 4.267506689965305e-05,
+      "loss": 0.1638316035270691,
+      "step": 1500
+    },
+    {
+      "epoch": 0.2738355167394469,
+      "grad_norm": 0.1648740917444229,
+      "learning_rate": 4.262293393096171e-05,
+      "loss": 0.15332664251327516,
+      "step": 1505
+    },
+    {
+      "epoch": 0.274745269286754,
+      "grad_norm": 0.16445094347000122,
+      "learning_rate": 4.257064818849685e-05,
+      "loss": 0.1706634521484375,
+      "step": 1510
+    },
+    {
+      "epoch": 0.27565502183406115,
+      "grad_norm": 0.1584935486316681,
+      "learning_rate": 4.251821012552524e-05,
+      "loss": 0.1684114694595337,
+      "step": 1515
+    },
+    {
+      "epoch": 0.2765647743813683,
+      "grad_norm": 0.17215611040592194,
+      "learning_rate": 4.24656201966341e-05,
+      "loss": 0.15594131946563722,
+      "step": 1520
+    },
+    {
+      "epoch": 0.2774745269286754,
+      "grad_norm": 0.15945589542388916,
+      "learning_rate": 4.2412878857727214e-05,
+      "loss": 0.1686659574508667,
+      "step": 1525
+    },
+    {
+      "epoch": 0.27838427947598254,
+      "grad_norm": 0.16103951632976532,
+      "learning_rate": 4.2359986566020906e-05,
+      "loss": 0.17779340744018554,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2792940320232897,
+      "grad_norm": 0.1770307570695877,
+      "learning_rate": 4.230694378004014e-05,
+      "loss": 0.16786882877349854,
+      "step": 1535
+    },
+    {
+      "epoch": 0.2802037845705968,
+      "grad_norm": 0.16225053369998932,
+      "learning_rate": 4.2253750959614504e-05,
+      "loss": 0.16558897495269775,
+      "step": 1540
+    },
+    {
+      "epoch": 0.28111353711790393,
+      "grad_norm": 0.27213969826698303,
+      "learning_rate": 4.220040856587425e-05,
+      "loss": 0.1641119599342346,
+      "step": 1545
+    },
+    {
+      "epoch": 0.28202328966521106,
+      "grad_norm": 0.1773071587085724,
+      "learning_rate": 4.2146917061246284e-05,
+      "loss": 0.16919140815734862,
+      "step": 1550
+    },
+    {
+      "epoch": 0.2829330422125182,
+      "grad_norm": 0.15519705414772034,
+      "learning_rate": 4.209327690945014e-05,
+      "loss": 0.15501506328582765,
+      "step": 1555
+    },
+    {
+      "epoch": 0.2838427947598253,
+      "grad_norm": 0.19921597838401794,
+      "learning_rate": 4.203948857549402e-05,
+      "loss": 0.1690821886062622,
+      "step": 1560
+    },
+    {
+      "epoch": 0.28475254730713245,
+      "grad_norm": 0.15417630970478058,
+      "learning_rate": 4.1985552525670696e-05,
+      "loss": 0.1675640344619751,
+      "step": 1565
+    },
+    {
+      "epoch": 0.2856622998544396,
+      "grad_norm": 0.1739572137594223,
+      "learning_rate": 4.193146922755348e-05,
+      "loss": 0.16738017797470092,
+      "step": 1570
+    },
+    {
+      "epoch": 0.2865720524017467,
+      "grad_norm": 0.1384361982345581,
+      "learning_rate": 4.187723914999221e-05,
+      "loss": 0.16802358627319336,
+      "step": 1575
+    },
+    {
+      "epoch": 0.28748180494905384,
+      "grad_norm": 0.1491454839706421,
+      "learning_rate": 4.182286276310915e-05,
+      "loss": 0.1619583249092102,
+      "step": 1580
+    },
+    {
+      "epoch": 0.288391557496361,
+      "grad_norm": 0.15831919014453888,
+      "learning_rate": 4.176834053829492e-05,
+      "loss": 0.1625199794769287,
+      "step": 1585
+    },
+    {
+      "epoch": 0.2893013100436681,
+      "grad_norm": 0.16265396773815155,
+      "learning_rate": 4.1713672948204416e-05,
+      "loss": 0.16718552112579346,
+      "step": 1590
+    },
+    {
+      "epoch": 0.29021106259097523,
+      "grad_norm": 0.15153461694717407,
+      "learning_rate": 4.1658860466752714e-05,
+      "loss": 0.15979087352752686,
+      "step": 1595
+    },
+    {
+      "epoch": 0.29112081513828236,
+      "grad_norm": 0.1620412915945053,
+      "learning_rate": 4.160390356911096e-05,
+      "loss": 0.16103557348251343,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2920305676855895,
+      "grad_norm": 0.16673807799816132,
+      "learning_rate": 4.154880273170223e-05,
+      "loss": 0.16394708156585694,
+      "step": 1605
+    },
+    {
+      "epoch": 0.2929403202328967,
+      "grad_norm": 0.14834867417812347,
+      "learning_rate": 4.149355843219744e-05,
+      "loss": 0.15916435718536376,
+      "step": 1610
+    },
+    {
+      "epoch": 0.2938500727802038,
+      "grad_norm": 0.16977964341640472,
+      "learning_rate": 4.143817114951119e-05,
+      "loss": 0.16538127660751342,
+      "step": 1615
+    },
+    {
+      "epoch": 0.29475982532751094,
+      "grad_norm": 0.17986875772476196,
+      "learning_rate": 4.138264136379756e-05,
+      "loss": 0.15514618158340454,
+      "step": 1620
+    },
+    {
+      "epoch": 0.29566957787481807,
+      "grad_norm": 0.15794920921325684,
+      "learning_rate": 4.132696955644605e-05,
+      "loss": 0.15992183685302735,
+      "step": 1625
+    },
+    {
+      "epoch": 0.2965793304221252,
+      "grad_norm": 0.19955399632453918,
+      "learning_rate": 4.127115621007731e-05,
+      "loss": 0.16362056732177735,
+      "step": 1630
+    },
+    {
+      "epoch": 0.29748908296943233,
+      "grad_norm": 0.1352023035287857,
+      "learning_rate": 4.121520180853903e-05,
+      "loss": 0.15631601810455323,
+      "step": 1635
+    },
+    {
+      "epoch": 0.29839883551673946,
+      "grad_norm": 0.15340781211853027,
+      "learning_rate": 4.1159106836901674e-05,
+      "loss": 0.1571858048439026,
+      "step": 1640
+    },
+    {
+      "epoch": 0.2993085880640466,
+      "grad_norm": 0.15311770141124725,
+      "learning_rate": 4.110287178145433e-05,
+      "loss": 0.16082344055175782,
+      "step": 1645
+    },
+    {
+      "epoch": 0.3002183406113537,
+      "grad_norm": 0.17811627686023712,
+      "learning_rate": 4.10464971297005e-05,
+      "loss": 0.16117215156555176,
+      "step": 1650
+    },
+    {
+      "epoch": 0.30112809315866085,
+      "grad_norm": 0.21060039103031158,
+      "learning_rate": 4.0989983370353805e-05,
+      "loss": 0.15838587284088135,
+      "step": 1655
+    },
+    {
+      "epoch": 0.302037845705968,
+      "grad_norm": 0.155836820602417,
+      "learning_rate": 4.093333099333383e-05,
+      "loss": 0.16648870706558228,
+      "step": 1660
+    },
+    {
+      "epoch": 0.3029475982532751,
+      "grad_norm": 0.13711698353290558,
+      "learning_rate": 4.0876540489761826e-05,
+      "loss": 0.16899349689483642,
+      "step": 1665
+    },
+    {
+      "epoch": 0.30385735080058224,
+      "grad_norm": 0.15162716805934906,
+      "learning_rate": 4.0819612351956485e-05,
+      "loss": 0.16574090719223022,
+      "step": 1670
+    },
+    {
+      "epoch": 0.30476710334788937,
+      "grad_norm": 0.15016348659992218,
+      "learning_rate": 4.0762547073429615e-05,
+      "loss": 0.1689780354499817,
+      "step": 1675
+    },
+    {
+      "epoch": 0.3056768558951965,
+      "grad_norm": 0.15182986855506897,
+      "learning_rate": 4.070534514888194e-05,
+      "loss": 0.1593686819076538,
+      "step": 1680
+    },
+    {
+      "epoch": 0.3065866084425036,
+      "grad_norm": 0.15648750960826874,
+      "learning_rate": 4.0648007074198765e-05,
+      "loss": 0.16436235904693602,
+      "step": 1685
+    },
+    {
+      "epoch": 0.30749636098981076,
+      "grad_norm": 0.18339484930038452,
+      "learning_rate": 4.0590533346445665e-05,
+      "loss": 0.1678077220916748,
+      "step": 1690
+    },
+    {
+      "epoch": 0.3084061135371179,
+      "grad_norm": 0.16426527500152588,
+      "learning_rate": 4.053292446386422e-05,
+      "loss": 0.1689227342605591,
+      "step": 1695
+    },
+    {
+      "epoch": 0.309315866084425,
+      "grad_norm": 0.16129335761070251,
+      "learning_rate": 4.047518092586766e-05,
+      "loss": 0.16592445373535156,
+      "step": 1700
+    },
+    {
+      "epoch": 0.31022561863173215,
+      "grad_norm": 0.15512363612651825,
+      "learning_rate": 4.041730323303654e-05,
+      "loss": 0.16142364740371704,
+      "step": 1705
+    },
+    {
+      "epoch": 0.3111353711790393,
+      "grad_norm": 0.159842386841774,
+      "learning_rate": 4.0359291887114425e-05,
+      "loss": 0.1702875852584839,
+      "step": 1710
+    },
+    {
+      "epoch": 0.3120451237263464,
+      "grad_norm": 0.19558854401111603,
+      "learning_rate": 4.030114739100352e-05,
+      "loss": 0.15966148376464845,
+      "step": 1715
+    },
+    {
+      "epoch": 0.3129548762736536,
+      "grad_norm": 0.1577496975660324,
+      "learning_rate": 4.024287024876029e-05,
+      "loss": 0.1620358943939209,
+      "step": 1720
+    },
+    {
+      "epoch": 0.3138646288209607,
+      "grad_norm": 0.1629355251789093,
+      "learning_rate": 4.0184460965591144e-05,
+      "loss": 0.16511552333831786,
+      "step": 1725
+    },
+    {
+      "epoch": 0.31477438136826785,
+      "grad_norm": 0.17060767114162445,
+      "learning_rate": 4.0125920047848e-05,
+      "loss": 0.15672838687896729,
+      "step": 1730
+    },
+    {
+      "epoch": 0.315684133915575,
+      "grad_norm": 0.22447620332241058,
+      "learning_rate": 4.006724800302394e-05,
+      "loss": 0.15339784622192382,
+      "step": 1735
+    },
+    {
+      "epoch": 0.3165938864628821,
+      "grad_norm": 0.14572037756443024,
+      "learning_rate": 4.000844533974878e-05,
+      "loss": 0.16566959619522095,
+      "step": 1740
+    },
+    {
+      "epoch": 0.31750363901018924,
+      "grad_norm": 0.15915483236312866,
+      "learning_rate": 3.9949512567784684e-05,
+      "loss": 0.16153957843780517,
+      "step": 1745
+    },
+    {
+      "epoch": 0.3184133915574964,
+      "grad_norm": 0.1668540984392166,
+      "learning_rate": 3.9890450198021704e-05,
+      "loss": 0.1659809947013855,
+      "step": 1750
+    },
+    {
+      "epoch": 0.3193231441048035,
+      "grad_norm": 0.16612035036087036,
+      "learning_rate": 3.983125874247341e-05,
+      "loss": 0.16941241025924683,
+      "step": 1755
+    },
+    {
+      "epoch": 0.32023289665211063,
+      "grad_norm": 0.15163679420948029,
+      "learning_rate": 3.9771938714272407e-05,
+      "loss": 0.16053590774536133,
+      "step": 1760
+    },
+    {
+      "epoch": 0.32114264919941776,
+      "grad_norm": 0.1797824203968048,
+      "learning_rate": 3.97124906276659e-05,
+      "loss": 0.1667110800743103,
+      "step": 1765
+    },
+    {
+      "epoch": 0.3220524017467249,
+      "grad_norm": 0.15076608955860138,
+      "learning_rate": 3.9652914998011237e-05,
+      "loss": 0.1607860803604126,
+      "step": 1770
+    },
+    {
+      "epoch": 0.322962154294032,
+      "grad_norm": 0.16523587703704834,
+      "learning_rate": 3.959321234177144e-05,
+      "loss": 0.16515827178955078,
+      "step": 1775
+    },
+    {
+      "epoch": 0.32387190684133915,
+      "grad_norm": 0.22065149247646332,
+      "learning_rate": 3.9533383176510746e-05,
+      "loss": 0.1618957757949829,
+      "step": 1780
+    },
+    {
+      "epoch": 0.3247816593886463,
+      "grad_norm": 0.16426463425159454,
+      "learning_rate": 3.9473428020890066e-05,
+      "loss": 0.15763382911682128,
+      "step": 1785
+    },
+    {
+      "epoch": 0.3256914119359534,
+      "grad_norm": 0.16474904119968414,
+      "learning_rate": 3.941334739466257e-05,
+      "loss": 0.15135571956634522,
+      "step": 1790
+    },
+    {
+      "epoch": 0.32660116448326054,
+      "grad_norm": 0.16746412217617035,
+      "learning_rate": 3.935314181866909e-05,
+      "loss": 0.15925389528274536,
+      "step": 1795
+    },
+    {
+      "epoch": 0.32751091703056767,
+      "grad_norm": 0.17819371819496155,
+      "learning_rate": 3.929281181483369e-05,
+      "loss": 0.1598669171333313,
+      "step": 1800
+    },
+    {
+      "epoch": 0.3284206695778748,
+      "grad_norm": 0.1816040277481079,
+      "learning_rate": 3.923235790615907e-05,
+      "loss": 0.1652522087097168,
+      "step": 1805
+    },
+    {
+      "epoch": 0.32933042212518193,
+      "grad_norm": 0.14846695959568024,
+      "learning_rate": 3.917178061672211e-05,
+      "loss": 0.16665585041046144,
+      "step": 1810
+    },
+    {
+      "epoch": 0.33024017467248906,
+      "grad_norm": 0.1734926551580429,
+      "learning_rate": 3.911108047166924e-05,
+      "loss": 0.16069791316986085,
+      "step": 1815
+    },
+    {
+      "epoch": 0.3311499272197962,
+      "grad_norm": 0.16154922544956207,
+      "learning_rate": 3.905025799721194e-05,
+      "loss": 0.16114097833633423,
+      "step": 1820
+    },
+    {
+      "epoch": 0.3320596797671033,
+      "grad_norm": 0.1538771390914917,
+      "learning_rate": 3.898931372062217e-05,
+      "loss": 0.1602831244468689,
+      "step": 1825
+    },
+    {
+      "epoch": 0.3329694323144105,
+      "grad_norm": 0.14036566019058228,
+      "learning_rate": 3.892824817022781e-05,
+      "loss": 0.1502395749092102,
+      "step": 1830
+    },
+    {
+      "epoch": 0.33387918486171764,
+      "grad_norm": 0.19212059676647186,
+      "learning_rate": 3.886706187540804e-05,
+      "loss": 0.16265250444412233,
+      "step": 1835
+    },
+    {
+      "epoch": 0.33478893740902477,
+      "grad_norm": 0.17410333454608917,
+      "learning_rate": 3.880575536658881e-05,
+      "loss": 0.15689224004745483,
+      "step": 1840
+    },
+    {
+      "epoch": 0.3356986899563319,
+      "grad_norm": 0.15165294706821442,
+      "learning_rate": 3.874432917523817e-05,
+      "loss": 0.15033140182495117,
+      "step": 1845
+    },
+    {
+      "epoch": 0.336608442503639,
+      "grad_norm": 0.16166730225086212,
+      "learning_rate": 3.8682783833861736e-05,
+      "loss": 0.16896235942840576,
+      "step": 1850
+    },
+    {
+      "epoch": 0.33751819505094616,
+      "grad_norm": 0.16497021913528442,
+      "learning_rate": 3.8621119875998026e-05,
+      "loss": 0.1600774645805359,
+      "step": 1855
+    },
+    {
+      "epoch": 0.3384279475982533,
+      "grad_norm": 0.17264948785305023,
+      "learning_rate": 3.855933783621384e-05,
+      "loss": 0.16947593688964843,
+      "step": 1860
+    },
+    {
+      "epoch": 0.3393377001455604,
+      "grad_norm": 0.16870704293251038,
+      "learning_rate": 3.8497438250099636e-05,
+      "loss": 0.16062095165252685,
+      "step": 1865
+    },
+    {
+      "epoch": 0.34024745269286755,
+      "grad_norm": 0.16644036769866943,
+      "learning_rate": 3.843542165426492e-05,
+      "loss": 0.16015599966049193,
+      "step": 1870
+    },
+    {
+      "epoch": 0.3411572052401747,
+      "grad_norm": 0.1626352220773697,
+      "learning_rate": 3.837328858633349e-05,
+      "loss": 0.17444703578948975,
+      "step": 1875
+    },
+    {
+      "epoch": 0.3420669577874818,
+      "grad_norm": 0.1427375227212906,
+      "learning_rate": 3.83110395849389e-05,
+      "loss": 0.1589805006980896,
+      "step": 1880
+    },
+    {
+      "epoch": 0.34297671033478894,
+      "grad_norm": 0.17840255796909332,
+      "learning_rate": 3.824867518971973e-05,
+      "loss": 0.15953952074050903,
+      "step": 1885
+    },
+    {
+      "epoch": 0.34388646288209607,
+      "grad_norm": 0.16998249292373657,
+      "learning_rate": 3.818619594131489e-05,
+      "loss": 0.16027032136917113,
+      "step": 1890
+    },
+    {
+      "epoch": 0.3447962154294032,
+      "grad_norm": 0.14950257539749146,
+      "learning_rate": 3.812360238135897e-05,
+      "loss": 0.15335670709609986,
+      "step": 1895
+    },
+    {
+      "epoch": 0.3457059679767103,
+      "grad_norm": 0.1678011417388916,
+      "learning_rate": 3.806089505247752e-05,
+      "loss": 0.1560648798942566,
+      "step": 1900
+    },
+    {
+      "epoch": 0.34661572052401746,
+      "grad_norm": 0.17944541573524475,
+      "learning_rate": 3.799807449828238e-05,
+      "loss": 0.16072254180908202,
+      "step": 1905
+    },
+    {
+      "epoch": 0.3475254730713246,
+      "grad_norm": 0.166817307472229,
+      "learning_rate": 3.793514126336691e-05,
+      "loss": 0.1542820692062378,
+      "step": 1910
+    },
+    {
+      "epoch": 0.3484352256186317,
+      "grad_norm": 0.16047626733779907,
+      "learning_rate": 3.787209589330134e-05,
+      "loss": 0.16092092990875245,
+      "step": 1915
+    },
+    {
+      "epoch": 0.34934497816593885,
+      "grad_norm": 0.16478900611400604,
+      "learning_rate": 3.7808938934627965e-05,
+      "loss": 0.16765867471694945,
+      "step": 1920
+    },
+    {
+      "epoch": 0.350254730713246,
+      "grad_norm": 0.15349514782428741,
+      "learning_rate": 3.774567093485648e-05,
+      "loss": 0.15890377759933472,
+      "step": 1925
+    },
+    {
+      "epoch": 0.3511644832605531,
+      "grad_norm": 0.1515921950340271,
+      "learning_rate": 3.768229244245917e-05,
+      "loss": 0.16668319702148438,
+      "step": 1930
+    },
+    {
+      "epoch": 0.35207423580786024,
+      "grad_norm": 0.16310466825962067,
+      "learning_rate": 3.7618804006866195e-05,
+      "loss": 0.15182652473449706,
+      "step": 1935
+    },
+    {
+      "epoch": 0.3529839883551674,
+      "grad_norm": 0.17294517159461975,
+      "learning_rate": 3.755520617846084e-05,
+      "loss": 0.16287628412246705,
+      "step": 1940
+    },
+    {
+      "epoch": 0.35389374090247455,
+      "grad_norm": 0.1482895463705063,
+      "learning_rate": 3.749149950857467e-05,
+      "loss": 0.15321952104568481,
+      "step": 1945
+    },
+    {
+      "epoch": 0.3548034934497817,
+      "grad_norm": 0.2236029952764511,
+      "learning_rate": 3.7427684549482847e-05,
+      "loss": 0.15403482913970948,
+      "step": 1950
+    },
+    {
+      "epoch": 0.3557132459970888,
+      "grad_norm": 0.20185327529907227,
+      "learning_rate": 3.736376185439927e-05,
+      "loss": 0.1633884072303772,
+      "step": 1955
+    },
+    {
+      "epoch": 0.35662299854439594,
+      "grad_norm": 0.13906247913837433,
+      "learning_rate": 3.7299731977471816e-05,
+      "loss": 0.15925350189208984,
+      "step": 1960
+    },
+    {
+      "epoch": 0.35753275109170307,
+      "grad_norm": 0.18665002286434174,
+      "learning_rate": 3.723559547377751e-05,
+      "loss": 0.1612026572227478,
+      "step": 1965
+    },
+    {
+      "epoch": 0.3584425036390102,
+      "grad_norm": 0.16913433372974396,
+      "learning_rate": 3.717135289931774e-05,
+      "loss": 0.15479494333267213,
+      "step": 1970
+    },
+    {
+      "epoch": 0.35935225618631733,
+      "grad_norm": 0.1620066910982132,
+      "learning_rate": 3.7107004811013434e-05,
+      "loss": 0.1604058027267456,
+      "step": 1975
+    },
+    {
+      "epoch": 0.36026200873362446,
+      "grad_norm": 0.16838301718235016,
+      "learning_rate": 3.704255176670021e-05,
+      "loss": 0.15335073471069335,
+      "step": 1980
+    },
+    {
+      "epoch": 0.3611717612809316,
+      "grad_norm": 0.3054695427417755,
+      "learning_rate": 3.6977994325123535e-05,
+      "loss": 0.16558053493499755,
+      "step": 1985
+    },
+    {
+      "epoch": 0.3620815138282387,
+      "grad_norm": 0.1526716649532318,
+      "learning_rate": 3.6913333045933934e-05,
+      "loss": 0.16148923635482787,
+      "step": 1990
+    },
+    {
+      "epoch": 0.36299126637554585,
+      "grad_norm": 0.15328513085842133,
+      "learning_rate": 3.684856848968209e-05,
+      "loss": 0.1553613781929016,
+      "step": 1995
+    },
+    {
+      "epoch": 0.363901018922853,
+      "grad_norm": 0.16129714250564575,
+      "learning_rate": 3.6783701217813995e-05,
+      "loss": 0.16724612712860107,
+      "step": 2000
+    },
+    {
+      "epoch": 0.3648107714701601,
+      "grad_norm": 0.15715539455413818,
+      "learning_rate": 3.6718731792666086e-05,
+      "loss": 0.15867922306060792,
+      "step": 2005
+    },
+    {
+      "epoch": 0.36572052401746724,
+      "grad_norm": 0.15569166839122772,
+      "learning_rate": 3.6653660777460366e-05,
+      "loss": 0.1552058696746826,
+      "step": 2010
+    },
+    {
+      "epoch": 0.36663027656477437,
+      "grad_norm": 0.16223010420799255,
+      "learning_rate": 3.6588488736299535e-05,
+      "loss": 0.1583200454711914,
+      "step": 2015
+    },
+    {
+      "epoch": 0.3675400291120815,
+      "grad_norm": 0.18441995978355408,
+      "learning_rate": 3.652321623416209e-05,
+      "loss": 0.15050662755966188,
+      "step": 2020
+    },
+    {
+      "epoch": 0.36844978165938863,
+      "grad_norm": 0.13792674243450165,
+      "learning_rate": 3.645784383689742e-05,
+      "loss": 0.15458759069442748,
+      "step": 2025
+    },
+    {
+      "epoch": 0.36935953420669576,
+      "grad_norm": 0.14993111789226532,
+      "learning_rate": 3.639237211122091e-05,
+      "loss": 0.15926222801208495,
+      "step": 2030
+    },
+    {
+      "epoch": 0.3702692867540029,
+      "grad_norm": 0.16815930604934692,
+      "learning_rate": 3.632680162470904e-05,
+      "loss": 0.15524441003799438,
+      "step": 2035
+    },
+    {
+      "epoch": 0.37117903930131,
+      "grad_norm": 0.13312821090221405,
+      "learning_rate": 3.626113294579441e-05,
+      "loss": 0.15883516073226928,
+      "step": 2040
+    },
+    {
+      "epoch": 0.37208879184861715,
+      "grad_norm": 0.16838273406028748,
+      "learning_rate": 3.619536664376091e-05,
+      "loss": 0.15829603672027587,
+      "step": 2045
+    },
+    {
+      "epoch": 0.37299854439592434,
+      "grad_norm": 0.14706873893737793,
+      "learning_rate": 3.612950328873869e-05,
+      "loss": 0.15644397735595703,
+      "step": 2050
+    },
+    {
+      "epoch": 0.37390829694323147,
+      "grad_norm": 0.1644199639558792,
+      "learning_rate": 3.606354345169926e-05,
+      "loss": 0.15858219861984252,
+      "step": 2055
+    },
+    {
+      "epoch": 0.3748180494905386,
+      "grad_norm": 0.18077051639556885,
+      "learning_rate": 3.599748770445055e-05,
+      "loss": 0.1641286849975586,
+      "step": 2060
+    },
+    {
+      "epoch": 0.3757278020378457,
+      "grad_norm": 0.16329127550125122,
+      "learning_rate": 3.5931336619631914e-05,
+      "loss": 0.15027186870574952,
+      "step": 2065
+    },
+    {
+      "epoch": 0.37663755458515286,
+      "grad_norm": 0.16346783936023712,
+      "learning_rate": 3.586509077070922e-05,
+      "loss": 0.1558641314506531,
+      "step": 2070
+    },
+    {
+      "epoch": 0.37754730713246,
+      "grad_norm": 0.1727602630853653,
+      "learning_rate": 3.5798750731969834e-05,
+      "loss": 0.15390506982803345,
+      "step": 2075
+    },
+    {
+      "epoch": 0.3784570596797671,
+      "grad_norm": 0.7598192691802979,
+      "learning_rate": 3.5732317078517654e-05,
+      "loss": 0.1533232808113098,
+      "step": 2080
+    },
+    {
+      "epoch": 0.37936681222707425,
+      "grad_norm": 0.1433355212211609,
+      "learning_rate": 3.5665790386268124e-05,
+      "loss": 0.15560413599014283,
+      "step": 2085
+    },
+    {
+      "epoch": 0.3802765647743814,
+      "grad_norm": 0.18439625203609467,
+      "learning_rate": 3.559917123194325e-05,
+      "loss": 0.16695556640625,
+      "step": 2090
+    },
+    {
+      "epoch": 0.3811863173216885,
+      "grad_norm": 0.1693502813577652,
+      "learning_rate": 3.55324601930666e-05,
+      "loss": 0.15957870483398437,
+      "step": 2095
+    },
+    {
+      "epoch": 0.38209606986899564,
+      "grad_norm": 0.17776088416576385,
+      "learning_rate": 3.54656578479583e-05,
+      "loss": 0.1527492880821228,
+      "step": 2100
+    },
+    {
+      "epoch": 0.38300582241630277,
+      "grad_norm": 0.15993724763393402,
+      "learning_rate": 3.539876477572998e-05,
+      "loss": 0.1567505717277527,
+      "step": 2105
+    },
+    {
+      "epoch": 0.3839155749636099,
+      "grad_norm": 0.17067375779151917,
+      "learning_rate": 3.533178155627981e-05,
+      "loss": 0.14660797119140626,
+      "step": 2110
+    },
+    {
+      "epoch": 0.384825327510917,
+      "grad_norm": 0.20239882171154022,
+      "learning_rate": 3.526470877028745e-05,
+      "loss": 0.1596767544746399,
+      "step": 2115
+    },
+    {
+      "epoch": 0.38573508005822416,
+      "grad_norm": 0.1863643079996109,
+      "learning_rate": 3.5197546999209005e-05,
+      "loss": 0.15738571882247926,
+      "step": 2120
+    },
+    {
+      "epoch": 0.3866448326055313,
+      "grad_norm": 0.16994133591651917,
+      "learning_rate": 3.5130296825272014e-05,
+      "loss": 0.16255316734313965,
+      "step": 2125
+    },
+    {
+      "epoch": 0.3875545851528384,
+      "grad_norm": 0.18703415989875793,
+      "learning_rate": 3.5062958831470355e-05,
+      "loss": 0.15206334590911866,
+      "step": 2130
+    },
+    {
+      "epoch": 0.38846433770014555,
+      "grad_norm": 0.15433982014656067,
+      "learning_rate": 3.4995533601559226e-05,
+      "loss": 0.1590178370475769,
+      "step": 2135
+    },
+    {
+      "epoch": 0.3893740902474527,
+      "grad_norm": 0.16498146951198578,
+      "learning_rate": 3.4928021720050104e-05,
+      "loss": 0.14759145975112914,
+      "step": 2140
+    },
+    {
+      "epoch": 0.3902838427947598,
+      "grad_norm": 0.17880478501319885,
+      "learning_rate": 3.486042377220562e-05,
+      "loss": 0.1642458915710449,
+      "step": 2145
+    },
+    {
+      "epoch": 0.39119359534206694,
+      "grad_norm": 0.14700061082839966,
+      "learning_rate": 3.479274034403455e-05,
+      "loss": 0.16105138063430785,
+      "step": 2150
+    },
+    {
+      "epoch": 0.39210334788937407,
+      "grad_norm": 0.1620762050151825,
+      "learning_rate": 3.472497202228664e-05,
+      "loss": 0.15104985237121582,
+      "step": 2155
+    },
+    {
+      "epoch": 0.3930131004366812,
+      "grad_norm": 0.1625058799982071,
+      "learning_rate": 3.4657119394447654e-05,
+      "loss": 0.16145485639572144,
+      "step": 2160
+    },
+    {
+      "epoch": 0.3939228529839884,
+      "grad_norm": 0.1631549596786499,
+      "learning_rate": 3.458918304873417e-05,
+      "loss": 0.16712255477905275,
+      "step": 2165
+    },
+    {
+      "epoch": 0.3948326055312955,
+      "grad_norm": 0.16041551530361176,
+      "learning_rate": 3.452116357408853e-05,
+      "loss": 0.15118330717086792,
+      "step": 2170
+    },
+    {
+      "epoch": 0.39574235807860264,
+      "grad_norm": 0.16692611575126648,
+      "learning_rate": 3.44530615601737e-05,
+      "loss": 0.16982550621032716,
+      "step": 2175
+    },
+    {
+      "epoch": 0.39665211062590977,
+      "grad_norm": 0.16082268953323364,
+      "learning_rate": 3.438487759736821e-05,
+      "loss": 0.1513260006904602,
+      "step": 2180
+    },
+    {
+      "epoch": 0.3975618631732169,
+      "grad_norm": 0.1474589854478836,
+      "learning_rate": 3.4316612276761004e-05,
+      "loss": 0.14968743324279785,
+      "step": 2185
+    },
+    {
+      "epoch": 0.39847161572052403,
+      "grad_norm": 0.14531342685222626,
+      "learning_rate": 3.42482661901463e-05,
+      "loss": 0.1563260555267334,
+      "step": 2190
+    },
+    {
+      "epoch": 0.39938136826783116,
+      "grad_norm": 0.16775506734848022,
+      "learning_rate": 3.41798399300185e-05,
+      "loss": 0.14861010313034057,
+      "step": 2195
+    },
+    {
+      "epoch": 0.4002911208151383,
+      "grad_norm": 0.15065217018127441,
+      "learning_rate": 3.411133408956703e-05,
+      "loss": 0.15559519529342652,
+      "step": 2200
+    },
+    {
+      "epoch": 0.4012008733624454,
+      "grad_norm": 0.16655296087265015,
+      "learning_rate": 3.4042749262671184e-05,
+      "loss": 0.16025567054748535,
+      "step": 2205
+    },
+    {
+      "epoch": 0.40211062590975255,
+      "grad_norm": 0.14773905277252197,
+      "learning_rate": 3.397408604389501e-05,
+      "loss": 0.15074082612991332,
+      "step": 2210
+    },
+    {
+      "epoch": 0.4030203784570597,
+      "grad_norm": 0.16233304142951965,
+      "learning_rate": 3.3905345028482125e-05,
+      "loss": 0.15490520000457764,
+      "step": 2215
+    },
+    {
+      "epoch": 0.4039301310043668,
+      "grad_norm": 0.17520153522491455,
+      "learning_rate": 3.383652681235058e-05,
+      "loss": 0.1517520785331726,
+      "step": 2220
+    },
+    {
+      "epoch": 0.40483988355167394,
+      "grad_norm": 0.14749875664710999,
+      "learning_rate": 3.376763199208766e-05,
+      "loss": 0.15410997867584228,
+      "step": 2225
+    },
+    {
+      "epoch": 0.40574963609898107,
+      "grad_norm": 0.16855919361114502,
+      "learning_rate": 3.369866116494477e-05,
+      "loss": 0.1510261058807373,
+      "step": 2230
+    },
+    {
+      "epoch": 0.4066593886462882,
+      "grad_norm": 0.1594122350215912,
+      "learning_rate": 3.362961492883218e-05,
+      "loss": 0.1493813395500183,
+      "step": 2235
+    },
+    {
+      "epoch": 0.40756914119359533,
+      "grad_norm": 0.13645926117897034,
+      "learning_rate": 3.3560493882313915e-05,
+      "loss": 0.14876762628555298,
+      "step": 2240
+    },
+    {
+      "epoch": 0.40847889374090246,
+      "grad_norm": 0.14304400980472565,
+      "learning_rate": 3.349129862460251e-05,
+      "loss": 0.15567013025283813,
+      "step": 2245
+    },
+    {
+      "epoch": 0.4093886462882096,
+      "grad_norm": 0.17040041089057922,
+      "learning_rate": 3.342202975555386e-05,
+      "loss": 0.1563249945640564,
+      "step": 2250
+    },
+    {
+      "epoch": 0.4102983988355167,
+      "grad_norm": 0.15594671666622162,
+      "learning_rate": 3.3352687875661984e-05,
+      "loss": 0.1546410083770752,
+      "step": 2255
+    },
+    {
+      "epoch": 0.41120815138282385,
+      "grad_norm": 0.1677195280790329,
+      "learning_rate": 3.328327358605384e-05,
+      "loss": 0.15710171461105346,
+      "step": 2260
+    },
+    {
+      "epoch": 0.412117903930131,
+      "grad_norm": 0.1731705516576767,
+      "learning_rate": 3.321378748848412e-05,
+      "loss": 0.16444036960601807,
+      "step": 2265
+    },
+    {
+      "epoch": 0.4130276564774381,
+      "grad_norm": 0.18779033422470093,
+      "learning_rate": 3.3144230185329984e-05,
+      "loss": 0.15659687519073487,
+      "step": 2270
+    },
+    {
+      "epoch": 0.4139374090247453,
+      "grad_norm": 0.1543768346309662,
+      "learning_rate": 3.3074602279585913e-05,
+      "loss": 0.15100739002227784,
+      "step": 2275
+    },
+    {
+      "epoch": 0.4148471615720524,
+      "grad_norm": 0.16672168672084808,
+      "learning_rate": 3.300490437485843e-05,
+      "loss": 0.15535364151000977,
+      "step": 2280
+    },
+    {
+      "epoch": 0.41575691411935956,
+      "grad_norm": 0.16741308569908142,
+      "learning_rate": 3.293513707536089e-05,
+      "loss": 0.15523911714553834,
+      "step": 2285
+    },
+    {
+      "epoch": 0.4166666666666667,
+      "grad_norm": 0.1488303542137146,
+      "learning_rate": 3.286530098590822e-05,
+      "loss": 0.1542000651359558,
+      "step": 2290
+    },
+    {
+      "epoch": 0.4175764192139738,
+      "grad_norm": 0.1637732982635498,
+      "learning_rate": 3.2795396711911694e-05,
+      "loss": 0.15354831218719484,
+      "step": 2295
+    },
+    {
+      "epoch": 0.41848617176128095,
+      "grad_norm": 0.1472022533416748,
+      "learning_rate": 3.272542485937369e-05,
+      "loss": 0.16235145330429077,
+      "step": 2300
+    },
+    {
+      "epoch": 0.4193959243085881,
+      "grad_norm": 0.15908290445804596,
+      "learning_rate": 3.265538603488241e-05,
+      "loss": 0.15642645359039306,
+      "step": 2305
+    },
+    {
+      "epoch": 0.4203056768558952,
+      "grad_norm": 0.1584865301847458,
+      "learning_rate": 3.2585280845606645e-05,
+      "loss": 0.15490249395370484,
+      "step": 2310
+    },
+    {
+      "epoch": 0.42121542940320233,
+      "grad_norm": 0.15893949568271637,
+      "learning_rate": 3.251510989929052e-05,
+      "loss": 0.1598116159439087,
+      "step": 2315
+    },
+    {
+      "epoch": 0.42212518195050946,
+      "grad_norm": 0.18930596113204956,
+      "learning_rate": 3.244487380424817e-05,
+      "loss": 0.1482008934020996,
+      "step": 2320
+    },
+    {
+      "epoch": 0.4230349344978166,
+      "grad_norm": 0.132876455783844,
+      "learning_rate": 3.237457316935856e-05,
+      "loss": 0.15304710865020751,
+      "step": 2325
+    },
+    {
+      "epoch": 0.4239446870451237,
+      "grad_norm": 0.16447032988071442,
+      "learning_rate": 3.2304208604060106e-05,
+      "loss": 0.15298750400543212,
+      "step": 2330
+    },
+    {
+      "epoch": 0.42485443959243085,
+      "grad_norm": 0.17748120427131653,
+      "learning_rate": 3.223378071834546e-05,
+      "loss": 0.1556084156036377,
+      "step": 2335
+    },
+    {
+      "epoch": 0.425764192139738,
+      "grad_norm": 0.16366586089134216,
+      "learning_rate": 3.2163290122756206e-05,
+      "loss": 0.14387927055358887,
+      "step": 2340
+    },
+    {
+      "epoch": 0.4266739446870451,
+      "grad_norm": 0.15398970246315002,
+      "learning_rate": 3.209273742837755e-05,
+      "loss": 0.16091293096542358,
+      "step": 2345
+    },
+    {
+      "epoch": 0.42758369723435224,
+      "grad_norm": 0.164212167263031,
+      "learning_rate": 3.202212324683305e-05,
+      "loss": 0.15523531436920165,
+      "step": 2350
+    },
+    {
+      "epoch": 0.4284934497816594,
+      "grad_norm": 0.16749800741672516,
+      "learning_rate": 3.1951448190279255e-05,
+      "loss": 0.15354975461959838,
+      "step": 2355
+    },
+    {
+      "epoch": 0.4294032023289665,
+      "grad_norm": 0.14137034118175507,
+      "learning_rate": 3.18807128714005e-05,
+      "loss": 0.14981694221496583,
+      "step": 2360
+    },
+    {
+      "epoch": 0.43031295487627363,
+      "grad_norm": 0.14848439395427704,
+      "learning_rate": 3.1809917903403507e-05,
+      "loss": 0.15448769330978393,
+      "step": 2365
+    },
+    {
+      "epoch": 0.43122270742358076,
+      "grad_norm": 0.1747605800628662,
+      "learning_rate": 3.1739063900012095e-05,
+      "loss": 0.15882387161254882,
+      "step": 2370
+    },
+    {
+      "epoch": 0.4321324599708879,
+      "grad_norm": 0.16054467856884003,
+      "learning_rate": 3.166815147546186e-05,
+      "loss": 0.15170297622680665,
+      "step": 2375
+    },
+    {
+      "epoch": 0.433042212518195,
+      "grad_norm": 0.15428027510643005,
+      "learning_rate": 3.1597181244494886e-05,
+      "loss": 0.16202548742294312,
+      "step": 2380
+    },
+    {
+      "epoch": 0.4339519650655022,
+      "grad_norm": 0.16747219860553741,
+      "learning_rate": 3.1526153822354325e-05,
+      "loss": 0.15461477041244506,
+      "step": 2385
+    },
+    {
+      "epoch": 0.43486171761280934,
+      "grad_norm": 0.17415772378444672,
+      "learning_rate": 3.145506982477918e-05,
+      "loss": 0.16173542737960817,
+      "step": 2390
+    },
+    {
+      "epoch": 0.43577147016011647,
+      "grad_norm": 0.1293518990278244,
+      "learning_rate": 3.1383929867998865e-05,
+      "loss": 0.15572521686553956,
+      "step": 2395
+    },
+    {
+      "epoch": 0.4366812227074236,
+      "grad_norm": 0.16909323632717133,
+      "learning_rate": 3.1312734568727935e-05,
+      "loss": 0.15898628234863282,
+      "step": 2400
+    },
+    {
+      "epoch": 0.43759097525473073,
+      "grad_norm": 0.16770294308662415,
+      "learning_rate": 3.124148454416069e-05,
+      "loss": 0.1536281704902649,
+      "step": 2405
+    },
+    {
+      "epoch": 0.43850072780203786,
+      "grad_norm": 0.14078612625598907,
+      "learning_rate": 3.117018041196585e-05,
+      "loss": 0.15274266004562378,
+      "step": 2410
+    },
+    {
+      "epoch": 0.439410480349345,
+      "grad_norm": 0.15457536280155182,
+      "learning_rate": 3.1098822790281226e-05,
+      "loss": 0.15391263961791993,
+      "step": 2415
+    },
+    {
+      "epoch": 0.4403202328966521,
+      "grad_norm": 0.1640717089176178,
+      "learning_rate": 3.102741229770827e-05,
+      "loss": 0.15515168905258178,
+      "step": 2420
+    },
+    {
+      "epoch": 0.44122998544395925,
+      "grad_norm": 0.2601533830165863,
+      "learning_rate": 3.095594955330683e-05,
+      "loss": 0.1587247371673584,
+      "step": 2425
+    },
+    {
+      "epoch": 0.4421397379912664,
+      "grad_norm": 0.1352529525756836,
+      "learning_rate": 3.08844351765897e-05,
+      "loss": 0.1483217477798462,
+      "step": 2430
+    },
+    {
+      "epoch": 0.4430494905385735,
+      "grad_norm": 0.18479721248149872,
+      "learning_rate": 3.081286978751728e-05,
+      "loss": 0.15121787786483765,
+      "step": 2435
+    },
+    {
+      "epoch": 0.44395924308588064,
+      "grad_norm": 0.16954511404037476,
+      "learning_rate": 3.074125400649221e-05,
+      "loss": 0.16073100566864013,
+      "step": 2440
+    },
+    {
+      "epoch": 0.44486899563318777,
+      "grad_norm": 0.15154729783535004,
+      "learning_rate": 3.0669588454353944e-05,
+      "loss": 0.15738017559051515,
+      "step": 2445
+    },
+    {
+      "epoch": 0.4457787481804949,
+      "grad_norm": 0.1540488302707672,
+      "learning_rate": 3.059787375237344e-05,
+      "loss": 0.1515384554862976,
+      "step": 2450
+    },
+    {
+      "epoch": 0.44668850072780203,
+      "grad_norm": 0.1814432442188263,
+      "learning_rate": 3.052611052224774e-05,
+      "loss": 0.15731438398361205,
+      "step": 2455
+    },
+    {
+      "epoch": 0.44759825327510916,
+      "grad_norm": 0.16657036542892456,
+      "learning_rate": 3.0454299386094542e-05,
+      "loss": 0.15741543769836425,
+      "step": 2460
+    },
+    {
+      "epoch": 0.4485080058224163,
+      "grad_norm": 0.2177237570285797,
+      "learning_rate": 3.0382440966446875e-05,
+      "loss": 0.14972515106201173,
+      "step": 2465
+    },
+    {
+      "epoch": 0.4494177583697234,
+      "grad_norm": 0.1669909954071045,
+      "learning_rate": 3.031053588624766e-05,
+      "loss": 0.1506432294845581,
+      "step": 2470
+    },
+    {
+      "epoch": 0.45032751091703055,
+      "grad_norm": 0.1752234250307083,
+      "learning_rate": 3.0238584768844313e-05,
+      "loss": 0.14969609975814818,
+      "step": 2475
+    },
+    {
+      "epoch": 0.4512372634643377,
+      "grad_norm": 0.18267901241779327,
+      "learning_rate": 3.0166588237983363e-05,
+      "loss": 0.15112748146057128,
+      "step": 2480
+    },
+    {
+      "epoch": 0.4521470160116448,
+      "grad_norm": 0.16250105202198029,
+      "learning_rate": 3.0094546917805007e-05,
+      "loss": 0.15864100456237792,
+      "step": 2485
+    },
+    {
+      "epoch": 0.45305676855895194,
+      "grad_norm": 0.14825721085071564,
+      "learning_rate": 3.0022461432837752e-05,
+      "loss": 0.1513954520225525,
+      "step": 2490
+    },
+    {
+      "epoch": 0.4539665211062591,
+      "grad_norm": 0.1626640111207962,
+      "learning_rate": 2.9950332407992943e-05,
+      "loss": 0.1505578875541687,
+      "step": 2495
+    },
+    {
+      "epoch": 0.45487627365356625,
+      "grad_norm": 0.1535351574420929,
+      "learning_rate": 2.987816046855939e-05,
+      "loss": 0.15255829095840454,
+      "step": 2500
+    },
+    {
+      "epoch": 0.4557860262008734,
+      "grad_norm": 0.17552775144577026,
+      "learning_rate": 2.9805946240197928e-05,
+      "loss": 0.1516443133354187,
+      "step": 2505
+    },
+    {
+      "epoch": 0.4566957787481805,
+      "grad_norm": 0.16020981967449188,
+      "learning_rate": 2.9733690348935994e-05,
+      "loss": 0.14519743919372557,
+      "step": 2510
+    },
+    {
+      "epoch": 0.45760553129548764,
+      "grad_norm": 0.17800211906433105,
+      "learning_rate": 2.9661393421162204e-05,
+      "loss": 0.15679080486297609,
+      "step": 2515
+    },
+    {
+      "epoch": 0.4585152838427948,
+      "grad_norm": 0.16016991436481476,
+      "learning_rate": 2.9589056083620902e-05,
+      "loss": 0.14768127202987671,
+      "step": 2520
+    },
+    {
+      "epoch": 0.4594250363901019,
+      "grad_norm": 0.16272081434726715,
+      "learning_rate": 2.951667896340679e-05,
+      "loss": 0.1513301968574524,
+      "step": 2525
+    },
+    {
+      "epoch": 0.46033478893740903,
+      "grad_norm": 0.1726413071155548,
+      "learning_rate": 2.9444262687959402e-05,
+      "loss": 0.14819332361221313,
+      "step": 2530
+    },
+    {
+      "epoch": 0.46124454148471616,
+      "grad_norm": 0.1670403778553009,
+      "learning_rate": 2.9371807885057735e-05,
+      "loss": 0.15245940685272216,
+      "step": 2535
+    },
+    {
+      "epoch": 0.4621542940320233,
+      "grad_norm": 0.1650049239397049,
+      "learning_rate": 2.9299315182814772e-05,
+      "loss": 0.15187418460845947,
+      "step": 2540
+    },
+    {
+      "epoch": 0.4630640465793304,
+      "grad_norm": 0.16327734291553497,
+      "learning_rate": 2.9226785209672047e-05,
+      "loss": 0.15579828023910522,
+      "step": 2545
+    },
+    {
+      "epoch": 0.46397379912663755,
+      "grad_norm": 0.3367880582809448,
+      "learning_rate": 2.91542185943942e-05,
+      "loss": 0.15617697238922118,
+      "step": 2550
+    },
+    {
+      "epoch": 0.4648835516739447,
+      "grad_norm": 0.1731594055891037,
+      "learning_rate": 2.908161596606353e-05,
+      "loss": 0.1559603691101074,
+      "step": 2555
+    },
+    {
+      "epoch": 0.4657933042212518,
+      "grad_norm": 0.1477293074131012,
+      "learning_rate": 2.9008977954074517e-05,
+      "loss": 0.15567959547042848,
+      "step": 2560
+    },
+    {
+      "epoch": 0.46670305676855894,
+      "grad_norm": 0.16227173805236816,
+      "learning_rate": 2.8936305188128392e-05,
+      "loss": 0.1522113561630249,
+      "step": 2565
+    },
+    {
+      "epoch": 0.4676128093158661,
+      "grad_norm": 0.2031075656414032,
+      "learning_rate": 2.8863598298227674e-05,
+      "loss": 0.15054640769958497,
+      "step": 2570
+    },
+    {
+      "epoch": 0.4685225618631732,
+      "grad_norm": 0.18351472914218903,
+      "learning_rate": 2.8790857914670698e-05,
+      "loss": 0.15837019681930542,
+      "step": 2575
+    },
+    {
+      "epoch": 0.46943231441048033,
+      "grad_norm": 0.15914765000343323,
+      "learning_rate": 2.871808466804616e-05,
+      "loss": 0.1550259470939636,
+      "step": 2580
+    },
+    {
+      "epoch": 0.47034206695778746,
+      "grad_norm": 0.17366717755794525,
+      "learning_rate": 2.8645279189227636e-05,
+      "loss": 0.15702390670776367,
+      "step": 2585
+    },
+    {
+      "epoch": 0.4712518195050946,
+      "grad_norm": 0.13677838444709778,
+      "learning_rate": 2.8572442109368134e-05,
+      "loss": 0.15485031604766847,
+      "step": 2590
+    },
+    {
+      "epoch": 0.4721615720524017,
+      "grad_norm": 0.1477748304605484,
+      "learning_rate": 2.8499574059894617e-05,
+      "loss": 0.14577245712280273,
+      "step": 2595
+    },
+    {
+      "epoch": 0.47307132459970885,
+      "grad_norm": 0.1582217663526535,
+      "learning_rate": 2.842667567250252e-05,
+      "loss": 0.15586793422698975,
+      "step": 2600
+    },
+    {
+      "epoch": 0.47398107714701604,
+      "grad_norm": 0.19658738374710083,
+      "learning_rate": 2.8353747579150268e-05,
+      "loss": 0.15060495138168334,
+      "step": 2605
+    },
+    {
+      "epoch": 0.47489082969432317,
+      "grad_norm": 0.176767036318779,
+      "learning_rate": 2.828079041205382e-05,
+      "loss": 0.15116705894470214,
+      "step": 2610
+    },
+    {
+      "epoch": 0.4758005822416303,
+      "grad_norm": 0.16972507536411285,
+      "learning_rate": 2.820780480368117e-05,
+      "loss": 0.1541937470436096,
+      "step": 2615
+    },
+    {
+      "epoch": 0.47671033478893743,
+      "grad_norm": 0.1548585742712021,
+      "learning_rate": 2.8134791386746884e-05,
+      "loss": 0.14334756135940552,
+      "step": 2620
+    },
+    {
+      "epoch": 0.47762008733624456,
+      "grad_norm": 0.15411986410617828,
+      "learning_rate": 2.806175079420658e-05,
+      "loss": 0.14642289876937867,
+      "step": 2625
+    },
+    {
+      "epoch": 0.4785298398835517,
+      "grad_norm": 0.16609491407871246,
+      "learning_rate": 2.7988683659251474e-05,
+      "loss": 0.15083469152450563,
+      "step": 2630
+    },
+    {
+      "epoch": 0.4794395924308588,
+      "grad_norm": 0.16592684388160706,
+      "learning_rate": 2.791559061530289e-05,
+      "loss": 0.14218480587005616,
+      "step": 2635
+    },
+    {
+      "epoch": 0.48034934497816595,
+      "grad_norm": 0.1764935404062271,
+      "learning_rate": 2.7842472296006722e-05,
+      "loss": 0.15004343986511232,
+      "step": 2640
+    },
+    {
+      "epoch": 0.4812590975254731,
+      "grad_norm": 0.20094354450702667,
+      "learning_rate": 2.7769329335228022e-05,
+      "loss": 0.14975016117095946,
+      "step": 2645
+    },
+    {
+      "epoch": 0.4821688500727802,
+      "grad_norm": 0.1869269460439682,
+      "learning_rate": 2.769616236704542e-05,
+      "loss": 0.155981707572937,
+      "step": 2650
+    },
+    {
+      "epoch": 0.48307860262008734,
+      "grad_norm": 0.16671574115753174,
+      "learning_rate": 2.762297202574571e-05,
+      "loss": 0.14633859395980836,
+      "step": 2655
+    },
+    {
+      "epoch": 0.48398835516739447,
+      "grad_norm": 0.14999663829803467,
+      "learning_rate": 2.754975894581826e-05,
+      "loss": 0.15692603588104248,
+      "step": 2660
+    },
+    {
+      "epoch": 0.4848981077147016,
+      "grad_norm": 0.16893649101257324,
+      "learning_rate": 2.7476523761949592e-05,
+      "loss": 0.14530394077301026,
+      "step": 2665
+    },
+    {
+      "epoch": 0.48580786026200873,
+      "grad_norm": 0.16039884090423584,
+      "learning_rate": 2.740326710901784e-05,
+      "loss": 0.15013915300369263,
+      "step": 2670
+    },
+    {
+      "epoch": 0.48671761280931586,
+      "grad_norm": 0.16672006249427795,
+      "learning_rate": 2.732998962208725e-05,
+      "loss": 0.15667349100112915,
+      "step": 2675
+    },
+    {
+      "epoch": 0.487627365356623,
+      "grad_norm": 0.2160867303609848,
+      "learning_rate": 2.7256691936402684e-05,
+      "loss": 0.14335414171218872,
+      "step": 2680
+    },
+    {
+      "epoch": 0.4885371179039301,
+      "grad_norm": 0.349030077457428,
+      "learning_rate": 2.71833746873841e-05,
+      "loss": 0.1437530279159546,
+      "step": 2685
+    },
+    {
+      "epoch": 0.48944687045123725,
+      "grad_norm": 0.18380966782569885,
+      "learning_rate": 2.7110038510621073e-05,
+      "loss": 0.1476014256477356,
+      "step": 2690
+    },
+    {
+      "epoch": 0.4903566229985444,
+      "grad_norm": 0.1523742377758026,
+      "learning_rate": 2.703668404186722e-05,
+      "loss": 0.14578526020050048,
+      "step": 2695
+    },
+    {
+      "epoch": 0.4912663755458515,
+      "grad_norm": 0.16092729568481445,
+      "learning_rate": 2.696331191703479e-05,
+      "loss": 0.15335593223571778,
+      "step": 2700
+    },
+    {
+      "epoch": 0.49217612809315864,
+      "grad_norm": 0.17185333371162415,
+      "learning_rate": 2.688992277218904e-05,
+      "loss": 0.1540898084640503,
+      "step": 2705
+    },
+    {
+      "epoch": 0.49308588064046577,
+      "grad_norm": 0.1521969735622406,
+      "learning_rate": 2.6816517243542792e-05,
+      "loss": 0.15171396732330322,
+      "step": 2710
+    },
+    {
+      "epoch": 0.49399563318777295,
+      "grad_norm": 0.16064171493053436,
+      "learning_rate": 2.674309596745092e-05,
+      "loss": 0.1505839228630066,
+      "step": 2715
+    },
+    {
+      "epoch": 0.4949053857350801,
+      "grad_norm": 0.16430898010730743,
+      "learning_rate": 2.6669659580404795e-05,
+      "loss": 0.1551363468170166,
+      "step": 2720
+    },
+    {
+      "epoch": 0.4958151382823872,
+      "grad_norm": 0.16125477850437164,
+      "learning_rate": 2.659620871902677e-05,
+      "loss": 0.15069286823272704,
+      "step": 2725
+    },
+    {
+      "epoch": 0.49672489082969434,
+      "grad_norm": 0.1428450047969818,
+      "learning_rate": 2.652274402006471e-05,
+      "loss": 0.15511081218719483,
+      "step": 2730
+    },
+    {
+      "epoch": 0.4976346433770015,
+      "grad_norm": 0.15452754497528076,
+      "learning_rate": 2.6449266120386406e-05,
+      "loss": 0.14941939115524291,
+      "step": 2735
+    },
+    {
+      "epoch": 0.4985443959243086,
+      "grad_norm": 0.17243537306785583,
+      "learning_rate": 2.6375775656974123e-05,
+      "loss": 0.151741623878479,
+      "step": 2740
+    },
+    {
+      "epoch": 0.49945414847161573,
+      "grad_norm": 0.13736453652381897,
+      "learning_rate": 2.6302273266919008e-05,
+      "loss": 0.147042977809906,
+      "step": 2745
+    },
+    {
+      "epoch": 0.5003639010189228,
+      "grad_norm": 0.16241495311260223,
+      "learning_rate": 2.6228759587415614e-05,
+      "loss": 0.14664684534072875,
+      "step": 2750
+    },
+    {
+      "epoch": 0.50127365356623,
+      "grad_norm": 0.193496435880661,
+      "learning_rate": 2.6155235255756356e-05,
+      "loss": 0.15486966371536254,
+      "step": 2755
+    },
+    {
+      "epoch": 0.5021834061135371,
+      "grad_norm": 0.1542847901582718,
+      "learning_rate": 2.6081700909326e-05,
+      "loss": 0.15148009061813356,
+      "step": 2760
+    },
+    {
+      "epoch": 0.5030931586608443,
+      "grad_norm": 0.1696511209011078,
+      "learning_rate": 2.6008157185596142e-05,
+      "loss": 0.14190055131912233,
+      "step": 2765
+    },
+    {
+      "epoch": 0.5040029112081513,
+      "grad_norm": 0.14690077304840088,
+      "learning_rate": 2.5934604722119655e-05,
+      "loss": 0.1570739269256592,
+      "step": 2770
+    },
+    {
+      "epoch": 0.5049126637554585,
+      "grad_norm": 0.17149671912193298,
+      "learning_rate": 2.5861044156525162e-05,
+      "loss": 0.14940304756164552,
+      "step": 2775
+    },
+    {
+      "epoch": 0.5058224163027657,
+      "grad_norm": 0.16639231145381927,
+      "learning_rate": 2.578747612651155e-05,
+      "loss": 0.15691237449645995,
+      "step": 2780
+    },
+    {
+      "epoch": 0.5067321688500728,
+      "grad_norm": 0.2062763124704361,
+      "learning_rate": 2.5713901269842404e-05,
+      "loss": 0.1564734935760498,
+      "step": 2785
+    },
+    {
+      "epoch": 0.50764192139738,
+      "grad_norm": 0.12636308372020721,
+      "learning_rate": 2.5640320224340502e-05,
+      "loss": 0.14539417028427123,
+      "step": 2790
+    },
+    {
+      "epoch": 0.508551673944687,
+      "grad_norm": 0.16893689334392548,
+      "learning_rate": 2.556673362788225e-05,
+      "loss": 0.15440930128097535,
+      "step": 2795
+    },
+    {
+      "epoch": 0.5094614264919942,
+      "grad_norm": 0.16250015795230865,
+      "learning_rate": 2.54931421183922e-05,
+      "loss": 0.14485647678375244,
+      "step": 2800
+    },
+    {
+      "epoch": 0.5103711790393013,
+      "grad_norm": 0.1700994372367859,
+      "learning_rate": 2.5419546333837462e-05,
+      "loss": 0.15411126613616943,
+      "step": 2805
+    },
+    {
+      "epoch": 0.5112809315866085,
+      "grad_norm": 0.1547706127166748,
+      "learning_rate": 2.5345946912222256e-05,
+      "loss": 0.15516072511672974,
+      "step": 2810
+    },
+    {
+      "epoch": 0.5121906841339156,
+      "grad_norm": 0.17955681681632996,
+      "learning_rate": 2.527234449158228e-05,
+      "loss": 0.15546923875808716,
+      "step": 2815
+    },
+    {
+      "epoch": 0.5131004366812227,
+      "grad_norm": 0.163709819316864,
+      "learning_rate": 2.519873970997927e-05,
+      "loss": 0.15665037631988527,
+      "step": 2820
+    },
+    {
+      "epoch": 0.5140101892285298,
+      "grad_norm": 0.17859576642513275,
+      "learning_rate": 2.5125133205495405e-05,
+      "loss": 0.1539722204208374,
+      "step": 2825
+    },
+    {
+      "epoch": 0.514919941775837,
+      "grad_norm": 0.17443150281906128,
+      "learning_rate": 2.5051525616227806e-05,
+      "loss": 0.148411762714386,
+      "step": 2830
+    },
+    {
+      "epoch": 0.5158296943231441,
+      "grad_norm": 0.17397581040859222,
+      "learning_rate": 2.4977917580283007e-05,
+      "loss": 0.14880497455596925,
+      "step": 2835
+    },
+    {
+      "epoch": 0.5167394468704513,
+      "grad_norm": 0.14565663039684296,
+      "learning_rate": 2.4904309735771405e-05,
+      "loss": 0.14934680461883545,
+      "step": 2840
+    },
+    {
+      "epoch": 0.5176491994177583,
+      "grad_norm": 0.17895659804344177,
+      "learning_rate": 2.4830702720801746e-05,
+      "loss": 0.15287939310073853,
+      "step": 2845
+    },
+    {
+      "epoch": 0.5185589519650655,
+      "grad_norm": 0.15812788903713226,
+      "learning_rate": 2.4757097173475572e-05,
+      "loss": 0.14576947689056396,
+      "step": 2850
+    },
+    {
+      "epoch": 0.5194687045123726,
+      "grad_norm": 0.17123781144618988,
+      "learning_rate": 2.46834937318817e-05,
+      "loss": 0.15224847793579102,
+      "step": 2855
+    },
+    {
+      "epoch": 0.5203784570596798,
+      "grad_norm": 0.14845474064350128,
+      "learning_rate": 2.460989303409072e-05,
+      "loss": 0.14901585578918458,
+      "step": 2860
+    },
+    {
+      "epoch": 0.5212882096069869,
+      "grad_norm": 0.23493704199790955,
+      "learning_rate": 2.4536295718149407e-05,
+      "loss": 0.1517487049102783,
+      "step": 2865
+    },
+    {
+      "epoch": 0.522197962154294,
+      "grad_norm": 0.16209843754768372,
+      "learning_rate": 2.4462702422075217e-05,
+      "loss": 0.14327445030212402,
+      "step": 2870
+    },
+    {
+      "epoch": 0.5231077147016011,
+      "grad_norm": 0.17249803245067596,
+      "learning_rate": 2.4389113783850793e-05,
+      "loss": 0.1517549753189087,
+      "step": 2875
+    },
+    {
+      "epoch": 0.5240174672489083,
+      "grad_norm": 0.14561402797698975,
+      "learning_rate": 2.431553044141836e-05,
+      "loss": 0.14764087200164794,
+      "step": 2880
+    },
+    {
+      "epoch": 0.5249272197962155,
+      "grad_norm": 0.17033302783966064,
+      "learning_rate": 2.4241953032674256e-05,
+      "loss": 0.15181604623794556,
+      "step": 2885
+    },
+    {
+      "epoch": 0.5258369723435226,
+      "grad_norm": 0.1184430941939354,
+      "learning_rate": 2.4168382195463367e-05,
+      "loss": 0.14264242649078368,
+      "step": 2890
+    },
+    {
+      "epoch": 0.5267467248908297,
+      "grad_norm": 0.17521196603775024,
+      "learning_rate": 2.4094818567573618e-05,
+      "loss": 0.1509538173675537,
+      "step": 2895
+    },
+    {
+      "epoch": 0.5276564774381368,
+      "grad_norm": 0.1681576371192932,
+      "learning_rate": 2.4021262786730428e-05,
+      "loss": 0.15344605445861817,
+      "step": 2900
+    },
+    {
+      "epoch": 0.528566229985444,
+      "grad_norm": 0.17134182155132294,
+      "learning_rate": 2.3947715490591206e-05,
+      "loss": 0.15161689519882202,
+      "step": 2905
+    },
+    {
+      "epoch": 0.5294759825327511,
+      "grad_norm": 0.1796472817659378,
+      "learning_rate": 2.3874177316739778e-05,
+      "loss": 0.15086464881896972,
+      "step": 2910
+    },
+    {
+      "epoch": 0.5303857350800583,
+      "grad_norm": 0.23268625140190125,
+      "learning_rate": 2.380064890268093e-05,
+      "loss": 0.15354180335998535,
+      "step": 2915
+    },
+    {
+      "epoch": 0.5312954876273653,
+      "grad_norm": 0.16318941116333008,
+      "learning_rate": 2.372713088583481e-05,
+      "loss": 0.15131797790527343,
+      "step": 2920
+    },
+    {
+      "epoch": 0.5322052401746725,
+      "grad_norm": 0.18171803653240204,
+      "learning_rate": 2.365362390353143e-05,
+      "loss": 0.15784090757369995,
+      "step": 2925
+    },
+    {
+      "epoch": 0.5331149927219796,
+      "grad_norm": 0.17672640085220337,
+      "learning_rate": 2.3580128593005156e-05,
+      "loss": 0.15509436130523682,
+      "step": 2930
+    },
+    {
+      "epoch": 0.5340247452692868,
+      "grad_norm": 0.15985223650932312,
+      "learning_rate": 2.3506645591389174e-05,
+      "loss": 0.14851027727127075,
+      "step": 2935
+    },
+    {
+      "epoch": 0.5349344978165939,
+      "grad_norm": 0.16597607731819153,
+      "learning_rate": 2.343317553570995e-05,
+      "loss": 0.1504931092262268,
+      "step": 2940
+    },
+    {
+      "epoch": 0.535844250363901,
+      "grad_norm": 0.20180748403072357,
+      "learning_rate": 2.3359719062881725e-05,
+      "loss": 0.15023820400238036,
+      "step": 2945
+    },
+    {
+      "epoch": 0.5367540029112081,
+      "grad_norm": 0.1735963076353073,
+      "learning_rate": 2.3286276809701e-05,
+      "loss": 0.15374408960342406,
+      "step": 2950
+    },
+    {
+      "epoch": 0.5376637554585153,
+      "grad_norm": 0.17629501223564148,
+      "learning_rate": 2.3212849412840995e-05,
+      "loss": 0.15007833242416382,
+      "step": 2955
+    },
+    {
+      "epoch": 0.5385735080058224,
+      "grad_norm": 0.1493796557188034,
+      "learning_rate": 2.3139437508846155e-05,
+      "loss": 0.15206656455993653,
+      "step": 2960
+    },
+    {
+      "epoch": 0.5394832605531296,
+      "grad_norm": 0.17426837980747223,
+      "learning_rate": 2.306604173412659e-05,
+      "loss": 0.1441131591796875,
+      "step": 2965
+    },
+    {
+      "epoch": 0.5403930131004366,
+      "grad_norm": 0.16984431445598602,
+      "learning_rate": 2.2992662724952613e-05,
+      "loss": 0.14438753128051757,
+      "step": 2970
+    },
+    {
+      "epoch": 0.5413027656477438,
+      "grad_norm": 0.1814386397600174,
+      "learning_rate": 2.2919301117449167e-05,
+      "loss": 0.14887022972106934,
+      "step": 2975
+    },
+    {
+      "epoch": 0.5422125181950509,
+      "grad_norm": 0.158392995595932,
+      "learning_rate": 2.2845957547590368e-05,
+      "loss": 0.14404361248016356,
+      "step": 2980
+    },
+    {
+      "epoch": 0.5431222707423581,
+      "grad_norm": 0.17496263980865479,
+      "learning_rate": 2.2772632651193953e-05,
+      "loss": 0.1454906702041626,
+      "step": 2985
+    },
+    {
+      "epoch": 0.5440320232896652,
+      "grad_norm": 0.157533198595047,
+      "learning_rate": 2.2699327063915766e-05,
+      "loss": 0.1458217740058899,
+      "step": 2990
+    },
+    {
+      "epoch": 0.5449417758369723,
+      "grad_norm": 0.1767890453338623,
+      "learning_rate": 2.262604142124427e-05,
+      "loss": 0.14384825229644777,
+      "step": 2995
+    },
+    {
+      "epoch": 0.5458515283842795,
+      "grad_norm": 0.1851050704717636,
+      "learning_rate": 2.2552776358495033e-05,
+      "loss": 0.14832457304000854,
+      "step": 3000
+    },
+    {
+      "epoch": 0.5467612809315866,
+      "grad_norm": 0.164175882935524,
+      "learning_rate": 2.247953251080521e-05,
+      "loss": 0.14999878406524658,
+      "step": 3005
+    },
+    {
+      "epoch": 0.5476710334788938,
+      "grad_norm": 0.3403675854206085,
+      "learning_rate": 2.240631051312804e-05,
+      "loss": 0.1443937063217163,
+      "step": 3010
+    },
+    {
+      "epoch": 0.5485807860262009,
+      "grad_norm": 0.16751109063625336,
+      "learning_rate": 2.2333111000227342e-05,
+      "loss": 0.1462402105331421,
+      "step": 3015
+    },
+    {
+      "epoch": 0.549490538573508,
+      "grad_norm": 0.14741151034832,
+      "learning_rate": 2.225993460667201e-05,
+      "loss": 0.149855899810791,
+      "step": 3020
+    },
+    {
+      "epoch": 0.5504002911208151,
+      "grad_norm": 0.20605266094207764,
+      "learning_rate": 2.218678196683054e-05,
+      "loss": 0.15413178205490113,
+      "step": 3025
+    },
+    {
+      "epoch": 0.5513100436681223,
+      "grad_norm": 0.14884796738624573,
+      "learning_rate": 2.2113653714865473e-05,
+      "loss": 0.14592334032058715,
+      "step": 3030
+    },
+    {
+      "epoch": 0.5522197962154294,
+      "grad_norm": 0.17114350199699402,
+      "learning_rate": 2.2040550484727943e-05,
+      "loss": 0.1498338460922241,
+      "step": 3035
+    },
+    {
+      "epoch": 0.5531295487627366,
+      "grad_norm": 0.16496853530406952,
+      "learning_rate": 2.196747291015219e-05,
+      "loss": 0.14650315046310425,
+      "step": 3040
+    },
+    {
+      "epoch": 0.5540393013100436,
+      "grad_norm": 0.15172401070594788,
+      "learning_rate": 2.189442162465001e-05,
+      "loss": 0.14984124898910522,
+      "step": 3045
+    },
+    {
+      "epoch": 0.5549490538573508,
+      "grad_norm": 0.19258467853069305,
+      "learning_rate": 2.182139726150532e-05,
+      "loss": 0.1486764669418335,
+      "step": 3050
+    },
+    {
+      "epoch": 0.5558588064046579,
+      "grad_norm": 0.1749001443386078,
+      "learning_rate": 2.1748400453768652e-05,
+      "loss": 0.14983701705932617,
+      "step": 3055
+    },
+    {
+      "epoch": 0.5567685589519651,
+      "grad_norm": 0.37510567903518677,
+      "learning_rate": 2.1675431834251637e-05,
+      "loss": 0.14483561515808105,
+      "step": 3060
+    },
+    {
+      "epoch": 0.5576783114992722,
+      "grad_norm": 0.16932405531406403,
+      "learning_rate": 2.1602492035521553e-05,
+      "loss": 0.14487643241882325,
+      "step": 3065
+    },
+    {
+      "epoch": 0.5585880640465793,
+      "grad_norm": 0.174176424741745,
+      "learning_rate": 2.152958168989584e-05,
+      "loss": 0.14737497568130492,
+      "step": 3070
+    },
+    {
+      "epoch": 0.5594978165938864,
+      "grad_norm": 0.1601252257823944,
+      "learning_rate": 2.1456701429436577e-05,
+      "loss": 0.15183379650115966,
+      "step": 3075
+    },
+    {
+      "epoch": 0.5604075691411936,
+      "grad_norm": 0.14960910379886627,
+      "learning_rate": 2.1383851885945085e-05,
+      "loss": 0.143074893951416,
+      "step": 3080
+    },
+    {
+      "epoch": 0.5613173216885007,
+      "grad_norm": 0.1678633838891983,
+      "learning_rate": 2.1311033690956346e-05,
+      "loss": 0.14961432218551635,
+      "step": 3085
+    },
+    {
+      "epoch": 0.5622270742358079,
+      "grad_norm": 0.15814319252967834,
+      "learning_rate": 2.1238247475733613e-05,
+      "loss": 0.14308581352233887,
+      "step": 3090
+    },
+    {
+      "epoch": 0.5631368267831149,
+      "grad_norm": 0.21240772306919098,
+      "learning_rate": 2.1165493871262887e-05,
+      "loss": 0.14737485647201537,
+      "step": 3095
+    },
+    {
+      "epoch": 0.5640465793304221,
+      "grad_norm": 0.15161271393299103,
+      "learning_rate": 2.109277350824749e-05,
+      "loss": 0.14534420967102052,
+      "step": 3100
+    },
+    {
+      "epoch": 0.5649563318777293,
+      "grad_norm": 0.16572362184524536,
+      "learning_rate": 2.1020087017102537e-05,
+      "loss": 0.14299670457839966,
+      "step": 3105
+    },
+    {
+      "epoch": 0.5658660844250364,
+      "grad_norm": 0.1548164039850235,
+      "learning_rate": 2.094743502794954e-05,
+      "loss": 0.14371142387390137,
+      "step": 3110
+    },
+    {
+      "epoch": 0.5667758369723436,
+      "grad_norm": 0.2574169933795929,
+      "learning_rate": 2.0874818170610885e-05,
+      "loss": 0.14350423812866211,
+      "step": 3115
+    },
+    {
+      "epoch": 0.5676855895196506,
+      "grad_norm": 0.16359548270702362,
+      "learning_rate": 2.080223707460443e-05,
+      "loss": 0.1520243763923645,
+      "step": 3120
+    },
+    {
+      "epoch": 0.5685953420669578,
+      "grad_norm": 0.1798320859670639,
+      "learning_rate": 2.072969236913799e-05,
+      "loss": 0.14832595586776734,
+      "step": 3125
+    },
+    {
+      "epoch": 0.5695050946142649,
+      "grad_norm": 0.17045916616916656,
+      "learning_rate": 2.0657184683103926e-05,
+      "loss": 0.15308042764663696,
+      "step": 3130
+    },
+    {
+      "epoch": 0.5704148471615721,
+      "grad_norm": 0.16345897316932678,
+      "learning_rate": 2.058471464507366e-05,
+      "loss": 0.14564799070358275,
+      "step": 3135
+    },
+    {
+      "epoch": 0.5713245997088792,
+      "grad_norm": 0.15170110762119293,
+      "learning_rate": 2.0512282883292257e-05,
+      "loss": 0.14161767959594726,
+      "step": 3140
+    },
+    {
+      "epoch": 0.5722343522561864,
+      "grad_norm": 0.8107472658157349,
+      "learning_rate": 2.0439890025672955e-05,
+      "loss": 0.14481087923049926,
+      "step": 3145
+    },
+    {
+      "epoch": 0.5731441048034934,
+      "grad_norm": 0.15346679091453552,
+      "learning_rate": 2.036753669979174e-05,
+      "loss": 0.14860262870788574,
+      "step": 3150
+    },
+    {
+      "epoch": 0.5740538573508006,
+      "grad_norm": 0.1632593423128128,
+      "learning_rate": 2.0295223532881886e-05,
+      "loss": 0.1481687307357788,
+      "step": 3155
+    },
+    {
+      "epoch": 0.5749636098981077,
+      "grad_norm": 0.23399172723293304,
+      "learning_rate": 2.022295115182852e-05,
+      "loss": 0.149153733253479,
+      "step": 3160
+    },
+    {
+      "epoch": 0.5758733624454149,
+      "grad_norm": 0.14977394044399261,
+      "learning_rate": 2.015072018316323e-05,
+      "loss": 0.14921388626098633,
+      "step": 3165
+    },
+    {
+      "epoch": 0.576783114992722,
+      "grad_norm": 0.1550658792257309,
+      "learning_rate": 2.007853125305856e-05,
+      "loss": 0.1482759475708008,
+      "step": 3170
+    },
+    {
+      "epoch": 0.5776928675400291,
+      "grad_norm": 0.16661737859249115,
+      "learning_rate": 2.0006384987322645e-05,
+      "loss": 0.14903552532196046,
+      "step": 3175
+    },
+    {
+      "epoch": 0.5786026200873362,
+      "grad_norm": 0.1746823936700821,
+      "learning_rate": 1.9934282011393753e-05,
+      "loss": 0.1412947654724121,
+      "step": 3180
+    },
+    {
+      "epoch": 0.5795123726346434,
+      "grad_norm": 0.17025792598724365,
+      "learning_rate": 1.9862222950334857e-05,
+      "loss": 0.15289769172668458,
+      "step": 3185
+    },
+    {
+      "epoch": 0.5804221251819505,
+      "grad_norm": 0.16857658326625824,
+      "learning_rate": 1.9790208428828252e-05,
+      "loss": 0.14419941902160643,
+      "step": 3190
+    },
+    {
+      "epoch": 0.5813318777292577,
+      "grad_norm": 0.16099876165390015,
+      "learning_rate": 1.9718239071170118e-05,
+      "loss": 0.14476487636566163,
+      "step": 3195
+    },
+    {
+      "epoch": 0.5822416302765647,
+      "grad_norm": 0.16140873730182648,
+      "learning_rate": 1.964631550126508e-05,
+      "loss": 0.14588416814804078,
+      "step": 3200
+    },
+    {
+      "epoch": 0.5831513828238719,
+      "grad_norm": 0.15719448029994965,
+      "learning_rate": 1.957443834262087e-05,
+      "loss": 0.15144693851470947,
+      "step": 3205
+    },
+    {
+      "epoch": 0.584061135371179,
+      "grad_norm": 0.16512645781040192,
+      "learning_rate": 1.950260821834285e-05,
+      "loss": 0.14787566661834717,
+      "step": 3210
+    },
+    {
+      "epoch": 0.5849708879184862,
+      "grad_norm": 0.18584516644477844,
+      "learning_rate": 1.9430825751128643e-05,
+      "loss": 0.14514710903167724,
+      "step": 3215
+    },
+    {
+      "epoch": 0.5858806404657934,
+      "grad_norm": 0.17640981078147888,
+      "learning_rate": 1.9359091563262742e-05,
+      "loss": 0.1511004686355591,
+      "step": 3220
+    },
+    {
+      "epoch": 0.5867903930131004,
+      "grad_norm": 0.1697624921798706,
+      "learning_rate": 1.9287406276611095e-05,
+      "loss": 0.15392563343048096,
+      "step": 3225
+    },
+    {
+      "epoch": 0.5877001455604076,
+      "grad_norm": 0.1677260845899582,
+      "learning_rate": 1.9215770512615725e-05,
+      "loss": 0.15311745405197144,
+      "step": 3230
+    },
+    {
+      "epoch": 0.5886098981077147,
+      "grad_norm": 0.15357480943202972,
+      "learning_rate": 1.9144184892289337e-05,
+      "loss": 0.14370160102844237,
+      "step": 3235
+    },
+    {
+      "epoch": 0.5895196506550219,
+      "grad_norm": 0.18601207435131073,
+      "learning_rate": 1.9072650036209955e-05,
+      "loss": 0.14095077514648438,
+      "step": 3240
+    },
+    {
+      "epoch": 0.590429403202329,
+      "grad_norm": 0.17313526570796967,
+      "learning_rate": 1.9001166564515513e-05,
+      "loss": 0.148259174823761,
+      "step": 3245
+    },
+    {
+      "epoch": 0.5913391557496361,
+      "grad_norm": 0.1634378433227539,
+      "learning_rate": 1.8929735096898504e-05,
+      "loss": 0.15082294940948487,
+      "step": 3250
+    },
+    {
+      "epoch": 0.5922489082969432,
+      "grad_norm": 0.18542174994945526,
+      "learning_rate": 1.885835625260058e-05,
+      "loss": 0.14461435079574586,
+      "step": 3255
+    },
+    {
+      "epoch": 0.5931586608442504,
+      "grad_norm": 0.1740756630897522,
+      "learning_rate": 1.87870306504072e-05,
+      "loss": 0.14083608388900756,
+      "step": 3260
+    },
+    {
+      "epoch": 0.5940684133915575,
+      "grad_norm": 0.25606217980384827,
+      "learning_rate": 1.8715758908642288e-05,
+      "loss": 0.15125386714935302,
+      "step": 3265
+    },
+    {
+      "epoch": 0.5949781659388647,
+      "grad_norm": 0.20194627344608307,
+      "learning_rate": 1.8644541645162834e-05,
+      "loss": 0.14433003664016725,
+      "step": 3270
+    },
+    {
+      "epoch": 0.5958879184861717,
+      "grad_norm": 0.1902168095111847,
+      "learning_rate": 1.8573379477353542e-05,
+      "loss": 0.14718132019042968,
+      "step": 3275
+    },
+    {
+      "epoch": 0.5967976710334789,
+      "grad_norm": 0.15122972428798676,
+      "learning_rate": 1.850227302212151e-05,
+      "loss": 0.153376567363739,
+      "step": 3280
+    },
+    {
+      "epoch": 0.597707423580786,
+      "grad_norm": 0.14331959187984467,
+      "learning_rate": 1.843122289589085e-05,
+      "loss": 0.146630597114563,
+      "step": 3285
+    },
+    {
+      "epoch": 0.5986171761280932,
+      "grad_norm": 0.15083099901676178,
+      "learning_rate": 1.836022971459737e-05,
+      "loss": 0.1445971965789795,
+      "step": 3290
+    },
+    {
+      "epoch": 0.5995269286754003,
+      "grad_norm": 0.16585418581962585,
+      "learning_rate": 1.828929409368321e-05,
+      "loss": 0.15120241641998292,
+      "step": 3295
+    },
+    {
+      "epoch": 0.6004366812227074,
+      "grad_norm": 0.1653224229812622,
+      "learning_rate": 1.8218416648091524e-05,
+      "loss": 0.14349838495254516,
+      "step": 3300
+    },
+    {
+      "epoch": 0.6013464337700145,
+      "grad_norm": 0.1891375184059143,
+      "learning_rate": 1.8147597992261124e-05,
+      "loss": 0.15171384811401367,
+      "step": 3305
+    },
+    {
+      "epoch": 0.6022561863173217,
+      "grad_norm": 0.13392704725265503,
+      "learning_rate": 1.8076838740121187e-05,
+      "loss": 0.14607118368148803,
+      "step": 3310
+    },
+    {
+      "epoch": 0.6031659388646288,
+      "grad_norm": 0.15421944856643677,
+      "learning_rate": 1.8006139505085926e-05,
+      "loss": 0.1380957007408142,
+      "step": 3315
+    },
+    {
+      "epoch": 0.604075691411936,
+      "grad_norm": 0.16637761890888214,
+      "learning_rate": 1.7935500900049246e-05,
+      "loss": 0.14604611396789552,
+      "step": 3320
+    },
+    {
+      "epoch": 0.6049854439592431,
+      "grad_norm": 0.16638441383838654,
+      "learning_rate": 1.7864923537379445e-05,
+      "loss": 0.1513611912727356,
+      "step": 3325
+    },
+    {
+      "epoch": 0.6058951965065502,
+      "grad_norm": 0.1745707094669342,
+      "learning_rate": 1.779440802891394e-05,
+      "loss": 0.15391240119934083,
+      "step": 3330
+    },
+    {
+      "epoch": 0.6068049490538574,
+      "grad_norm": 0.1620505005121231,
+      "learning_rate": 1.77239549859539e-05,
+      "loss": 0.14986472129821776,
+      "step": 3335
+    },
+    {
+      "epoch": 0.6077147016011645,
+      "grad_norm": 0.1579132080078125,
+      "learning_rate": 1.7653565019259e-05,
+      "loss": 0.1466603994369507,
+      "step": 3340
+    },
+    {
+      "epoch": 0.6086244541484717,
+      "grad_norm": 0.19154994189739227,
+      "learning_rate": 1.7583238739042086e-05,
+      "loss": 0.15228934288024903,
+      "step": 3345
+    },
+    {
+      "epoch": 0.6095342066957787,
+      "grad_norm": 0.15771779417991638,
+      "learning_rate": 1.7512976754963913e-05,
+      "loss": 0.14965078830718995,
+      "step": 3350
+    },
+    {
+      "epoch": 0.6104439592430859,
+      "grad_norm": 0.18406136333942413,
+      "learning_rate": 1.744277967612785e-05,
+      "loss": 0.1473196864128113,
+      "step": 3355
+    },
+    {
+      "epoch": 0.611353711790393,
+      "grad_norm": 0.17603816092014313,
+      "learning_rate": 1.7372648111074607e-05,
+      "loss": 0.1430676221847534,
+      "step": 3360
+    },
+    {
+      "epoch": 0.6122634643377002,
+      "grad_norm": 0.156408429145813,
+      "learning_rate": 1.7302582667776933e-05,
+      "loss": 0.14018454551696777,
+      "step": 3365
+    },
+    {
+      "epoch": 0.6131732168850073,
+      "grad_norm": 0.14504843950271606,
+      "learning_rate": 1.7232583953634407e-05,
+      "loss": 0.14505640268325806,
+      "step": 3370
+    },
+    {
+      "epoch": 0.6140829694323144,
+      "grad_norm": 0.1864968240261078,
+      "learning_rate": 1.716265257546808e-05,
+      "loss": 0.14810394048690795,
+      "step": 3375
+    },
+    {
+      "epoch": 0.6149927219796215,
+      "grad_norm": 0.1621711403131485,
+      "learning_rate": 1.7092789139515295e-05,
+      "loss": 0.14203091859817504,
+      "step": 3380
+    },
+    {
+      "epoch": 0.6159024745269287,
+      "grad_norm": 0.17994914948940277,
+      "learning_rate": 1.70229942514244e-05,
+      "loss": 0.14565644264221192,
+      "step": 3385
+    },
+    {
+      "epoch": 0.6168122270742358,
+      "grad_norm": 0.1707388162612915,
+      "learning_rate": 1.6953268516249486e-05,
+      "loss": 0.14449434280395507,
+      "step": 3390
+    },
+    {
+      "epoch": 0.617721979621543,
+      "grad_norm": 0.16425329446792603,
+      "learning_rate": 1.6883612538445175e-05,
+      "loss": 0.15185940265655518,
+      "step": 3395
+    },
+    {
+      "epoch": 0.61863173216885,
+      "grad_norm": 0.15987788140773773,
+      "learning_rate": 1.6814026921861335e-05,
+      "loss": 0.14994431734085084,
+      "step": 3400
+    },
+    {
+      "epoch": 0.6195414847161572,
+      "grad_norm": 0.2987690269947052,
+      "learning_rate": 1.6744512269737894e-05,
+      "loss": 0.14652738571166993,
+      "step": 3405
+    },
+    {
+      "epoch": 0.6204512372634643,
+      "grad_norm": 0.1681315004825592,
+      "learning_rate": 1.6675069184699574e-05,
+      "loss": 0.14566165208816528,
+      "step": 3410
+    },
+    {
+      "epoch": 0.6213609898107715,
+      "grad_norm": 0.15847846865653992,
+      "learning_rate": 1.660569826875069e-05,
+      "loss": 0.1374401330947876,
+      "step": 3415
+    },
+    {
+      "epoch": 0.6222707423580786,
+      "grad_norm": 0.16370312869548798,
+      "learning_rate": 1.6536400123269907e-05,
+      "loss": 0.14905524253845215,
+      "step": 3420
+    },
+    {
+      "epoch": 0.6231804949053857,
+      "grad_norm": 0.16054444015026093,
+      "learning_rate": 1.6467175349005054e-05,
+      "loss": 0.1496324896812439,
+      "step": 3425
+    },
+    {
+      "epoch": 0.6240902474526928,
+      "grad_norm": 0.1663951277732849,
+      "learning_rate": 1.639802454606788e-05,
+      "loss": 0.1504170298576355,
+      "step": 3430
+    },
+    {
+      "epoch": 0.625,
+      "grad_norm": 0.1591310054063797,
+      "learning_rate": 1.6328948313928906e-05,
+      "loss": 0.1410186171531677,
+      "step": 3435
+    },
+    {
+      "epoch": 0.6259097525473072,
+      "grad_norm": 0.1637524962425232,
+      "learning_rate": 1.6259947251412178e-05,
+      "loss": 0.13963305950164795,
+      "step": 3440
+    },
+    {
+      "epoch": 0.6268195050946143,
+      "grad_norm": 0.1688017100095749,
+      "learning_rate": 1.6191021956690096e-05,
+      "loss": 0.14727941751480103,
+      "step": 3445
+    },
+    {
+      "epoch": 0.6277292576419214,
+      "grad_norm": 0.1691795438528061,
+      "learning_rate": 1.612217302727821e-05,
+      "loss": 0.14856183528900146,
+      "step": 3450
+    },
+    {
+      "epoch": 0.6286390101892285,
+      "grad_norm": 0.18501746654510498,
+      "learning_rate": 1.60534010600301e-05,
+      "loss": 0.1481746554374695,
+      "step": 3455
+    },
+    {
+      "epoch": 0.6295487627365357,
+      "grad_norm": 0.16234716773033142,
+      "learning_rate": 1.5984706651132125e-05,
+      "loss": 0.1427530527114868,
+      "step": 3460
+    },
+    {
+      "epoch": 0.6304585152838428,
+      "grad_norm": 0.16013780236244202,
+      "learning_rate": 1.5916090396098293e-05,
+      "loss": 0.14264426231384278,
+      "step": 3465
+    },
+    {
+      "epoch": 0.63136826783115,
+      "grad_norm": 0.17116396129131317,
+      "learning_rate": 1.5847552889765095e-05,
+      "loss": 0.14109257459640503,
+      "step": 3470
+    },
+    {
+      "epoch": 0.632278020378457,
+      "grad_norm": 0.16949769854545593,
+      "learning_rate": 1.5779094726286344e-05,
+      "loss": 0.1387040376663208,
+      "step": 3475
+    },
+    {
+      "epoch": 0.6331877729257642,
+      "grad_norm": 0.14983431994915009,
+      "learning_rate": 1.5710716499128044e-05,
+      "loss": 0.13645120859146118,
+      "step": 3480
+    },
+    {
+      "epoch": 0.6340975254730713,
+      "grad_norm": 0.1632554531097412,
+      "learning_rate": 1.564241880106321e-05,
+      "loss": 0.14883992671966553,
+      "step": 3485
+    },
+    {
+      "epoch": 0.6350072780203785,
+      "grad_norm": 0.15686506032943726,
+      "learning_rate": 1.5574202224166744e-05,
+      "loss": 0.14244272708892822,
+      "step": 3490
+    },
+    {
+      "epoch": 0.6359170305676856,
+      "grad_norm": 0.18843458592891693,
+      "learning_rate": 1.5506067359810333e-05,
+      "loss": 0.15149861574172974,
+      "step": 3495
+    },
+    {
+      "epoch": 0.6368267831149927,
+      "grad_norm": 0.15874551236629486,
+      "learning_rate": 1.5438014798657275e-05,
+      "loss": 0.15188233852386473,
+      "step": 3500
+    },
+    {
+      "epoch": 0.6377365356622998,
+      "grad_norm": 0.17014239728450775,
+      "learning_rate": 1.5370045130657366e-05,
+      "loss": 0.14694437980651856,
+      "step": 3505
+    },
+    {
+      "epoch": 0.638646288209607,
+      "grad_norm": 0.14744038879871368,
+      "learning_rate": 1.5302158945041838e-05,
+      "loss": 0.14434736967086792,
+      "step": 3510
+    },
+    {
+      "epoch": 0.6395560407569141,
+      "grad_norm": 0.2069770246744156,
+      "learning_rate": 1.523435683031818e-05,
+      "loss": 0.13982917070388795,
+      "step": 3515
+    },
+    {
+      "epoch": 0.6404657933042213,
+      "grad_norm": 0.17811502516269684,
+      "learning_rate": 1.5166639374265063e-05,
+      "loss": 0.1408839702606201,
+      "step": 3520
+    },
+    {
+      "epoch": 0.6413755458515283,
+      "grad_norm": 0.165786474943161,
+      "learning_rate": 1.509900716392728e-05,
+      "loss": 0.15312877893447877,
+      "step": 3525
+    },
+    {
+      "epoch": 0.6422852983988355,
+      "grad_norm": 0.1633884161710739,
+      "learning_rate": 1.5031460785610596e-05,
+      "loss": 0.1488795518875122,
+      "step": 3530
+    },
+    {
+      "epoch": 0.6431950509461426,
+      "grad_norm": 0.16498984396457672,
+      "learning_rate": 1.4964000824876723e-05,
+      "loss": 0.15031465291976928,
+      "step": 3535
+    },
+    {
+      "epoch": 0.6441048034934498,
+      "grad_norm": 0.18043678998947144,
+      "learning_rate": 1.4896627866538191e-05,
+      "loss": 0.147829806804657,
+      "step": 3540
+    },
+    {
+      "epoch": 0.6450145560407569,
+      "grad_norm": 0.16813597083091736,
+      "learning_rate": 1.4829342494653315e-05,
+      "loss": 0.1418998956680298,
+      "step": 3545
+    },
+    {
+      "epoch": 0.645924308588064,
+      "grad_norm": 0.1817242056131363,
+      "learning_rate": 1.4762145292521118e-05,
+      "loss": 0.14508869647979736,
+      "step": 3550
+    },
+    {
+      "epoch": 0.6468340611353712,
+      "grad_norm": 0.14666494727134705,
+      "learning_rate": 1.469503684267628e-05,
+      "loss": 0.14159854650497436,
+      "step": 3555
+    },
+    {
+      "epoch": 0.6477438136826783,
+      "grad_norm": 0.16485381126403809,
+      "learning_rate": 1.4628017726884086e-05,
+      "loss": 0.14419105052947997,
+      "step": 3560
+    },
+    {
+      "epoch": 0.6486535662299855,
+      "grad_norm": 0.16100342571735382,
+      "learning_rate": 1.4561088526135375e-05,
+      "loss": 0.14501721858978273,
+      "step": 3565
+    },
+    {
+      "epoch": 0.6495633187772926,
+      "grad_norm": 0.16996590793132782,
+      "learning_rate": 1.4494249820641493e-05,
+      "loss": 0.1377166509628296,
+      "step": 3570
+    },
+    {
+      "epoch": 0.6504730713245997,
+      "grad_norm": 0.16168837249279022,
+      "learning_rate": 1.4427502189829339e-05,
+      "loss": 0.1414325475692749,
+      "step": 3575
+    },
+    {
+      "epoch": 0.6513828238719068,
+      "grad_norm": 0.16318906843662262,
+      "learning_rate": 1.436084621233621e-05,
+      "loss": 0.14685193300247193,
+      "step": 3580
+    },
+    {
+      "epoch": 0.652292576419214,
+      "grad_norm": 0.1636219322681427,
+      "learning_rate": 1.4294282466004899e-05,
+      "loss": 0.1405899167060852,
+      "step": 3585
+    },
+    {
+      "epoch": 0.6532023289665211,
+      "grad_norm": 0.1838461309671402,
+      "learning_rate": 1.422781152787865e-05,
+      "loss": 0.14386332035064697,
+      "step": 3590
+    },
+    {
+      "epoch": 0.6541120815138283,
+      "grad_norm": 0.1796344667673111,
+      "learning_rate": 1.4161433974196115e-05,
+      "loss": 0.1513024687767029,
+      "step": 3595
+    },
+    {
+      "epoch": 0.6550218340611353,
+      "grad_norm": 0.16424529254436493,
+      "learning_rate": 1.4095150380386427e-05,
+      "loss": 0.14238927364349366,
+      "step": 3600
+    },
+    {
+      "epoch": 0.6559315866084425,
+      "grad_norm": 0.19264160096645355,
+      "learning_rate": 1.402896132106415e-05,
+      "loss": 0.14297477006912232,
+      "step": 3605
+    },
+    {
+      "epoch": 0.6568413391557496,
+      "grad_norm": 0.18319948017597198,
+      "learning_rate": 1.3962867370024347e-05,
+      "loss": 0.1448880434036255,
+      "step": 3610
+    },
+    {
+      "epoch": 0.6577510917030568,
+      "grad_norm": 0.16507290303707123,
+      "learning_rate": 1.389686910023758e-05,
+      "loss": 0.14724698066711425,
+      "step": 3615
+    },
+    {
+      "epoch": 0.6586608442503639,
+      "grad_norm": 0.17871244251728058,
+      "learning_rate": 1.3830967083844942e-05,
+      "loss": 0.14479386806488037,
+      "step": 3620
+    },
+    {
+      "epoch": 0.659570596797671,
+      "grad_norm": 0.1846228390932083,
+      "learning_rate": 1.3765161892153112e-05,
+      "loss": 0.1453616738319397,
+      "step": 3625
+    },
+    {
+      "epoch": 0.6604803493449781,
+      "grad_norm": 0.17185978591442108,
+      "learning_rate": 1.3699454095629372e-05,
+      "loss": 0.14906206130981445,
+      "step": 3630
+    },
+    {
+      "epoch": 0.6613901018922853,
+      "grad_norm": 0.14751191437244415,
+      "learning_rate": 1.3633844263896698e-05,
+      "loss": 0.13991892337799072,
+      "step": 3635
+    },
+    {
+      "epoch": 0.6622998544395924,
+      "grad_norm": 0.22059763967990875,
+      "learning_rate": 1.3568332965728817e-05,
+      "loss": 0.14680869579315187,
+      "step": 3640
+    },
+    {
+      "epoch": 0.6632096069868996,
+      "grad_norm": 0.15295909345149994,
+      "learning_rate": 1.3502920769045232e-05,
+      "loss": 0.1404443383216858,
+      "step": 3645
+    },
+    {
+      "epoch": 0.6641193595342066,
+      "grad_norm": 0.14600558578968048,
+      "learning_rate": 1.3437608240906364e-05,
+      "loss": 0.14663270711898804,
+      "step": 3650
+    },
+    {
+      "epoch": 0.6650291120815138,
+      "grad_norm": 0.15548352897167206,
+      "learning_rate": 1.3372395947508587e-05,
+      "loss": 0.1431443452835083,
+      "step": 3655
+    },
+    {
+      "epoch": 0.665938864628821,
+      "grad_norm": 0.1813388466835022,
+      "learning_rate": 1.3307284454179342e-05,
+      "loss": 0.1458706736564636,
+      "step": 3660
+    },
+    {
+      "epoch": 0.6668486171761281,
+      "grad_norm": 0.16326870024204254,
+      "learning_rate": 1.3242274325372247e-05,
+      "loss": 0.14700595140457154,
+      "step": 3665
+    },
+    {
+      "epoch": 0.6677583697234353,
+      "grad_norm": 0.18779197335243225,
+      "learning_rate": 1.3177366124662149e-05,
+      "loss": 0.1497237801551819,
+      "step": 3670
+    },
+    {
+      "epoch": 0.6686681222707423,
+      "grad_norm": 0.16291002929210663,
+      "learning_rate": 1.3112560414740315e-05,
+      "loss": 0.1387086868286133,
+      "step": 3675
+    },
+    {
+      "epoch": 0.6695778748180495,
+      "grad_norm": 0.1532297134399414,
+      "learning_rate": 1.3047857757409487e-05,
+      "loss": 0.14497545957565308,
+      "step": 3680
+    },
+    {
+      "epoch": 0.6704876273653566,
+      "grad_norm": 0.14697515964508057,
+      "learning_rate": 1.2983258713579066e-05,
+      "loss": 0.1494283437728882,
+      "step": 3685
+    },
+    {
+      "epoch": 0.6713973799126638,
+      "grad_norm": 0.15213452279567719,
+      "learning_rate": 1.2918763843260218e-05,
+      "loss": 0.1468907594680786,
+      "step": 3690
+    },
+    {
+      "epoch": 0.6723071324599709,
+      "grad_norm": 0.1745215803384781,
+      "learning_rate": 1.285437370556099e-05,
+      "loss": 0.14997754096984864,
+      "step": 3695
+    },
+    {
+      "epoch": 0.673216885007278,
+      "grad_norm": 0.19207637012004852,
+      "learning_rate": 1.2790088858681577e-05,
+      "loss": 0.14202862977981567,
+      "step": 3700
+    },
+    {
+      "epoch": 0.6741266375545851,
+      "grad_norm": 0.1521359086036682,
+      "learning_rate": 1.2725909859909313e-05,
+      "loss": 0.14547673463821412,
+      "step": 3705
+    },
+    {
+      "epoch": 0.6750363901018923,
+      "grad_norm": 0.16975535452365875,
+      "learning_rate": 1.2661837265613999e-05,
+      "loss": 0.14006874561309815,
+      "step": 3710
+    },
+    {
+      "epoch": 0.6759461426491994,
+      "grad_norm": 0.22234582901000977,
+      "learning_rate": 1.2597871631242992e-05,
+      "loss": 0.13691173791885375,
+      "step": 3715
+    },
+    {
+      "epoch": 0.6768558951965066,
+      "grad_norm": 0.16082969307899475,
+      "learning_rate": 1.2534013511316383e-05,
+      "loss": 0.14932308197021485,
+      "step": 3720
+    },
+    {
+      "epoch": 0.6777656477438136,
+      "grad_norm": 0.1751091182231903,
+      "learning_rate": 1.247026345942226e-05,
+      "loss": 0.14531974792480468,
+      "step": 3725
+    },
+    {
+      "epoch": 0.6786754002911208,
+      "grad_norm": 0.15838147699832916,
+      "learning_rate": 1.2406622028211844e-05,
+      "loss": 0.14759832620620728,
+      "step": 3730
+    },
+    {
+      "epoch": 0.6795851528384279,
+      "grad_norm": 0.1771744042634964,
+      "learning_rate": 1.2343089769394714e-05,
+      "loss": 0.1382831573486328,
+      "step": 3735
+    },
+    {
+      "epoch": 0.6804949053857351,
+      "grad_norm": 0.16301538050174713,
+      "learning_rate": 1.2279667233734037e-05,
+      "loss": 0.14444775581359864,
+      "step": 3740
+    },
+    {
+      "epoch": 0.6814046579330422,
+      "grad_norm": 0.1584121286869049,
+      "learning_rate": 1.2216354971041796e-05,
+      "loss": 0.14200170040130616,
+      "step": 3745
+    },
+    {
+      "epoch": 0.6823144104803494,
+      "grad_norm": 0.139187291264534,
+      "learning_rate": 1.2153153530174007e-05,
+      "loss": 0.14318310022354125,
+      "step": 3750
+    },
+    {
+      "epoch": 0.6832241630276564,
+      "grad_norm": 0.13665248453617096,
+      "learning_rate": 1.2090063459025955e-05,
+      "loss": 0.1411946654319763,
+      "step": 3755
+    },
+    {
+      "epoch": 0.6841339155749636,
+      "grad_norm": 0.16273781657218933,
+      "learning_rate": 1.2027085304527475e-05,
+      "loss": 0.14873508214950562,
+      "step": 3760
+    },
+    {
+      "epoch": 0.6850436681222707,
+      "grad_norm": 0.16317526996135712,
+      "learning_rate": 1.1964219612638194e-05,
+      "loss": 0.14644203186035157,
+      "step": 3765
+    },
+    {
+      "epoch": 0.6859534206695779,
+      "grad_norm": 0.17253617942333221,
+      "learning_rate": 1.1901466928342777e-05,
+      "loss": 0.14027841091156007,
+      "step": 3770
+    },
+    {
+      "epoch": 0.6868631732168851,
+      "grad_norm": 0.19692830741405487,
+      "learning_rate": 1.183882779564624e-05,
+      "loss": 0.14411110877990724,
+      "step": 3775
+    },
+    {
+      "epoch": 0.6877729257641921,
+      "grad_norm": 0.15444578230381012,
+      "learning_rate": 1.1776302757569214e-05,
+      "loss": 0.14355008602142333,
+      "step": 3780
+    },
+    {
+      "epoch": 0.6886826783114993,
+      "grad_norm": 0.1622200757265091,
+      "learning_rate": 1.1713892356143239e-05,
+      "loss": 0.14794334173202514,
+      "step": 3785
+    },
+    {
+      "epoch": 0.6895924308588064,
+      "grad_norm": 0.1898501068353653,
+      "learning_rate": 1.1651597132406073e-05,
+      "loss": 0.1418622612953186,
+      "step": 3790
+    },
+    {
+      "epoch": 0.6905021834061136,
+      "grad_norm": 0.17803208529949188,
+      "learning_rate": 1.1589417626396973e-05,
+      "loss": 0.14576040506362914,
+      "step": 3795
+    },
+    {
+      "epoch": 0.6914119359534207,
+      "grad_norm": 0.17138013243675232,
+      "learning_rate": 1.1527354377152053e-05,
+      "loss": 0.14494270086288452,
+      "step": 3800
+    },
+    {
+      "epoch": 0.6923216885007278,
+      "grad_norm": 0.15170913934707642,
+      "learning_rate": 1.1465407922699603e-05,
+      "loss": 0.144084370136261,
+      "step": 3805
+    },
+    {
+      "epoch": 0.6932314410480349,
+      "grad_norm": 0.158562570810318,
+      "learning_rate": 1.1403578800055387e-05,
+      "loss": 0.13636608123779298,
+      "step": 3810
+    },
+    {
+      "epoch": 0.6941411935953421,
+      "grad_norm": 0.17687302827835083,
+      "learning_rate": 1.1341867545218044e-05,
+      "loss": 0.14214688539505005,
+      "step": 3815
+    },
+    {
+      "epoch": 0.6950509461426492,
+      "grad_norm": 0.15394899249076843,
+      "learning_rate": 1.1280274693164378e-05,
+      "loss": 0.14914129972457885,
+      "step": 3820
+    },
+    {
+      "epoch": 0.6959606986899564,
+      "grad_norm": 0.15709355473518372,
+      "learning_rate": 1.12188007778448e-05,
+      "loss": 0.14798580408096312,
+      "step": 3825
+    },
+    {
+      "epoch": 0.6968704512372634,
+      "grad_norm": 0.16631539165973663,
+      "learning_rate": 1.115744633217864e-05,
+      "loss": 0.14756966829299928,
+      "step": 3830
+    },
+    {
+      "epoch": 0.6977802037845706,
+      "grad_norm": 0.15893076360225677,
+      "learning_rate": 1.109621188804951e-05,
+      "loss": 0.14061959981918334,
+      "step": 3835
+    },
+    {
+      "epoch": 0.6986899563318777,
+      "grad_norm": 0.183414489030838,
+      "learning_rate": 1.103509797630077e-05,
+      "loss": 0.1448473334312439,
+      "step": 3840
+    },
+    {
+      "epoch": 0.6995997088791849,
+      "grad_norm": 0.14087305963039398,
+      "learning_rate": 1.0974105126730841e-05,
+      "loss": 0.14369285106658936,
+      "step": 3845
+    },
+    {
+      "epoch": 0.700509461426492,
+      "grad_norm": 0.16919967532157898,
+      "learning_rate": 1.0913233868088685e-05,
+      "loss": 0.1478085398674011,
+      "step": 3850
+    },
+    {
+      "epoch": 0.7014192139737991,
+      "grad_norm": 0.1439533829689026,
+      "learning_rate": 1.0852484728069178e-05,
+      "loss": 0.14376721382141114,
+      "step": 3855
+    },
+    {
+      "epoch": 0.7023289665211062,
+      "grad_norm": 0.17719274759292603,
+      "learning_rate": 1.0791858233308521e-05,
+      "loss": 0.14089040756225585,
+      "step": 3860
+    },
+    {
+      "epoch": 0.7032387190684134,
+      "grad_norm": 0.19753769040107727,
+      "learning_rate": 1.0731354909379754e-05,
+      "loss": 0.15021742582321168,
+      "step": 3865
+    },
+    {
+      "epoch": 0.7041484716157205,
+      "grad_norm": 0.19186992943286896,
+      "learning_rate": 1.0670975280788086e-05,
+      "loss": 0.14113202095031738,
+      "step": 3870
+    },
+    {
+      "epoch": 0.7050582241630277,
+      "grad_norm": 0.1709229201078415,
+      "learning_rate": 1.0610719870966443e-05,
+      "loss": 0.1500566840171814,
+      "step": 3875
+    },
+    {
+      "epoch": 0.7059679767103348,
+      "grad_norm": 0.17846204340457916,
+      "learning_rate": 1.0550589202270892e-05,
+      "loss": 0.15014195442199707,
+      "step": 3880
+    },
+    {
+      "epoch": 0.7068777292576419,
+      "grad_norm": 0.1827082335948944,
+      "learning_rate": 1.0490583795976091e-05,
+      "loss": 0.1423472762107849,
+      "step": 3885
+    },
+    {
+      "epoch": 0.7077874818049491,
+      "grad_norm": 0.17418377101421356,
+      "learning_rate": 1.043070417227083e-05,
+      "loss": 0.14668900966644288,
+      "step": 3890
+    },
+    {
+      "epoch": 0.7086972343522562,
+      "grad_norm": 0.17385616898536682,
+      "learning_rate": 1.0370950850253449e-05,
+      "loss": 0.14627279043197633,
+      "step": 3895
+    },
+    {
+      "epoch": 0.7096069868995634,
+      "grad_norm": 0.16486723721027374,
+      "learning_rate": 1.0311324347927404e-05,
+      "loss": 0.14603652954101562,
+      "step": 3900
+    },
+    {
+      "epoch": 0.7105167394468704,
+      "grad_norm": 0.21806862950325012,
+      "learning_rate": 1.0251825182196732e-05,
+      "loss": 0.1488169550895691,
+      "step": 3905
+    },
+    {
+      "epoch": 0.7114264919941776,
+      "grad_norm": 0.19884569942951202,
+      "learning_rate": 1.019245386886159e-05,
+      "loss": 0.14387656450271608,
+      "step": 3910
+    },
+    {
+      "epoch": 0.7123362445414847,
+      "grad_norm": 0.16139011085033417,
+      "learning_rate": 1.0133210922613789e-05,
+      "loss": 0.1483074426651001,
+      "step": 3915
+    },
+    {
+      "epoch": 0.7132459970887919,
+      "grad_norm": 0.17000740766525269,
+      "learning_rate": 1.007409685703229e-05,
+      "loss": 0.14050065279006957,
+      "step": 3920
+    },
+    {
+      "epoch": 0.714155749636099,
+      "grad_norm": 0.17235304415225983,
+      "learning_rate": 1.0015112184578813e-05,
+      "loss": 0.1440442681312561,
+      "step": 3925
+    },
+    {
+      "epoch": 0.7150655021834061,
+      "grad_norm": 0.15737567842006683,
+      "learning_rate": 9.956257416593362e-06,
+      "loss": 0.14960765838623047,
+      "step": 3930
+    },
+    {
+      "epoch": 0.7159752547307132,
+      "grad_norm": 0.15499180555343628,
+      "learning_rate": 9.897533063289773e-06,
+      "loss": 0.14488829374313356,
+      "step": 3935
+    },
+    {
+      "epoch": 0.7168850072780204,
+      "grad_norm": 0.17744216322898865,
+      "learning_rate": 9.838939633751337e-06,
+      "loss": 0.1416949987411499,
+      "step": 3940
+    },
+    {
+      "epoch": 0.7177947598253275,
+      "grad_norm": 0.1597192883491516,
+      "learning_rate": 9.780477635926358e-06,
+      "loss": 0.14275280237197877,
+      "step": 3945
+    },
+    {
+      "epoch": 0.7187045123726347,
+      "grad_norm": 0.17800374329090118,
+      "learning_rate": 9.722147576623743e-06,
+      "loss": 0.14532098770141602,
+      "step": 3950
+    },
+    {
+      "epoch": 0.7196142649199417,
+      "grad_norm": 0.1828162521123886,
+      "learning_rate": 9.66394996150864e-06,
+      "loss": 0.14525585174560546,
+      "step": 3955
+    },
+    {
+      "epoch": 0.7205240174672489,
+      "grad_norm": 0.1800539344549179,
+      "learning_rate": 9.605885295098005e-06,
+      "loss": 0.14235819578170777,
+      "step": 3960
+    },
+    {
+      "epoch": 0.721433770014556,
+      "grad_norm": 0.16556483507156372,
+      "learning_rate": 9.54795408075628e-06,
+      "loss": 0.13965482711791993,
+      "step": 3965
+    },
+    {
+      "epoch": 0.7223435225618632,
+      "grad_norm": 0.1592024862766266,
+      "learning_rate": 9.49015682069101e-06,
+      "loss": 0.14051042795181273,
+      "step": 3970
+    },
+    {
+      "epoch": 0.7232532751091703,
+      "grad_norm": 0.18988847732543945,
+      "learning_rate": 9.43249401594846e-06,
+      "loss": 0.1436900496482849,
+      "step": 3975
+    },
+    {
+      "epoch": 0.7241630276564774,
+      "grad_norm": 0.24433808028697968,
+      "learning_rate": 9.374966166409329e-06,
+      "loss": 0.14883997440338134,
+      "step": 3980
+    },
+    {
+      "epoch": 0.7250727802037845,
+      "grad_norm": 0.15091639757156372,
+      "learning_rate": 9.317573770784352e-06,
+      "loss": 0.14726560115814208,
+      "step": 3985
+    },
+    {
+      "epoch": 0.7259825327510917,
+      "grad_norm": 0.17045573890209198,
+      "learning_rate": 9.260317326610051e-06,
+      "loss": 0.14120506048202514,
+      "step": 3990
+    },
+    {
+      "epoch": 0.7268922852983989,
+      "grad_norm": 0.18847957253456116,
+      "learning_rate": 9.203197330244343e-06,
+      "loss": 0.1377041220664978,
+      "step": 3995
+    },
+    {
+      "epoch": 0.727802037845706,
+      "grad_norm": 0.1516445279121399,
+      "learning_rate": 9.14621427686229e-06,
+      "loss": 0.14043946266174318,
+      "step": 4000
+    },
+    {
+      "epoch": 0.7287117903930131,
+      "grad_norm": 0.18264050781726837,
+      "learning_rate": 9.0893686604518e-06,
+      "loss": 0.14080368280410765,
+      "step": 4005
+    },
+    {
+      "epoch": 0.7296215429403202,
+      "grad_norm": 0.19129371643066406,
+      "learning_rate": 9.032660973809312e-06,
+      "loss": 0.1402561902999878,
+      "step": 4010
+    },
+    {
+      "epoch": 0.7305312954876274,
+      "grad_norm": 0.15762710571289062,
+      "learning_rate": 8.976091708535567e-06,
+      "loss": 0.14421157836914061,
+      "step": 4015
+    },
+    {
+      "epoch": 0.7314410480349345,
+      "grad_norm": 0.17785198986530304,
+      "learning_rate": 8.919661355031331e-06,
+      "loss": 0.14999009370803834,
+      "step": 4020
+    },
+    {
+      "epoch": 0.7323508005822417,
+      "grad_norm": 0.15306031703948975,
+      "learning_rate": 8.8633704024931e-06,
+      "loss": 0.14101698398590087,
+      "step": 4025
+    },
+    {
+      "epoch": 0.7332605531295487,
+      "grad_norm": 0.16481758654117584,
+      "learning_rate": 8.807219338908968e-06,
+      "loss": 0.14170764684677123,
+      "step": 4030
+    },
+    {
+      "epoch": 0.7341703056768559,
+      "grad_norm": 0.14892235398292542,
+      "learning_rate": 8.751208651054257e-06,
+      "loss": 0.15317896604537964,
+      "step": 4035
+    },
+    {
+      "epoch": 0.735080058224163,
+      "grad_norm": 0.1775592565536499,
+      "learning_rate": 8.695338824487409e-06,
+      "loss": 0.1520617723464966,
+      "step": 4040
+    },
+    {
+      "epoch": 0.7359898107714702,
+      "grad_norm": 0.1614258885383606,
+      "learning_rate": 8.639610343545728e-06,
+      "loss": 0.13747400045394897,
+      "step": 4045
+    },
+    {
+      "epoch": 0.7368995633187773,
+      "grad_norm": 0.21415506303310394,
+      "learning_rate": 8.58402369134117e-06,
+      "loss": 0.1432439088821411,
+      "step": 4050
+    },
+    {
+      "epoch": 0.7378093158660844,
+      "grad_norm": 0.1759418249130249,
+      "learning_rate": 8.528579349756205e-06,
+      "loss": 0.141641104221344,
+      "step": 4055
+    },
+    {
+      "epoch": 0.7387190684133915,
+      "grad_norm": 0.16738329827785492,
+      "learning_rate": 8.47327779943957e-06,
+      "loss": 0.14294810295104982,
+      "step": 4060
+    },
+    {
+      "epoch": 0.7396288209606987,
+      "grad_norm": 0.13916844129562378,
+      "learning_rate": 8.41811951980217e-06,
+      "loss": 0.13876968622207642,
+      "step": 4065
+    },
+    {
+      "epoch": 0.7405385735080058,
+      "grad_norm": 0.1828441321849823,
+      "learning_rate": 8.36310498901288e-06,
+      "loss": 0.148428475856781,
+      "step": 4070
+    },
+    {
+      "epoch": 0.741448326055313,
+      "grad_norm": 0.16534076631069183,
+      "learning_rate": 8.308234683994415e-06,
+      "loss": 0.14222711324691772,
+      "step": 4075
+    },
+    {
+      "epoch": 0.74235807860262,
+      "grad_norm": 0.17922644317150116,
+      "learning_rate": 8.253509080419198e-06,
+      "loss": 0.14365782737731933,
+      "step": 4080
+    },
+    {
+      "epoch": 0.7432678311499272,
+      "grad_norm": 0.15061035752296448,
+      "learning_rate": 8.198928652705204e-06,
+      "loss": 0.13571925163269044,
+      "step": 4085
+    },
+    {
+      "epoch": 0.7441775836972343,
+      "grad_norm": 0.18075402081012726,
+      "learning_rate": 8.144493874011908e-06,
+      "loss": 0.14385528564453126,
+      "step": 4090
+    },
+    {
+      "epoch": 0.7450873362445415,
+      "grad_norm": 0.16514739394187927,
+      "learning_rate": 8.090205216236135e-06,
+      "loss": 0.14920626878738402,
+      "step": 4095
+    },
+    {
+      "epoch": 0.7459970887918487,
+      "grad_norm": 0.16453702747821808,
+      "learning_rate": 8.03606315000797e-06,
+      "loss": 0.14704222679138185,
+      "step": 4100
+    },
+    {
+      "epoch": 0.7469068413391557,
+      "grad_norm": 0.16719917953014374,
+      "learning_rate": 7.982068144686707e-06,
+      "loss": 0.14722511768341065,
+      "step": 4105
+    },
+    {
+      "epoch": 0.7478165938864629,
+      "grad_norm": 0.18499110639095306,
+      "learning_rate": 7.92822066835677e-06,
+      "loss": 0.1401848554611206,
+      "step": 4110
+    },
+    {
+      "epoch": 0.74872634643377,
+      "grad_norm": 0.17249563336372375,
+      "learning_rate": 7.87452118782363e-06,
+      "loss": 0.15132423639297485,
+      "step": 4115
+    },
+    {
+      "epoch": 0.7496360989810772,
+      "grad_norm": 0.15049682557582855,
+      "learning_rate": 7.8209701686098e-06,
+      "loss": 0.1341150164604187,
+      "step": 4120
+    },
+    {
+      "epoch": 0.7505458515283843,
+      "grad_norm": 0.16892646253108978,
+      "learning_rate": 7.767568074950751e-06,
+      "loss": 0.1466840147972107,
+      "step": 4125
+    },
+    {
+      "epoch": 0.7514556040756915,
+      "grad_norm": 0.17288286983966827,
+      "learning_rate": 7.714315369790942e-06,
+      "loss": 0.13819680213928223,
+      "step": 4130
+    },
+    {
+      "epoch": 0.7523653566229985,
+      "grad_norm": 0.21893996000289917,
+      "learning_rate": 7.661212514779745e-06,
+      "loss": 0.14369510412216185,
+      "step": 4135
+    },
+    {
+      "epoch": 0.7532751091703057,
+      "grad_norm": 0.1674601435661316,
+      "learning_rate": 7.608259970267509e-06,
+      "loss": 0.14810250997543334,
+      "step": 4140
+    },
+    {
+      "epoch": 0.7541848617176128,
+      "grad_norm": 0.15875539183616638,
+      "learning_rate": 7.555458195301526e-06,
+      "loss": 0.14103198051452637,
+      "step": 4145
+    },
+    {
+      "epoch": 0.75509461426492,
+      "grad_norm": 0.19454079866409302,
+      "learning_rate": 7.502807647622037e-06,
+      "loss": 0.13848764896392823,
+      "step": 4150
+    },
+    {
+      "epoch": 0.756004366812227,
+      "grad_norm": 0.1795455813407898,
+      "learning_rate": 7.450308783658341e-06,
+      "loss": 0.14459335803985596,
+      "step": 4155
+    },
+    {
+      "epoch": 0.7569141193595342,
+      "grad_norm": 0.1643362045288086,
+      "learning_rate": 7.397962058524735e-06,
+      "loss": 0.14335378408432006,
+      "step": 4160
+    },
+    {
+      "epoch": 0.7578238719068413,
+      "grad_norm": 0.16362066566944122,
+      "learning_rate": 7.3457679260166475e-06,
+      "loss": 0.14222005605697632,
+      "step": 4165
+    },
+    {
+      "epoch": 0.7587336244541485,
+      "grad_norm": 0.17313003540039062,
+      "learning_rate": 7.293726838606674e-06,
+      "loss": 0.14272255897521974,
+      "step": 4170
+    },
+    {
+      "epoch": 0.7596433770014556,
+      "grad_norm": 0.1809929460287094,
+      "learning_rate": 7.2418392474406405e-06,
+      "loss": 0.14089123010635377,
+      "step": 4175
+    },
+    {
+      "epoch": 0.7605531295487628,
+      "grad_norm": 0.14306005835533142,
+      "learning_rate": 7.19010560233373e-06,
+      "loss": 0.13531534671783446,
+      "step": 4180
+    },
+    {
+      "epoch": 0.7614628820960698,
+      "grad_norm": 0.15525390207767487,
+      "learning_rate": 7.138526351766559e-06,
+      "loss": 0.14340845346450806,
+      "step": 4185
+    },
+    {
+      "epoch": 0.762372634643377,
+      "grad_norm": 0.24478943645954132,
+      "learning_rate": 7.087101942881263e-06,
+      "loss": 0.14744555950164795,
+      "step": 4190
+    },
+    {
+      "epoch": 0.7632823871906841,
+      "grad_norm": 0.31335577368736267,
+      "learning_rate": 7.035832821477711e-06,
+      "loss": 0.1484094500541687,
+      "step": 4195
+    },
+    {
+      "epoch": 0.7641921397379913,
+      "grad_norm": 0.15140366554260254,
+      "learning_rate": 6.984719432009515e-06,
+      "loss": 0.14991614818572999,
+      "step": 4200
+    },
+    {
+      "epoch": 0.7651018922852983,
+      "grad_norm": 0.16125506162643433,
+      "learning_rate": 6.933762217580289e-06,
+      "loss": 0.1408134937286377,
+      "step": 4205
+    },
+    {
+      "epoch": 0.7660116448326055,
+      "grad_norm": 0.2501450181007385,
+      "learning_rate": 6.882961619939726e-06,
+      "loss": 0.13875640630722047,
+      "step": 4210
+    },
+    {
+      "epoch": 0.7669213973799127,
+      "grad_norm": 0.16227811574935913,
+      "learning_rate": 6.8323180794798245e-06,
+      "loss": 0.14138660430908204,
+      "step": 4215
+    },
+    {
+      "epoch": 0.7678311499272198,
+      "grad_norm": 0.16676810383796692,
+      "learning_rate": 6.781832035231053e-06,
+      "loss": 0.14696706533432008,
+      "step": 4220
+    },
+    {
+      "epoch": 0.768740902474527,
+      "grad_norm": 0.14638574421405792,
+      "learning_rate": 6.731503924858518e-06,
+      "loss": 0.14263020753860473,
+      "step": 4225
+    },
+    {
+      "epoch": 0.769650655021834,
+      "grad_norm": 0.17093190550804138,
+      "learning_rate": 6.681334184658211e-06,
+      "loss": 0.14694111347198485,
+      "step": 4230
+    },
+    {
+      "epoch": 0.7705604075691412,
+      "grad_norm": 0.17174287140369415,
+      "learning_rate": 6.631323249553201e-06,
+      "loss": 0.13854929208755493,
+      "step": 4235
+    },
+    {
+      "epoch": 0.7714701601164483,
+      "grad_norm": 0.14599016308784485,
+      "learning_rate": 6.5814715530898745e-06,
+      "loss": 0.14058833122253417,
+      "step": 4240
+    },
+    {
+      "epoch": 0.7723799126637555,
+      "grad_norm": 0.16222265362739563,
+      "learning_rate": 6.531779527434176e-06,
+      "loss": 0.1428326725959778,
+      "step": 4245
+    },
+    {
+      "epoch": 0.7732896652110626,
+      "grad_norm": 0.1741994023323059,
+      "learning_rate": 6.482247603367839e-06,
+      "loss": 0.13985042572021483,
+      "step": 4250
+    },
+    {
+      "epoch": 0.7741994177583698,
+      "grad_norm": 0.17427101731300354,
+      "learning_rate": 6.432876210284688e-06,
+      "loss": 0.1442667603492737,
+      "step": 4255
+    },
+    {
+      "epoch": 0.7751091703056768,
+      "grad_norm": 0.1665259599685669,
+      "learning_rate": 6.383665776186912e-06,
+      "loss": 0.1421986222267151,
+      "step": 4260
+    },
+    {
+      "epoch": 0.776018922852984,
+      "grad_norm": 0.1728232353925705,
+      "learning_rate": 6.334616727681303e-06,
+      "loss": 0.1367053508758545,
+      "step": 4265
+    },
+    {
+      "epoch": 0.7769286754002911,
+      "grad_norm": 0.15882381796836853,
+      "learning_rate": 6.285729489975639e-06,
+      "loss": 0.14551182985305786,
+      "step": 4270
+    },
+    {
+      "epoch": 0.7778384279475983,
+      "grad_norm": 0.242042675614357,
+      "learning_rate": 6.2370044868749115e-06,
+      "loss": 0.1455132007598877,
+      "step": 4275
+    },
+    {
+      "epoch": 0.7787481804949054,
+      "grad_norm": 0.1599501073360443,
+      "learning_rate": 6.188442140777742e-06,
+      "loss": 0.1424942970275879,
+      "step": 4280
+    },
+    {
+      "epoch": 0.7796579330422125,
+      "grad_norm": 0.15182635188102722,
+      "learning_rate": 6.140042872672647e-06,
+      "loss": 0.14212887287139891,
+      "step": 4285
+    },
+    {
+      "epoch": 0.7805676855895196,
+      "grad_norm": 0.1720375418663025,
+      "learning_rate": 6.091807102134403e-06,
+      "loss": 0.14243412017822266,
+      "step": 4290
+    },
+    {
+      "epoch": 0.7814774381368268,
+      "grad_norm": 0.16436047852039337,
+      "learning_rate": 6.043735247320454e-06,
+      "loss": 0.15035657882690429,
+      "step": 4295
+    },
+    {
+      "epoch": 0.7823871906841339,
+      "grad_norm": 0.1498408019542694,
+      "learning_rate": 5.995827724967218e-06,
+      "loss": 0.14494839906692505,
+      "step": 4300
+    },
+    {
+      "epoch": 0.7832969432314411,
+      "grad_norm": 0.16924560070037842,
+      "learning_rate": 5.948084950386535e-06,
+      "loss": 0.13581212759017944,
+      "step": 4305
+    },
+    {
+      "epoch": 0.7842066957787481,
+      "grad_norm": 0.15889139473438263,
+      "learning_rate": 5.900507337462036e-06,
+      "loss": 0.15071530342102052,
+      "step": 4310
+    },
+    {
+      "epoch": 0.7851164483260553,
+      "grad_norm": 0.17201054096221924,
+      "learning_rate": 5.853095298645542e-06,
+      "loss": 0.1398628830909729,
+      "step": 4315
+    },
+    {
+      "epoch": 0.7860262008733624,
+      "grad_norm": 0.17965619266033173,
+      "learning_rate": 5.805849244953548e-06,
+      "loss": 0.14666696786880493,
+      "step": 4320
+    },
+    {
+      "epoch": 0.7869359534206696,
+      "grad_norm": 0.17514032125473022,
+      "learning_rate": 5.758769585963569e-06,
+      "loss": 0.1383386731147766,
+      "step": 4325
+    },
+    {
+      "epoch": 0.7878457059679768,
+      "grad_norm": 0.17497631907463074,
+      "learning_rate": 5.7118567298106744e-06,
+      "loss": 0.14362354278564454,
+      "step": 4330
+    },
+    {
+      "epoch": 0.7887554585152838,
+      "grad_norm": 0.16770458221435547,
+      "learning_rate": 5.665111083183905e-06,
+      "loss": 0.14136618375778198,
+      "step": 4335
+    },
+    {
+      "epoch": 0.789665211062591,
+      "grad_norm": 0.17134106159210205,
+      "learning_rate": 5.618533051322747e-06,
+      "loss": 0.1401529550552368,
+      "step": 4340
+    },
+    {
+      "epoch": 0.7905749636098981,
+      "grad_norm": 0.19458788633346558,
+      "learning_rate": 5.5721230380136435e-06,
+      "loss": 0.1393273115158081,
+      "step": 4345
+    },
+    {
+      "epoch": 0.7914847161572053,
+      "grad_norm": 0.19483692944049835,
+      "learning_rate": 5.525881445586467e-06,
+      "loss": 0.1369825482368469,
+      "step": 4350
+    },
+    {
+      "epoch": 0.7923944687045124,
+      "grad_norm": 0.3052191734313965,
+      "learning_rate": 5.4798086749110495e-06,
+      "loss": 0.14762181043624878,
+      "step": 4355
+    },
+    {
+      "epoch": 0.7933042212518195,
+      "grad_norm": 0.164458766579628,
+      "learning_rate": 5.4339051253937065e-06,
+      "loss": 0.14501686096191407,
+      "step": 4360
+    },
+    {
+      "epoch": 0.7942139737991266,
+      "grad_norm": 0.1719193458557129,
+      "learning_rate": 5.3881711949737625e-06,
+      "loss": 0.13321092128753662,
+      "step": 4365
+    },
+    {
+      "epoch": 0.7951237263464338,
+      "grad_norm": 0.17219696938991547,
+      "learning_rate": 5.342607280120121e-06,
+      "loss": 0.1413906455039978,
+      "step": 4370
+    },
+    {
+      "epoch": 0.7960334788937409,
+      "grad_norm": 0.15083056688308716,
+      "learning_rate": 5.297213775827789e-06,
+      "loss": 0.14772192239761353,
+      "step": 4375
+    },
+    {
+      "epoch": 0.7969432314410481,
+      "grad_norm": 0.1699071079492569,
+      "learning_rate": 5.251991075614507e-06,
+      "loss": 0.1392375946044922,
+      "step": 4380
+    },
+    {
+      "epoch": 0.7978529839883551,
+      "grad_norm": 0.1680395007133484,
+      "learning_rate": 5.206939571517302e-06,
+      "loss": 0.14185575246810914,
+      "step": 4385
+    },
+    {
+      "epoch": 0.7987627365356623,
+      "grad_norm": 0.16526710987091064,
+      "learning_rate": 5.162059654089083e-06,
+      "loss": 0.15001428127288818,
+      "step": 4390
+    },
+    {
+      "epoch": 0.7996724890829694,
+      "grad_norm": 0.16281752288341522,
+      "learning_rate": 5.1173517123952794e-06,
+      "loss": 0.13747023344039916,
+      "step": 4395
+    },
+    {
+      "epoch": 0.8005822416302766,
+      "grad_norm": 0.1454378366470337,
+      "learning_rate": 5.072816134010458e-06,
+      "loss": 0.14710829257965088,
+      "step": 4400
+    },
+    {
+      "epoch": 0.8014919941775837,
+      "grad_norm": 0.16565890610218048,
+      "learning_rate": 5.028453305014966e-06,
+      "loss": 0.14138611555099487,
+      "step": 4405
+    },
+    {
+      "epoch": 0.8024017467248908,
+      "grad_norm": 0.1962810605764389,
+      "learning_rate": 4.984263609991577e-06,
+      "loss": 0.13836177587509155,
+      "step": 4410
+    },
+    {
+      "epoch": 0.8033114992721979,
+      "grad_norm": 0.16091369092464447,
+      "learning_rate": 4.940247432022149e-06,
+      "loss": 0.14407440423965454,
+      "step": 4415
+    },
+    {
+      "epoch": 0.8042212518195051,
+      "grad_norm": 0.1930241584777832,
+      "learning_rate": 4.89640515268433e-06,
+      "loss": 0.14346336126327514,
+      "step": 4420
+    },
+    {
+      "epoch": 0.8051310043668122,
+      "grad_norm": 0.19301500916481018,
+      "learning_rate": 4.852737152048242e-06,
+      "loss": 0.14174317121505736,
+      "step": 4425
+    },
+    {
+      "epoch": 0.8060407569141194,
+      "grad_norm": 0.1541353315114975,
+      "learning_rate": 4.80924380867315e-06,
+      "loss": 0.14100592136383056,
+      "step": 4430
+    },
+    {
+      "epoch": 0.8069505094614265,
+      "grad_norm": 0.16285750269889832,
+      "learning_rate": 4.765925499604243e-06,
+      "loss": 0.1441288709640503,
+      "step": 4435
+    },
+    {
+      "epoch": 0.8078602620087336,
+      "grad_norm": 0.17382675409317017,
+      "learning_rate": 4.722782600369299e-06,
+      "loss": 0.13763951063156127,
+      "step": 4440
+    },
+    {
+      "epoch": 0.8087700145560408,
+      "grad_norm": 0.1697344034910202,
+      "learning_rate": 4.679815484975505e-06,
+      "loss": 0.1410105347633362,
+      "step": 4445
+    },
+    {
+      "epoch": 0.8096797671033479,
+      "grad_norm": 0.19964542984962463,
+      "learning_rate": 4.637024525906131e-06,
+      "loss": 0.1439276695251465,
+      "step": 4450
+    },
+    {
+      "epoch": 0.8105895196506551,
+      "grad_norm": 0.165307879447937,
+      "learning_rate": 4.59441009411736e-06,
+      "loss": 0.13897504806518554,
+      "step": 4455
+    },
+    {
+      "epoch": 0.8114992721979621,
+      "grad_norm": 0.16687989234924316,
+      "learning_rate": 4.551972559035067e-06,
+      "loss": 0.1422593355178833,
+      "step": 4460
+    },
+    {
+      "epoch": 0.8124090247452693,
+      "grad_norm": 0.15737789869308472,
+      "learning_rate": 4.509712288551571e-06,
+      "loss": 0.1452128052711487,
+      "step": 4465
+    },
+    {
+      "epoch": 0.8133187772925764,
+      "grad_norm": 0.17116659879684448,
+      "learning_rate": 4.467629649022509e-06,
+      "loss": 0.14385371208190917,
+      "step": 4470
+    },
+    {
+      "epoch": 0.8142285298398836,
+      "grad_norm": 0.17457640171051025,
+      "learning_rate": 4.425725005263623e-06,
+      "loss": 0.14808475971221924,
+      "step": 4475
+    },
+    {
+      "epoch": 0.8151382823871907,
+      "grad_norm": 0.1621970385313034,
+      "learning_rate": 4.383998720547583e-06,
+      "loss": 0.13927959203720092,
+      "step": 4480
+    },
+    {
+      "epoch": 0.8160480349344978,
+      "grad_norm": 0.176296666264534,
+      "learning_rate": 4.342451156600896e-06,
+      "loss": 0.15041060447692872,
+      "step": 4485
+    },
+    {
+      "epoch": 0.8169577874818049,
+      "grad_norm": 0.17157645523548126,
+      "learning_rate": 4.301082673600698e-06,
+      "loss": 0.13932652473449708,
+      "step": 4490
+    },
+    {
+      "epoch": 0.8178675400291121,
+      "grad_norm": 0.15378527343273163,
+      "learning_rate": 4.259893630171682e-06,
+      "loss": 0.1406856894493103,
+      "step": 4495
+    },
+    {
+      "epoch": 0.8187772925764192,
+      "grad_norm": 0.1750226765871048,
+      "learning_rate": 4.218884383382987e-06,
+      "loss": 0.1350164532661438,
+      "step": 4500
+    },
+    {
+      "epoch": 0.8196870451237264,
+      "grad_norm": 0.1393742561340332,
+      "learning_rate": 4.178055288745053e-06,
+      "loss": 0.13769235610961914,
+      "step": 4505
+    },
+    {
+      "epoch": 0.8205967976710334,
+      "grad_norm": 0.1668994128704071,
+      "learning_rate": 4.137406700206617e-06,
+      "loss": 0.14029752016067504,
+      "step": 4510
+    },
+    {
+      "epoch": 0.8215065502183406,
+      "grad_norm": 0.1833454668521881,
+      "learning_rate": 4.0969389701515675e-06,
+      "loss": 0.14276301860809326,
+      "step": 4515
+    },
+    {
+      "epoch": 0.8224163027656477,
+      "grad_norm": 0.16187874972820282,
+      "learning_rate": 4.056652449395945e-06,
+      "loss": 0.1444832682609558,
+      "step": 4520
+    },
+    {
+      "epoch": 0.8233260553129549,
+      "grad_norm": 0.1453280746936798,
+      "learning_rate": 4.01654748718488e-06,
+      "loss": 0.14512733221054078,
+      "step": 4525
+    },
+    {
+      "epoch": 0.824235807860262,
+      "grad_norm": 0.1782725751399994,
+      "learning_rate": 3.976624431189563e-06,
+      "loss": 0.14093561172485353,
+      "step": 4530
+    },
+    {
+      "epoch": 0.8251455604075691,
+      "grad_norm": 0.17374491691589355,
+      "learning_rate": 3.936883627504234e-06,
+      "loss": 0.14031401872634888,
+      "step": 4535
+    },
+    {
+      "epoch": 0.8260553129548762,
+      "grad_norm": 0.1609172821044922,
+      "learning_rate": 3.897325420643174e-06,
+      "loss": 0.1428336262702942,
+      "step": 4540
+    },
+    {
+      "epoch": 0.8269650655021834,
+      "grad_norm": 0.1520884931087494,
+      "learning_rate": 3.85795015353774e-06,
+      "loss": 0.1460547924041748,
+      "step": 4545
+    },
+    {
+      "epoch": 0.8278748180494906,
+      "grad_norm": 0.20986326038837433,
+      "learning_rate": 3.818758167533376e-06,
+      "loss": 0.14706350564956666,
+      "step": 4550
+    },
+    {
+      "epoch": 0.8287845705967977,
+      "grad_norm": 0.16825413703918457,
+      "learning_rate": 3.7797498023866396e-06,
+      "loss": 0.14507200717926025,
+      "step": 4555
+    },
+    {
+      "epoch": 0.8296943231441049,
+      "grad_norm": 0.16758380830287933,
+      "learning_rate": 3.740925396262296e-06,
+      "loss": 0.14898381233215333,
+      "step": 4560
+    },
+    {
+      "epoch": 0.8306040756914119,
+      "grad_norm": 0.15207453072071075,
+      "learning_rate": 3.7022852857303503e-06,
+      "loss": 0.14138854742050172,
+      "step": 4565
+    },
+    {
+      "epoch": 0.8315138282387191,
+      "grad_norm": 0.15150749683380127,
+      "learning_rate": 3.66382980576315e-06,
+      "loss": 0.13894975185394287,
+      "step": 4570
+    },
+    {
+      "epoch": 0.8324235807860262,
+      "grad_norm": 0.17071188986301422,
+      "learning_rate": 3.625559289732472e-06,
+      "loss": 0.14072470664978026,
+      "step": 4575
+    },
+    {
+      "epoch": 0.8333333333333334,
+      "grad_norm": 0.154335618019104,
+      "learning_rate": 3.5874740694066294e-06,
+      "loss": 0.13791344165802003,
+      "step": 4580
+    },
+    {
+      "epoch": 0.8342430858806404,
+      "grad_norm": 0.14017128944396973,
+      "learning_rate": 3.5495744749476116e-06,
+      "loss": 0.14427922964096068,
+      "step": 4585
+    },
+    {
+      "epoch": 0.8351528384279476,
+      "grad_norm": 0.17210033535957336,
+      "learning_rate": 3.5118608349081983e-06,
+      "loss": 0.15191166400909423,
+      "step": 4590
+    },
+    {
+      "epoch": 0.8360625909752547,
+      "grad_norm": 0.18715685606002808,
+      "learning_rate": 3.4743334762291358e-06,
+      "loss": 0.14451316595077515,
+      "step": 4595
+    },
+    {
+      "epoch": 0.8369723435225619,
+      "grad_norm": 0.18079884350299835,
+      "learning_rate": 3.436992724236293e-06,
+      "loss": 0.13530746698379517,
+      "step": 4600
+    },
+    {
+      "epoch": 0.837882096069869,
+      "grad_norm": 0.13519920408725739,
+      "learning_rate": 3.399838902637817e-06,
+      "loss": 0.1477964401245117,
+      "step": 4605
+    },
+    {
+      "epoch": 0.8387918486171762,
+      "grad_norm": 0.1778026670217514,
+      "learning_rate": 3.3628723335213885e-06,
+      "loss": 0.14419831037521363,
+      "step": 4610
+    },
+    {
+      "epoch": 0.8397016011644832,
+      "grad_norm": 0.15165366232395172,
+      "learning_rate": 3.326093337351355e-06,
+      "loss": 0.13888469934463502,
+      "step": 4615
+    },
+    {
+      "epoch": 0.8406113537117904,
+      "grad_norm": 0.17049473524093628,
+      "learning_rate": 3.2895022329660018e-06,
+      "loss": 0.14438477754592896,
+      "step": 4620
+    },
+    {
+      "epoch": 0.8415211062590975,
+      "grad_norm": 0.16536414623260498,
+      "learning_rate": 3.2530993375747833e-06,
+      "loss": 0.1444351315498352,
+      "step": 4625
+    },
+    {
+      "epoch": 0.8424308588064047,
+      "grad_norm": 0.17570015788078308,
+      "learning_rate": 3.2168849667555402e-06,
+      "loss": 0.13861945867538453,
+      "step": 4630
+    },
+    {
+      "epoch": 0.8433406113537117,
+      "grad_norm": 0.1699545532464981,
+      "learning_rate": 3.1808594344518132e-06,
+      "loss": 0.13902754783630372,
+      "step": 4635
+    },
+    {
+      "epoch": 0.8442503639010189,
+      "grad_norm": 0.12331254780292511,
+      "learning_rate": 3.1450230529700837e-06,
+      "loss": 0.14104254245758058,
+      "step": 4640
+    },
+    {
+      "epoch": 0.845160116448326,
+      "grad_norm": 0.1508190929889679,
+      "learning_rate": 3.1093761329770708e-06,
+      "loss": 0.13288766145706177,
+      "step": 4645
+    },
+    {
+      "epoch": 0.8460698689956332,
+      "grad_norm": 0.19049489498138428,
+      "learning_rate": 3.0739189834970735e-06,
+      "loss": 0.14914840459823608,
+      "step": 4650
+    },
+    {
+      "epoch": 0.8469796215429404,
+      "grad_norm": 0.1662369966506958,
+      "learning_rate": 3.0386519119092293e-06,
+      "loss": 0.14222898483276367,
+      "step": 4655
+    },
+    {
+      "epoch": 0.8478893740902474,
+      "grad_norm": 0.18985967338085175,
+      "learning_rate": 3.0035752239449126e-06,
+      "loss": 0.14431113004684448,
+      "step": 4660
+    },
+    {
+      "epoch": 0.8487991266375546,
+      "grad_norm": 0.17005261778831482,
+      "learning_rate": 2.9686892236850337e-06,
+      "loss": 0.14140807390213012,
+      "step": 4665
+    },
+    {
+      "epoch": 0.8497088791848617,
+      "grad_norm": 0.16786684095859528,
+      "learning_rate": 2.9339942135574394e-06,
+      "loss": 0.14161460399627684,
+      "step": 4670
+    },
+    {
+      "epoch": 0.8506186317321689,
+      "grad_norm": 0.16358181834220886,
+      "learning_rate": 2.899490494334281e-06,
+      "loss": 0.14674670696258546,
+      "step": 4675
+    },
+    {
+      "epoch": 0.851528384279476,
+      "grad_norm": 0.1651349812746048,
+      "learning_rate": 2.8651783651293867e-06,
+      "loss": 0.13794611692428588,
+      "step": 4680
+    },
+    {
+      "epoch": 0.8524381368267832,
+      "grad_norm": 0.16934923827648163,
+      "learning_rate": 2.831058123395694e-06,
+      "loss": 0.13199397325515747,
+      "step": 4685
+    },
+    {
+      "epoch": 0.8533478893740902,
+      "grad_norm": 0.1704150140285492,
+      "learning_rate": 2.797130064922665e-06,
+      "loss": 0.14044904708862305,
+      "step": 4690
+    },
+    {
+      "epoch": 0.8542576419213974,
+      "grad_norm": 0.1814192682504654,
+      "learning_rate": 2.7633944838337143e-06,
+      "loss": 0.1465100646018982,
+      "step": 4695
+    },
+    {
+      "epoch": 0.8551673944687045,
+      "grad_norm": 0.18942610919475555,
+      "learning_rate": 2.729851672583669e-06,
+      "loss": 0.14685982465744019,
+      "step": 4700
+    },
+    {
+      "epoch": 0.8560771470160117,
+      "grad_norm": 0.17895208299160004,
+      "learning_rate": 2.6965019219562155e-06,
+      "loss": 0.13971571922302245,
+      "step": 4705
+    },
+    {
+      "epoch": 0.8569868995633187,
+      "grad_norm": 0.22735828161239624,
+      "learning_rate": 2.6633455210614055e-06,
+      "loss": 0.13776102066040039,
+      "step": 4710
+    },
+    {
+      "epoch": 0.8578966521106259,
+      "grad_norm": 0.16779793798923492,
+      "learning_rate": 2.630382757333133e-06,
+      "loss": 0.14134042263031005,
+      "step": 4715
+    },
+    {
+      "epoch": 0.858806404657933,
+      "grad_norm": 0.2148888260126114,
+      "learning_rate": 2.597613916526637e-06,
+      "loss": 0.14680721759796142,
+      "step": 4720
+    },
+    {
+      "epoch": 0.8597161572052402,
+      "grad_norm": 0.16560257971286774,
+      "learning_rate": 2.565039282716045e-06,
+      "loss": 0.14137234687805175,
+      "step": 4725
+    },
+    {
+      "epoch": 0.8606259097525473,
+      "grad_norm": 0.16197068989276886,
+      "learning_rate": 2.532659138291879e-06,
+      "loss": 0.14969314336776735,
+      "step": 4730
+    },
+    {
+      "epoch": 0.8615356622998545,
+      "grad_norm": 0.14650246500968933,
+      "learning_rate": 2.5004737639586497e-06,
+      "loss": 0.13532910346984864,
+      "step": 4735
+    },
+    {
+      "epoch": 0.8624454148471615,
+      "grad_norm": 0.1565634310245514,
+      "learning_rate": 2.4684834387323943e-06,
+      "loss": 0.14146244525909424,
+      "step": 4740
+    },
+    {
+      "epoch": 0.8633551673944687,
+      "grad_norm": 0.18060864508152008,
+      "learning_rate": 2.4366884399382393e-06,
+      "loss": 0.14218534231185914,
+      "step": 4745
+    },
+    {
+      "epoch": 0.8642649199417758,
+      "grad_norm": 0.24613255262374878,
+      "learning_rate": 2.4050890432080557e-06,
+      "loss": 0.13907679319381713,
+      "step": 4750
+    },
+    {
+      "epoch": 0.865174672489083,
+      "grad_norm": 0.16036023199558258,
+      "learning_rate": 2.3736855224780057e-06,
+      "loss": 0.13718113899230958,
+      "step": 4755
+    },
+    {
+      "epoch": 0.86608442503639,
+      "grad_norm": 0.16678516566753387,
+      "learning_rate": 2.3424781499862075e-06,
+      "loss": 0.1327962040901184,
+      "step": 4760
+    },
+    {
+      "epoch": 0.8669941775836972,
+      "grad_norm": 0.1763770878314972,
+      "learning_rate": 2.3114671962703727e-06,
+      "loss": 0.14390318393707274,
+      "step": 4765
+    },
+    {
+      "epoch": 0.8679039301310044,
+      "grad_norm": 0.17735697329044342,
+      "learning_rate": 2.280652930165428e-06,
+      "loss": 0.15223288536071777,
+      "step": 4770
+    },
+    {
+      "epoch": 0.8688136826783115,
+      "grad_norm": 0.15827041864395142,
+      "learning_rate": 2.250035618801241e-06,
+      "loss": 0.14296332597732545,
+      "step": 4775
+    },
+    {
+      "epoch": 0.8697234352256187,
+      "grad_norm": 0.16876135766506195,
+      "learning_rate": 2.219615527600244e-06,
+      "loss": 0.1359076738357544,
+      "step": 4780
+    },
+    {
+      "epoch": 0.8706331877729258,
+      "grad_norm": 0.1800110638141632,
+      "learning_rate": 2.189392920275174e-06,
+      "loss": 0.1424281358718872,
+      "step": 4785
+    },
+    {
+      "epoch": 0.8715429403202329,
+      "grad_norm": 0.1409560889005661,
+      "learning_rate": 2.159368058826783e-06,
+      "loss": 0.14480490684509278,
+      "step": 4790
+    },
+    {
+      "epoch": 0.87245269286754,
+      "grad_norm": 0.1634288728237152,
+      "learning_rate": 2.129541203541535e-06,
+      "loss": 0.14513269662857056,
+      "step": 4795
+    },
+    {
+      "epoch": 0.8733624454148472,
+      "grad_norm": 0.17126062512397766,
+      "learning_rate": 2.099912612989391e-06,
+      "loss": 0.13546934127807617,
+      "step": 4800
+    },
+    {
+      "epoch": 0.8742721979621543,
+      "grad_norm": 0.16704080998897552,
+      "learning_rate": 2.0704825440215457e-06,
+      "loss": 0.13852492570877076,
+      "step": 4805
+    },
+    {
+      "epoch": 0.8751819505094615,
+      "grad_norm": 0.1725970208644867,
+      "learning_rate": 2.0412512517681946e-06,
+      "loss": 0.14504197835922242,
+      "step": 4810
+    },
+    {
+      "epoch": 0.8760917030567685,
+      "grad_norm": 0.1700201779603958,
+      "learning_rate": 2.0122189896363387e-06,
+      "loss": 0.14312338829040527,
+      "step": 4815
+    },
+    {
+      "epoch": 0.8770014556040757,
+      "grad_norm": 0.16491736471652985,
+      "learning_rate": 1.9833860093075834e-06,
+      "loss": 0.14062976837158203,
+      "step": 4820
+    },
+    {
+      "epoch": 0.8779112081513828,
+      "grad_norm": 0.13748787343502045,
+      "learning_rate": 1.9547525607359537e-06,
+      "loss": 0.1346171498298645,
+      "step": 4825
+    },
+    {
+      "epoch": 0.87882096069869,
+      "grad_norm": 0.16399399936199188,
+      "learning_rate": 1.926318892145712e-06,
+      "loss": 0.14178123474121093,
+      "step": 4830
+    },
+    {
+      "epoch": 0.879730713245997,
+      "grad_norm": 0.14491963386535645,
+      "learning_rate": 1.8980852500292412e-06,
+      "loss": 0.1408564567565918,
+      "step": 4835
+    },
+    {
+      "epoch": 0.8806404657933042,
+      "grad_norm": 0.17335423827171326,
+      "learning_rate": 1.8700518791448851e-06,
+      "loss": 0.14403265714645386,
+      "step": 4840
+    },
+    {
+      "epoch": 0.8815502183406113,
+      "grad_norm": 0.17399625480175018,
+      "learning_rate": 1.8422190225148155e-06,
+      "loss": 0.14289036989212037,
+      "step": 4845
+    },
+    {
+      "epoch": 0.8824599708879185,
+      "grad_norm": 0.17945612967014313,
+      "learning_rate": 1.814586921422956e-06,
+      "loss": 0.14494109153747559,
+      "step": 4850
+    },
+    {
+      "epoch": 0.8833697234352256,
+      "grad_norm": 0.1910620480775833,
+      "learning_rate": 1.7871558154128664e-06,
+      "loss": 0.13726245164871215,
+      "step": 4855
+    },
+    {
+      "epoch": 0.8842794759825328,
+      "grad_norm": 0.1771879345178604,
+      "learning_rate": 1.7599259422856756e-06,
+      "loss": 0.1464752197265625,
+      "step": 4860
+    },
+    {
+      "epoch": 0.8851892285298398,
+      "grad_norm": 0.19427461922168732,
+      "learning_rate": 1.7328975380980218e-06,
+      "loss": 0.13823356628417968,
+      "step": 4865
+    },
+    {
+      "epoch": 0.886098981077147,
+      "grad_norm": 0.1491149365901947,
+      "learning_rate": 1.7060708371599897e-06,
+      "loss": 0.1338604211807251,
+      "step": 4870
+    },
+    {
+      "epoch": 0.8870087336244541,
+      "grad_norm": 0.16087733209133148,
+      "learning_rate": 1.6794460720331057e-06,
+      "loss": 0.14184389114379883,
+      "step": 4875
+    },
+    {
+      "epoch": 0.8879184861717613,
+      "grad_norm": 0.14506325125694275,
+      "learning_rate": 1.653023473528309e-06,
+      "loss": 0.14267687797546386,
+      "step": 4880
+    },
+    {
+      "epoch": 0.8888282387190685,
+      "grad_norm": 0.16886365413665771,
+      "learning_rate": 1.626803270703936e-06,
+      "loss": 0.14266083240509034,
+      "step": 4885
+    },
+    {
+      "epoch": 0.8897379912663755,
+      "grad_norm": 0.1891999989748001,
+      "learning_rate": 1.6007856908637652e-06,
+      "loss": 0.1398016929626465,
+      "step": 4890
+    },
+    {
+      "epoch": 0.8906477438136827,
+      "grad_norm": 0.17645299434661865,
+      "learning_rate": 1.5749709595550083e-06,
+      "loss": 0.13869571685791016,
+      "step": 4895
+    },
+    {
+      "epoch": 0.8915574963609898,
+      "grad_norm": 0.17714262008666992,
+      "learning_rate": 1.549359300566408e-06,
+      "loss": 0.14957486391067504,
+      "step": 4900
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.694709851825631e+18,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-4900/training_args.bin b/checkpoint-4900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-4900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/checkpoint-500/README.md b/checkpoint-500/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-500/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-500/adapter_config.json b/checkpoint-500/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-500/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-500/adapter_model.safetensors b/checkpoint-500/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..57f073f55be4d5b6476317a2dd5c9d7c012c731e
--- /dev/null
+++ b/checkpoint-500/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e72da5e72729a82ed1e74cb4530ea283258fbc2e6a672f1720f78e35833fc693
+size 169741912
diff --git a/checkpoint-500/chat_template.jinja b/checkpoint-500/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-500/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-500/optimizer.pt b/checkpoint-500/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f1c23368da91ccb321cdb11a33822f5ce9681e88
--- /dev/null
+++ b/checkpoint-500/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:57690ba6db6b1c1317be364e116f785706400e214fc68f6bac0f5f05a58f044a
+size 72807355
diff --git a/checkpoint-500/processor_config.json b/checkpoint-500/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-500/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-500/rng_state.pth b/checkpoint-500/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-500/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-500/scheduler.pt b/checkpoint-500/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..dedb3acf3e322ab265844313e4843a1c87c685f5
--- /dev/null
+++ b/checkpoint-500/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c96ac0a9f378033255ff6badf107ce844a8d4aa8acf7d0f966846c207b52eaef
+size 1465
diff --git a/checkpoint-500/tokenizer.json b/checkpoint-500/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-500/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-500/tokenizer_config.json b/checkpoint-500/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-500/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-500/trainer_state.json b/checkpoint-500/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..84a830d845225c501fec2be7ed3ca813c30edc50
--- /dev/null
+++ b/checkpoint-500/trainer_state.json
@@ -0,0 +1,742 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.09097525473071325,
+  "eval_steps": 100,
+  "global_step": 500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    },
+    {
+      "epoch": 0.03729985443959243,
+      "grad_norm": 0.061678580939769745,
+      "learning_rate": 4.999340748636462e-05,
+      "loss": 0.24956226348876953,
+      "step": 205
+    },
+    {
+      "epoch": 0.03820960698689956,
+      "grad_norm": 0.07328873127698898,
+      "learning_rate": 4.999160884067051e-05,
+      "loss": 0.26169676780700685,
+      "step": 210
+    },
+    {
+      "epoch": 0.0391193595342067,
+      "grad_norm": 0.08287990838289261,
+      "learning_rate": 4.9989593541926246e-05,
+      "loss": 0.2574604034423828,
+      "step": 215
+    },
+    {
+      "epoch": 0.04002911208151383,
+      "grad_norm": 0.06787359714508057,
+      "learning_rate": 4.9987361607602525e-05,
+      "loss": 0.25351409912109374,
+      "step": 220
+    },
+    {
+      "epoch": 0.04093886462882096,
+      "grad_norm": 0.06695502996444702,
+      "learning_rate": 4.998491305704805e-05,
+      "loss": 0.24522039890289307,
+      "step": 225
+    },
+    {
+      "epoch": 0.041848617176128096,
+      "grad_norm": 0.08872214704751968,
+      "learning_rate": 4.9982247911489375e-05,
+      "loss": 0.2581867933273315,
+      "step": 230
+    },
+    {
+      "epoch": 0.042758369723435226,
+      "grad_norm": 0.07637131959199905,
+      "learning_rate": 4.9979366194030743e-05,
+      "loss": 0.25569658279418944,
+      "step": 235
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 0.08158119022846222,
+      "learning_rate": 4.997626792965385e-05,
+      "loss": 0.2529409646987915,
+      "step": 240
+    },
+    {
+      "epoch": 0.04457787481804949,
+      "grad_norm": 0.07529161125421524,
+      "learning_rate": 4.997295314521766e-05,
+      "loss": 0.24049024581909179,
+      "step": 245
+    },
+    {
+      "epoch": 0.04548762736535662,
+      "grad_norm": 0.08860139548778534,
+      "learning_rate": 4.996942186945813e-05,
+      "loss": 0.2490522861480713,
+      "step": 250
+    },
+    {
+      "epoch": 0.04639737991266375,
+      "grad_norm": 0.0850321501493454,
+      "learning_rate": 4.9965674132988005e-05,
+      "loss": 0.24180831909179687,
+      "step": 255
+    },
+    {
+      "epoch": 0.04730713245997089,
+      "grad_norm": 0.07556115090847015,
+      "learning_rate": 4.996170996829653e-05,
+      "loss": 0.2509631872177124,
+      "step": 260
+    },
+    {
+      "epoch": 0.04821688500727802,
+      "grad_norm": 0.07971206307411194,
+      "learning_rate": 4.995752940974918e-05,
+      "loss": 0.24398891925811766,
+      "step": 265
+    },
+    {
+      "epoch": 0.04912663755458515,
+      "grad_norm": 0.09149336814880371,
+      "learning_rate": 4.9953132493587344e-05,
+      "loss": 0.2300492286682129,
+      "step": 270
+    },
+    {
+      "epoch": 0.050036390101892286,
+      "grad_norm": 0.08265820890665054,
+      "learning_rate": 4.9948519257928034e-05,
+      "loss": 0.24246792793273925,
+      "step": 275
+    },
+    {
+      "epoch": 0.050946142649199416,
+      "grad_norm": 0.10328587144613266,
+      "learning_rate": 4.9943689742763534e-05,
+      "loss": 0.2367171049118042,
+      "step": 280
+    },
+    {
+      "epoch": 0.05185589519650655,
+      "grad_norm": 0.0836917981505394,
+      "learning_rate": 4.993864398996105e-05,
+      "loss": 0.23215813636779786,
+      "step": 285
+    },
+    {
+      "epoch": 0.05276564774381368,
+      "grad_norm": 0.09475161135196686,
+      "learning_rate": 4.99333820432624e-05,
+      "loss": 0.2350748062133789,
+      "step": 290
+    },
+    {
+      "epoch": 0.05367540029112081,
+      "grad_norm": 0.08040128648281097,
+      "learning_rate": 4.992790394828355e-05,
+      "loss": 0.23253886699676513,
+      "step": 295
+    },
+    {
+      "epoch": 0.05458515283842795,
+      "grad_norm": 0.08852150291204453,
+      "learning_rate": 4.992220975251428e-05,
+      "loss": 0.23856515884399415,
+      "step": 300
+    },
+    {
+      "epoch": 0.05549490538573508,
+      "grad_norm": 0.09565229713916779,
+      "learning_rate": 4.991629950531775e-05,
+      "loss": 0.23311660289764405,
+      "step": 305
+    },
+    {
+      "epoch": 0.05640465793304221,
+      "grad_norm": 0.08158160001039505,
+      "learning_rate": 4.991017325793009e-05,
+      "loss": 0.22467944622039795,
+      "step": 310
+    },
+    {
+      "epoch": 0.05731441048034935,
+      "grad_norm": 0.07746429741382599,
+      "learning_rate": 4.990383106345994e-05,
+      "loss": 0.229844069480896,
+      "step": 315
+    },
+    {
+      "epoch": 0.05822416302765648,
+      "grad_norm": 0.08564355969429016,
+      "learning_rate": 4.989727297688797e-05,
+      "loss": 0.22414517402648926,
+      "step": 320
+    },
+    {
+      "epoch": 0.05913391557496361,
+      "grad_norm": 0.07517435401678085,
+      "learning_rate": 4.9890499055066435e-05,
+      "loss": 0.2236532211303711,
+      "step": 325
+    },
+    {
+      "epoch": 0.060043668122270744,
+      "grad_norm": 0.111734539270401,
+      "learning_rate": 4.988350935671869e-05,
+      "loss": 0.21474847793579102,
+      "step": 330
+    },
+    {
+      "epoch": 0.060953420669577874,
+      "grad_norm": 0.09906989336013794,
+      "learning_rate": 4.987630394243866e-05,
+      "loss": 0.23321933746337892,
+      "step": 335
+    },
+    {
+      "epoch": 0.06186317321688501,
+      "grad_norm": 0.10131457448005676,
+      "learning_rate": 4.98688828746903e-05,
+      "loss": 0.2310662031173706,
+      "step": 340
+    },
+    {
+      "epoch": 0.06277292576419213,
+      "grad_norm": 0.09203507006168365,
+      "learning_rate": 4.986124621780708e-05,
+      "loss": 0.22021169662475587,
+      "step": 345
+    },
+    {
+      "epoch": 0.06368267831149928,
+      "grad_norm": 0.09505912661552429,
+      "learning_rate": 4.9853394037991416e-05,
+      "loss": 0.2197155237197876,
+      "step": 350
+    },
+    {
+      "epoch": 0.06459243085880641,
+      "grad_norm": 0.09038657695055008,
+      "learning_rate": 4.984532640331412e-05,
+      "loss": 0.22066287994384765,
+      "step": 355
+    },
+    {
+      "epoch": 0.06550218340611354,
+      "grad_norm": 0.09707064181566238,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 0.22455451488494874,
+      "step": 360
+    },
+    {
+      "epoch": 0.06641193595342067,
+      "grad_norm": 0.10367228090763092,
+      "learning_rate": 4.98285450509961e-05,
+      "loss": 0.21993820667266845,
+      "step": 365
+    },
+    {
+      "epoch": 0.0673216885007278,
+      "grad_norm": 0.12229471653699875,
+      "learning_rate": 4.9819831478833456e-05,
+      "loss": 0.2168867588043213,
+      "step": 370
+    },
+    {
+      "epoch": 0.06823144104803494,
+      "grad_norm": 0.0964592918753624,
+      "learning_rate": 4.981090274276406e-05,
+      "loss": 0.21579203605651856,
+      "step": 375
+    },
+    {
+      "epoch": 0.06914119359534207,
+      "grad_norm": 0.09400496631860733,
+      "learning_rate": 4.980175892019141e-05,
+      "loss": 0.20972180366516113,
+      "step": 380
+    },
+    {
+      "epoch": 0.0700509461426492,
+      "grad_norm": 0.08158645778894424,
+      "learning_rate": 4.9792400090383594e-05,
+      "loss": 0.22148358821868896,
+      "step": 385
+    },
+    {
+      "epoch": 0.07096069868995633,
+      "grad_norm": 0.10916394740343094,
+      "learning_rate": 4.978282633447261e-05,
+      "loss": 0.2214418649673462,
+      "step": 390
+    },
+    {
+      "epoch": 0.07187045123726346,
+      "grad_norm": 0.11138810962438583,
+      "learning_rate": 4.9773037735453636e-05,
+      "loss": 0.21814754009246826,
+      "step": 395
+    },
+    {
+      "epoch": 0.07278020378457059,
+      "grad_norm": 0.10914396494626999,
+      "learning_rate": 4.9763034378184365e-05,
+      "loss": 0.21310818195343018,
+      "step": 400
+    },
+    {
+      "epoch": 0.07368995633187773,
+      "grad_norm": 0.1043366864323616,
+      "learning_rate": 4.975281634938421e-05,
+      "loss": 0.21266789436340333,
+      "step": 405
+    },
+    {
+      "epoch": 0.07459970887918486,
+      "grad_norm": 0.1036868542432785,
+      "learning_rate": 4.9742383737633594e-05,
+      "loss": 0.21606721878051757,
+      "step": 410
+    },
+    {
+      "epoch": 0.075509461426492,
+      "grad_norm": 0.11640442907810211,
+      "learning_rate": 4.9731736633373144e-05,
+      "loss": 0.21532948017120362,
+      "step": 415
+    },
+    {
+      "epoch": 0.07641921397379912,
+      "grad_norm": 0.11219926178455353,
+      "learning_rate": 4.9720875128902956e-05,
+      "loss": 0.2191627025604248,
+      "step": 420
+    },
+    {
+      "epoch": 0.07732896652110625,
+      "grad_norm": 0.12103637307882309,
+      "learning_rate": 4.970979931838176e-05,
+      "loss": 0.20938868522644044,
+      "step": 425
+    },
+    {
+      "epoch": 0.0782387190684134,
+      "grad_norm": 0.13274189829826355,
+      "learning_rate": 4.96985092978261e-05,
+      "loss": 0.21792960166931152,
+      "step": 430
+    },
+    {
+      "epoch": 0.07914847161572053,
+      "grad_norm": 0.11164513230323792,
+      "learning_rate": 4.968700516510954e-05,
+      "loss": 0.2022618055343628,
+      "step": 435
+    },
+    {
+      "epoch": 0.08005822416302766,
+      "grad_norm": 0.09532847255468369,
+      "learning_rate": 4.967528701996174e-05,
+      "loss": 0.21255812644958497,
+      "step": 440
+    },
+    {
+      "epoch": 0.08096797671033479,
+      "grad_norm": 0.10279258340597153,
+      "learning_rate": 4.96633549639677e-05,
+      "loss": 0.20683050155639648,
+      "step": 445
+    },
+    {
+      "epoch": 0.08187772925764192,
+      "grad_norm": 0.1257462352514267,
+      "learning_rate": 4.965120910056677e-05,
+      "loss": 0.21419920921325683,
+      "step": 450
+    },
+    {
+      "epoch": 0.08278748180494905,
+      "grad_norm": 0.11663137376308441,
+      "learning_rate": 4.963884953505186e-05,
+      "loss": 0.2072287082672119,
+      "step": 455
+    },
+    {
+      "epoch": 0.08369723435225619,
+      "grad_norm": 0.10488224029541016,
+      "learning_rate": 4.96262763745684e-05,
+      "loss": 0.1982678532600403,
+      "step": 460
+    },
+    {
+      "epoch": 0.08460698689956332,
+      "grad_norm": 0.11801692098379135,
+      "learning_rate": 4.961348972811354e-05,
+      "loss": 0.20662031173706055,
+      "step": 465
+    },
+    {
+      "epoch": 0.08551673944687045,
+      "grad_norm": 0.11318827420473099,
+      "learning_rate": 4.96004897065351e-05,
+      "loss": 0.20947303771972656,
+      "step": 470
+    },
+    {
+      "epoch": 0.08642649199417758,
+      "grad_norm": 0.13409486413002014,
+      "learning_rate": 4.95872764225307e-05,
+      "loss": 0.19670876264572143,
+      "step": 475
+    },
+    {
+      "epoch": 0.08733624454148471,
+      "grad_norm": 0.14440792798995972,
+      "learning_rate": 4.957384999064672e-05,
+      "loss": 0.19842848777770997,
+      "step": 480
+    },
+    {
+      "epoch": 0.08824599708879186,
+      "grad_norm": 0.12246996909379959,
+      "learning_rate": 4.956021052727731e-05,
+      "loss": 0.20318071842193602,
+      "step": 485
+    },
+    {
+      "epoch": 0.08915574963609899,
+      "grad_norm": 0.13437233865261078,
+      "learning_rate": 4.954635815066342e-05,
+      "loss": 0.21675212383270265,
+      "step": 490
+    },
+    {
+      "epoch": 0.09006550218340612,
+      "grad_norm": 0.11109672486782074,
+      "learning_rate": 4.9532292980891744e-05,
+      "loss": 0.2100757837295532,
+      "step": 495
+    },
+    {
+      "epoch": 0.09097525473071325,
+      "grad_norm": 0.1388893872499466,
+      "learning_rate": 4.9518015139893675e-05,
+      "loss": 0.20303285121917725,
+      "step": 500
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.8430480830875264e+17,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-500/training_args.bin b/checkpoint-500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/checkpoint-5000/README.md b/checkpoint-5000/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-5000/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-5000/adapter_config.json b/checkpoint-5000/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-5000/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-5000/adapter_model.safetensors b/checkpoint-5000/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..49e258125bc310d7c820342ac761d1fe8b70b818
--- /dev/null
+++ b/checkpoint-5000/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:062304accc1e0e63859762076339d9b10c027abb482653c443866239fa7d30c9
+size 169741912
diff --git a/checkpoint-5000/chat_template.jinja b/checkpoint-5000/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-5000/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-5000/optimizer.pt b/checkpoint-5000/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a970be0b82ebb489cb30cebae9385376466a3534
--- /dev/null
+++ b/checkpoint-5000/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf925baf611c389d14cd98bfb1ec7673cb616dc7356b485848d57d356399989a
+size 72807355
diff --git a/checkpoint-5000/processor_config.json b/checkpoint-5000/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-5000/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-5000/rng_state.pth b/checkpoint-5000/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-5000/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-5000/scheduler.pt b/checkpoint-5000/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f0660753bd7e780deec81470e1025d35fa4b1dd8
--- /dev/null
+++ b/checkpoint-5000/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bc5e6b6ccc6bc308c743e552cb17737cf764433750ecaa57b00cf76b0e4a1c85
+size 1465
diff --git a/checkpoint-5000/tokenizer.json b/checkpoint-5000/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-5000/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-5000/tokenizer_config.json b/checkpoint-5000/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-5000/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-5000/trainer_state.json b/checkpoint-5000/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..9e750a5fad76573f641e0468ee9006c9af08f7ba
--- /dev/null
+++ b/checkpoint-5000/trainer_state.json
@@ -0,0 +1,7042 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9097525473071325,
+  "eval_steps": 100,
+  "global_step": 5000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    },
+    {
+      "epoch": 0.03729985443959243,
+      "grad_norm": 0.061678580939769745,
+      "learning_rate": 4.999340748636462e-05,
+      "loss": 0.24956226348876953,
+      "step": 205
+    },
+    {
+      "epoch": 0.03820960698689956,
+      "grad_norm": 0.07328873127698898,
+      "learning_rate": 4.999160884067051e-05,
+      "loss": 0.26169676780700685,
+      "step": 210
+    },
+    {
+      "epoch": 0.0391193595342067,
+      "grad_norm": 0.08287990838289261,
+      "learning_rate": 4.9989593541926246e-05,
+      "loss": 0.2574604034423828,
+      "step": 215
+    },
+    {
+      "epoch": 0.04002911208151383,
+      "grad_norm": 0.06787359714508057,
+      "learning_rate": 4.9987361607602525e-05,
+      "loss": 0.25351409912109374,
+      "step": 220
+    },
+    {
+      "epoch": 0.04093886462882096,
+      "grad_norm": 0.06695502996444702,
+      "learning_rate": 4.998491305704805e-05,
+      "loss": 0.24522039890289307,
+      "step": 225
+    },
+    {
+      "epoch": 0.041848617176128096,
+      "grad_norm": 0.08872214704751968,
+      "learning_rate": 4.9982247911489375e-05,
+      "loss": 0.2581867933273315,
+      "step": 230
+    },
+    {
+      "epoch": 0.042758369723435226,
+      "grad_norm": 0.07637131959199905,
+      "learning_rate": 4.9979366194030743e-05,
+      "loss": 0.25569658279418944,
+      "step": 235
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 0.08158119022846222,
+      "learning_rate": 4.997626792965385e-05,
+      "loss": 0.2529409646987915,
+      "step": 240
+    },
+    {
+      "epoch": 0.04457787481804949,
+      "grad_norm": 0.07529161125421524,
+      "learning_rate": 4.997295314521766e-05,
+      "loss": 0.24049024581909179,
+      "step": 245
+    },
+    {
+      "epoch": 0.04548762736535662,
+      "grad_norm": 0.08860139548778534,
+      "learning_rate": 4.996942186945813e-05,
+      "loss": 0.2490522861480713,
+      "step": 250
+    },
+    {
+      "epoch": 0.04639737991266375,
+      "grad_norm": 0.0850321501493454,
+      "learning_rate": 4.9965674132988005e-05,
+      "loss": 0.24180831909179687,
+      "step": 255
+    },
+    {
+      "epoch": 0.04730713245997089,
+      "grad_norm": 0.07556115090847015,
+      "learning_rate": 4.996170996829653e-05,
+      "loss": 0.2509631872177124,
+      "step": 260
+    },
+    {
+      "epoch": 0.04821688500727802,
+      "grad_norm": 0.07971206307411194,
+      "learning_rate": 4.995752940974918e-05,
+      "loss": 0.24398891925811766,
+      "step": 265
+    },
+    {
+      "epoch": 0.04912663755458515,
+      "grad_norm": 0.09149336814880371,
+      "learning_rate": 4.9953132493587344e-05,
+      "loss": 0.2300492286682129,
+      "step": 270
+    },
+    {
+      "epoch": 0.050036390101892286,
+      "grad_norm": 0.08265820890665054,
+      "learning_rate": 4.9948519257928034e-05,
+      "loss": 0.24246792793273925,
+      "step": 275
+    },
+    {
+      "epoch": 0.050946142649199416,
+      "grad_norm": 0.10328587144613266,
+      "learning_rate": 4.9943689742763534e-05,
+      "loss": 0.2367171049118042,
+      "step": 280
+    },
+    {
+      "epoch": 0.05185589519650655,
+      "grad_norm": 0.0836917981505394,
+      "learning_rate": 4.993864398996105e-05,
+      "loss": 0.23215813636779786,
+      "step": 285
+    },
+    {
+      "epoch": 0.05276564774381368,
+      "grad_norm": 0.09475161135196686,
+      "learning_rate": 4.99333820432624e-05,
+      "loss": 0.2350748062133789,
+      "step": 290
+    },
+    {
+      "epoch": 0.05367540029112081,
+      "grad_norm": 0.08040128648281097,
+      "learning_rate": 4.992790394828355e-05,
+      "loss": 0.23253886699676513,
+      "step": 295
+    },
+    {
+      "epoch": 0.05458515283842795,
+      "grad_norm": 0.08852150291204453,
+      "learning_rate": 4.992220975251428e-05,
+      "loss": 0.23856515884399415,
+      "step": 300
+    },
+    {
+      "epoch": 0.05549490538573508,
+      "grad_norm": 0.09565229713916779,
+      "learning_rate": 4.991629950531775e-05,
+      "loss": 0.23311660289764405,
+      "step": 305
+    },
+    {
+      "epoch": 0.05640465793304221,
+      "grad_norm": 0.08158160001039505,
+      "learning_rate": 4.991017325793009e-05,
+      "loss": 0.22467944622039795,
+      "step": 310
+    },
+    {
+      "epoch": 0.05731441048034935,
+      "grad_norm": 0.07746429741382599,
+      "learning_rate": 4.990383106345994e-05,
+      "loss": 0.229844069480896,
+      "step": 315
+    },
+    {
+      "epoch": 0.05822416302765648,
+      "grad_norm": 0.08564355969429016,
+      "learning_rate": 4.989727297688797e-05,
+      "loss": 0.22414517402648926,
+      "step": 320
+    },
+    {
+      "epoch": 0.05913391557496361,
+      "grad_norm": 0.07517435401678085,
+      "learning_rate": 4.9890499055066435e-05,
+      "loss": 0.2236532211303711,
+      "step": 325
+    },
+    {
+      "epoch": 0.060043668122270744,
+      "grad_norm": 0.111734539270401,
+      "learning_rate": 4.988350935671869e-05,
+      "loss": 0.21474847793579102,
+      "step": 330
+    },
+    {
+      "epoch": 0.060953420669577874,
+      "grad_norm": 0.09906989336013794,
+      "learning_rate": 4.987630394243866e-05,
+      "loss": 0.23321933746337892,
+      "step": 335
+    },
+    {
+      "epoch": 0.06186317321688501,
+      "grad_norm": 0.10131457448005676,
+      "learning_rate": 4.98688828746903e-05,
+      "loss": 0.2310662031173706,
+      "step": 340
+    },
+    {
+      "epoch": 0.06277292576419213,
+      "grad_norm": 0.09203507006168365,
+      "learning_rate": 4.986124621780708e-05,
+      "loss": 0.22021169662475587,
+      "step": 345
+    },
+    {
+      "epoch": 0.06368267831149928,
+      "grad_norm": 0.09505912661552429,
+      "learning_rate": 4.9853394037991416e-05,
+      "loss": 0.2197155237197876,
+      "step": 350
+    },
+    {
+      "epoch": 0.06459243085880641,
+      "grad_norm": 0.09038657695055008,
+      "learning_rate": 4.984532640331412e-05,
+      "loss": 0.22066287994384765,
+      "step": 355
+    },
+    {
+      "epoch": 0.06550218340611354,
+      "grad_norm": 0.09707064181566238,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 0.22455451488494874,
+      "step": 360
+    },
+    {
+      "epoch": 0.06641193595342067,
+      "grad_norm": 0.10367228090763092,
+      "learning_rate": 4.98285450509961e-05,
+      "loss": 0.21993820667266845,
+      "step": 365
+    },
+    {
+      "epoch": 0.0673216885007278,
+      "grad_norm": 0.12229471653699875,
+      "learning_rate": 4.9819831478833456e-05,
+      "loss": 0.2168867588043213,
+      "step": 370
+    },
+    {
+      "epoch": 0.06823144104803494,
+      "grad_norm": 0.0964592918753624,
+      "learning_rate": 4.981090274276406e-05,
+      "loss": 0.21579203605651856,
+      "step": 375
+    },
+    {
+      "epoch": 0.06914119359534207,
+      "grad_norm": 0.09400496631860733,
+      "learning_rate": 4.980175892019141e-05,
+      "loss": 0.20972180366516113,
+      "step": 380
+    },
+    {
+      "epoch": 0.0700509461426492,
+      "grad_norm": 0.08158645778894424,
+      "learning_rate": 4.9792400090383594e-05,
+      "loss": 0.22148358821868896,
+      "step": 385
+    },
+    {
+      "epoch": 0.07096069868995633,
+      "grad_norm": 0.10916394740343094,
+      "learning_rate": 4.978282633447261e-05,
+      "loss": 0.2214418649673462,
+      "step": 390
+    },
+    {
+      "epoch": 0.07187045123726346,
+      "grad_norm": 0.11138810962438583,
+      "learning_rate": 4.9773037735453636e-05,
+      "loss": 0.21814754009246826,
+      "step": 395
+    },
+    {
+      "epoch": 0.07278020378457059,
+      "grad_norm": 0.10914396494626999,
+      "learning_rate": 4.9763034378184365e-05,
+      "loss": 0.21310818195343018,
+      "step": 400
+    },
+    {
+      "epoch": 0.07368995633187773,
+      "grad_norm": 0.1043366864323616,
+      "learning_rate": 4.975281634938421e-05,
+      "loss": 0.21266789436340333,
+      "step": 405
+    },
+    {
+      "epoch": 0.07459970887918486,
+      "grad_norm": 0.1036868542432785,
+      "learning_rate": 4.9742383737633594e-05,
+      "loss": 0.21606721878051757,
+      "step": 410
+    },
+    {
+      "epoch": 0.075509461426492,
+      "grad_norm": 0.11640442907810211,
+      "learning_rate": 4.9731736633373144e-05,
+      "loss": 0.21532948017120362,
+      "step": 415
+    },
+    {
+      "epoch": 0.07641921397379912,
+      "grad_norm": 0.11219926178455353,
+      "learning_rate": 4.9720875128902956e-05,
+      "loss": 0.2191627025604248,
+      "step": 420
+    },
+    {
+      "epoch": 0.07732896652110625,
+      "grad_norm": 0.12103637307882309,
+      "learning_rate": 4.970979931838176e-05,
+      "loss": 0.20938868522644044,
+      "step": 425
+    },
+    {
+      "epoch": 0.0782387190684134,
+      "grad_norm": 0.13274189829826355,
+      "learning_rate": 4.96985092978261e-05,
+      "loss": 0.21792960166931152,
+      "step": 430
+    },
+    {
+      "epoch": 0.07914847161572053,
+      "grad_norm": 0.11164513230323792,
+      "learning_rate": 4.968700516510954e-05,
+      "loss": 0.2022618055343628,
+      "step": 435
+    },
+    {
+      "epoch": 0.08005822416302766,
+      "grad_norm": 0.09532847255468369,
+      "learning_rate": 4.967528701996174e-05,
+      "loss": 0.21255812644958497,
+      "step": 440
+    },
+    {
+      "epoch": 0.08096797671033479,
+      "grad_norm": 0.10279258340597153,
+      "learning_rate": 4.96633549639677e-05,
+      "loss": 0.20683050155639648,
+      "step": 445
+    },
+    {
+      "epoch": 0.08187772925764192,
+      "grad_norm": 0.1257462352514267,
+      "learning_rate": 4.965120910056677e-05,
+      "loss": 0.21419920921325683,
+      "step": 450
+    },
+    {
+      "epoch": 0.08278748180494905,
+      "grad_norm": 0.11663137376308441,
+      "learning_rate": 4.963884953505186e-05,
+      "loss": 0.2072287082672119,
+      "step": 455
+    },
+    {
+      "epoch": 0.08369723435225619,
+      "grad_norm": 0.10488224029541016,
+      "learning_rate": 4.96262763745684e-05,
+      "loss": 0.1982678532600403,
+      "step": 460
+    },
+    {
+      "epoch": 0.08460698689956332,
+      "grad_norm": 0.11801692098379135,
+      "learning_rate": 4.961348972811354e-05,
+      "loss": 0.20662031173706055,
+      "step": 465
+    },
+    {
+      "epoch": 0.08551673944687045,
+      "grad_norm": 0.11318827420473099,
+      "learning_rate": 4.96004897065351e-05,
+      "loss": 0.20947303771972656,
+      "step": 470
+    },
+    {
+      "epoch": 0.08642649199417758,
+      "grad_norm": 0.13409486413002014,
+      "learning_rate": 4.95872764225307e-05,
+      "loss": 0.19670876264572143,
+      "step": 475
+    },
+    {
+      "epoch": 0.08733624454148471,
+      "grad_norm": 0.14440792798995972,
+      "learning_rate": 4.957384999064672e-05,
+      "loss": 0.19842848777770997,
+      "step": 480
+    },
+    {
+      "epoch": 0.08824599708879186,
+      "grad_norm": 0.12246996909379959,
+      "learning_rate": 4.956021052727731e-05,
+      "loss": 0.20318071842193602,
+      "step": 485
+    },
+    {
+      "epoch": 0.08915574963609899,
+      "grad_norm": 0.13437233865261078,
+      "learning_rate": 4.954635815066342e-05,
+      "loss": 0.21675212383270265,
+      "step": 490
+    },
+    {
+      "epoch": 0.09006550218340612,
+      "grad_norm": 0.11109672486782074,
+      "learning_rate": 4.9532292980891744e-05,
+      "loss": 0.2100757837295532,
+      "step": 495
+    },
+    {
+      "epoch": 0.09097525473071325,
+      "grad_norm": 0.1388893872499466,
+      "learning_rate": 4.9518015139893675e-05,
+      "loss": 0.20303285121917725,
+      "step": 500
+    },
+    {
+      "epoch": 0.09188500727802038,
+      "grad_norm": 0.13239721953868866,
+      "learning_rate": 4.950352475144427e-05,
+      "loss": 0.2152268409729004,
+      "step": 505
+    },
+    {
+      "epoch": 0.0927947598253275,
+      "grad_norm": 0.12834979593753815,
+      "learning_rate": 4.948882194116119e-05,
+      "loss": 0.20799248218536376,
+      "step": 510
+    },
+    {
+      "epoch": 0.09370451237263465,
+      "grad_norm": 0.11886704713106155,
+      "learning_rate": 4.947390683650354e-05,
+      "loss": 0.20394976139068605,
+      "step": 515
+    },
+    {
+      "epoch": 0.09461426491994178,
+      "grad_norm": 0.11398876458406448,
+      "learning_rate": 4.945877956677083e-05,
+      "loss": 0.2091092586517334,
+      "step": 520
+    },
+    {
+      "epoch": 0.09552401746724891,
+      "grad_norm": 0.1422540694475174,
+      "learning_rate": 4.944344026310186e-05,
+      "loss": 0.19564238786697388,
+      "step": 525
+    },
+    {
+      "epoch": 0.09643377001455604,
+      "grad_norm": 0.11359584331512451,
+      "learning_rate": 4.9427889058473535e-05,
+      "loss": 0.20493624210357667,
+      "step": 530
+    },
+    {
+      "epoch": 0.09734352256186317,
+      "grad_norm": 0.11703553050756454,
+      "learning_rate": 4.941212608769974e-05,
+      "loss": 0.2098615884780884,
+      "step": 535
+    },
+    {
+      "epoch": 0.0982532751091703,
+      "grad_norm": 0.14552047848701477,
+      "learning_rate": 4.939615148743017e-05,
+      "loss": 0.20382182598114013,
+      "step": 540
+    },
+    {
+      "epoch": 0.09916302765647744,
+      "grad_norm": 0.13178016245365143,
+      "learning_rate": 4.937996539614914e-05,
+      "loss": 0.19901862144470214,
+      "step": 545
+    },
+    {
+      "epoch": 0.10007278020378457,
+      "grad_norm": 0.635392427444458,
+      "learning_rate": 4.936356795417439e-05,
+      "loss": 0.20694944858551026,
+      "step": 550
+    },
+    {
+      "epoch": 0.1009825327510917,
+      "grad_norm": 0.15019077062606812,
+      "learning_rate": 4.934695930365586e-05,
+      "loss": 0.19313746690750122,
+      "step": 555
+    },
+    {
+      "epoch": 0.10189228529839883,
+      "grad_norm": 0.12941956520080566,
+      "learning_rate": 4.9330139588574474e-05,
+      "loss": 0.19671722650527954,
+      "step": 560
+    },
+    {
+      "epoch": 0.10280203784570596,
+      "grad_norm": 0.13818831741809845,
+      "learning_rate": 4.931310895474088e-05,
+      "loss": 0.20026786327362062,
+      "step": 565
+    },
+    {
+      "epoch": 0.1037117903930131,
+      "grad_norm": 0.12011194974184036,
+      "learning_rate": 4.929586754979417e-05,
+      "loss": 0.1932437539100647,
+      "step": 570
+    },
+    {
+      "epoch": 0.10462154294032024,
+      "grad_norm": 0.1345364898443222,
+      "learning_rate": 4.9278415523200644e-05,
+      "loss": 0.20245940685272218,
+      "step": 575
+    },
+    {
+      "epoch": 0.10553129548762737,
+      "grad_norm": 0.13281017541885376,
+      "learning_rate": 4.926075302625247e-05,
+      "loss": 0.19864981174468993,
+      "step": 580
+    },
+    {
+      "epoch": 0.1064410480349345,
+      "grad_norm": 0.13465586304664612,
+      "learning_rate": 4.924288021206639e-05,
+      "loss": 0.19573183059692384,
+      "step": 585
+    },
+    {
+      "epoch": 0.10735080058224163,
+      "grad_norm": 0.15225961804389954,
+      "learning_rate": 4.9224797235582396e-05,
+      "loss": 0.19946801662445068,
+      "step": 590
+    },
+    {
+      "epoch": 0.10826055312954876,
+      "grad_norm": 0.12816746532917023,
+      "learning_rate": 4.92065042535624e-05,
+      "loss": 0.19851526021957397,
+      "step": 595
+    },
+    {
+      "epoch": 0.1091703056768559,
+      "grad_norm": 0.13802853226661682,
+      "learning_rate": 4.9188001424588824e-05,
+      "loss": 0.19321763515472412,
+      "step": 600
+    },
+    {
+      "epoch": 0.11008005822416303,
+      "grad_norm": 0.17504797875881195,
+      "learning_rate": 4.9169288909063295e-05,
+      "loss": 0.2032616138458252,
+      "step": 605
+    },
+    {
+      "epoch": 0.11098981077147016,
+      "grad_norm": 0.13544194400310516,
+      "learning_rate": 4.91503668692052e-05,
+      "loss": 0.2011256456375122,
+      "step": 610
+    },
+    {
+      "epoch": 0.11189956331877729,
+      "grad_norm": 1.3976134061813354,
+      "learning_rate": 4.91312354690503e-05,
+      "loss": 0.19916868209838867,
+      "step": 615
+    },
+    {
+      "epoch": 0.11280931586608442,
+      "grad_norm": 0.1465059071779251,
+      "learning_rate": 4.91118948744493e-05,
+      "loss": 0.19487457275390624,
+      "step": 620
+    },
+    {
+      "epoch": 0.11371906841339156,
+      "grad_norm": 0.12103168666362762,
+      "learning_rate": 4.909234525306645e-05,
+      "loss": 0.1907251238822937,
+      "step": 625
+    },
+    {
+      "epoch": 0.1146288209606987,
+      "grad_norm": 0.12660574913024902,
+      "learning_rate": 4.907258677437802e-05,
+      "loss": 0.19327253103256226,
+      "step": 630
+    },
+    {
+      "epoch": 0.11553857350800582,
+      "grad_norm": 0.1347813606262207,
+      "learning_rate": 4.90526196096709e-05,
+      "loss": 0.19637736082077026,
+      "step": 635
+    },
+    {
+      "epoch": 0.11644832605531295,
+      "grad_norm": 0.14953652024269104,
+      "learning_rate": 4.903244393204107e-05,
+      "loss": 0.20325069427490233,
+      "step": 640
+    },
+    {
+      "epoch": 0.11735807860262008,
+      "grad_norm": 0.13936272263526917,
+      "learning_rate": 4.901205991639213e-05,
+      "loss": 0.1930275321006775,
+      "step": 645
+    },
+    {
+      "epoch": 0.11826783114992721,
+      "grad_norm": 0.1448420137166977,
+      "learning_rate": 4.899146773943374e-05,
+      "loss": 0.20026936531066894,
+      "step": 650
+    },
+    {
+      "epoch": 0.11917758369723436,
+      "grad_norm": 0.1312534064054489,
+      "learning_rate": 4.897066757968014e-05,
+      "loss": 0.19062033891677857,
+      "step": 655
+    },
+    {
+      "epoch": 0.12008733624454149,
+      "grad_norm": 0.13644742965698242,
+      "learning_rate": 4.894965961744859e-05,
+      "loss": 0.18719595670700073,
+      "step": 660
+    },
+    {
+      "epoch": 0.12099708879184862,
+      "grad_norm": 0.14276087284088135,
+      "learning_rate": 4.892844403485777e-05,
+      "loss": 0.19784307479858398,
+      "step": 665
+    },
+    {
+      "epoch": 0.12190684133915575,
+      "grad_norm": 0.14735399186611176,
+      "learning_rate": 4.890702101582623e-05,
+      "loss": 0.19163782596588136,
+      "step": 670
+    },
+    {
+      "epoch": 0.12281659388646288,
+      "grad_norm": 0.15742065012454987,
+      "learning_rate": 4.888539074607082e-05,
+      "loss": 0.19312986135482788,
+      "step": 675
+    },
+    {
+      "epoch": 0.12372634643377002,
+      "grad_norm": 0.12917031347751617,
+      "learning_rate": 4.8863553413105025e-05,
+      "loss": 0.20066320896148682,
+      "step": 680
+    },
+    {
+      "epoch": 0.12463609898107715,
+      "grad_norm": 0.1484801322221756,
+      "learning_rate": 4.884150920623737e-05,
+      "loss": 0.20096096992492676,
+      "step": 685
+    },
+    {
+      "epoch": 0.12554585152838427,
+      "grad_norm": 0.1455296128988266,
+      "learning_rate": 4.88192583165698e-05,
+      "loss": 0.20518505573272705,
+      "step": 690
+    },
+    {
+      "epoch": 0.12645560407569142,
+      "grad_norm": 0.14517490565776825,
+      "learning_rate": 4.879680093699598e-05,
+      "loss": 0.18859238624572755,
+      "step": 695
+    },
+    {
+      "epoch": 0.12736535662299855,
+      "grad_norm": 0.18778090178966522,
+      "learning_rate": 4.877413726219964e-05,
+      "loss": 0.197074818611145,
+      "step": 700
+    },
+    {
+      "epoch": 0.12827510917030568,
+      "grad_norm": 0.13497677445411682,
+      "learning_rate": 4.87512674886529e-05,
+      "loss": 0.18713107109069824,
+      "step": 705
+    },
+    {
+      "epoch": 0.12918486171761281,
+      "grad_norm": 0.12657155096530914,
+      "learning_rate": 4.872819181461455e-05,
+      "loss": 0.1858484387397766,
+      "step": 710
+    },
+    {
+      "epoch": 0.13009461426491994,
+      "grad_norm": 0.11458148807287216,
+      "learning_rate": 4.870491044012834e-05,
+      "loss": 0.18732179403305055,
+      "step": 715
+    },
+    {
+      "epoch": 0.13100436681222707,
+      "grad_norm": 0.13000249862670898,
+      "learning_rate": 4.8681423567021244e-05,
+      "loss": 0.1872936010360718,
+      "step": 720
+    },
+    {
+      "epoch": 0.1319141193595342,
+      "grad_norm": 0.14580890536308289,
+      "learning_rate": 4.865773139890172e-05,
+      "loss": 0.19280019998550416,
+      "step": 725
+    },
+    {
+      "epoch": 0.13282387190684133,
+      "grad_norm": 0.1507277935743332,
+      "learning_rate": 4.8633834141157913e-05,
+      "loss": 0.1898929238319397,
+      "step": 730
+    },
+    {
+      "epoch": 0.13373362445414846,
+      "grad_norm": 0.1418737769126892,
+      "learning_rate": 4.860973200095592e-05,
+      "loss": 0.17926375865936278,
+      "step": 735
+    },
+    {
+      "epoch": 0.1346433770014556,
+      "grad_norm": 0.17151866853237152,
+      "learning_rate": 4.858542518723794e-05,
+      "loss": 0.18963592052459716,
+      "step": 740
+    },
+    {
+      "epoch": 0.13555312954876272,
+      "grad_norm": 0.11162743717432022,
+      "learning_rate": 4.8560913910720535e-05,
+      "loss": 0.19466646909713745,
+      "step": 745
+    },
+    {
+      "epoch": 0.13646288209606988,
+      "grad_norm": 0.15628376603126526,
+      "learning_rate": 4.8536198383892725e-05,
+      "loss": 0.19494034051895143,
+      "step": 750
+    },
+    {
+      "epoch": 0.137372634643377,
+      "grad_norm": 0.18209289014339447,
+      "learning_rate": 4.851127882101421e-05,
+      "loss": 0.18747550249099731,
+      "step": 755
+    },
+    {
+      "epoch": 0.13828238719068414,
+      "grad_norm": 0.14559614658355713,
+      "learning_rate": 4.8486155438113454e-05,
+      "loss": 0.1897158980369568,
+      "step": 760
+    },
+    {
+      "epoch": 0.13919213973799127,
+      "grad_norm": 0.3198587894439697,
+      "learning_rate": 4.846082845298586e-05,
+      "loss": 0.18571001291275024,
+      "step": 765
+    },
+    {
+      "epoch": 0.1401018922852984,
+      "grad_norm": 0.1486678421497345,
+      "learning_rate": 4.843529808519189e-05,
+      "loss": 0.19561930894851684,
+      "step": 770
+    },
+    {
+      "epoch": 0.14101164483260553,
+      "grad_norm": 0.15318170189857483,
+      "learning_rate": 4.840956455605509e-05,
+      "loss": 0.187040114402771,
+      "step": 775
+    },
+    {
+      "epoch": 0.14192139737991266,
+      "grad_norm": 0.13754244148731232,
+      "learning_rate": 4.838362808866025e-05,
+      "loss": 0.18345539569854735,
+      "step": 780
+    },
+    {
+      "epoch": 0.1428311499272198,
+      "grad_norm": 0.12943248450756073,
+      "learning_rate": 4.835748890785143e-05,
+      "loss": 0.1921079397201538,
+      "step": 785
+    },
+    {
+      "epoch": 0.14374090247452692,
+      "grad_norm": 0.110458143055439,
+      "learning_rate": 4.833114724023001e-05,
+      "loss": 0.17927205562591553,
+      "step": 790
+    },
+    {
+      "epoch": 0.14465065502183405,
+      "grad_norm": 0.2421770840883255,
+      "learning_rate": 4.830460331415275e-05,
+      "loss": 0.18317567110061644,
+      "step": 795
+    },
+    {
+      "epoch": 0.14556040756914118,
+      "grad_norm": 0.14752762019634247,
+      "learning_rate": 4.8277857359729787e-05,
+      "loss": 0.1843916058540344,
+      "step": 800
+    },
+    {
+      "epoch": 0.14647016011644834,
+      "grad_norm": 0.15043556690216064,
+      "learning_rate": 4.8250909608822644e-05,
+      "loss": 0.18354393243789674,
+      "step": 805
+    },
+    {
+      "epoch": 0.14737991266375547,
+      "grad_norm": 0.1381794661283493,
+      "learning_rate": 4.822376029504223e-05,
+      "loss": 0.1789781332015991,
+      "step": 810
+    },
+    {
+      "epoch": 0.1482896652110626,
+      "grad_norm": 0.18386174738407135,
+      "learning_rate": 4.819640965374681e-05,
+      "loss": 0.19494292736053467,
+      "step": 815
+    },
+    {
+      "epoch": 0.14919941775836973,
+      "grad_norm": 0.13829593360424042,
+      "learning_rate": 4.816885792203996e-05,
+      "loss": 0.18486063480377196,
+      "step": 820
+    },
+    {
+      "epoch": 0.15010917030567686,
+      "grad_norm": 0.15033291280269623,
+      "learning_rate": 4.814110533876852e-05,
+      "loss": 0.18061509132385253,
+      "step": 825
+    },
+    {
+      "epoch": 0.151018922852984,
+      "grad_norm": 0.17150473594665527,
+      "learning_rate": 4.811315214452051e-05,
+      "loss": 0.18464866876602173,
+      "step": 830
+    },
+    {
+      "epoch": 0.15192867540029112,
+      "grad_norm": 0.15317125618457794,
+      "learning_rate": 4.808499858162307e-05,
+      "loss": 0.1837708592414856,
+      "step": 835
+    },
+    {
+      "epoch": 0.15283842794759825,
+      "grad_norm": 0.2671392560005188,
+      "learning_rate": 4.805664489414031e-05,
+      "loss": 0.19338636398315429,
+      "step": 840
+    },
+    {
+      "epoch": 0.15374818049490538,
+      "grad_norm": 0.14047028124332428,
+      "learning_rate": 4.802809132787125e-05,
+      "loss": 0.17069108486175538,
+      "step": 845
+    },
+    {
+      "epoch": 0.1546579330422125,
+      "grad_norm": 0.1520431935787201,
+      "learning_rate": 4.799933813034768e-05,
+      "loss": 0.18607735633850098,
+      "step": 850
+    },
+    {
+      "epoch": 0.15556768558951964,
+      "grad_norm": 0.17239463329315186,
+      "learning_rate": 4.797038555083197e-05,
+      "loss": 0.18069062232971192,
+      "step": 855
+    },
+    {
+      "epoch": 0.1564774381368268,
+      "grad_norm": 0.1377955675125122,
+      "learning_rate": 4.794123384031495e-05,
+      "loss": 0.18870222568511963,
+      "step": 860
+    },
+    {
+      "epoch": 0.15738719068413393,
+      "grad_norm": 0.15901461243629456,
+      "learning_rate": 4.791188325151373e-05,
+      "loss": 0.18128334283828734,
+      "step": 865
+    },
+    {
+      "epoch": 0.15829694323144106,
+      "grad_norm": 0.14634132385253906,
+      "learning_rate": 4.7882334038869495e-05,
+      "loss": 0.1866163969039917,
+      "step": 870
+    },
+    {
+      "epoch": 0.1592066957787482,
+      "grad_norm": 0.15361061692237854,
+      "learning_rate": 4.785258645854529e-05,
+      "loss": 0.17850807905197144,
+      "step": 875
+    },
+    {
+      "epoch": 0.16011644832605532,
+      "grad_norm": 0.13751649856567383,
+      "learning_rate": 4.782264076842385e-05,
+      "loss": 0.17731113433837892,
+      "step": 880
+    },
+    {
+      "epoch": 0.16102620087336245,
+      "grad_norm": 0.17909638583660126,
+      "learning_rate": 4.7792497228105314e-05,
+      "loss": 0.18344542980194092,
+      "step": 885
+    },
+    {
+      "epoch": 0.16193595342066958,
+      "grad_norm": 0.16038304567337036,
+      "learning_rate": 4.776215609890498e-05,
+      "loss": 0.18868647813796996,
+      "step": 890
+    },
+    {
+      "epoch": 0.1628457059679767,
+      "grad_norm": 0.1653951108455658,
+      "learning_rate": 4.773161764385107e-05,
+      "loss": 0.18614152669906617,
+      "step": 895
+    },
+    {
+      "epoch": 0.16375545851528384,
+      "grad_norm": 0.16193026304244995,
+      "learning_rate": 4.770088212768241e-05,
+      "loss": 0.18564575910568237,
+      "step": 900
+    },
+    {
+      "epoch": 0.16466521106259097,
+      "grad_norm": 0.16048531234264374,
+      "learning_rate": 4.7669949816846173e-05,
+      "loss": 0.18330031633377075,
+      "step": 905
+    },
+    {
+      "epoch": 0.1655749636098981,
+      "grad_norm": 0.1440177708864212,
+      "learning_rate": 4.7638820979495534e-05,
+      "loss": 0.17712442874908446,
+      "step": 910
+    },
+    {
+      "epoch": 0.16648471615720525,
+      "grad_norm": 0.19635969400405884,
+      "learning_rate": 4.760749588548738e-05,
+      "loss": 0.18679027557373046,
+      "step": 915
+    },
+    {
+      "epoch": 0.16739446870451238,
+      "grad_norm": 0.15576541423797607,
+      "learning_rate": 4.757597480637995e-05,
+      "loss": 0.19283764362335204,
+      "step": 920
+    },
+    {
+      "epoch": 0.1683042212518195,
+      "grad_norm": 0.1550331562757492,
+      "learning_rate": 4.7544258015430463e-05,
+      "loss": 0.18269542455673218,
+      "step": 925
+    },
+    {
+      "epoch": 0.16921397379912664,
+      "grad_norm": 0.18369626998901367,
+      "learning_rate": 4.75123457875928e-05,
+      "loss": 0.1697891116142273,
+      "step": 930
+    },
+    {
+      "epoch": 0.17012372634643377,
+      "grad_norm": 0.15266314148902893,
+      "learning_rate": 4.7480238399515074e-05,
+      "loss": 0.18523451089859008,
+      "step": 935
+    },
+    {
+      "epoch": 0.1710334788937409,
+      "grad_norm": 0.16709664463996887,
+      "learning_rate": 4.744793612953724e-05,
+      "loss": 0.1803238034248352,
+      "step": 940
+    },
+    {
+      "epoch": 0.17194323144104803,
+      "grad_norm": 0.14929179847240448,
+      "learning_rate": 4.741543925768872e-05,
+      "loss": 0.1861217737197876,
+      "step": 945
+    },
+    {
+      "epoch": 0.17285298398835516,
+      "grad_norm": 0.1362280696630478,
+      "learning_rate": 4.7382748065685915e-05,
+      "loss": 0.17896100282669067,
+      "step": 950
+    },
+    {
+      "epoch": 0.1737627365356623,
+      "grad_norm": 0.15290239453315735,
+      "learning_rate": 4.734986283692982e-05,
+      "loss": 0.18432788848876952,
+      "step": 955
+    },
+    {
+      "epoch": 0.17467248908296942,
+      "grad_norm": 0.1287035197019577,
+      "learning_rate": 4.73167838565035e-05,
+      "loss": 0.18485682010650634,
+      "step": 960
+    },
+    {
+      "epoch": 0.17558224163027655,
+      "grad_norm": 0.17969627678394318,
+      "learning_rate": 4.728351141116971e-05,
+      "loss": 0.17361557483673096,
+      "step": 965
+    },
+    {
+      "epoch": 0.1764919941775837,
+      "grad_norm": 0.13751201331615448,
+      "learning_rate": 4.7250045789368326e-05,
+      "loss": 0.1731679320335388,
+      "step": 970
+    },
+    {
+      "epoch": 0.17740174672489084,
+      "grad_norm": 0.1603265255689621,
+      "learning_rate": 4.721638728121388e-05,
+      "loss": 0.17308170795440675,
+      "step": 975
+    },
+    {
+      "epoch": 0.17831149927219797,
+      "grad_norm": 0.1592789888381958,
+      "learning_rate": 4.718253617849306e-05,
+      "loss": 0.17534757852554322,
+      "step": 980
+    },
+    {
+      "epoch": 0.1792212518195051,
+      "grad_norm": 0.12727224826812744,
+      "learning_rate": 4.714849277466214e-05,
+      "loss": 0.17817609310150145,
+      "step": 985
+    },
+    {
+      "epoch": 0.18013100436681223,
+      "grad_norm": 0.15401554107666016,
+      "learning_rate": 4.711425736484447e-05,
+      "loss": 0.1733405351638794,
+      "step": 990
+    },
+    {
+      "epoch": 0.18104075691411936,
+      "grad_norm": 0.13253968954086304,
+      "learning_rate": 4.7079830245827906e-05,
+      "loss": 0.17846795320510864,
+      "step": 995
+    },
+    {
+      "epoch": 0.1819505094614265,
+      "grad_norm": 0.21846213936805725,
+      "learning_rate": 4.7045211716062245e-05,
+      "loss": 0.18021599054336548,
+      "step": 1000
+    },
+    {
+      "epoch": 0.18286026200873362,
+      "grad_norm": 0.16867990791797638,
+      "learning_rate": 4.7010402075656595e-05,
+      "loss": 0.18232386112213134,
+      "step": 1005
+    },
+    {
+      "epoch": 0.18377001455604075,
+      "grad_norm": 0.17180582880973816,
+      "learning_rate": 4.697540162637686e-05,
+      "loss": 0.1816317319869995,
+      "step": 1010
+    },
+    {
+      "epoch": 0.18467976710334788,
+      "grad_norm": 0.16480213403701782,
+      "learning_rate": 4.694021067164303e-05,
+      "loss": 0.17718446254730225,
+      "step": 1015
+    },
+    {
+      "epoch": 0.185589519650655,
+      "grad_norm": 0.15015918016433716,
+      "learning_rate": 4.6904829516526605e-05,
+      "loss": 0.17412011623382567,
+      "step": 1020
+    },
+    {
+      "epoch": 0.18649927219796217,
+      "grad_norm": 0.14445139467716217,
+      "learning_rate": 4.686925846774795e-05,
+      "loss": 0.1778018832206726,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1874090247452693,
+      "grad_norm": 0.1701960265636444,
+      "learning_rate": 4.683349783367362e-05,
+      "loss": 0.16901081800460815,
+      "step": 1030
+    },
+    {
+      "epoch": 0.18831877729257643,
+      "grad_norm": 0.15894867479801178,
+      "learning_rate": 4.679754792431368e-05,
+      "loss": 0.17055928707122803,
+      "step": 1035
+    },
+    {
+      "epoch": 0.18922852983988356,
+      "grad_norm": 0.1511942446231842,
+      "learning_rate": 4.676140905131903e-05,
+      "loss": 0.17339680194854737,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1901382823871907,
+      "grad_norm": 0.14735209941864014,
+      "learning_rate": 4.672508152797872e-05,
+      "loss": 0.17802717685699462,
+      "step": 1045
+    },
+    {
+      "epoch": 0.19104803493449782,
+      "grad_norm": 0.17367291450500488,
+      "learning_rate": 4.66885656692172e-05,
+      "loss": 0.1732744097709656,
+      "step": 1050
+    },
+    {
+      "epoch": 0.19195778748180495,
+      "grad_norm": 0.147227481007576,
+      "learning_rate": 4.665186179159159e-05,
+      "loss": 0.17040517330169677,
+      "step": 1055
+    },
+    {
+      "epoch": 0.19286754002911208,
+      "grad_norm": 0.1709655076265335,
+      "learning_rate": 4.6614970213289e-05,
+      "loss": 0.17794088125228882,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1937772925764192,
+      "grad_norm": 0.1588088721036911,
+      "learning_rate": 4.657789125412366e-05,
+      "loss": 0.17180380821228028,
+      "step": 1065
+    },
+    {
+      "epoch": 0.19468704512372634,
+      "grad_norm": 0.14827021956443787,
+      "learning_rate": 4.654062523553428e-05,
+      "loss": 0.182997989654541,
+      "step": 1070
+    },
+    {
+      "epoch": 0.19559679767103347,
+      "grad_norm": 0.16230466961860657,
+      "learning_rate": 4.6503172480581126e-05,
+      "loss": 0.17346880435943604,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1965065502183406,
+      "grad_norm": 0.1637624353170395,
+      "learning_rate": 4.646553331394333e-05,
+      "loss": 0.17263576984405518,
+      "step": 1080
+    },
+    {
+      "epoch": 0.19741630276564776,
+      "grad_norm": 0.15977843105793,
+      "learning_rate": 4.642770806191603e-05,
+      "loss": 0.17284308671951293,
+      "step": 1085
+    },
+    {
+      "epoch": 0.19832605531295489,
+      "grad_norm": 0.15394869446754456,
+      "learning_rate": 4.6389697052407534e-05,
+      "loss": 0.17797101736068727,
+      "step": 1090
+    },
+    {
+      "epoch": 0.19923580786026202,
+      "grad_norm": 0.15995225310325623,
+      "learning_rate": 4.6351500614936485e-05,
+      "loss": 0.18137198686599731,
+      "step": 1095
+    },
+    {
+      "epoch": 0.20014556040756915,
+      "grad_norm": 0.1779479682445526,
+      "learning_rate": 4.6313119080629006e-05,
+      "loss": 0.17998344898223878,
+      "step": 1100
+    },
+    {
+      "epoch": 0.20105531295487628,
+      "grad_norm": 0.14362832903862,
+      "learning_rate": 4.627455278221584e-05,
+      "loss": 0.18196423053741456,
+      "step": 1105
+    },
+    {
+      "epoch": 0.2019650655021834,
+      "grad_norm": 0.15951639413833618,
+      "learning_rate": 4.623580205402947e-05,
+      "loss": 0.17423888444900512,
+      "step": 1110
+    },
+    {
+      "epoch": 0.20287481804949054,
+      "grad_norm": 0.17273563146591187,
+      "learning_rate": 4.619686723200115e-05,
+      "loss": 0.17392473220825194,
+      "step": 1115
+    },
+    {
+      "epoch": 0.20378457059679767,
+      "grad_norm": 0.1655360758304596,
+      "learning_rate": 4.615774865365813e-05,
+      "loss": 0.17528389692306517,
+      "step": 1120
+    },
+    {
+      "epoch": 0.2046943231441048,
+      "grad_norm": 0.15920691192150116,
+      "learning_rate": 4.611844665812058e-05,
+      "loss": 0.1806849241256714,
+      "step": 1125
+    },
+    {
+      "epoch": 0.20560407569141192,
+      "grad_norm": 0.16114577651023865,
+      "learning_rate": 4.607896158609875e-05,
+      "loss": 0.17217352390289306,
+      "step": 1130
+    },
+    {
+      "epoch": 0.20651382823871905,
+      "grad_norm": 0.1499422937631607,
+      "learning_rate": 4.603929377988999e-05,
+      "loss": 0.17806737422943114,
+      "step": 1135
+    },
+    {
+      "epoch": 0.2074235807860262,
+      "grad_norm": 0.17605191469192505,
+      "learning_rate": 4.5999443583375765e-05,
+      "loss": 0.17842113971710205,
+      "step": 1140
+    },
+    {
+      "epoch": 0.20833333333333334,
+      "grad_norm": 0.16117210686206818,
+      "learning_rate": 4.595941134201871e-05,
+      "loss": 0.18379683494567872,
+      "step": 1145
+    },
+    {
+      "epoch": 0.20924308588064047,
+      "grad_norm": 0.21199050545692444,
+      "learning_rate": 4.591919740285957e-05,
+      "loss": 0.16286123991012574,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2101528384279476,
+      "grad_norm": 0.15100529789924622,
+      "learning_rate": 4.587880211451427e-05,
+      "loss": 0.17995200157165528,
+      "step": 1155
+    },
+    {
+      "epoch": 0.21106259097525473,
+      "grad_norm": 0.16618172824382782,
+      "learning_rate": 4.583822582717085e-05,
+      "loss": 0.16960303783416747,
+      "step": 1160
+    },
+    {
+      "epoch": 0.21197234352256186,
+      "grad_norm": 0.14743569493293762,
+      "learning_rate": 4.579746889258643e-05,
+      "loss": 0.17762668132781984,
+      "step": 1165
+    },
+    {
+      "epoch": 0.212882096069869,
+      "grad_norm": 0.1697179079055786,
+      "learning_rate": 4.575653166408417e-05,
+      "loss": 0.16656005382537842,
+      "step": 1170
+    },
+    {
+      "epoch": 0.21379184861717612,
+      "grad_norm": 0.14886513352394104,
+      "learning_rate": 4.57154144965502e-05,
+      "loss": 0.17091882228851318,
+      "step": 1175
+    },
+    {
+      "epoch": 0.21470160116448325,
+      "grad_norm": 0.18197473883628845,
+      "learning_rate": 4.5674117746430556e-05,
+      "loss": 0.1770920753479004,
+      "step": 1180
+    },
+    {
+      "epoch": 0.21561135371179038,
+      "grad_norm": 0.17323088645935059,
+      "learning_rate": 4.563264177172807e-05,
+      "loss": 0.1734643578529358,
+      "step": 1185
+    },
+    {
+      "epoch": 0.2165211062590975,
+      "grad_norm": 0.1521984338760376,
+      "learning_rate": 4.559098693199929e-05,
+      "loss": 0.17515116930007935,
+      "step": 1190
+    },
+    {
+      "epoch": 0.21743085880640467,
+      "grad_norm": 0.1842304915189743,
+      "learning_rate": 4.554915358835134e-05,
+      "loss": 0.16798022985458375,
+      "step": 1195
+    },
+    {
+      "epoch": 0.2183406113537118,
+      "grad_norm": 0.14753451943397522,
+      "learning_rate": 4.5507142103438794e-05,
+      "loss": 0.1755476713180542,
+      "step": 1200
+    },
+    {
+      "epoch": 0.21925036390101893,
+      "grad_norm": 0.17096194624900818,
+      "learning_rate": 4.546495284146057e-05,
+      "loss": 0.1792473554611206,
+      "step": 1205
+    },
+    {
+      "epoch": 0.22016011644832606,
+      "grad_norm": 0.1579233556985855,
+      "learning_rate": 4.542258616815669e-05,
+      "loss": 0.17230144739151002,
+      "step": 1210
+    },
+    {
+      "epoch": 0.2210698689956332,
+      "grad_norm": 0.177297905087471,
+      "learning_rate": 4.5380042450805216e-05,
+      "loss": 0.1807127833366394,
+      "step": 1215
+    },
+    {
+      "epoch": 0.22197962154294032,
+      "grad_norm": 0.14331696927547455,
+      "learning_rate": 4.533732205821897e-05,
+      "loss": 0.17201389074325563,
+      "step": 1220
+    },
+    {
+      "epoch": 0.22288937409024745,
+      "grad_norm": 0.14473360776901245,
+      "learning_rate": 4.529442536074239e-05,
+      "loss": 0.17036900520324708,
+      "step": 1225
+    },
+    {
+      "epoch": 0.22379912663755458,
+      "grad_norm": 0.1820901483297348,
+      "learning_rate": 4.5251352730248314e-05,
+      "loss": 0.17704882621765136,
+      "step": 1230
+    },
+    {
+      "epoch": 0.2247088791848617,
+      "grad_norm": 0.1948976367712021,
+      "learning_rate": 4.5208104540134746e-05,
+      "loss": 0.1706973433494568,
+      "step": 1235
+    },
+    {
+      "epoch": 0.22561863173216884,
+      "grad_norm": 0.16660070419311523,
+      "learning_rate": 4.51646811653216e-05,
+      "loss": 0.17636821269989014,
+      "step": 1240
+    },
+    {
+      "epoch": 0.22652838427947597,
+      "grad_norm": 0.1699984073638916,
+      "learning_rate": 4.512108298224751e-05,
+      "loss": 0.16986632347106934,
+      "step": 1245
+    },
+    {
+      "epoch": 0.22743813682678313,
+      "grad_norm": 0.17601042985916138,
+      "learning_rate": 4.50773103688665e-05,
+      "loss": 0.17507898807525635,
+      "step": 1250
+    },
+    {
+      "epoch": 0.22834788937409026,
+      "grad_norm": 0.17557238042354584,
+      "learning_rate": 4.503336370464476e-05,
+      "loss": 0.17702863216400147,
+      "step": 1255
+    },
+    {
+      "epoch": 0.2292576419213974,
+      "grad_norm": 0.1800651252269745,
+      "learning_rate": 4.498924337055729e-05,
+      "loss": 0.16419180631637573,
+      "step": 1260
+    },
+    {
+      "epoch": 0.23016739446870452,
+      "grad_norm": 0.2022479772567749,
+      "learning_rate": 4.494494974908468e-05,
+      "loss": 0.17482060194015503,
+      "step": 1265
+    },
+    {
+      "epoch": 0.23107714701601165,
+      "grad_norm": 0.14180205762386322,
+      "learning_rate": 4.490048322420973e-05,
+      "loss": 0.1723136067390442,
+      "step": 1270
+    },
+    {
+      "epoch": 0.23198689956331878,
+      "grad_norm": 0.18607310950756073,
+      "learning_rate": 4.485584418141419e-05,
+      "loss": 0.17096419334411622,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2328966521106259,
+      "grad_norm": 0.15958310663700104,
+      "learning_rate": 4.481103300767529e-05,
+      "loss": 0.1656244158744812,
+      "step": 1280
+    },
+    {
+      "epoch": 0.23380640465793304,
+      "grad_norm": 0.17552383244037628,
+      "learning_rate": 4.476605009146255e-05,
+      "loss": 0.17677626609802247,
+      "step": 1285
+    },
+    {
+      "epoch": 0.23471615720524017,
+      "grad_norm": 0.15299823880195618,
+      "learning_rate": 4.472089582273429e-05,
+      "loss": 0.1778991103172302,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2356259097525473,
+      "grad_norm": 0.14613987505435944,
+      "learning_rate": 4.46755705929343e-05,
+      "loss": 0.17071452140808105,
+      "step": 1295
+    },
+    {
+      "epoch": 0.23653566229985443,
+      "grad_norm": 0.17781122028827667,
+      "learning_rate": 4.463007479498843e-05,
+      "loss": 0.16955430507659913,
+      "step": 1300
+    },
+    {
+      "epoch": 0.23744541484716158,
+      "grad_norm": 0.16326487064361572,
+      "learning_rate": 4.458440882330119e-05,
+      "loss": 0.1777693510055542,
+      "step": 1305
+    },
+    {
+      "epoch": 0.23835516739446871,
+      "grad_norm": 0.17701926827430725,
+      "learning_rate": 4.4538573073752365e-05,
+      "loss": 0.16323351860046387,
+      "step": 1310
+    },
+    {
+      "epoch": 0.23926491994177584,
+      "grad_norm": 0.13104717433452606,
+      "learning_rate": 4.449256794369349e-05,
+      "loss": 0.17653456926345826,
+      "step": 1315
+    },
+    {
+      "epoch": 0.24017467248908297,
+      "grad_norm": 0.1796836256980896,
+      "learning_rate": 4.444639383194452e-05,
+      "loss": 0.17189600467681884,
+      "step": 1320
+    },
+    {
+      "epoch": 0.2410844250363901,
+      "grad_norm": 0.14919696748256683,
+      "learning_rate": 4.440005113879029e-05,
+      "loss": 0.17003334760665895,
+      "step": 1325
+    },
+    {
+      "epoch": 0.24199417758369723,
+      "grad_norm": 0.1728784441947937,
+      "learning_rate": 4.4353540265977064e-05,
+      "loss": 0.17397408485412597,
+      "step": 1330
+    },
+    {
+      "epoch": 0.24290393013100436,
+      "grad_norm": 0.14591015875339508,
+      "learning_rate": 4.43068616167091e-05,
+      "loss": 0.16498478651046752,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2438136826783115,
+      "grad_norm": 0.18417201936244965,
+      "learning_rate": 4.4260015595645055e-05,
+      "loss": 0.16841750144958495,
+      "step": 1340
+    },
+    {
+      "epoch": 0.24472343522561862,
+      "grad_norm": 0.16264279186725616,
+      "learning_rate": 4.4213002608894605e-05,
+      "loss": 0.16907373666763306,
+      "step": 1345
+    },
+    {
+      "epoch": 0.24563318777292575,
+      "grad_norm": 0.15248481929302216,
+      "learning_rate": 4.416582306401481e-05,
+      "loss": 0.15931472778320313,
+      "step": 1350
+    },
+    {
+      "epoch": 0.24654294032023288,
+      "grad_norm": 0.1488373875617981,
+      "learning_rate": 4.4118477370006636e-05,
+      "loss": 0.1701716423034668,
+      "step": 1355
+    },
+    {
+      "epoch": 0.24745269286754004,
+      "grad_norm": 0.14679782092571259,
+      "learning_rate": 4.407096593731142e-05,
+      "loss": 0.157412326335907,
+      "step": 1360
+    },
+    {
+      "epoch": 0.24836244541484717,
+      "grad_norm": 0.17139530181884766,
+      "learning_rate": 4.402328917780728e-05,
+      "loss": 0.17303754091262818,
+      "step": 1365
+    },
+    {
+      "epoch": 0.2492721979621543,
+      "grad_norm": 0.1534871757030487,
+      "learning_rate": 4.397544750480554e-05,
+      "loss": 0.1786255121231079,
+      "step": 1370
+    },
+    {
+      "epoch": 0.2501819505094614,
+      "grad_norm": 0.1876252293586731,
+      "learning_rate": 4.39274413330472e-05,
+      "loss": 0.16442898511886597,
+      "step": 1375
+    },
+    {
+      "epoch": 0.25109170305676853,
+      "grad_norm": 0.16165752708911896,
+      "learning_rate": 4.387927107869928e-05,
+      "loss": 0.1780426025390625,
+      "step": 1380
+    },
+    {
+      "epoch": 0.25200145560407566,
+      "grad_norm": 0.17242255806922913,
+      "learning_rate": 4.383093715935124e-05,
+      "loss": 0.15959256887435913,
+      "step": 1385
+    },
+    {
+      "epoch": 0.25291120815138285,
+      "grad_norm": 0.1627114862203598,
+      "learning_rate": 4.378243999401137e-05,
+      "loss": 0.17606115341186523,
+      "step": 1390
+    },
+    {
+      "epoch": 0.25382096069869,
+      "grad_norm": 0.15911224484443665,
+      "learning_rate": 4.373378000310312e-05,
+      "loss": 0.16798585653305054,
+      "step": 1395
+    },
+    {
+      "epoch": 0.2547307132459971,
+      "grad_norm": 0.15542249381542206,
+      "learning_rate": 4.3684957608461505e-05,
+      "loss": 0.1695417881011963,
+      "step": 1400
+    },
+    {
+      "epoch": 0.25564046579330424,
+      "grad_norm": 0.1475304812192917,
+      "learning_rate": 4.363597323332941e-05,
+      "loss": 0.16340878009796142,
+      "step": 1405
+    },
+    {
+      "epoch": 0.25655021834061137,
+      "grad_norm": 0.16943927109241486,
+      "learning_rate": 4.358682730235395e-05,
+      "loss": 0.17240238189697266,
+      "step": 1410
+    },
+    {
+      "epoch": 0.2574599708879185,
+      "grad_norm": 0.1816391944885254,
+      "learning_rate": 4.3537520241582744e-05,
+      "loss": 0.16558437347412108,
+      "step": 1415
+    },
+    {
+      "epoch": 0.25836972343522563,
+      "grad_norm": 0.23851341009140015,
+      "learning_rate": 4.348805247846027e-05,
+      "loss": 0.16796000003814698,
+      "step": 1420
+    },
+    {
+      "epoch": 0.25927947598253276,
+      "grad_norm": 0.15415243804454803,
+      "learning_rate": 4.343842444182414e-05,
+      "loss": 0.1746017098426819,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2601892285298399,
+      "grad_norm": 0.15651032328605652,
+      "learning_rate": 4.338863656190139e-05,
+      "loss": 0.1649057984352112,
+      "step": 1430
+    },
+    {
+      "epoch": 0.261098981077147,
+      "grad_norm": 0.16601966321468353,
+      "learning_rate": 4.333868927030471e-05,
+      "loss": 0.15888988971710205,
+      "step": 1435
+    },
+    {
+      "epoch": 0.26200873362445415,
+      "grad_norm": 0.1549467295408249,
+      "learning_rate": 4.328858300002876e-05,
+      "loss": 0.16357985734939576,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2629184861717613,
+      "grad_norm": 0.16332370042800903,
+      "learning_rate": 4.32383181854464e-05,
+      "loss": 0.16749982833862304,
+      "step": 1445
+    },
+    {
+      "epoch": 0.2638282387190684,
+      "grad_norm": 0.14827077090740204,
+      "learning_rate": 4.3187895262304894e-05,
+      "loss": 0.16886214017868043,
+      "step": 1450
+    },
+    {
+      "epoch": 0.26473799126637554,
+      "grad_norm": 0.1557198166847229,
+      "learning_rate": 4.313731466772216e-05,
+      "loss": 0.17512214183807373,
+      "step": 1455
+    },
+    {
+      "epoch": 0.26564774381368267,
+      "grad_norm": 0.17263570427894592,
+      "learning_rate": 4.308657684018299e-05,
+      "loss": 0.16248074769973755,
+      "step": 1460
+    },
+    {
+      "epoch": 0.2665574963609898,
+      "grad_norm": 0.17135761678218842,
+      "learning_rate": 4.303568221953521e-05,
+      "loss": 0.16605921983718872,
+      "step": 1465
+    },
+    {
+      "epoch": 0.26746724890829693,
+      "grad_norm": 0.14322632551193237,
+      "learning_rate": 4.2984631246985897e-05,
+      "loss": 0.1610772728919983,
+      "step": 1470
+    },
+    {
+      "epoch": 0.26837700145560406,
+      "grad_norm": 0.18852312862873077,
+      "learning_rate": 4.2933424365097564e-05,
+      "loss": 0.1686462163925171,
+      "step": 1475
+    },
+    {
+      "epoch": 0.2692867540029112,
+      "grad_norm": 0.1780245155096054,
+      "learning_rate": 4.2882062017784294e-05,
+      "loss": 0.16953932046890258,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2701965065502183,
+      "grad_norm": 0.180568665266037,
+      "learning_rate": 4.2830544650307895e-05,
+      "loss": 0.16442664861679077,
+      "step": 1485
+    },
+    {
+      "epoch": 0.27110625909752545,
+      "grad_norm": 0.16876435279846191,
+      "learning_rate": 4.277887270927407e-05,
+      "loss": 0.17128173112869263,
+      "step": 1490
+    },
+    {
+      "epoch": 0.2720160116448326,
+      "grad_norm": 0.164053276181221,
+      "learning_rate": 4.2727046642628513e-05,
+      "loss": 0.16331382989883422,
+      "step": 1495
+    },
+    {
+      "epoch": 0.27292576419213976,
+      "grad_norm": 0.14577528834342957,
+      "learning_rate": 4.267506689965305e-05,
+      "loss": 0.1638316035270691,
+      "step": 1500
+    },
+    {
+      "epoch": 0.2738355167394469,
+      "grad_norm": 0.1648740917444229,
+      "learning_rate": 4.262293393096171e-05,
+      "loss": 0.15332664251327516,
+      "step": 1505
+    },
+    {
+      "epoch": 0.274745269286754,
+      "grad_norm": 0.16445094347000122,
+      "learning_rate": 4.257064818849685e-05,
+      "loss": 0.1706634521484375,
+      "step": 1510
+    },
+    {
+      "epoch": 0.27565502183406115,
+      "grad_norm": 0.1584935486316681,
+      "learning_rate": 4.251821012552524e-05,
+      "loss": 0.1684114694595337,
+      "step": 1515
+    },
+    {
+      "epoch": 0.2765647743813683,
+      "grad_norm": 0.17215611040592194,
+      "learning_rate": 4.24656201966341e-05,
+      "loss": 0.15594131946563722,
+      "step": 1520
+    },
+    {
+      "epoch": 0.2774745269286754,
+      "grad_norm": 0.15945589542388916,
+      "learning_rate": 4.2412878857727214e-05,
+      "loss": 0.1686659574508667,
+      "step": 1525
+    },
+    {
+      "epoch": 0.27838427947598254,
+      "grad_norm": 0.16103951632976532,
+      "learning_rate": 4.2359986566020906e-05,
+      "loss": 0.17779340744018554,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2792940320232897,
+      "grad_norm": 0.1770307570695877,
+      "learning_rate": 4.230694378004014e-05,
+      "loss": 0.16786882877349854,
+      "step": 1535
+    },
+    {
+      "epoch": 0.2802037845705968,
+      "grad_norm": 0.16225053369998932,
+      "learning_rate": 4.2253750959614504e-05,
+      "loss": 0.16558897495269775,
+      "step": 1540
+    },
+    {
+      "epoch": 0.28111353711790393,
+      "grad_norm": 0.27213969826698303,
+      "learning_rate": 4.220040856587425e-05,
+      "loss": 0.1641119599342346,
+      "step": 1545
+    },
+    {
+      "epoch": 0.28202328966521106,
+      "grad_norm": 0.1773071587085724,
+      "learning_rate": 4.2146917061246284e-05,
+      "loss": 0.16919140815734862,
+      "step": 1550
+    },
+    {
+      "epoch": 0.2829330422125182,
+      "grad_norm": 0.15519705414772034,
+      "learning_rate": 4.209327690945014e-05,
+      "loss": 0.15501506328582765,
+      "step": 1555
+    },
+    {
+      "epoch": 0.2838427947598253,
+      "grad_norm": 0.19921597838401794,
+      "learning_rate": 4.203948857549402e-05,
+      "loss": 0.1690821886062622,
+      "step": 1560
+    },
+    {
+      "epoch": 0.28475254730713245,
+      "grad_norm": 0.15417630970478058,
+      "learning_rate": 4.1985552525670696e-05,
+      "loss": 0.1675640344619751,
+      "step": 1565
+    },
+    {
+      "epoch": 0.2856622998544396,
+      "grad_norm": 0.1739572137594223,
+      "learning_rate": 4.193146922755348e-05,
+      "loss": 0.16738017797470092,
+      "step": 1570
+    },
+    {
+      "epoch": 0.2865720524017467,
+      "grad_norm": 0.1384361982345581,
+      "learning_rate": 4.187723914999221e-05,
+      "loss": 0.16802358627319336,
+      "step": 1575
+    },
+    {
+      "epoch": 0.28748180494905384,
+      "grad_norm": 0.1491454839706421,
+      "learning_rate": 4.182286276310915e-05,
+      "loss": 0.1619583249092102,
+      "step": 1580
+    },
+    {
+      "epoch": 0.288391557496361,
+      "grad_norm": 0.15831919014453888,
+      "learning_rate": 4.176834053829492e-05,
+      "loss": 0.1625199794769287,
+      "step": 1585
+    },
+    {
+      "epoch": 0.2893013100436681,
+      "grad_norm": 0.16265396773815155,
+      "learning_rate": 4.1713672948204416e-05,
+      "loss": 0.16718552112579346,
+      "step": 1590
+    },
+    {
+      "epoch": 0.29021106259097523,
+      "grad_norm": 0.15153461694717407,
+      "learning_rate": 4.1658860466752714e-05,
+      "loss": 0.15979087352752686,
+      "step": 1595
+    },
+    {
+      "epoch": 0.29112081513828236,
+      "grad_norm": 0.1620412915945053,
+      "learning_rate": 4.160390356911096e-05,
+      "loss": 0.16103557348251343,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2920305676855895,
+      "grad_norm": 0.16673807799816132,
+      "learning_rate": 4.154880273170223e-05,
+      "loss": 0.16394708156585694,
+      "step": 1605
+    },
+    {
+      "epoch": 0.2929403202328967,
+      "grad_norm": 0.14834867417812347,
+      "learning_rate": 4.149355843219744e-05,
+      "loss": 0.15916435718536376,
+      "step": 1610
+    },
+    {
+      "epoch": 0.2938500727802038,
+      "grad_norm": 0.16977964341640472,
+      "learning_rate": 4.143817114951119e-05,
+      "loss": 0.16538127660751342,
+      "step": 1615
+    },
+    {
+      "epoch": 0.29475982532751094,
+      "grad_norm": 0.17986875772476196,
+      "learning_rate": 4.138264136379756e-05,
+      "loss": 0.15514618158340454,
+      "step": 1620
+    },
+    {
+      "epoch": 0.29566957787481807,
+      "grad_norm": 0.15794920921325684,
+      "learning_rate": 4.132696955644605e-05,
+      "loss": 0.15992183685302735,
+      "step": 1625
+    },
+    {
+      "epoch": 0.2965793304221252,
+      "grad_norm": 0.19955399632453918,
+      "learning_rate": 4.127115621007731e-05,
+      "loss": 0.16362056732177735,
+      "step": 1630
+    },
+    {
+      "epoch": 0.29748908296943233,
+      "grad_norm": 0.1352023035287857,
+      "learning_rate": 4.121520180853903e-05,
+      "loss": 0.15631601810455323,
+      "step": 1635
+    },
+    {
+      "epoch": 0.29839883551673946,
+      "grad_norm": 0.15340781211853027,
+      "learning_rate": 4.1159106836901674e-05,
+      "loss": 0.1571858048439026,
+      "step": 1640
+    },
+    {
+      "epoch": 0.2993085880640466,
+      "grad_norm": 0.15311770141124725,
+      "learning_rate": 4.110287178145433e-05,
+      "loss": 0.16082344055175782,
+      "step": 1645
+    },
+    {
+      "epoch": 0.3002183406113537,
+      "grad_norm": 0.17811627686023712,
+      "learning_rate": 4.10464971297005e-05,
+      "loss": 0.16117215156555176,
+      "step": 1650
+    },
+    {
+      "epoch": 0.30112809315866085,
+      "grad_norm": 0.21060039103031158,
+      "learning_rate": 4.0989983370353805e-05,
+      "loss": 0.15838587284088135,
+      "step": 1655
+    },
+    {
+      "epoch": 0.302037845705968,
+      "grad_norm": 0.155836820602417,
+      "learning_rate": 4.093333099333383e-05,
+      "loss": 0.16648870706558228,
+      "step": 1660
+    },
+    {
+      "epoch": 0.3029475982532751,
+      "grad_norm": 0.13711698353290558,
+      "learning_rate": 4.0876540489761826e-05,
+      "loss": 0.16899349689483642,
+      "step": 1665
+    },
+    {
+      "epoch": 0.30385735080058224,
+      "grad_norm": 0.15162716805934906,
+      "learning_rate": 4.0819612351956485e-05,
+      "loss": 0.16574090719223022,
+      "step": 1670
+    },
+    {
+      "epoch": 0.30476710334788937,
+      "grad_norm": 0.15016348659992218,
+      "learning_rate": 4.0762547073429615e-05,
+      "loss": 0.1689780354499817,
+      "step": 1675
+    },
+    {
+      "epoch": 0.3056768558951965,
+      "grad_norm": 0.15182986855506897,
+      "learning_rate": 4.070534514888194e-05,
+      "loss": 0.1593686819076538,
+      "step": 1680
+    },
+    {
+      "epoch": 0.3065866084425036,
+      "grad_norm": 0.15648750960826874,
+      "learning_rate": 4.0648007074198765e-05,
+      "loss": 0.16436235904693602,
+      "step": 1685
+    },
+    {
+      "epoch": 0.30749636098981076,
+      "grad_norm": 0.18339484930038452,
+      "learning_rate": 4.0590533346445665e-05,
+      "loss": 0.1678077220916748,
+      "step": 1690
+    },
+    {
+      "epoch": 0.3084061135371179,
+      "grad_norm": 0.16426527500152588,
+      "learning_rate": 4.053292446386422e-05,
+      "loss": 0.1689227342605591,
+      "step": 1695
+    },
+    {
+      "epoch": 0.309315866084425,
+      "grad_norm": 0.16129335761070251,
+      "learning_rate": 4.047518092586766e-05,
+      "loss": 0.16592445373535156,
+      "step": 1700
+    },
+    {
+      "epoch": 0.31022561863173215,
+      "grad_norm": 0.15512363612651825,
+      "learning_rate": 4.041730323303654e-05,
+      "loss": 0.16142364740371704,
+      "step": 1705
+    },
+    {
+      "epoch": 0.3111353711790393,
+      "grad_norm": 0.159842386841774,
+      "learning_rate": 4.0359291887114425e-05,
+      "loss": 0.1702875852584839,
+      "step": 1710
+    },
+    {
+      "epoch": 0.3120451237263464,
+      "grad_norm": 0.19558854401111603,
+      "learning_rate": 4.030114739100352e-05,
+      "loss": 0.15966148376464845,
+      "step": 1715
+    },
+    {
+      "epoch": 0.3129548762736536,
+      "grad_norm": 0.1577496975660324,
+      "learning_rate": 4.024287024876029e-05,
+      "loss": 0.1620358943939209,
+      "step": 1720
+    },
+    {
+      "epoch": 0.3138646288209607,
+      "grad_norm": 0.1629355251789093,
+      "learning_rate": 4.0184460965591144e-05,
+      "loss": 0.16511552333831786,
+      "step": 1725
+    },
+    {
+      "epoch": 0.31477438136826785,
+      "grad_norm": 0.17060767114162445,
+      "learning_rate": 4.0125920047848e-05,
+      "loss": 0.15672838687896729,
+      "step": 1730
+    },
+    {
+      "epoch": 0.315684133915575,
+      "grad_norm": 0.22447620332241058,
+      "learning_rate": 4.006724800302394e-05,
+      "loss": 0.15339784622192382,
+      "step": 1735
+    },
+    {
+      "epoch": 0.3165938864628821,
+      "grad_norm": 0.14572037756443024,
+      "learning_rate": 4.000844533974878e-05,
+      "loss": 0.16566959619522095,
+      "step": 1740
+    },
+    {
+      "epoch": 0.31750363901018924,
+      "grad_norm": 0.15915483236312866,
+      "learning_rate": 3.9949512567784684e-05,
+      "loss": 0.16153957843780517,
+      "step": 1745
+    },
+    {
+      "epoch": 0.3184133915574964,
+      "grad_norm": 0.1668540984392166,
+      "learning_rate": 3.9890450198021704e-05,
+      "loss": 0.1659809947013855,
+      "step": 1750
+    },
+    {
+      "epoch": 0.3193231441048035,
+      "grad_norm": 0.16612035036087036,
+      "learning_rate": 3.983125874247341e-05,
+      "loss": 0.16941241025924683,
+      "step": 1755
+    },
+    {
+      "epoch": 0.32023289665211063,
+      "grad_norm": 0.15163679420948029,
+      "learning_rate": 3.9771938714272407e-05,
+      "loss": 0.16053590774536133,
+      "step": 1760
+    },
+    {
+      "epoch": 0.32114264919941776,
+      "grad_norm": 0.1797824203968048,
+      "learning_rate": 3.97124906276659e-05,
+      "loss": 0.1667110800743103,
+      "step": 1765
+    },
+    {
+      "epoch": 0.3220524017467249,
+      "grad_norm": 0.15076608955860138,
+      "learning_rate": 3.9652914998011237e-05,
+      "loss": 0.1607860803604126,
+      "step": 1770
+    },
+    {
+      "epoch": 0.322962154294032,
+      "grad_norm": 0.16523587703704834,
+      "learning_rate": 3.959321234177144e-05,
+      "loss": 0.16515827178955078,
+      "step": 1775
+    },
+    {
+      "epoch": 0.32387190684133915,
+      "grad_norm": 0.22065149247646332,
+      "learning_rate": 3.9533383176510746e-05,
+      "loss": 0.1618957757949829,
+      "step": 1780
+    },
+    {
+      "epoch": 0.3247816593886463,
+      "grad_norm": 0.16426463425159454,
+      "learning_rate": 3.9473428020890066e-05,
+      "loss": 0.15763382911682128,
+      "step": 1785
+    },
+    {
+      "epoch": 0.3256914119359534,
+      "grad_norm": 0.16474904119968414,
+      "learning_rate": 3.941334739466257e-05,
+      "loss": 0.15135571956634522,
+      "step": 1790
+    },
+    {
+      "epoch": 0.32660116448326054,
+      "grad_norm": 0.16746412217617035,
+      "learning_rate": 3.935314181866909e-05,
+      "loss": 0.15925389528274536,
+      "step": 1795
+    },
+    {
+      "epoch": 0.32751091703056767,
+      "grad_norm": 0.17819371819496155,
+      "learning_rate": 3.929281181483369e-05,
+      "loss": 0.1598669171333313,
+      "step": 1800
+    },
+    {
+      "epoch": 0.3284206695778748,
+      "grad_norm": 0.1816040277481079,
+      "learning_rate": 3.923235790615907e-05,
+      "loss": 0.1652522087097168,
+      "step": 1805
+    },
+    {
+      "epoch": 0.32933042212518193,
+      "grad_norm": 0.14846695959568024,
+      "learning_rate": 3.917178061672211e-05,
+      "loss": 0.16665585041046144,
+      "step": 1810
+    },
+    {
+      "epoch": 0.33024017467248906,
+      "grad_norm": 0.1734926551580429,
+      "learning_rate": 3.911108047166924e-05,
+      "loss": 0.16069791316986085,
+      "step": 1815
+    },
+    {
+      "epoch": 0.3311499272197962,
+      "grad_norm": 0.16154922544956207,
+      "learning_rate": 3.905025799721194e-05,
+      "loss": 0.16114097833633423,
+      "step": 1820
+    },
+    {
+      "epoch": 0.3320596797671033,
+      "grad_norm": 0.1538771390914917,
+      "learning_rate": 3.898931372062217e-05,
+      "loss": 0.1602831244468689,
+      "step": 1825
+    },
+    {
+      "epoch": 0.3329694323144105,
+      "grad_norm": 0.14036566019058228,
+      "learning_rate": 3.892824817022781e-05,
+      "loss": 0.1502395749092102,
+      "step": 1830
+    },
+    {
+      "epoch": 0.33387918486171764,
+      "grad_norm": 0.19212059676647186,
+      "learning_rate": 3.886706187540804e-05,
+      "loss": 0.16265250444412233,
+      "step": 1835
+    },
+    {
+      "epoch": 0.33478893740902477,
+      "grad_norm": 0.17410333454608917,
+      "learning_rate": 3.880575536658881e-05,
+      "loss": 0.15689224004745483,
+      "step": 1840
+    },
+    {
+      "epoch": 0.3356986899563319,
+      "grad_norm": 0.15165294706821442,
+      "learning_rate": 3.874432917523817e-05,
+      "loss": 0.15033140182495117,
+      "step": 1845
+    },
+    {
+      "epoch": 0.336608442503639,
+      "grad_norm": 0.16166730225086212,
+      "learning_rate": 3.8682783833861736e-05,
+      "loss": 0.16896235942840576,
+      "step": 1850
+    },
+    {
+      "epoch": 0.33751819505094616,
+      "grad_norm": 0.16497021913528442,
+      "learning_rate": 3.8621119875998026e-05,
+      "loss": 0.1600774645805359,
+      "step": 1855
+    },
+    {
+      "epoch": 0.3384279475982533,
+      "grad_norm": 0.17264948785305023,
+      "learning_rate": 3.855933783621384e-05,
+      "loss": 0.16947593688964843,
+      "step": 1860
+    },
+    {
+      "epoch": 0.3393377001455604,
+      "grad_norm": 0.16870704293251038,
+      "learning_rate": 3.8497438250099636e-05,
+      "loss": 0.16062095165252685,
+      "step": 1865
+    },
+    {
+      "epoch": 0.34024745269286755,
+      "grad_norm": 0.16644036769866943,
+      "learning_rate": 3.843542165426492e-05,
+      "loss": 0.16015599966049193,
+      "step": 1870
+    },
+    {
+      "epoch": 0.3411572052401747,
+      "grad_norm": 0.1626352220773697,
+      "learning_rate": 3.837328858633349e-05,
+      "loss": 0.17444703578948975,
+      "step": 1875
+    },
+    {
+      "epoch": 0.3420669577874818,
+      "grad_norm": 0.1427375227212906,
+      "learning_rate": 3.83110395849389e-05,
+      "loss": 0.1589805006980896,
+      "step": 1880
+    },
+    {
+      "epoch": 0.34297671033478894,
+      "grad_norm": 0.17840255796909332,
+      "learning_rate": 3.824867518971973e-05,
+      "loss": 0.15953952074050903,
+      "step": 1885
+    },
+    {
+      "epoch": 0.34388646288209607,
+      "grad_norm": 0.16998249292373657,
+      "learning_rate": 3.818619594131489e-05,
+      "loss": 0.16027032136917113,
+      "step": 1890
+    },
+    {
+      "epoch": 0.3447962154294032,
+      "grad_norm": 0.14950257539749146,
+      "learning_rate": 3.812360238135897e-05,
+      "loss": 0.15335670709609986,
+      "step": 1895
+    },
+    {
+      "epoch": 0.3457059679767103,
+      "grad_norm": 0.1678011417388916,
+      "learning_rate": 3.806089505247752e-05,
+      "loss": 0.1560648798942566,
+      "step": 1900
+    },
+    {
+      "epoch": 0.34661572052401746,
+      "grad_norm": 0.17944541573524475,
+      "learning_rate": 3.799807449828238e-05,
+      "loss": 0.16072254180908202,
+      "step": 1905
+    },
+    {
+      "epoch": 0.3475254730713246,
+      "grad_norm": 0.166817307472229,
+      "learning_rate": 3.793514126336691e-05,
+      "loss": 0.1542820692062378,
+      "step": 1910
+    },
+    {
+      "epoch": 0.3484352256186317,
+      "grad_norm": 0.16047626733779907,
+      "learning_rate": 3.787209589330134e-05,
+      "loss": 0.16092092990875245,
+      "step": 1915
+    },
+    {
+      "epoch": 0.34934497816593885,
+      "grad_norm": 0.16478900611400604,
+      "learning_rate": 3.7808938934627965e-05,
+      "loss": 0.16765867471694945,
+      "step": 1920
+    },
+    {
+      "epoch": 0.350254730713246,
+      "grad_norm": 0.15349514782428741,
+      "learning_rate": 3.774567093485648e-05,
+      "loss": 0.15890377759933472,
+      "step": 1925
+    },
+    {
+      "epoch": 0.3511644832605531,
+      "grad_norm": 0.1515921950340271,
+      "learning_rate": 3.768229244245917e-05,
+      "loss": 0.16668319702148438,
+      "step": 1930
+    },
+    {
+      "epoch": 0.35207423580786024,
+      "grad_norm": 0.16310466825962067,
+      "learning_rate": 3.7618804006866195e-05,
+      "loss": 0.15182652473449706,
+      "step": 1935
+    },
+    {
+      "epoch": 0.3529839883551674,
+      "grad_norm": 0.17294517159461975,
+      "learning_rate": 3.755520617846084e-05,
+      "loss": 0.16287628412246705,
+      "step": 1940
+    },
+    {
+      "epoch": 0.35389374090247455,
+      "grad_norm": 0.1482895463705063,
+      "learning_rate": 3.749149950857467e-05,
+      "loss": 0.15321952104568481,
+      "step": 1945
+    },
+    {
+      "epoch": 0.3548034934497817,
+      "grad_norm": 0.2236029952764511,
+      "learning_rate": 3.7427684549482847e-05,
+      "loss": 0.15403482913970948,
+      "step": 1950
+    },
+    {
+      "epoch": 0.3557132459970888,
+      "grad_norm": 0.20185327529907227,
+      "learning_rate": 3.736376185439927e-05,
+      "loss": 0.1633884072303772,
+      "step": 1955
+    },
+    {
+      "epoch": 0.35662299854439594,
+      "grad_norm": 0.13906247913837433,
+      "learning_rate": 3.7299731977471816e-05,
+      "loss": 0.15925350189208984,
+      "step": 1960
+    },
+    {
+      "epoch": 0.35753275109170307,
+      "grad_norm": 0.18665002286434174,
+      "learning_rate": 3.723559547377751e-05,
+      "loss": 0.1612026572227478,
+      "step": 1965
+    },
+    {
+      "epoch": 0.3584425036390102,
+      "grad_norm": 0.16913433372974396,
+      "learning_rate": 3.717135289931774e-05,
+      "loss": 0.15479494333267213,
+      "step": 1970
+    },
+    {
+      "epoch": 0.35935225618631733,
+      "grad_norm": 0.1620066910982132,
+      "learning_rate": 3.7107004811013434e-05,
+      "loss": 0.1604058027267456,
+      "step": 1975
+    },
+    {
+      "epoch": 0.36026200873362446,
+      "grad_norm": 0.16838301718235016,
+      "learning_rate": 3.704255176670021e-05,
+      "loss": 0.15335073471069335,
+      "step": 1980
+    },
+    {
+      "epoch": 0.3611717612809316,
+      "grad_norm": 0.3054695427417755,
+      "learning_rate": 3.6977994325123535e-05,
+      "loss": 0.16558053493499755,
+      "step": 1985
+    },
+    {
+      "epoch": 0.3620815138282387,
+      "grad_norm": 0.1526716649532318,
+      "learning_rate": 3.6913333045933934e-05,
+      "loss": 0.16148923635482787,
+      "step": 1990
+    },
+    {
+      "epoch": 0.36299126637554585,
+      "grad_norm": 0.15328513085842133,
+      "learning_rate": 3.684856848968209e-05,
+      "loss": 0.1553613781929016,
+      "step": 1995
+    },
+    {
+      "epoch": 0.363901018922853,
+      "grad_norm": 0.16129714250564575,
+      "learning_rate": 3.6783701217813995e-05,
+      "loss": 0.16724612712860107,
+      "step": 2000
+    },
+    {
+      "epoch": 0.3648107714701601,
+      "grad_norm": 0.15715539455413818,
+      "learning_rate": 3.6718731792666086e-05,
+      "loss": 0.15867922306060792,
+      "step": 2005
+    },
+    {
+      "epoch": 0.36572052401746724,
+      "grad_norm": 0.15569166839122772,
+      "learning_rate": 3.6653660777460366e-05,
+      "loss": 0.1552058696746826,
+      "step": 2010
+    },
+    {
+      "epoch": 0.36663027656477437,
+      "grad_norm": 0.16223010420799255,
+      "learning_rate": 3.6588488736299535e-05,
+      "loss": 0.1583200454711914,
+      "step": 2015
+    },
+    {
+      "epoch": 0.3675400291120815,
+      "grad_norm": 0.18441995978355408,
+      "learning_rate": 3.652321623416209e-05,
+      "loss": 0.15050662755966188,
+      "step": 2020
+    },
+    {
+      "epoch": 0.36844978165938863,
+      "grad_norm": 0.13792674243450165,
+      "learning_rate": 3.645784383689742e-05,
+      "loss": 0.15458759069442748,
+      "step": 2025
+    },
+    {
+      "epoch": 0.36935953420669576,
+      "grad_norm": 0.14993111789226532,
+      "learning_rate": 3.639237211122091e-05,
+      "loss": 0.15926222801208495,
+      "step": 2030
+    },
+    {
+      "epoch": 0.3702692867540029,
+      "grad_norm": 0.16815930604934692,
+      "learning_rate": 3.632680162470904e-05,
+      "loss": 0.15524441003799438,
+      "step": 2035
+    },
+    {
+      "epoch": 0.37117903930131,
+      "grad_norm": 0.13312821090221405,
+      "learning_rate": 3.626113294579441e-05,
+      "loss": 0.15883516073226928,
+      "step": 2040
+    },
+    {
+      "epoch": 0.37208879184861715,
+      "grad_norm": 0.16838273406028748,
+      "learning_rate": 3.619536664376091e-05,
+      "loss": 0.15829603672027587,
+      "step": 2045
+    },
+    {
+      "epoch": 0.37299854439592434,
+      "grad_norm": 0.14706873893737793,
+      "learning_rate": 3.612950328873869e-05,
+      "loss": 0.15644397735595703,
+      "step": 2050
+    },
+    {
+      "epoch": 0.37390829694323147,
+      "grad_norm": 0.1644199639558792,
+      "learning_rate": 3.606354345169926e-05,
+      "loss": 0.15858219861984252,
+      "step": 2055
+    },
+    {
+      "epoch": 0.3748180494905386,
+      "grad_norm": 0.18077051639556885,
+      "learning_rate": 3.599748770445055e-05,
+      "loss": 0.1641286849975586,
+      "step": 2060
+    },
+    {
+      "epoch": 0.3757278020378457,
+      "grad_norm": 0.16329127550125122,
+      "learning_rate": 3.5931336619631914e-05,
+      "loss": 0.15027186870574952,
+      "step": 2065
+    },
+    {
+      "epoch": 0.37663755458515286,
+      "grad_norm": 0.16346783936023712,
+      "learning_rate": 3.586509077070922e-05,
+      "loss": 0.1558641314506531,
+      "step": 2070
+    },
+    {
+      "epoch": 0.37754730713246,
+      "grad_norm": 0.1727602630853653,
+      "learning_rate": 3.5798750731969834e-05,
+      "loss": 0.15390506982803345,
+      "step": 2075
+    },
+    {
+      "epoch": 0.3784570596797671,
+      "grad_norm": 0.7598192691802979,
+      "learning_rate": 3.5732317078517654e-05,
+      "loss": 0.1533232808113098,
+      "step": 2080
+    },
+    {
+      "epoch": 0.37936681222707425,
+      "grad_norm": 0.1433355212211609,
+      "learning_rate": 3.5665790386268124e-05,
+      "loss": 0.15560413599014283,
+      "step": 2085
+    },
+    {
+      "epoch": 0.3802765647743814,
+      "grad_norm": 0.18439625203609467,
+      "learning_rate": 3.559917123194325e-05,
+      "loss": 0.16695556640625,
+      "step": 2090
+    },
+    {
+      "epoch": 0.3811863173216885,
+      "grad_norm": 0.1693502813577652,
+      "learning_rate": 3.55324601930666e-05,
+      "loss": 0.15957870483398437,
+      "step": 2095
+    },
+    {
+      "epoch": 0.38209606986899564,
+      "grad_norm": 0.17776088416576385,
+      "learning_rate": 3.54656578479583e-05,
+      "loss": 0.1527492880821228,
+      "step": 2100
+    },
+    {
+      "epoch": 0.38300582241630277,
+      "grad_norm": 0.15993724763393402,
+      "learning_rate": 3.539876477572998e-05,
+      "loss": 0.1567505717277527,
+      "step": 2105
+    },
+    {
+      "epoch": 0.3839155749636099,
+      "grad_norm": 0.17067375779151917,
+      "learning_rate": 3.533178155627981e-05,
+      "loss": 0.14660797119140626,
+      "step": 2110
+    },
+    {
+      "epoch": 0.384825327510917,
+      "grad_norm": 0.20239882171154022,
+      "learning_rate": 3.526470877028745e-05,
+      "loss": 0.1596767544746399,
+      "step": 2115
+    },
+    {
+      "epoch": 0.38573508005822416,
+      "grad_norm": 0.1863643079996109,
+      "learning_rate": 3.5197546999209005e-05,
+      "loss": 0.15738571882247926,
+      "step": 2120
+    },
+    {
+      "epoch": 0.3866448326055313,
+      "grad_norm": 0.16994133591651917,
+      "learning_rate": 3.5130296825272014e-05,
+      "loss": 0.16255316734313965,
+      "step": 2125
+    },
+    {
+      "epoch": 0.3875545851528384,
+      "grad_norm": 0.18703415989875793,
+      "learning_rate": 3.5062958831470355e-05,
+      "loss": 0.15206334590911866,
+      "step": 2130
+    },
+    {
+      "epoch": 0.38846433770014555,
+      "grad_norm": 0.15433982014656067,
+      "learning_rate": 3.4995533601559226e-05,
+      "loss": 0.1590178370475769,
+      "step": 2135
+    },
+    {
+      "epoch": 0.3893740902474527,
+      "grad_norm": 0.16498146951198578,
+      "learning_rate": 3.4928021720050104e-05,
+      "loss": 0.14759145975112914,
+      "step": 2140
+    },
+    {
+      "epoch": 0.3902838427947598,
+      "grad_norm": 0.17880478501319885,
+      "learning_rate": 3.486042377220562e-05,
+      "loss": 0.1642458915710449,
+      "step": 2145
+    },
+    {
+      "epoch": 0.39119359534206694,
+      "grad_norm": 0.14700061082839966,
+      "learning_rate": 3.479274034403455e-05,
+      "loss": 0.16105138063430785,
+      "step": 2150
+    },
+    {
+      "epoch": 0.39210334788937407,
+      "grad_norm": 0.1620762050151825,
+      "learning_rate": 3.472497202228664e-05,
+      "loss": 0.15104985237121582,
+      "step": 2155
+    },
+    {
+      "epoch": 0.3930131004366812,
+      "grad_norm": 0.1625058799982071,
+      "learning_rate": 3.4657119394447654e-05,
+      "loss": 0.16145485639572144,
+      "step": 2160
+    },
+    {
+      "epoch": 0.3939228529839884,
+      "grad_norm": 0.1631549596786499,
+      "learning_rate": 3.458918304873417e-05,
+      "loss": 0.16712255477905275,
+      "step": 2165
+    },
+    {
+      "epoch": 0.3948326055312955,
+      "grad_norm": 0.16041551530361176,
+      "learning_rate": 3.452116357408853e-05,
+      "loss": 0.15118330717086792,
+      "step": 2170
+    },
+    {
+      "epoch": 0.39574235807860264,
+      "grad_norm": 0.16692611575126648,
+      "learning_rate": 3.44530615601737e-05,
+      "loss": 0.16982550621032716,
+      "step": 2175
+    },
+    {
+      "epoch": 0.39665211062590977,
+      "grad_norm": 0.16082268953323364,
+      "learning_rate": 3.438487759736821e-05,
+      "loss": 0.1513260006904602,
+      "step": 2180
+    },
+    {
+      "epoch": 0.3975618631732169,
+      "grad_norm": 0.1474589854478836,
+      "learning_rate": 3.4316612276761004e-05,
+      "loss": 0.14968743324279785,
+      "step": 2185
+    },
+    {
+      "epoch": 0.39847161572052403,
+      "grad_norm": 0.14531342685222626,
+      "learning_rate": 3.42482661901463e-05,
+      "loss": 0.1563260555267334,
+      "step": 2190
+    },
+    {
+      "epoch": 0.39938136826783116,
+      "grad_norm": 0.16775506734848022,
+      "learning_rate": 3.41798399300185e-05,
+      "loss": 0.14861010313034057,
+      "step": 2195
+    },
+    {
+      "epoch": 0.4002911208151383,
+      "grad_norm": 0.15065217018127441,
+      "learning_rate": 3.411133408956703e-05,
+      "loss": 0.15559519529342652,
+      "step": 2200
+    },
+    {
+      "epoch": 0.4012008733624454,
+      "grad_norm": 0.16655296087265015,
+      "learning_rate": 3.4042749262671184e-05,
+      "loss": 0.16025567054748535,
+      "step": 2205
+    },
+    {
+      "epoch": 0.40211062590975255,
+      "grad_norm": 0.14773905277252197,
+      "learning_rate": 3.397408604389501e-05,
+      "loss": 0.15074082612991332,
+      "step": 2210
+    },
+    {
+      "epoch": 0.4030203784570597,
+      "grad_norm": 0.16233304142951965,
+      "learning_rate": 3.3905345028482125e-05,
+      "loss": 0.15490520000457764,
+      "step": 2215
+    },
+    {
+      "epoch": 0.4039301310043668,
+      "grad_norm": 0.17520153522491455,
+      "learning_rate": 3.383652681235058e-05,
+      "loss": 0.1517520785331726,
+      "step": 2220
+    },
+    {
+      "epoch": 0.40483988355167394,
+      "grad_norm": 0.14749875664710999,
+      "learning_rate": 3.376763199208766e-05,
+      "loss": 0.15410997867584228,
+      "step": 2225
+    },
+    {
+      "epoch": 0.40574963609898107,
+      "grad_norm": 0.16855919361114502,
+      "learning_rate": 3.369866116494477e-05,
+      "loss": 0.1510261058807373,
+      "step": 2230
+    },
+    {
+      "epoch": 0.4066593886462882,
+      "grad_norm": 0.1594122350215912,
+      "learning_rate": 3.362961492883218e-05,
+      "loss": 0.1493813395500183,
+      "step": 2235
+    },
+    {
+      "epoch": 0.40756914119359533,
+      "grad_norm": 0.13645926117897034,
+      "learning_rate": 3.3560493882313915e-05,
+      "loss": 0.14876762628555298,
+      "step": 2240
+    },
+    {
+      "epoch": 0.40847889374090246,
+      "grad_norm": 0.14304400980472565,
+      "learning_rate": 3.349129862460251e-05,
+      "loss": 0.15567013025283813,
+      "step": 2245
+    },
+    {
+      "epoch": 0.4093886462882096,
+      "grad_norm": 0.17040041089057922,
+      "learning_rate": 3.342202975555386e-05,
+      "loss": 0.1563249945640564,
+      "step": 2250
+    },
+    {
+      "epoch": 0.4102983988355167,
+      "grad_norm": 0.15594671666622162,
+      "learning_rate": 3.3352687875661984e-05,
+      "loss": 0.1546410083770752,
+      "step": 2255
+    },
+    {
+      "epoch": 0.41120815138282385,
+      "grad_norm": 0.1677195280790329,
+      "learning_rate": 3.328327358605384e-05,
+      "loss": 0.15710171461105346,
+      "step": 2260
+    },
+    {
+      "epoch": 0.412117903930131,
+      "grad_norm": 0.1731705516576767,
+      "learning_rate": 3.321378748848412e-05,
+      "loss": 0.16444036960601807,
+      "step": 2265
+    },
+    {
+      "epoch": 0.4130276564774381,
+      "grad_norm": 0.18779033422470093,
+      "learning_rate": 3.3144230185329984e-05,
+      "loss": 0.15659687519073487,
+      "step": 2270
+    },
+    {
+      "epoch": 0.4139374090247453,
+      "grad_norm": 0.1543768346309662,
+      "learning_rate": 3.3074602279585913e-05,
+      "loss": 0.15100739002227784,
+      "step": 2275
+    },
+    {
+      "epoch": 0.4148471615720524,
+      "grad_norm": 0.16672168672084808,
+      "learning_rate": 3.300490437485843e-05,
+      "loss": 0.15535364151000977,
+      "step": 2280
+    },
+    {
+      "epoch": 0.41575691411935956,
+      "grad_norm": 0.16741308569908142,
+      "learning_rate": 3.293513707536089e-05,
+      "loss": 0.15523911714553834,
+      "step": 2285
+    },
+    {
+      "epoch": 0.4166666666666667,
+      "grad_norm": 0.1488303542137146,
+      "learning_rate": 3.286530098590822e-05,
+      "loss": 0.1542000651359558,
+      "step": 2290
+    },
+    {
+      "epoch": 0.4175764192139738,
+      "grad_norm": 0.1637732982635498,
+      "learning_rate": 3.2795396711911694e-05,
+      "loss": 0.15354831218719484,
+      "step": 2295
+    },
+    {
+      "epoch": 0.41848617176128095,
+      "grad_norm": 0.1472022533416748,
+      "learning_rate": 3.272542485937369e-05,
+      "loss": 0.16235145330429077,
+      "step": 2300
+    },
+    {
+      "epoch": 0.4193959243085881,
+      "grad_norm": 0.15908290445804596,
+      "learning_rate": 3.265538603488241e-05,
+      "loss": 0.15642645359039306,
+      "step": 2305
+    },
+    {
+      "epoch": 0.4203056768558952,
+      "grad_norm": 0.1584865301847458,
+      "learning_rate": 3.2585280845606645e-05,
+      "loss": 0.15490249395370484,
+      "step": 2310
+    },
+    {
+      "epoch": 0.42121542940320233,
+      "grad_norm": 0.15893949568271637,
+      "learning_rate": 3.251510989929052e-05,
+      "loss": 0.1598116159439087,
+      "step": 2315
+    },
+    {
+      "epoch": 0.42212518195050946,
+      "grad_norm": 0.18930596113204956,
+      "learning_rate": 3.244487380424817e-05,
+      "loss": 0.1482008934020996,
+      "step": 2320
+    },
+    {
+      "epoch": 0.4230349344978166,
+      "grad_norm": 0.132876455783844,
+      "learning_rate": 3.237457316935856e-05,
+      "loss": 0.15304710865020751,
+      "step": 2325
+    },
+    {
+      "epoch": 0.4239446870451237,
+      "grad_norm": 0.16447032988071442,
+      "learning_rate": 3.2304208604060106e-05,
+      "loss": 0.15298750400543212,
+      "step": 2330
+    },
+    {
+      "epoch": 0.42485443959243085,
+      "grad_norm": 0.17748120427131653,
+      "learning_rate": 3.223378071834546e-05,
+      "loss": 0.1556084156036377,
+      "step": 2335
+    },
+    {
+      "epoch": 0.425764192139738,
+      "grad_norm": 0.16366586089134216,
+      "learning_rate": 3.2163290122756206e-05,
+      "loss": 0.14387927055358887,
+      "step": 2340
+    },
+    {
+      "epoch": 0.4266739446870451,
+      "grad_norm": 0.15398970246315002,
+      "learning_rate": 3.209273742837755e-05,
+      "loss": 0.16091293096542358,
+      "step": 2345
+    },
+    {
+      "epoch": 0.42758369723435224,
+      "grad_norm": 0.164212167263031,
+      "learning_rate": 3.202212324683305e-05,
+      "loss": 0.15523531436920165,
+      "step": 2350
+    },
+    {
+      "epoch": 0.4284934497816594,
+      "grad_norm": 0.16749800741672516,
+      "learning_rate": 3.1951448190279255e-05,
+      "loss": 0.15354975461959838,
+      "step": 2355
+    },
+    {
+      "epoch": 0.4294032023289665,
+      "grad_norm": 0.14137034118175507,
+      "learning_rate": 3.18807128714005e-05,
+      "loss": 0.14981694221496583,
+      "step": 2360
+    },
+    {
+      "epoch": 0.43031295487627363,
+      "grad_norm": 0.14848439395427704,
+      "learning_rate": 3.1809917903403507e-05,
+      "loss": 0.15448769330978393,
+      "step": 2365
+    },
+    {
+      "epoch": 0.43122270742358076,
+      "grad_norm": 0.1747605800628662,
+      "learning_rate": 3.1739063900012095e-05,
+      "loss": 0.15882387161254882,
+      "step": 2370
+    },
+    {
+      "epoch": 0.4321324599708879,
+      "grad_norm": 0.16054467856884003,
+      "learning_rate": 3.166815147546186e-05,
+      "loss": 0.15170297622680665,
+      "step": 2375
+    },
+    {
+      "epoch": 0.433042212518195,
+      "grad_norm": 0.15428027510643005,
+      "learning_rate": 3.1597181244494886e-05,
+      "loss": 0.16202548742294312,
+      "step": 2380
+    },
+    {
+      "epoch": 0.4339519650655022,
+      "grad_norm": 0.16747219860553741,
+      "learning_rate": 3.1526153822354325e-05,
+      "loss": 0.15461477041244506,
+      "step": 2385
+    },
+    {
+      "epoch": 0.43486171761280934,
+      "grad_norm": 0.17415772378444672,
+      "learning_rate": 3.145506982477918e-05,
+      "loss": 0.16173542737960817,
+      "step": 2390
+    },
+    {
+      "epoch": 0.43577147016011647,
+      "grad_norm": 0.1293518990278244,
+      "learning_rate": 3.1383929867998865e-05,
+      "loss": 0.15572521686553956,
+      "step": 2395
+    },
+    {
+      "epoch": 0.4366812227074236,
+      "grad_norm": 0.16909323632717133,
+      "learning_rate": 3.1312734568727935e-05,
+      "loss": 0.15898628234863282,
+      "step": 2400
+    },
+    {
+      "epoch": 0.43759097525473073,
+      "grad_norm": 0.16770294308662415,
+      "learning_rate": 3.124148454416069e-05,
+      "loss": 0.1536281704902649,
+      "step": 2405
+    },
+    {
+      "epoch": 0.43850072780203786,
+      "grad_norm": 0.14078612625598907,
+      "learning_rate": 3.117018041196585e-05,
+      "loss": 0.15274266004562378,
+      "step": 2410
+    },
+    {
+      "epoch": 0.439410480349345,
+      "grad_norm": 0.15457536280155182,
+      "learning_rate": 3.1098822790281226e-05,
+      "loss": 0.15391263961791993,
+      "step": 2415
+    },
+    {
+      "epoch": 0.4403202328966521,
+      "grad_norm": 0.1640717089176178,
+      "learning_rate": 3.102741229770827e-05,
+      "loss": 0.15515168905258178,
+      "step": 2420
+    },
+    {
+      "epoch": 0.44122998544395925,
+      "grad_norm": 0.2601533830165863,
+      "learning_rate": 3.095594955330683e-05,
+      "loss": 0.1587247371673584,
+      "step": 2425
+    },
+    {
+      "epoch": 0.4421397379912664,
+      "grad_norm": 0.1352529525756836,
+      "learning_rate": 3.08844351765897e-05,
+      "loss": 0.1483217477798462,
+      "step": 2430
+    },
+    {
+      "epoch": 0.4430494905385735,
+      "grad_norm": 0.18479721248149872,
+      "learning_rate": 3.081286978751728e-05,
+      "loss": 0.15121787786483765,
+      "step": 2435
+    },
+    {
+      "epoch": 0.44395924308588064,
+      "grad_norm": 0.16954511404037476,
+      "learning_rate": 3.074125400649221e-05,
+      "loss": 0.16073100566864013,
+      "step": 2440
+    },
+    {
+      "epoch": 0.44486899563318777,
+      "grad_norm": 0.15154729783535004,
+      "learning_rate": 3.0669588454353944e-05,
+      "loss": 0.15738017559051515,
+      "step": 2445
+    },
+    {
+      "epoch": 0.4457787481804949,
+      "grad_norm": 0.1540488302707672,
+      "learning_rate": 3.059787375237344e-05,
+      "loss": 0.1515384554862976,
+      "step": 2450
+    },
+    {
+      "epoch": 0.44668850072780203,
+      "grad_norm": 0.1814432442188263,
+      "learning_rate": 3.052611052224774e-05,
+      "loss": 0.15731438398361205,
+      "step": 2455
+    },
+    {
+      "epoch": 0.44759825327510916,
+      "grad_norm": 0.16657036542892456,
+      "learning_rate": 3.0454299386094542e-05,
+      "loss": 0.15741543769836425,
+      "step": 2460
+    },
+    {
+      "epoch": 0.4485080058224163,
+      "grad_norm": 0.2177237570285797,
+      "learning_rate": 3.0382440966446875e-05,
+      "loss": 0.14972515106201173,
+      "step": 2465
+    },
+    {
+      "epoch": 0.4494177583697234,
+      "grad_norm": 0.1669909954071045,
+      "learning_rate": 3.031053588624766e-05,
+      "loss": 0.1506432294845581,
+      "step": 2470
+    },
+    {
+      "epoch": 0.45032751091703055,
+      "grad_norm": 0.1752234250307083,
+      "learning_rate": 3.0238584768844313e-05,
+      "loss": 0.14969609975814818,
+      "step": 2475
+    },
+    {
+      "epoch": 0.4512372634643377,
+      "grad_norm": 0.18267901241779327,
+      "learning_rate": 3.0166588237983363e-05,
+      "loss": 0.15112748146057128,
+      "step": 2480
+    },
+    {
+      "epoch": 0.4521470160116448,
+      "grad_norm": 0.16250105202198029,
+      "learning_rate": 3.0094546917805007e-05,
+      "loss": 0.15864100456237792,
+      "step": 2485
+    },
+    {
+      "epoch": 0.45305676855895194,
+      "grad_norm": 0.14825721085071564,
+      "learning_rate": 3.0022461432837752e-05,
+      "loss": 0.1513954520225525,
+      "step": 2490
+    },
+    {
+      "epoch": 0.4539665211062591,
+      "grad_norm": 0.1626640111207962,
+      "learning_rate": 2.9950332407992943e-05,
+      "loss": 0.1505578875541687,
+      "step": 2495
+    },
+    {
+      "epoch": 0.45487627365356625,
+      "grad_norm": 0.1535351574420929,
+      "learning_rate": 2.987816046855939e-05,
+      "loss": 0.15255829095840454,
+      "step": 2500
+    },
+    {
+      "epoch": 0.4557860262008734,
+      "grad_norm": 0.17552775144577026,
+      "learning_rate": 2.9805946240197928e-05,
+      "loss": 0.1516443133354187,
+      "step": 2505
+    },
+    {
+      "epoch": 0.4566957787481805,
+      "grad_norm": 0.16020981967449188,
+      "learning_rate": 2.9733690348935994e-05,
+      "loss": 0.14519743919372557,
+      "step": 2510
+    },
+    {
+      "epoch": 0.45760553129548764,
+      "grad_norm": 0.17800211906433105,
+      "learning_rate": 2.9661393421162204e-05,
+      "loss": 0.15679080486297609,
+      "step": 2515
+    },
+    {
+      "epoch": 0.4585152838427948,
+      "grad_norm": 0.16016991436481476,
+      "learning_rate": 2.9589056083620902e-05,
+      "loss": 0.14768127202987671,
+      "step": 2520
+    },
+    {
+      "epoch": 0.4594250363901019,
+      "grad_norm": 0.16272081434726715,
+      "learning_rate": 2.951667896340679e-05,
+      "loss": 0.1513301968574524,
+      "step": 2525
+    },
+    {
+      "epoch": 0.46033478893740903,
+      "grad_norm": 0.1726413071155548,
+      "learning_rate": 2.9444262687959402e-05,
+      "loss": 0.14819332361221313,
+      "step": 2530
+    },
+    {
+      "epoch": 0.46124454148471616,
+      "grad_norm": 0.1670403778553009,
+      "learning_rate": 2.9371807885057735e-05,
+      "loss": 0.15245940685272216,
+      "step": 2535
+    },
+    {
+      "epoch": 0.4621542940320233,
+      "grad_norm": 0.1650049239397049,
+      "learning_rate": 2.9299315182814772e-05,
+      "loss": 0.15187418460845947,
+      "step": 2540
+    },
+    {
+      "epoch": 0.4630640465793304,
+      "grad_norm": 0.16327734291553497,
+      "learning_rate": 2.9226785209672047e-05,
+      "loss": 0.15579828023910522,
+      "step": 2545
+    },
+    {
+      "epoch": 0.46397379912663755,
+      "grad_norm": 0.3367880582809448,
+      "learning_rate": 2.91542185943942e-05,
+      "loss": 0.15617697238922118,
+      "step": 2550
+    },
+    {
+      "epoch": 0.4648835516739447,
+      "grad_norm": 0.1731594055891037,
+      "learning_rate": 2.908161596606353e-05,
+      "loss": 0.1559603691101074,
+      "step": 2555
+    },
+    {
+      "epoch": 0.4657933042212518,
+      "grad_norm": 0.1477293074131012,
+      "learning_rate": 2.9008977954074517e-05,
+      "loss": 0.15567959547042848,
+      "step": 2560
+    },
+    {
+      "epoch": 0.46670305676855894,
+      "grad_norm": 0.16227173805236816,
+      "learning_rate": 2.8936305188128392e-05,
+      "loss": 0.1522113561630249,
+      "step": 2565
+    },
+    {
+      "epoch": 0.4676128093158661,
+      "grad_norm": 0.2031075656414032,
+      "learning_rate": 2.8863598298227674e-05,
+      "loss": 0.15054640769958497,
+      "step": 2570
+    },
+    {
+      "epoch": 0.4685225618631732,
+      "grad_norm": 0.18351472914218903,
+      "learning_rate": 2.8790857914670698e-05,
+      "loss": 0.15837019681930542,
+      "step": 2575
+    },
+    {
+      "epoch": 0.46943231441048033,
+      "grad_norm": 0.15914765000343323,
+      "learning_rate": 2.871808466804616e-05,
+      "loss": 0.1550259470939636,
+      "step": 2580
+    },
+    {
+      "epoch": 0.47034206695778746,
+      "grad_norm": 0.17366717755794525,
+      "learning_rate": 2.8645279189227636e-05,
+      "loss": 0.15702390670776367,
+      "step": 2585
+    },
+    {
+      "epoch": 0.4712518195050946,
+      "grad_norm": 0.13677838444709778,
+      "learning_rate": 2.8572442109368134e-05,
+      "loss": 0.15485031604766847,
+      "step": 2590
+    },
+    {
+      "epoch": 0.4721615720524017,
+      "grad_norm": 0.1477748304605484,
+      "learning_rate": 2.8499574059894617e-05,
+      "loss": 0.14577245712280273,
+      "step": 2595
+    },
+    {
+      "epoch": 0.47307132459970885,
+      "grad_norm": 0.1582217663526535,
+      "learning_rate": 2.842667567250252e-05,
+      "loss": 0.15586793422698975,
+      "step": 2600
+    },
+    {
+      "epoch": 0.47398107714701604,
+      "grad_norm": 0.19658738374710083,
+      "learning_rate": 2.8353747579150268e-05,
+      "loss": 0.15060495138168334,
+      "step": 2605
+    },
+    {
+      "epoch": 0.47489082969432317,
+      "grad_norm": 0.176767036318779,
+      "learning_rate": 2.828079041205382e-05,
+      "loss": 0.15116705894470214,
+      "step": 2610
+    },
+    {
+      "epoch": 0.4758005822416303,
+      "grad_norm": 0.16972507536411285,
+      "learning_rate": 2.820780480368117e-05,
+      "loss": 0.1541937470436096,
+      "step": 2615
+    },
+    {
+      "epoch": 0.47671033478893743,
+      "grad_norm": 0.1548585742712021,
+      "learning_rate": 2.8134791386746884e-05,
+      "loss": 0.14334756135940552,
+      "step": 2620
+    },
+    {
+      "epoch": 0.47762008733624456,
+      "grad_norm": 0.15411986410617828,
+      "learning_rate": 2.806175079420658e-05,
+      "loss": 0.14642289876937867,
+      "step": 2625
+    },
+    {
+      "epoch": 0.4785298398835517,
+      "grad_norm": 0.16609491407871246,
+      "learning_rate": 2.7988683659251474e-05,
+      "loss": 0.15083469152450563,
+      "step": 2630
+    },
+    {
+      "epoch": 0.4794395924308588,
+      "grad_norm": 0.16592684388160706,
+      "learning_rate": 2.791559061530289e-05,
+      "loss": 0.14218480587005616,
+      "step": 2635
+    },
+    {
+      "epoch": 0.48034934497816595,
+      "grad_norm": 0.1764935404062271,
+      "learning_rate": 2.7842472296006722e-05,
+      "loss": 0.15004343986511232,
+      "step": 2640
+    },
+    {
+      "epoch": 0.4812590975254731,
+      "grad_norm": 0.20094354450702667,
+      "learning_rate": 2.7769329335228022e-05,
+      "loss": 0.14975016117095946,
+      "step": 2645
+    },
+    {
+      "epoch": 0.4821688500727802,
+      "grad_norm": 0.1869269460439682,
+      "learning_rate": 2.769616236704542e-05,
+      "loss": 0.155981707572937,
+      "step": 2650
+    },
+    {
+      "epoch": 0.48307860262008734,
+      "grad_norm": 0.16671574115753174,
+      "learning_rate": 2.762297202574571e-05,
+      "loss": 0.14633859395980836,
+      "step": 2655
+    },
+    {
+      "epoch": 0.48398835516739447,
+      "grad_norm": 0.14999663829803467,
+      "learning_rate": 2.754975894581826e-05,
+      "loss": 0.15692603588104248,
+      "step": 2660
+    },
+    {
+      "epoch": 0.4848981077147016,
+      "grad_norm": 0.16893649101257324,
+      "learning_rate": 2.7476523761949592e-05,
+      "loss": 0.14530394077301026,
+      "step": 2665
+    },
+    {
+      "epoch": 0.48580786026200873,
+      "grad_norm": 0.16039884090423584,
+      "learning_rate": 2.740326710901784e-05,
+      "loss": 0.15013915300369263,
+      "step": 2670
+    },
+    {
+      "epoch": 0.48671761280931586,
+      "grad_norm": 0.16672006249427795,
+      "learning_rate": 2.732998962208725e-05,
+      "loss": 0.15667349100112915,
+      "step": 2675
+    },
+    {
+      "epoch": 0.487627365356623,
+      "grad_norm": 0.2160867303609848,
+      "learning_rate": 2.7256691936402684e-05,
+      "loss": 0.14335414171218872,
+      "step": 2680
+    },
+    {
+      "epoch": 0.4885371179039301,
+      "grad_norm": 0.349030077457428,
+      "learning_rate": 2.71833746873841e-05,
+      "loss": 0.1437530279159546,
+      "step": 2685
+    },
+    {
+      "epoch": 0.48944687045123725,
+      "grad_norm": 0.18380966782569885,
+      "learning_rate": 2.7110038510621073e-05,
+      "loss": 0.1476014256477356,
+      "step": 2690
+    },
+    {
+      "epoch": 0.4903566229985444,
+      "grad_norm": 0.1523742377758026,
+      "learning_rate": 2.703668404186722e-05,
+      "loss": 0.14578526020050048,
+      "step": 2695
+    },
+    {
+      "epoch": 0.4912663755458515,
+      "grad_norm": 0.16092729568481445,
+      "learning_rate": 2.696331191703479e-05,
+      "loss": 0.15335593223571778,
+      "step": 2700
+    },
+    {
+      "epoch": 0.49217612809315864,
+      "grad_norm": 0.17185333371162415,
+      "learning_rate": 2.688992277218904e-05,
+      "loss": 0.1540898084640503,
+      "step": 2705
+    },
+    {
+      "epoch": 0.49308588064046577,
+      "grad_norm": 0.1521969735622406,
+      "learning_rate": 2.6816517243542792e-05,
+      "loss": 0.15171396732330322,
+      "step": 2710
+    },
+    {
+      "epoch": 0.49399563318777295,
+      "grad_norm": 0.16064171493053436,
+      "learning_rate": 2.674309596745092e-05,
+      "loss": 0.1505839228630066,
+      "step": 2715
+    },
+    {
+      "epoch": 0.4949053857350801,
+      "grad_norm": 0.16430898010730743,
+      "learning_rate": 2.6669659580404795e-05,
+      "loss": 0.1551363468170166,
+      "step": 2720
+    },
+    {
+      "epoch": 0.4958151382823872,
+      "grad_norm": 0.16125477850437164,
+      "learning_rate": 2.659620871902677e-05,
+      "loss": 0.15069286823272704,
+      "step": 2725
+    },
+    {
+      "epoch": 0.49672489082969434,
+      "grad_norm": 0.1428450047969818,
+      "learning_rate": 2.652274402006471e-05,
+      "loss": 0.15511081218719483,
+      "step": 2730
+    },
+    {
+      "epoch": 0.4976346433770015,
+      "grad_norm": 0.15452754497528076,
+      "learning_rate": 2.6449266120386406e-05,
+      "loss": 0.14941939115524291,
+      "step": 2735
+    },
+    {
+      "epoch": 0.4985443959243086,
+      "grad_norm": 0.17243537306785583,
+      "learning_rate": 2.6375775656974123e-05,
+      "loss": 0.151741623878479,
+      "step": 2740
+    },
+    {
+      "epoch": 0.49945414847161573,
+      "grad_norm": 0.13736453652381897,
+      "learning_rate": 2.6302273266919008e-05,
+      "loss": 0.147042977809906,
+      "step": 2745
+    },
+    {
+      "epoch": 0.5003639010189228,
+      "grad_norm": 0.16241495311260223,
+      "learning_rate": 2.6228759587415614e-05,
+      "loss": 0.14664684534072875,
+      "step": 2750
+    },
+    {
+      "epoch": 0.50127365356623,
+      "grad_norm": 0.193496435880661,
+      "learning_rate": 2.6155235255756356e-05,
+      "loss": 0.15486966371536254,
+      "step": 2755
+    },
+    {
+      "epoch": 0.5021834061135371,
+      "grad_norm": 0.1542847901582718,
+      "learning_rate": 2.6081700909326e-05,
+      "loss": 0.15148009061813356,
+      "step": 2760
+    },
+    {
+      "epoch": 0.5030931586608443,
+      "grad_norm": 0.1696511209011078,
+      "learning_rate": 2.6008157185596142e-05,
+      "loss": 0.14190055131912233,
+      "step": 2765
+    },
+    {
+      "epoch": 0.5040029112081513,
+      "grad_norm": 0.14690077304840088,
+      "learning_rate": 2.5934604722119655e-05,
+      "loss": 0.1570739269256592,
+      "step": 2770
+    },
+    {
+      "epoch": 0.5049126637554585,
+      "grad_norm": 0.17149671912193298,
+      "learning_rate": 2.5861044156525162e-05,
+      "loss": 0.14940304756164552,
+      "step": 2775
+    },
+    {
+      "epoch": 0.5058224163027657,
+      "grad_norm": 0.16639231145381927,
+      "learning_rate": 2.578747612651155e-05,
+      "loss": 0.15691237449645995,
+      "step": 2780
+    },
+    {
+      "epoch": 0.5067321688500728,
+      "grad_norm": 0.2062763124704361,
+      "learning_rate": 2.5713901269842404e-05,
+      "loss": 0.1564734935760498,
+      "step": 2785
+    },
+    {
+      "epoch": 0.50764192139738,
+      "grad_norm": 0.12636308372020721,
+      "learning_rate": 2.5640320224340502e-05,
+      "loss": 0.14539417028427123,
+      "step": 2790
+    },
+    {
+      "epoch": 0.508551673944687,
+      "grad_norm": 0.16893689334392548,
+      "learning_rate": 2.556673362788225e-05,
+      "loss": 0.15440930128097535,
+      "step": 2795
+    },
+    {
+      "epoch": 0.5094614264919942,
+      "grad_norm": 0.16250015795230865,
+      "learning_rate": 2.54931421183922e-05,
+      "loss": 0.14485647678375244,
+      "step": 2800
+    },
+    {
+      "epoch": 0.5103711790393013,
+      "grad_norm": 0.1700994372367859,
+      "learning_rate": 2.5419546333837462e-05,
+      "loss": 0.15411126613616943,
+      "step": 2805
+    },
+    {
+      "epoch": 0.5112809315866085,
+      "grad_norm": 0.1547706127166748,
+      "learning_rate": 2.5345946912222256e-05,
+      "loss": 0.15516072511672974,
+      "step": 2810
+    },
+    {
+      "epoch": 0.5121906841339156,
+      "grad_norm": 0.17955681681632996,
+      "learning_rate": 2.527234449158228e-05,
+      "loss": 0.15546923875808716,
+      "step": 2815
+    },
+    {
+      "epoch": 0.5131004366812227,
+      "grad_norm": 0.163709819316864,
+      "learning_rate": 2.519873970997927e-05,
+      "loss": 0.15665037631988527,
+      "step": 2820
+    },
+    {
+      "epoch": 0.5140101892285298,
+      "grad_norm": 0.17859576642513275,
+      "learning_rate": 2.5125133205495405e-05,
+      "loss": 0.1539722204208374,
+      "step": 2825
+    },
+    {
+      "epoch": 0.514919941775837,
+      "grad_norm": 0.17443150281906128,
+      "learning_rate": 2.5051525616227806e-05,
+      "loss": 0.148411762714386,
+      "step": 2830
+    },
+    {
+      "epoch": 0.5158296943231441,
+      "grad_norm": 0.17397581040859222,
+      "learning_rate": 2.4977917580283007e-05,
+      "loss": 0.14880497455596925,
+      "step": 2835
+    },
+    {
+      "epoch": 0.5167394468704513,
+      "grad_norm": 0.14565663039684296,
+      "learning_rate": 2.4904309735771405e-05,
+      "loss": 0.14934680461883545,
+      "step": 2840
+    },
+    {
+      "epoch": 0.5176491994177583,
+      "grad_norm": 0.17895659804344177,
+      "learning_rate": 2.4830702720801746e-05,
+      "loss": 0.15287939310073853,
+      "step": 2845
+    },
+    {
+      "epoch": 0.5185589519650655,
+      "grad_norm": 0.15812788903713226,
+      "learning_rate": 2.4757097173475572e-05,
+      "loss": 0.14576947689056396,
+      "step": 2850
+    },
+    {
+      "epoch": 0.5194687045123726,
+      "grad_norm": 0.17123781144618988,
+      "learning_rate": 2.46834937318817e-05,
+      "loss": 0.15224847793579102,
+      "step": 2855
+    },
+    {
+      "epoch": 0.5203784570596798,
+      "grad_norm": 0.14845474064350128,
+      "learning_rate": 2.460989303409072e-05,
+      "loss": 0.14901585578918458,
+      "step": 2860
+    },
+    {
+      "epoch": 0.5212882096069869,
+      "grad_norm": 0.23493704199790955,
+      "learning_rate": 2.4536295718149407e-05,
+      "loss": 0.1517487049102783,
+      "step": 2865
+    },
+    {
+      "epoch": 0.522197962154294,
+      "grad_norm": 0.16209843754768372,
+      "learning_rate": 2.4462702422075217e-05,
+      "loss": 0.14327445030212402,
+      "step": 2870
+    },
+    {
+      "epoch": 0.5231077147016011,
+      "grad_norm": 0.17249803245067596,
+      "learning_rate": 2.4389113783850793e-05,
+      "loss": 0.1517549753189087,
+      "step": 2875
+    },
+    {
+      "epoch": 0.5240174672489083,
+      "grad_norm": 0.14561402797698975,
+      "learning_rate": 2.431553044141836e-05,
+      "loss": 0.14764087200164794,
+      "step": 2880
+    },
+    {
+      "epoch": 0.5249272197962155,
+      "grad_norm": 0.17033302783966064,
+      "learning_rate": 2.4241953032674256e-05,
+      "loss": 0.15181604623794556,
+      "step": 2885
+    },
+    {
+      "epoch": 0.5258369723435226,
+      "grad_norm": 0.1184430941939354,
+      "learning_rate": 2.4168382195463367e-05,
+      "loss": 0.14264242649078368,
+      "step": 2890
+    },
+    {
+      "epoch": 0.5267467248908297,
+      "grad_norm": 0.17521196603775024,
+      "learning_rate": 2.4094818567573618e-05,
+      "loss": 0.1509538173675537,
+      "step": 2895
+    },
+    {
+      "epoch": 0.5276564774381368,
+      "grad_norm": 0.1681576371192932,
+      "learning_rate": 2.4021262786730428e-05,
+      "loss": 0.15344605445861817,
+      "step": 2900
+    },
+    {
+      "epoch": 0.528566229985444,
+      "grad_norm": 0.17134182155132294,
+      "learning_rate": 2.3947715490591206e-05,
+      "loss": 0.15161689519882202,
+      "step": 2905
+    },
+    {
+      "epoch": 0.5294759825327511,
+      "grad_norm": 0.1796472817659378,
+      "learning_rate": 2.3874177316739778e-05,
+      "loss": 0.15086464881896972,
+      "step": 2910
+    },
+    {
+      "epoch": 0.5303857350800583,
+      "grad_norm": 0.23268625140190125,
+      "learning_rate": 2.380064890268093e-05,
+      "loss": 0.15354180335998535,
+      "step": 2915
+    },
+    {
+      "epoch": 0.5312954876273653,
+      "grad_norm": 0.16318941116333008,
+      "learning_rate": 2.372713088583481e-05,
+      "loss": 0.15131797790527343,
+      "step": 2920
+    },
+    {
+      "epoch": 0.5322052401746725,
+      "grad_norm": 0.18171803653240204,
+      "learning_rate": 2.365362390353143e-05,
+      "loss": 0.15784090757369995,
+      "step": 2925
+    },
+    {
+      "epoch": 0.5331149927219796,
+      "grad_norm": 0.17672640085220337,
+      "learning_rate": 2.3580128593005156e-05,
+      "loss": 0.15509436130523682,
+      "step": 2930
+    },
+    {
+      "epoch": 0.5340247452692868,
+      "grad_norm": 0.15985223650932312,
+      "learning_rate": 2.3506645591389174e-05,
+      "loss": 0.14851027727127075,
+      "step": 2935
+    },
+    {
+      "epoch": 0.5349344978165939,
+      "grad_norm": 0.16597607731819153,
+      "learning_rate": 2.343317553570995e-05,
+      "loss": 0.1504931092262268,
+      "step": 2940
+    },
+    {
+      "epoch": 0.535844250363901,
+      "grad_norm": 0.20180748403072357,
+      "learning_rate": 2.3359719062881725e-05,
+      "loss": 0.15023820400238036,
+      "step": 2945
+    },
+    {
+      "epoch": 0.5367540029112081,
+      "grad_norm": 0.1735963076353073,
+      "learning_rate": 2.3286276809701e-05,
+      "loss": 0.15374408960342406,
+      "step": 2950
+    },
+    {
+      "epoch": 0.5376637554585153,
+      "grad_norm": 0.17629501223564148,
+      "learning_rate": 2.3212849412840995e-05,
+      "loss": 0.15007833242416382,
+      "step": 2955
+    },
+    {
+      "epoch": 0.5385735080058224,
+      "grad_norm": 0.1493796557188034,
+      "learning_rate": 2.3139437508846155e-05,
+      "loss": 0.15206656455993653,
+      "step": 2960
+    },
+    {
+      "epoch": 0.5394832605531296,
+      "grad_norm": 0.17426837980747223,
+      "learning_rate": 2.306604173412659e-05,
+      "loss": 0.1441131591796875,
+      "step": 2965
+    },
+    {
+      "epoch": 0.5403930131004366,
+      "grad_norm": 0.16984431445598602,
+      "learning_rate": 2.2992662724952613e-05,
+      "loss": 0.14438753128051757,
+      "step": 2970
+    },
+    {
+      "epoch": 0.5413027656477438,
+      "grad_norm": 0.1814386397600174,
+      "learning_rate": 2.2919301117449167e-05,
+      "loss": 0.14887022972106934,
+      "step": 2975
+    },
+    {
+      "epoch": 0.5422125181950509,
+      "grad_norm": 0.158392995595932,
+      "learning_rate": 2.2845957547590368e-05,
+      "loss": 0.14404361248016356,
+      "step": 2980
+    },
+    {
+      "epoch": 0.5431222707423581,
+      "grad_norm": 0.17496263980865479,
+      "learning_rate": 2.2772632651193953e-05,
+      "loss": 0.1454906702041626,
+      "step": 2985
+    },
+    {
+      "epoch": 0.5440320232896652,
+      "grad_norm": 0.157533198595047,
+      "learning_rate": 2.2699327063915766e-05,
+      "loss": 0.1458217740058899,
+      "step": 2990
+    },
+    {
+      "epoch": 0.5449417758369723,
+      "grad_norm": 0.1767890453338623,
+      "learning_rate": 2.262604142124427e-05,
+      "loss": 0.14384825229644777,
+      "step": 2995
+    },
+    {
+      "epoch": 0.5458515283842795,
+      "grad_norm": 0.1851050704717636,
+      "learning_rate": 2.2552776358495033e-05,
+      "loss": 0.14832457304000854,
+      "step": 3000
+    },
+    {
+      "epoch": 0.5467612809315866,
+      "grad_norm": 0.164175882935524,
+      "learning_rate": 2.247953251080521e-05,
+      "loss": 0.14999878406524658,
+      "step": 3005
+    },
+    {
+      "epoch": 0.5476710334788938,
+      "grad_norm": 0.3403675854206085,
+      "learning_rate": 2.240631051312804e-05,
+      "loss": 0.1443937063217163,
+      "step": 3010
+    },
+    {
+      "epoch": 0.5485807860262009,
+      "grad_norm": 0.16751109063625336,
+      "learning_rate": 2.2333111000227342e-05,
+      "loss": 0.1462402105331421,
+      "step": 3015
+    },
+    {
+      "epoch": 0.549490538573508,
+      "grad_norm": 0.14741151034832,
+      "learning_rate": 2.225993460667201e-05,
+      "loss": 0.149855899810791,
+      "step": 3020
+    },
+    {
+      "epoch": 0.5504002911208151,
+      "grad_norm": 0.20605266094207764,
+      "learning_rate": 2.218678196683054e-05,
+      "loss": 0.15413178205490113,
+      "step": 3025
+    },
+    {
+      "epoch": 0.5513100436681223,
+      "grad_norm": 0.14884796738624573,
+      "learning_rate": 2.2113653714865473e-05,
+      "loss": 0.14592334032058715,
+      "step": 3030
+    },
+    {
+      "epoch": 0.5522197962154294,
+      "grad_norm": 0.17114350199699402,
+      "learning_rate": 2.2040550484727943e-05,
+      "loss": 0.1498338460922241,
+      "step": 3035
+    },
+    {
+      "epoch": 0.5531295487627366,
+      "grad_norm": 0.16496853530406952,
+      "learning_rate": 2.196747291015219e-05,
+      "loss": 0.14650315046310425,
+      "step": 3040
+    },
+    {
+      "epoch": 0.5540393013100436,
+      "grad_norm": 0.15172401070594788,
+      "learning_rate": 2.189442162465001e-05,
+      "loss": 0.14984124898910522,
+      "step": 3045
+    },
+    {
+      "epoch": 0.5549490538573508,
+      "grad_norm": 0.19258467853069305,
+      "learning_rate": 2.182139726150532e-05,
+      "loss": 0.1486764669418335,
+      "step": 3050
+    },
+    {
+      "epoch": 0.5558588064046579,
+      "grad_norm": 0.1749001443386078,
+      "learning_rate": 2.1748400453768652e-05,
+      "loss": 0.14983701705932617,
+      "step": 3055
+    },
+    {
+      "epoch": 0.5567685589519651,
+      "grad_norm": 0.37510567903518677,
+      "learning_rate": 2.1675431834251637e-05,
+      "loss": 0.14483561515808105,
+      "step": 3060
+    },
+    {
+      "epoch": 0.5576783114992722,
+      "grad_norm": 0.16932405531406403,
+      "learning_rate": 2.1602492035521553e-05,
+      "loss": 0.14487643241882325,
+      "step": 3065
+    },
+    {
+      "epoch": 0.5585880640465793,
+      "grad_norm": 0.174176424741745,
+      "learning_rate": 2.152958168989584e-05,
+      "loss": 0.14737497568130492,
+      "step": 3070
+    },
+    {
+      "epoch": 0.5594978165938864,
+      "grad_norm": 0.1601252257823944,
+      "learning_rate": 2.1456701429436577e-05,
+      "loss": 0.15183379650115966,
+      "step": 3075
+    },
+    {
+      "epoch": 0.5604075691411936,
+      "grad_norm": 0.14960910379886627,
+      "learning_rate": 2.1383851885945085e-05,
+      "loss": 0.143074893951416,
+      "step": 3080
+    },
+    {
+      "epoch": 0.5613173216885007,
+      "grad_norm": 0.1678633838891983,
+      "learning_rate": 2.1311033690956346e-05,
+      "loss": 0.14961432218551635,
+      "step": 3085
+    },
+    {
+      "epoch": 0.5622270742358079,
+      "grad_norm": 0.15814319252967834,
+      "learning_rate": 2.1238247475733613e-05,
+      "loss": 0.14308581352233887,
+      "step": 3090
+    },
+    {
+      "epoch": 0.5631368267831149,
+      "grad_norm": 0.21240772306919098,
+      "learning_rate": 2.1165493871262887e-05,
+      "loss": 0.14737485647201537,
+      "step": 3095
+    },
+    {
+      "epoch": 0.5640465793304221,
+      "grad_norm": 0.15161271393299103,
+      "learning_rate": 2.109277350824749e-05,
+      "loss": 0.14534420967102052,
+      "step": 3100
+    },
+    {
+      "epoch": 0.5649563318777293,
+      "grad_norm": 0.16572362184524536,
+      "learning_rate": 2.1020087017102537e-05,
+      "loss": 0.14299670457839966,
+      "step": 3105
+    },
+    {
+      "epoch": 0.5658660844250364,
+      "grad_norm": 0.1548164039850235,
+      "learning_rate": 2.094743502794954e-05,
+      "loss": 0.14371142387390137,
+      "step": 3110
+    },
+    {
+      "epoch": 0.5667758369723436,
+      "grad_norm": 0.2574169933795929,
+      "learning_rate": 2.0874818170610885e-05,
+      "loss": 0.14350423812866211,
+      "step": 3115
+    },
+    {
+      "epoch": 0.5676855895196506,
+      "grad_norm": 0.16359548270702362,
+      "learning_rate": 2.080223707460443e-05,
+      "loss": 0.1520243763923645,
+      "step": 3120
+    },
+    {
+      "epoch": 0.5685953420669578,
+      "grad_norm": 0.1798320859670639,
+      "learning_rate": 2.072969236913799e-05,
+      "loss": 0.14832595586776734,
+      "step": 3125
+    },
+    {
+      "epoch": 0.5695050946142649,
+      "grad_norm": 0.17045916616916656,
+      "learning_rate": 2.0657184683103926e-05,
+      "loss": 0.15308042764663696,
+      "step": 3130
+    },
+    {
+      "epoch": 0.5704148471615721,
+      "grad_norm": 0.16345897316932678,
+      "learning_rate": 2.058471464507366e-05,
+      "loss": 0.14564799070358275,
+      "step": 3135
+    },
+    {
+      "epoch": 0.5713245997088792,
+      "grad_norm": 0.15170110762119293,
+      "learning_rate": 2.0512282883292257e-05,
+      "loss": 0.14161767959594726,
+      "step": 3140
+    },
+    {
+      "epoch": 0.5722343522561864,
+      "grad_norm": 0.8107472658157349,
+      "learning_rate": 2.0439890025672955e-05,
+      "loss": 0.14481087923049926,
+      "step": 3145
+    },
+    {
+      "epoch": 0.5731441048034934,
+      "grad_norm": 0.15346679091453552,
+      "learning_rate": 2.036753669979174e-05,
+      "loss": 0.14860262870788574,
+      "step": 3150
+    },
+    {
+      "epoch": 0.5740538573508006,
+      "grad_norm": 0.1632593423128128,
+      "learning_rate": 2.0295223532881886e-05,
+      "loss": 0.1481687307357788,
+      "step": 3155
+    },
+    {
+      "epoch": 0.5749636098981077,
+      "grad_norm": 0.23399172723293304,
+      "learning_rate": 2.022295115182852e-05,
+      "loss": 0.149153733253479,
+      "step": 3160
+    },
+    {
+      "epoch": 0.5758733624454149,
+      "grad_norm": 0.14977394044399261,
+      "learning_rate": 2.015072018316323e-05,
+      "loss": 0.14921388626098633,
+      "step": 3165
+    },
+    {
+      "epoch": 0.576783114992722,
+      "grad_norm": 0.1550658792257309,
+      "learning_rate": 2.007853125305856e-05,
+      "loss": 0.1482759475708008,
+      "step": 3170
+    },
+    {
+      "epoch": 0.5776928675400291,
+      "grad_norm": 0.16661737859249115,
+      "learning_rate": 2.0006384987322645e-05,
+      "loss": 0.14903552532196046,
+      "step": 3175
+    },
+    {
+      "epoch": 0.5786026200873362,
+      "grad_norm": 0.1746823936700821,
+      "learning_rate": 1.9934282011393753e-05,
+      "loss": 0.1412947654724121,
+      "step": 3180
+    },
+    {
+      "epoch": 0.5795123726346434,
+      "grad_norm": 0.17025792598724365,
+      "learning_rate": 1.9862222950334857e-05,
+      "loss": 0.15289769172668458,
+      "step": 3185
+    },
+    {
+      "epoch": 0.5804221251819505,
+      "grad_norm": 0.16857658326625824,
+      "learning_rate": 1.9790208428828252e-05,
+      "loss": 0.14419941902160643,
+      "step": 3190
+    },
+    {
+      "epoch": 0.5813318777292577,
+      "grad_norm": 0.16099876165390015,
+      "learning_rate": 1.9718239071170118e-05,
+      "loss": 0.14476487636566163,
+      "step": 3195
+    },
+    {
+      "epoch": 0.5822416302765647,
+      "grad_norm": 0.16140873730182648,
+      "learning_rate": 1.964631550126508e-05,
+      "loss": 0.14588416814804078,
+      "step": 3200
+    },
+    {
+      "epoch": 0.5831513828238719,
+      "grad_norm": 0.15719448029994965,
+      "learning_rate": 1.957443834262087e-05,
+      "loss": 0.15144693851470947,
+      "step": 3205
+    },
+    {
+      "epoch": 0.584061135371179,
+      "grad_norm": 0.16512645781040192,
+      "learning_rate": 1.950260821834285e-05,
+      "loss": 0.14787566661834717,
+      "step": 3210
+    },
+    {
+      "epoch": 0.5849708879184862,
+      "grad_norm": 0.18584516644477844,
+      "learning_rate": 1.9430825751128643e-05,
+      "loss": 0.14514710903167724,
+      "step": 3215
+    },
+    {
+      "epoch": 0.5858806404657934,
+      "grad_norm": 0.17640981078147888,
+      "learning_rate": 1.9359091563262742e-05,
+      "loss": 0.1511004686355591,
+      "step": 3220
+    },
+    {
+      "epoch": 0.5867903930131004,
+      "grad_norm": 0.1697624921798706,
+      "learning_rate": 1.9287406276611095e-05,
+      "loss": 0.15392563343048096,
+      "step": 3225
+    },
+    {
+      "epoch": 0.5877001455604076,
+      "grad_norm": 0.1677260845899582,
+      "learning_rate": 1.9215770512615725e-05,
+      "loss": 0.15311745405197144,
+      "step": 3230
+    },
+    {
+      "epoch": 0.5886098981077147,
+      "grad_norm": 0.15357480943202972,
+      "learning_rate": 1.9144184892289337e-05,
+      "loss": 0.14370160102844237,
+      "step": 3235
+    },
+    {
+      "epoch": 0.5895196506550219,
+      "grad_norm": 0.18601207435131073,
+      "learning_rate": 1.9072650036209955e-05,
+      "loss": 0.14095077514648438,
+      "step": 3240
+    },
+    {
+      "epoch": 0.590429403202329,
+      "grad_norm": 0.17313526570796967,
+      "learning_rate": 1.9001166564515513e-05,
+      "loss": 0.148259174823761,
+      "step": 3245
+    },
+    {
+      "epoch": 0.5913391557496361,
+      "grad_norm": 0.1634378433227539,
+      "learning_rate": 1.8929735096898504e-05,
+      "loss": 0.15082294940948487,
+      "step": 3250
+    },
+    {
+      "epoch": 0.5922489082969432,
+      "grad_norm": 0.18542174994945526,
+      "learning_rate": 1.885835625260058e-05,
+      "loss": 0.14461435079574586,
+      "step": 3255
+    },
+    {
+      "epoch": 0.5931586608442504,
+      "grad_norm": 0.1740756630897522,
+      "learning_rate": 1.87870306504072e-05,
+      "loss": 0.14083608388900756,
+      "step": 3260
+    },
+    {
+      "epoch": 0.5940684133915575,
+      "grad_norm": 0.25606217980384827,
+      "learning_rate": 1.8715758908642288e-05,
+      "loss": 0.15125386714935302,
+      "step": 3265
+    },
+    {
+      "epoch": 0.5949781659388647,
+      "grad_norm": 0.20194627344608307,
+      "learning_rate": 1.8644541645162834e-05,
+      "loss": 0.14433003664016725,
+      "step": 3270
+    },
+    {
+      "epoch": 0.5958879184861717,
+      "grad_norm": 0.1902168095111847,
+      "learning_rate": 1.8573379477353542e-05,
+      "loss": 0.14718132019042968,
+      "step": 3275
+    },
+    {
+      "epoch": 0.5967976710334789,
+      "grad_norm": 0.15122972428798676,
+      "learning_rate": 1.850227302212151e-05,
+      "loss": 0.153376567363739,
+      "step": 3280
+    },
+    {
+      "epoch": 0.597707423580786,
+      "grad_norm": 0.14331959187984467,
+      "learning_rate": 1.843122289589085e-05,
+      "loss": 0.146630597114563,
+      "step": 3285
+    },
+    {
+      "epoch": 0.5986171761280932,
+      "grad_norm": 0.15083099901676178,
+      "learning_rate": 1.836022971459737e-05,
+      "loss": 0.1445971965789795,
+      "step": 3290
+    },
+    {
+      "epoch": 0.5995269286754003,
+      "grad_norm": 0.16585418581962585,
+      "learning_rate": 1.828929409368321e-05,
+      "loss": 0.15120241641998292,
+      "step": 3295
+    },
+    {
+      "epoch": 0.6004366812227074,
+      "grad_norm": 0.1653224229812622,
+      "learning_rate": 1.8218416648091524e-05,
+      "loss": 0.14349838495254516,
+      "step": 3300
+    },
+    {
+      "epoch": 0.6013464337700145,
+      "grad_norm": 0.1891375184059143,
+      "learning_rate": 1.8147597992261124e-05,
+      "loss": 0.15171384811401367,
+      "step": 3305
+    },
+    {
+      "epoch": 0.6022561863173217,
+      "grad_norm": 0.13392704725265503,
+      "learning_rate": 1.8076838740121187e-05,
+      "loss": 0.14607118368148803,
+      "step": 3310
+    },
+    {
+      "epoch": 0.6031659388646288,
+      "grad_norm": 0.15421944856643677,
+      "learning_rate": 1.8006139505085926e-05,
+      "loss": 0.1380957007408142,
+      "step": 3315
+    },
+    {
+      "epoch": 0.604075691411936,
+      "grad_norm": 0.16637761890888214,
+      "learning_rate": 1.7935500900049246e-05,
+      "loss": 0.14604611396789552,
+      "step": 3320
+    },
+    {
+      "epoch": 0.6049854439592431,
+      "grad_norm": 0.16638441383838654,
+      "learning_rate": 1.7864923537379445e-05,
+      "loss": 0.1513611912727356,
+      "step": 3325
+    },
+    {
+      "epoch": 0.6058951965065502,
+      "grad_norm": 0.1745707094669342,
+      "learning_rate": 1.779440802891394e-05,
+      "loss": 0.15391240119934083,
+      "step": 3330
+    },
+    {
+      "epoch": 0.6068049490538574,
+      "grad_norm": 0.1620505005121231,
+      "learning_rate": 1.77239549859539e-05,
+      "loss": 0.14986472129821776,
+      "step": 3335
+    },
+    {
+      "epoch": 0.6077147016011645,
+      "grad_norm": 0.1579132080078125,
+      "learning_rate": 1.7653565019259e-05,
+      "loss": 0.1466603994369507,
+      "step": 3340
+    },
+    {
+      "epoch": 0.6086244541484717,
+      "grad_norm": 0.19154994189739227,
+      "learning_rate": 1.7583238739042086e-05,
+      "loss": 0.15228934288024903,
+      "step": 3345
+    },
+    {
+      "epoch": 0.6095342066957787,
+      "grad_norm": 0.15771779417991638,
+      "learning_rate": 1.7512976754963913e-05,
+      "loss": 0.14965078830718995,
+      "step": 3350
+    },
+    {
+      "epoch": 0.6104439592430859,
+      "grad_norm": 0.18406136333942413,
+      "learning_rate": 1.744277967612785e-05,
+      "loss": 0.1473196864128113,
+      "step": 3355
+    },
+    {
+      "epoch": 0.611353711790393,
+      "grad_norm": 0.17603816092014313,
+      "learning_rate": 1.7372648111074607e-05,
+      "loss": 0.1430676221847534,
+      "step": 3360
+    },
+    {
+      "epoch": 0.6122634643377002,
+      "grad_norm": 0.156408429145813,
+      "learning_rate": 1.7302582667776933e-05,
+      "loss": 0.14018454551696777,
+      "step": 3365
+    },
+    {
+      "epoch": 0.6131732168850073,
+      "grad_norm": 0.14504843950271606,
+      "learning_rate": 1.7232583953634407e-05,
+      "loss": 0.14505640268325806,
+      "step": 3370
+    },
+    {
+      "epoch": 0.6140829694323144,
+      "grad_norm": 0.1864968240261078,
+      "learning_rate": 1.716265257546808e-05,
+      "loss": 0.14810394048690795,
+      "step": 3375
+    },
+    {
+      "epoch": 0.6149927219796215,
+      "grad_norm": 0.1621711403131485,
+      "learning_rate": 1.7092789139515295e-05,
+      "loss": 0.14203091859817504,
+      "step": 3380
+    },
+    {
+      "epoch": 0.6159024745269287,
+      "grad_norm": 0.17994914948940277,
+      "learning_rate": 1.70229942514244e-05,
+      "loss": 0.14565644264221192,
+      "step": 3385
+    },
+    {
+      "epoch": 0.6168122270742358,
+      "grad_norm": 0.1707388162612915,
+      "learning_rate": 1.6953268516249486e-05,
+      "loss": 0.14449434280395507,
+      "step": 3390
+    },
+    {
+      "epoch": 0.617721979621543,
+      "grad_norm": 0.16425329446792603,
+      "learning_rate": 1.6883612538445175e-05,
+      "loss": 0.15185940265655518,
+      "step": 3395
+    },
+    {
+      "epoch": 0.61863173216885,
+      "grad_norm": 0.15987788140773773,
+      "learning_rate": 1.6814026921861335e-05,
+      "loss": 0.14994431734085084,
+      "step": 3400
+    },
+    {
+      "epoch": 0.6195414847161572,
+      "grad_norm": 0.2987690269947052,
+      "learning_rate": 1.6744512269737894e-05,
+      "loss": 0.14652738571166993,
+      "step": 3405
+    },
+    {
+      "epoch": 0.6204512372634643,
+      "grad_norm": 0.1681315004825592,
+      "learning_rate": 1.6675069184699574e-05,
+      "loss": 0.14566165208816528,
+      "step": 3410
+    },
+    {
+      "epoch": 0.6213609898107715,
+      "grad_norm": 0.15847846865653992,
+      "learning_rate": 1.660569826875069e-05,
+      "loss": 0.1374401330947876,
+      "step": 3415
+    },
+    {
+      "epoch": 0.6222707423580786,
+      "grad_norm": 0.16370312869548798,
+      "learning_rate": 1.6536400123269907e-05,
+      "loss": 0.14905524253845215,
+      "step": 3420
+    },
+    {
+      "epoch": 0.6231804949053857,
+      "grad_norm": 0.16054444015026093,
+      "learning_rate": 1.6467175349005054e-05,
+      "loss": 0.1496324896812439,
+      "step": 3425
+    },
+    {
+      "epoch": 0.6240902474526928,
+      "grad_norm": 0.1663951277732849,
+      "learning_rate": 1.639802454606788e-05,
+      "loss": 0.1504170298576355,
+      "step": 3430
+    },
+    {
+      "epoch": 0.625,
+      "grad_norm": 0.1591310054063797,
+      "learning_rate": 1.6328948313928906e-05,
+      "loss": 0.1410186171531677,
+      "step": 3435
+    },
+    {
+      "epoch": 0.6259097525473072,
+      "grad_norm": 0.1637524962425232,
+      "learning_rate": 1.6259947251412178e-05,
+      "loss": 0.13963305950164795,
+      "step": 3440
+    },
+    {
+      "epoch": 0.6268195050946143,
+      "grad_norm": 0.1688017100095749,
+      "learning_rate": 1.6191021956690096e-05,
+      "loss": 0.14727941751480103,
+      "step": 3445
+    },
+    {
+      "epoch": 0.6277292576419214,
+      "grad_norm": 0.1691795438528061,
+      "learning_rate": 1.612217302727821e-05,
+      "loss": 0.14856183528900146,
+      "step": 3450
+    },
+    {
+      "epoch": 0.6286390101892285,
+      "grad_norm": 0.18501746654510498,
+      "learning_rate": 1.60534010600301e-05,
+      "loss": 0.1481746554374695,
+      "step": 3455
+    },
+    {
+      "epoch": 0.6295487627365357,
+      "grad_norm": 0.16234716773033142,
+      "learning_rate": 1.5984706651132125e-05,
+      "loss": 0.1427530527114868,
+      "step": 3460
+    },
+    {
+      "epoch": 0.6304585152838428,
+      "grad_norm": 0.16013780236244202,
+      "learning_rate": 1.5916090396098293e-05,
+      "loss": 0.14264426231384278,
+      "step": 3465
+    },
+    {
+      "epoch": 0.63136826783115,
+      "grad_norm": 0.17116396129131317,
+      "learning_rate": 1.5847552889765095e-05,
+      "loss": 0.14109257459640503,
+      "step": 3470
+    },
+    {
+      "epoch": 0.632278020378457,
+      "grad_norm": 0.16949769854545593,
+      "learning_rate": 1.5779094726286344e-05,
+      "loss": 0.1387040376663208,
+      "step": 3475
+    },
+    {
+      "epoch": 0.6331877729257642,
+      "grad_norm": 0.14983431994915009,
+      "learning_rate": 1.5710716499128044e-05,
+      "loss": 0.13645120859146118,
+      "step": 3480
+    },
+    {
+      "epoch": 0.6340975254730713,
+      "grad_norm": 0.1632554531097412,
+      "learning_rate": 1.564241880106321e-05,
+      "loss": 0.14883992671966553,
+      "step": 3485
+    },
+    {
+      "epoch": 0.6350072780203785,
+      "grad_norm": 0.15686506032943726,
+      "learning_rate": 1.5574202224166744e-05,
+      "loss": 0.14244272708892822,
+      "step": 3490
+    },
+    {
+      "epoch": 0.6359170305676856,
+      "grad_norm": 0.18843458592891693,
+      "learning_rate": 1.5506067359810333e-05,
+      "loss": 0.15149861574172974,
+      "step": 3495
+    },
+    {
+      "epoch": 0.6368267831149927,
+      "grad_norm": 0.15874551236629486,
+      "learning_rate": 1.5438014798657275e-05,
+      "loss": 0.15188233852386473,
+      "step": 3500
+    },
+    {
+      "epoch": 0.6377365356622998,
+      "grad_norm": 0.17014239728450775,
+      "learning_rate": 1.5370045130657366e-05,
+      "loss": 0.14694437980651856,
+      "step": 3505
+    },
+    {
+      "epoch": 0.638646288209607,
+      "grad_norm": 0.14744038879871368,
+      "learning_rate": 1.5302158945041838e-05,
+      "loss": 0.14434736967086792,
+      "step": 3510
+    },
+    {
+      "epoch": 0.6395560407569141,
+      "grad_norm": 0.2069770246744156,
+      "learning_rate": 1.523435683031818e-05,
+      "loss": 0.13982917070388795,
+      "step": 3515
+    },
+    {
+      "epoch": 0.6404657933042213,
+      "grad_norm": 0.17811502516269684,
+      "learning_rate": 1.5166639374265063e-05,
+      "loss": 0.1408839702606201,
+      "step": 3520
+    },
+    {
+      "epoch": 0.6413755458515283,
+      "grad_norm": 0.165786474943161,
+      "learning_rate": 1.509900716392728e-05,
+      "loss": 0.15312877893447877,
+      "step": 3525
+    },
+    {
+      "epoch": 0.6422852983988355,
+      "grad_norm": 0.1633884161710739,
+      "learning_rate": 1.5031460785610596e-05,
+      "loss": 0.1488795518875122,
+      "step": 3530
+    },
+    {
+      "epoch": 0.6431950509461426,
+      "grad_norm": 0.16498984396457672,
+      "learning_rate": 1.4964000824876723e-05,
+      "loss": 0.15031465291976928,
+      "step": 3535
+    },
+    {
+      "epoch": 0.6441048034934498,
+      "grad_norm": 0.18043678998947144,
+      "learning_rate": 1.4896627866538191e-05,
+      "loss": 0.147829806804657,
+      "step": 3540
+    },
+    {
+      "epoch": 0.6450145560407569,
+      "grad_norm": 0.16813597083091736,
+      "learning_rate": 1.4829342494653315e-05,
+      "loss": 0.1418998956680298,
+      "step": 3545
+    },
+    {
+      "epoch": 0.645924308588064,
+      "grad_norm": 0.1817242056131363,
+      "learning_rate": 1.4762145292521118e-05,
+      "loss": 0.14508869647979736,
+      "step": 3550
+    },
+    {
+      "epoch": 0.6468340611353712,
+      "grad_norm": 0.14666494727134705,
+      "learning_rate": 1.469503684267628e-05,
+      "loss": 0.14159854650497436,
+      "step": 3555
+    },
+    {
+      "epoch": 0.6477438136826783,
+      "grad_norm": 0.16485381126403809,
+      "learning_rate": 1.4628017726884086e-05,
+      "loss": 0.14419105052947997,
+      "step": 3560
+    },
+    {
+      "epoch": 0.6486535662299855,
+      "grad_norm": 0.16100342571735382,
+      "learning_rate": 1.4561088526135375e-05,
+      "loss": 0.14501721858978273,
+      "step": 3565
+    },
+    {
+      "epoch": 0.6495633187772926,
+      "grad_norm": 0.16996590793132782,
+      "learning_rate": 1.4494249820641493e-05,
+      "loss": 0.1377166509628296,
+      "step": 3570
+    },
+    {
+      "epoch": 0.6504730713245997,
+      "grad_norm": 0.16168837249279022,
+      "learning_rate": 1.4427502189829339e-05,
+      "loss": 0.1414325475692749,
+      "step": 3575
+    },
+    {
+      "epoch": 0.6513828238719068,
+      "grad_norm": 0.16318906843662262,
+      "learning_rate": 1.436084621233621e-05,
+      "loss": 0.14685193300247193,
+      "step": 3580
+    },
+    {
+      "epoch": 0.652292576419214,
+      "grad_norm": 0.1636219322681427,
+      "learning_rate": 1.4294282466004899e-05,
+      "loss": 0.1405899167060852,
+      "step": 3585
+    },
+    {
+      "epoch": 0.6532023289665211,
+      "grad_norm": 0.1838461309671402,
+      "learning_rate": 1.422781152787865e-05,
+      "loss": 0.14386332035064697,
+      "step": 3590
+    },
+    {
+      "epoch": 0.6541120815138283,
+      "grad_norm": 0.1796344667673111,
+      "learning_rate": 1.4161433974196115e-05,
+      "loss": 0.1513024687767029,
+      "step": 3595
+    },
+    {
+      "epoch": 0.6550218340611353,
+      "grad_norm": 0.16424529254436493,
+      "learning_rate": 1.4095150380386427e-05,
+      "loss": 0.14238927364349366,
+      "step": 3600
+    },
+    {
+      "epoch": 0.6559315866084425,
+      "grad_norm": 0.19264160096645355,
+      "learning_rate": 1.402896132106415e-05,
+      "loss": 0.14297477006912232,
+      "step": 3605
+    },
+    {
+      "epoch": 0.6568413391557496,
+      "grad_norm": 0.18319948017597198,
+      "learning_rate": 1.3962867370024347e-05,
+      "loss": 0.1448880434036255,
+      "step": 3610
+    },
+    {
+      "epoch": 0.6577510917030568,
+      "grad_norm": 0.16507290303707123,
+      "learning_rate": 1.389686910023758e-05,
+      "loss": 0.14724698066711425,
+      "step": 3615
+    },
+    {
+      "epoch": 0.6586608442503639,
+      "grad_norm": 0.17871244251728058,
+      "learning_rate": 1.3830967083844942e-05,
+      "loss": 0.14479386806488037,
+      "step": 3620
+    },
+    {
+      "epoch": 0.659570596797671,
+      "grad_norm": 0.1846228390932083,
+      "learning_rate": 1.3765161892153112e-05,
+      "loss": 0.1453616738319397,
+      "step": 3625
+    },
+    {
+      "epoch": 0.6604803493449781,
+      "grad_norm": 0.17185978591442108,
+      "learning_rate": 1.3699454095629372e-05,
+      "loss": 0.14906206130981445,
+      "step": 3630
+    },
+    {
+      "epoch": 0.6613901018922853,
+      "grad_norm": 0.14751191437244415,
+      "learning_rate": 1.3633844263896698e-05,
+      "loss": 0.13991892337799072,
+      "step": 3635
+    },
+    {
+      "epoch": 0.6622998544395924,
+      "grad_norm": 0.22059763967990875,
+      "learning_rate": 1.3568332965728817e-05,
+      "loss": 0.14680869579315187,
+      "step": 3640
+    },
+    {
+      "epoch": 0.6632096069868996,
+      "grad_norm": 0.15295909345149994,
+      "learning_rate": 1.3502920769045232e-05,
+      "loss": 0.1404443383216858,
+      "step": 3645
+    },
+    {
+      "epoch": 0.6641193595342066,
+      "grad_norm": 0.14600558578968048,
+      "learning_rate": 1.3437608240906364e-05,
+      "loss": 0.14663270711898804,
+      "step": 3650
+    },
+    {
+      "epoch": 0.6650291120815138,
+      "grad_norm": 0.15548352897167206,
+      "learning_rate": 1.3372395947508587e-05,
+      "loss": 0.1431443452835083,
+      "step": 3655
+    },
+    {
+      "epoch": 0.665938864628821,
+      "grad_norm": 0.1813388466835022,
+      "learning_rate": 1.3307284454179342e-05,
+      "loss": 0.1458706736564636,
+      "step": 3660
+    },
+    {
+      "epoch": 0.6668486171761281,
+      "grad_norm": 0.16326870024204254,
+      "learning_rate": 1.3242274325372247e-05,
+      "loss": 0.14700595140457154,
+      "step": 3665
+    },
+    {
+      "epoch": 0.6677583697234353,
+      "grad_norm": 0.18779197335243225,
+      "learning_rate": 1.3177366124662149e-05,
+      "loss": 0.1497237801551819,
+      "step": 3670
+    },
+    {
+      "epoch": 0.6686681222707423,
+      "grad_norm": 0.16291002929210663,
+      "learning_rate": 1.3112560414740315e-05,
+      "loss": 0.1387086868286133,
+      "step": 3675
+    },
+    {
+      "epoch": 0.6695778748180495,
+      "grad_norm": 0.1532297134399414,
+      "learning_rate": 1.3047857757409487e-05,
+      "loss": 0.14497545957565308,
+      "step": 3680
+    },
+    {
+      "epoch": 0.6704876273653566,
+      "grad_norm": 0.14697515964508057,
+      "learning_rate": 1.2983258713579066e-05,
+      "loss": 0.1494283437728882,
+      "step": 3685
+    },
+    {
+      "epoch": 0.6713973799126638,
+      "grad_norm": 0.15213452279567719,
+      "learning_rate": 1.2918763843260218e-05,
+      "loss": 0.1468907594680786,
+      "step": 3690
+    },
+    {
+      "epoch": 0.6723071324599709,
+      "grad_norm": 0.1745215803384781,
+      "learning_rate": 1.285437370556099e-05,
+      "loss": 0.14997754096984864,
+      "step": 3695
+    },
+    {
+      "epoch": 0.673216885007278,
+      "grad_norm": 0.19207637012004852,
+      "learning_rate": 1.2790088858681577e-05,
+      "loss": 0.14202862977981567,
+      "step": 3700
+    },
+    {
+      "epoch": 0.6741266375545851,
+      "grad_norm": 0.1521359086036682,
+      "learning_rate": 1.2725909859909313e-05,
+      "loss": 0.14547673463821412,
+      "step": 3705
+    },
+    {
+      "epoch": 0.6750363901018923,
+      "grad_norm": 0.16975535452365875,
+      "learning_rate": 1.2661837265613999e-05,
+      "loss": 0.14006874561309815,
+      "step": 3710
+    },
+    {
+      "epoch": 0.6759461426491994,
+      "grad_norm": 0.22234582901000977,
+      "learning_rate": 1.2597871631242992e-05,
+      "loss": 0.13691173791885375,
+      "step": 3715
+    },
+    {
+      "epoch": 0.6768558951965066,
+      "grad_norm": 0.16082969307899475,
+      "learning_rate": 1.2534013511316383e-05,
+      "loss": 0.14932308197021485,
+      "step": 3720
+    },
+    {
+      "epoch": 0.6777656477438136,
+      "grad_norm": 0.1751091182231903,
+      "learning_rate": 1.247026345942226e-05,
+      "loss": 0.14531974792480468,
+      "step": 3725
+    },
+    {
+      "epoch": 0.6786754002911208,
+      "grad_norm": 0.15838147699832916,
+      "learning_rate": 1.2406622028211844e-05,
+      "loss": 0.14759832620620728,
+      "step": 3730
+    },
+    {
+      "epoch": 0.6795851528384279,
+      "grad_norm": 0.1771744042634964,
+      "learning_rate": 1.2343089769394714e-05,
+      "loss": 0.1382831573486328,
+      "step": 3735
+    },
+    {
+      "epoch": 0.6804949053857351,
+      "grad_norm": 0.16301538050174713,
+      "learning_rate": 1.2279667233734037e-05,
+      "loss": 0.14444775581359864,
+      "step": 3740
+    },
+    {
+      "epoch": 0.6814046579330422,
+      "grad_norm": 0.1584121286869049,
+      "learning_rate": 1.2216354971041796e-05,
+      "loss": 0.14200170040130616,
+      "step": 3745
+    },
+    {
+      "epoch": 0.6823144104803494,
+      "grad_norm": 0.139187291264534,
+      "learning_rate": 1.2153153530174007e-05,
+      "loss": 0.14318310022354125,
+      "step": 3750
+    },
+    {
+      "epoch": 0.6832241630276564,
+      "grad_norm": 0.13665248453617096,
+      "learning_rate": 1.2090063459025955e-05,
+      "loss": 0.1411946654319763,
+      "step": 3755
+    },
+    {
+      "epoch": 0.6841339155749636,
+      "grad_norm": 0.16273781657218933,
+      "learning_rate": 1.2027085304527475e-05,
+      "loss": 0.14873508214950562,
+      "step": 3760
+    },
+    {
+      "epoch": 0.6850436681222707,
+      "grad_norm": 0.16317526996135712,
+      "learning_rate": 1.1964219612638194e-05,
+      "loss": 0.14644203186035157,
+      "step": 3765
+    },
+    {
+      "epoch": 0.6859534206695779,
+      "grad_norm": 0.17253617942333221,
+      "learning_rate": 1.1901466928342777e-05,
+      "loss": 0.14027841091156007,
+      "step": 3770
+    },
+    {
+      "epoch": 0.6868631732168851,
+      "grad_norm": 0.19692830741405487,
+      "learning_rate": 1.183882779564624e-05,
+      "loss": 0.14411110877990724,
+      "step": 3775
+    },
+    {
+      "epoch": 0.6877729257641921,
+      "grad_norm": 0.15444578230381012,
+      "learning_rate": 1.1776302757569214e-05,
+      "loss": 0.14355008602142333,
+      "step": 3780
+    },
+    {
+      "epoch": 0.6886826783114993,
+      "grad_norm": 0.1622200757265091,
+      "learning_rate": 1.1713892356143239e-05,
+      "loss": 0.14794334173202514,
+      "step": 3785
+    },
+    {
+      "epoch": 0.6895924308588064,
+      "grad_norm": 0.1898501068353653,
+      "learning_rate": 1.1651597132406073e-05,
+      "loss": 0.1418622612953186,
+      "step": 3790
+    },
+    {
+      "epoch": 0.6905021834061136,
+      "grad_norm": 0.17803208529949188,
+      "learning_rate": 1.1589417626396973e-05,
+      "loss": 0.14576040506362914,
+      "step": 3795
+    },
+    {
+      "epoch": 0.6914119359534207,
+      "grad_norm": 0.17138013243675232,
+      "learning_rate": 1.1527354377152053e-05,
+      "loss": 0.14494270086288452,
+      "step": 3800
+    },
+    {
+      "epoch": 0.6923216885007278,
+      "grad_norm": 0.15170913934707642,
+      "learning_rate": 1.1465407922699603e-05,
+      "loss": 0.144084370136261,
+      "step": 3805
+    },
+    {
+      "epoch": 0.6932314410480349,
+      "grad_norm": 0.158562570810318,
+      "learning_rate": 1.1403578800055387e-05,
+      "loss": 0.13636608123779298,
+      "step": 3810
+    },
+    {
+      "epoch": 0.6941411935953421,
+      "grad_norm": 0.17687302827835083,
+      "learning_rate": 1.1341867545218044e-05,
+      "loss": 0.14214688539505005,
+      "step": 3815
+    },
+    {
+      "epoch": 0.6950509461426492,
+      "grad_norm": 0.15394899249076843,
+      "learning_rate": 1.1280274693164378e-05,
+      "loss": 0.14914129972457885,
+      "step": 3820
+    },
+    {
+      "epoch": 0.6959606986899564,
+      "grad_norm": 0.15709355473518372,
+      "learning_rate": 1.12188007778448e-05,
+      "loss": 0.14798580408096312,
+      "step": 3825
+    },
+    {
+      "epoch": 0.6968704512372634,
+      "grad_norm": 0.16631539165973663,
+      "learning_rate": 1.115744633217864e-05,
+      "loss": 0.14756966829299928,
+      "step": 3830
+    },
+    {
+      "epoch": 0.6977802037845706,
+      "grad_norm": 0.15893076360225677,
+      "learning_rate": 1.109621188804951e-05,
+      "loss": 0.14061959981918334,
+      "step": 3835
+    },
+    {
+      "epoch": 0.6986899563318777,
+      "grad_norm": 0.183414489030838,
+      "learning_rate": 1.103509797630077e-05,
+      "loss": 0.1448473334312439,
+      "step": 3840
+    },
+    {
+      "epoch": 0.6995997088791849,
+      "grad_norm": 0.14087305963039398,
+      "learning_rate": 1.0974105126730841e-05,
+      "loss": 0.14369285106658936,
+      "step": 3845
+    },
+    {
+      "epoch": 0.700509461426492,
+      "grad_norm": 0.16919967532157898,
+      "learning_rate": 1.0913233868088685e-05,
+      "loss": 0.1478085398674011,
+      "step": 3850
+    },
+    {
+      "epoch": 0.7014192139737991,
+      "grad_norm": 0.1439533829689026,
+      "learning_rate": 1.0852484728069178e-05,
+      "loss": 0.14376721382141114,
+      "step": 3855
+    },
+    {
+      "epoch": 0.7023289665211062,
+      "grad_norm": 0.17719274759292603,
+      "learning_rate": 1.0791858233308521e-05,
+      "loss": 0.14089040756225585,
+      "step": 3860
+    },
+    {
+      "epoch": 0.7032387190684134,
+      "grad_norm": 0.19753769040107727,
+      "learning_rate": 1.0731354909379754e-05,
+      "loss": 0.15021742582321168,
+      "step": 3865
+    },
+    {
+      "epoch": 0.7041484716157205,
+      "grad_norm": 0.19186992943286896,
+      "learning_rate": 1.0670975280788086e-05,
+      "loss": 0.14113202095031738,
+      "step": 3870
+    },
+    {
+      "epoch": 0.7050582241630277,
+      "grad_norm": 0.1709229201078415,
+      "learning_rate": 1.0610719870966443e-05,
+      "loss": 0.1500566840171814,
+      "step": 3875
+    },
+    {
+      "epoch": 0.7059679767103348,
+      "grad_norm": 0.17846204340457916,
+      "learning_rate": 1.0550589202270892e-05,
+      "loss": 0.15014195442199707,
+      "step": 3880
+    },
+    {
+      "epoch": 0.7068777292576419,
+      "grad_norm": 0.1827082335948944,
+      "learning_rate": 1.0490583795976091e-05,
+      "loss": 0.1423472762107849,
+      "step": 3885
+    },
+    {
+      "epoch": 0.7077874818049491,
+      "grad_norm": 0.17418377101421356,
+      "learning_rate": 1.043070417227083e-05,
+      "loss": 0.14668900966644288,
+      "step": 3890
+    },
+    {
+      "epoch": 0.7086972343522562,
+      "grad_norm": 0.17385616898536682,
+      "learning_rate": 1.0370950850253449e-05,
+      "loss": 0.14627279043197633,
+      "step": 3895
+    },
+    {
+      "epoch": 0.7096069868995634,
+      "grad_norm": 0.16486723721027374,
+      "learning_rate": 1.0311324347927404e-05,
+      "loss": 0.14603652954101562,
+      "step": 3900
+    },
+    {
+      "epoch": 0.7105167394468704,
+      "grad_norm": 0.21806862950325012,
+      "learning_rate": 1.0251825182196732e-05,
+      "loss": 0.1488169550895691,
+      "step": 3905
+    },
+    {
+      "epoch": 0.7114264919941776,
+      "grad_norm": 0.19884569942951202,
+      "learning_rate": 1.019245386886159e-05,
+      "loss": 0.14387656450271608,
+      "step": 3910
+    },
+    {
+      "epoch": 0.7123362445414847,
+      "grad_norm": 0.16139011085033417,
+      "learning_rate": 1.0133210922613789e-05,
+      "loss": 0.1483074426651001,
+      "step": 3915
+    },
+    {
+      "epoch": 0.7132459970887919,
+      "grad_norm": 0.17000740766525269,
+      "learning_rate": 1.007409685703229e-05,
+      "loss": 0.14050065279006957,
+      "step": 3920
+    },
+    {
+      "epoch": 0.714155749636099,
+      "grad_norm": 0.17235304415225983,
+      "learning_rate": 1.0015112184578813e-05,
+      "loss": 0.1440442681312561,
+      "step": 3925
+    },
+    {
+      "epoch": 0.7150655021834061,
+      "grad_norm": 0.15737567842006683,
+      "learning_rate": 9.956257416593362e-06,
+      "loss": 0.14960765838623047,
+      "step": 3930
+    },
+    {
+      "epoch": 0.7159752547307132,
+      "grad_norm": 0.15499180555343628,
+      "learning_rate": 9.897533063289773e-06,
+      "loss": 0.14488829374313356,
+      "step": 3935
+    },
+    {
+      "epoch": 0.7168850072780204,
+      "grad_norm": 0.17744216322898865,
+      "learning_rate": 9.838939633751337e-06,
+      "loss": 0.1416949987411499,
+      "step": 3940
+    },
+    {
+      "epoch": 0.7177947598253275,
+      "grad_norm": 0.1597192883491516,
+      "learning_rate": 9.780477635926358e-06,
+      "loss": 0.14275280237197877,
+      "step": 3945
+    },
+    {
+      "epoch": 0.7187045123726347,
+      "grad_norm": 0.17800374329090118,
+      "learning_rate": 9.722147576623743e-06,
+      "loss": 0.14532098770141602,
+      "step": 3950
+    },
+    {
+      "epoch": 0.7196142649199417,
+      "grad_norm": 0.1828162521123886,
+      "learning_rate": 9.66394996150864e-06,
+      "loss": 0.14525585174560546,
+      "step": 3955
+    },
+    {
+      "epoch": 0.7205240174672489,
+      "grad_norm": 0.1800539344549179,
+      "learning_rate": 9.605885295098005e-06,
+      "loss": 0.14235819578170777,
+      "step": 3960
+    },
+    {
+      "epoch": 0.721433770014556,
+      "grad_norm": 0.16556483507156372,
+      "learning_rate": 9.54795408075628e-06,
+      "loss": 0.13965482711791993,
+      "step": 3965
+    },
+    {
+      "epoch": 0.7223435225618632,
+      "grad_norm": 0.1592024862766266,
+      "learning_rate": 9.49015682069101e-06,
+      "loss": 0.14051042795181273,
+      "step": 3970
+    },
+    {
+      "epoch": 0.7232532751091703,
+      "grad_norm": 0.18988847732543945,
+      "learning_rate": 9.43249401594846e-06,
+      "loss": 0.1436900496482849,
+      "step": 3975
+    },
+    {
+      "epoch": 0.7241630276564774,
+      "grad_norm": 0.24433808028697968,
+      "learning_rate": 9.374966166409329e-06,
+      "loss": 0.14883997440338134,
+      "step": 3980
+    },
+    {
+      "epoch": 0.7250727802037845,
+      "grad_norm": 0.15091639757156372,
+      "learning_rate": 9.317573770784352e-06,
+      "loss": 0.14726560115814208,
+      "step": 3985
+    },
+    {
+      "epoch": 0.7259825327510917,
+      "grad_norm": 0.17045573890209198,
+      "learning_rate": 9.260317326610051e-06,
+      "loss": 0.14120506048202514,
+      "step": 3990
+    },
+    {
+      "epoch": 0.7268922852983989,
+      "grad_norm": 0.18847957253456116,
+      "learning_rate": 9.203197330244343e-06,
+      "loss": 0.1377041220664978,
+      "step": 3995
+    },
+    {
+      "epoch": 0.727802037845706,
+      "grad_norm": 0.1516445279121399,
+      "learning_rate": 9.14621427686229e-06,
+      "loss": 0.14043946266174318,
+      "step": 4000
+    },
+    {
+      "epoch": 0.7287117903930131,
+      "grad_norm": 0.18264050781726837,
+      "learning_rate": 9.0893686604518e-06,
+      "loss": 0.14080368280410765,
+      "step": 4005
+    },
+    {
+      "epoch": 0.7296215429403202,
+      "grad_norm": 0.19129371643066406,
+      "learning_rate": 9.032660973809312e-06,
+      "loss": 0.1402561902999878,
+      "step": 4010
+    },
+    {
+      "epoch": 0.7305312954876274,
+      "grad_norm": 0.15762710571289062,
+      "learning_rate": 8.976091708535567e-06,
+      "loss": 0.14421157836914061,
+      "step": 4015
+    },
+    {
+      "epoch": 0.7314410480349345,
+      "grad_norm": 0.17785198986530304,
+      "learning_rate": 8.919661355031331e-06,
+      "loss": 0.14999009370803834,
+      "step": 4020
+    },
+    {
+      "epoch": 0.7323508005822417,
+      "grad_norm": 0.15306031703948975,
+      "learning_rate": 8.8633704024931e-06,
+      "loss": 0.14101698398590087,
+      "step": 4025
+    },
+    {
+      "epoch": 0.7332605531295487,
+      "grad_norm": 0.16481758654117584,
+      "learning_rate": 8.807219338908968e-06,
+      "loss": 0.14170764684677123,
+      "step": 4030
+    },
+    {
+      "epoch": 0.7341703056768559,
+      "grad_norm": 0.14892235398292542,
+      "learning_rate": 8.751208651054257e-06,
+      "loss": 0.15317896604537964,
+      "step": 4035
+    },
+    {
+      "epoch": 0.735080058224163,
+      "grad_norm": 0.1775592565536499,
+      "learning_rate": 8.695338824487409e-06,
+      "loss": 0.1520617723464966,
+      "step": 4040
+    },
+    {
+      "epoch": 0.7359898107714702,
+      "grad_norm": 0.1614258885383606,
+      "learning_rate": 8.639610343545728e-06,
+      "loss": 0.13747400045394897,
+      "step": 4045
+    },
+    {
+      "epoch": 0.7368995633187773,
+      "grad_norm": 0.21415506303310394,
+      "learning_rate": 8.58402369134117e-06,
+      "loss": 0.1432439088821411,
+      "step": 4050
+    },
+    {
+      "epoch": 0.7378093158660844,
+      "grad_norm": 0.1759418249130249,
+      "learning_rate": 8.528579349756205e-06,
+      "loss": 0.141641104221344,
+      "step": 4055
+    },
+    {
+      "epoch": 0.7387190684133915,
+      "grad_norm": 0.16738329827785492,
+      "learning_rate": 8.47327779943957e-06,
+      "loss": 0.14294810295104982,
+      "step": 4060
+    },
+    {
+      "epoch": 0.7396288209606987,
+      "grad_norm": 0.13916844129562378,
+      "learning_rate": 8.41811951980217e-06,
+      "loss": 0.13876968622207642,
+      "step": 4065
+    },
+    {
+      "epoch": 0.7405385735080058,
+      "grad_norm": 0.1828441321849823,
+      "learning_rate": 8.36310498901288e-06,
+      "loss": 0.148428475856781,
+      "step": 4070
+    },
+    {
+      "epoch": 0.741448326055313,
+      "grad_norm": 0.16534076631069183,
+      "learning_rate": 8.308234683994415e-06,
+      "loss": 0.14222711324691772,
+      "step": 4075
+    },
+    {
+      "epoch": 0.74235807860262,
+      "grad_norm": 0.17922644317150116,
+      "learning_rate": 8.253509080419198e-06,
+      "loss": 0.14365782737731933,
+      "step": 4080
+    },
+    {
+      "epoch": 0.7432678311499272,
+      "grad_norm": 0.15061035752296448,
+      "learning_rate": 8.198928652705204e-06,
+      "loss": 0.13571925163269044,
+      "step": 4085
+    },
+    {
+      "epoch": 0.7441775836972343,
+      "grad_norm": 0.18075402081012726,
+      "learning_rate": 8.144493874011908e-06,
+      "loss": 0.14385528564453126,
+      "step": 4090
+    },
+    {
+      "epoch": 0.7450873362445415,
+      "grad_norm": 0.16514739394187927,
+      "learning_rate": 8.090205216236135e-06,
+      "loss": 0.14920626878738402,
+      "step": 4095
+    },
+    {
+      "epoch": 0.7459970887918487,
+      "grad_norm": 0.16453702747821808,
+      "learning_rate": 8.03606315000797e-06,
+      "loss": 0.14704222679138185,
+      "step": 4100
+    },
+    {
+      "epoch": 0.7469068413391557,
+      "grad_norm": 0.16719917953014374,
+      "learning_rate": 7.982068144686707e-06,
+      "loss": 0.14722511768341065,
+      "step": 4105
+    },
+    {
+      "epoch": 0.7478165938864629,
+      "grad_norm": 0.18499110639095306,
+      "learning_rate": 7.92822066835677e-06,
+      "loss": 0.1401848554611206,
+      "step": 4110
+    },
+    {
+      "epoch": 0.74872634643377,
+      "grad_norm": 0.17249563336372375,
+      "learning_rate": 7.87452118782363e-06,
+      "loss": 0.15132423639297485,
+      "step": 4115
+    },
+    {
+      "epoch": 0.7496360989810772,
+      "grad_norm": 0.15049682557582855,
+      "learning_rate": 7.8209701686098e-06,
+      "loss": 0.1341150164604187,
+      "step": 4120
+    },
+    {
+      "epoch": 0.7505458515283843,
+      "grad_norm": 0.16892646253108978,
+      "learning_rate": 7.767568074950751e-06,
+      "loss": 0.1466840147972107,
+      "step": 4125
+    },
+    {
+      "epoch": 0.7514556040756915,
+      "grad_norm": 0.17288286983966827,
+      "learning_rate": 7.714315369790942e-06,
+      "loss": 0.13819680213928223,
+      "step": 4130
+    },
+    {
+      "epoch": 0.7523653566229985,
+      "grad_norm": 0.21893996000289917,
+      "learning_rate": 7.661212514779745e-06,
+      "loss": 0.14369510412216185,
+      "step": 4135
+    },
+    {
+      "epoch": 0.7532751091703057,
+      "grad_norm": 0.1674601435661316,
+      "learning_rate": 7.608259970267509e-06,
+      "loss": 0.14810250997543334,
+      "step": 4140
+    },
+    {
+      "epoch": 0.7541848617176128,
+      "grad_norm": 0.15875539183616638,
+      "learning_rate": 7.555458195301526e-06,
+      "loss": 0.14103198051452637,
+      "step": 4145
+    },
+    {
+      "epoch": 0.75509461426492,
+      "grad_norm": 0.19454079866409302,
+      "learning_rate": 7.502807647622037e-06,
+      "loss": 0.13848764896392823,
+      "step": 4150
+    },
+    {
+      "epoch": 0.756004366812227,
+      "grad_norm": 0.1795455813407898,
+      "learning_rate": 7.450308783658341e-06,
+      "loss": 0.14459335803985596,
+      "step": 4155
+    },
+    {
+      "epoch": 0.7569141193595342,
+      "grad_norm": 0.1643362045288086,
+      "learning_rate": 7.397962058524735e-06,
+      "loss": 0.14335378408432006,
+      "step": 4160
+    },
+    {
+      "epoch": 0.7578238719068413,
+      "grad_norm": 0.16362066566944122,
+      "learning_rate": 7.3457679260166475e-06,
+      "loss": 0.14222005605697632,
+      "step": 4165
+    },
+    {
+      "epoch": 0.7587336244541485,
+      "grad_norm": 0.17313003540039062,
+      "learning_rate": 7.293726838606674e-06,
+      "loss": 0.14272255897521974,
+      "step": 4170
+    },
+    {
+      "epoch": 0.7596433770014556,
+      "grad_norm": 0.1809929460287094,
+      "learning_rate": 7.2418392474406405e-06,
+      "loss": 0.14089123010635377,
+      "step": 4175
+    },
+    {
+      "epoch": 0.7605531295487628,
+      "grad_norm": 0.14306005835533142,
+      "learning_rate": 7.19010560233373e-06,
+      "loss": 0.13531534671783446,
+      "step": 4180
+    },
+    {
+      "epoch": 0.7614628820960698,
+      "grad_norm": 0.15525390207767487,
+      "learning_rate": 7.138526351766559e-06,
+      "loss": 0.14340845346450806,
+      "step": 4185
+    },
+    {
+      "epoch": 0.762372634643377,
+      "grad_norm": 0.24478943645954132,
+      "learning_rate": 7.087101942881263e-06,
+      "loss": 0.14744555950164795,
+      "step": 4190
+    },
+    {
+      "epoch": 0.7632823871906841,
+      "grad_norm": 0.31335577368736267,
+      "learning_rate": 7.035832821477711e-06,
+      "loss": 0.1484094500541687,
+      "step": 4195
+    },
+    {
+      "epoch": 0.7641921397379913,
+      "grad_norm": 0.15140366554260254,
+      "learning_rate": 6.984719432009515e-06,
+      "loss": 0.14991614818572999,
+      "step": 4200
+    },
+    {
+      "epoch": 0.7651018922852983,
+      "grad_norm": 0.16125506162643433,
+      "learning_rate": 6.933762217580289e-06,
+      "loss": 0.1408134937286377,
+      "step": 4205
+    },
+    {
+      "epoch": 0.7660116448326055,
+      "grad_norm": 0.2501450181007385,
+      "learning_rate": 6.882961619939726e-06,
+      "loss": 0.13875640630722047,
+      "step": 4210
+    },
+    {
+      "epoch": 0.7669213973799127,
+      "grad_norm": 0.16227811574935913,
+      "learning_rate": 6.8323180794798245e-06,
+      "loss": 0.14138660430908204,
+      "step": 4215
+    },
+    {
+      "epoch": 0.7678311499272198,
+      "grad_norm": 0.16676810383796692,
+      "learning_rate": 6.781832035231053e-06,
+      "loss": 0.14696706533432008,
+      "step": 4220
+    },
+    {
+      "epoch": 0.768740902474527,
+      "grad_norm": 0.14638574421405792,
+      "learning_rate": 6.731503924858518e-06,
+      "loss": 0.14263020753860473,
+      "step": 4225
+    },
+    {
+      "epoch": 0.769650655021834,
+      "grad_norm": 0.17093190550804138,
+      "learning_rate": 6.681334184658211e-06,
+      "loss": 0.14694111347198485,
+      "step": 4230
+    },
+    {
+      "epoch": 0.7705604075691412,
+      "grad_norm": 0.17174287140369415,
+      "learning_rate": 6.631323249553201e-06,
+      "loss": 0.13854929208755493,
+      "step": 4235
+    },
+    {
+      "epoch": 0.7714701601164483,
+      "grad_norm": 0.14599016308784485,
+      "learning_rate": 6.5814715530898745e-06,
+      "loss": 0.14058833122253417,
+      "step": 4240
+    },
+    {
+      "epoch": 0.7723799126637555,
+      "grad_norm": 0.16222265362739563,
+      "learning_rate": 6.531779527434176e-06,
+      "loss": 0.1428326725959778,
+      "step": 4245
+    },
+    {
+      "epoch": 0.7732896652110626,
+      "grad_norm": 0.1741994023323059,
+      "learning_rate": 6.482247603367839e-06,
+      "loss": 0.13985042572021483,
+      "step": 4250
+    },
+    {
+      "epoch": 0.7741994177583698,
+      "grad_norm": 0.17427101731300354,
+      "learning_rate": 6.432876210284688e-06,
+      "loss": 0.1442667603492737,
+      "step": 4255
+    },
+    {
+      "epoch": 0.7751091703056768,
+      "grad_norm": 0.1665259599685669,
+      "learning_rate": 6.383665776186912e-06,
+      "loss": 0.1421986222267151,
+      "step": 4260
+    },
+    {
+      "epoch": 0.776018922852984,
+      "grad_norm": 0.1728232353925705,
+      "learning_rate": 6.334616727681303e-06,
+      "loss": 0.1367053508758545,
+      "step": 4265
+    },
+    {
+      "epoch": 0.7769286754002911,
+      "grad_norm": 0.15882381796836853,
+      "learning_rate": 6.285729489975639e-06,
+      "loss": 0.14551182985305786,
+      "step": 4270
+    },
+    {
+      "epoch": 0.7778384279475983,
+      "grad_norm": 0.242042675614357,
+      "learning_rate": 6.2370044868749115e-06,
+      "loss": 0.1455132007598877,
+      "step": 4275
+    },
+    {
+      "epoch": 0.7787481804949054,
+      "grad_norm": 0.1599501073360443,
+      "learning_rate": 6.188442140777742e-06,
+      "loss": 0.1424942970275879,
+      "step": 4280
+    },
+    {
+      "epoch": 0.7796579330422125,
+      "grad_norm": 0.15182635188102722,
+      "learning_rate": 6.140042872672647e-06,
+      "loss": 0.14212887287139891,
+      "step": 4285
+    },
+    {
+      "epoch": 0.7805676855895196,
+      "grad_norm": 0.1720375418663025,
+      "learning_rate": 6.091807102134403e-06,
+      "loss": 0.14243412017822266,
+      "step": 4290
+    },
+    {
+      "epoch": 0.7814774381368268,
+      "grad_norm": 0.16436047852039337,
+      "learning_rate": 6.043735247320454e-06,
+      "loss": 0.15035657882690429,
+      "step": 4295
+    },
+    {
+      "epoch": 0.7823871906841339,
+      "grad_norm": 0.1498408019542694,
+      "learning_rate": 5.995827724967218e-06,
+      "loss": 0.14494839906692505,
+      "step": 4300
+    },
+    {
+      "epoch": 0.7832969432314411,
+      "grad_norm": 0.16924560070037842,
+      "learning_rate": 5.948084950386535e-06,
+      "loss": 0.13581212759017944,
+      "step": 4305
+    },
+    {
+      "epoch": 0.7842066957787481,
+      "grad_norm": 0.15889139473438263,
+      "learning_rate": 5.900507337462036e-06,
+      "loss": 0.15071530342102052,
+      "step": 4310
+    },
+    {
+      "epoch": 0.7851164483260553,
+      "grad_norm": 0.17201054096221924,
+      "learning_rate": 5.853095298645542e-06,
+      "loss": 0.1398628830909729,
+      "step": 4315
+    },
+    {
+      "epoch": 0.7860262008733624,
+      "grad_norm": 0.17965619266033173,
+      "learning_rate": 5.805849244953548e-06,
+      "loss": 0.14666696786880493,
+      "step": 4320
+    },
+    {
+      "epoch": 0.7869359534206696,
+      "grad_norm": 0.17514032125473022,
+      "learning_rate": 5.758769585963569e-06,
+      "loss": 0.1383386731147766,
+      "step": 4325
+    },
+    {
+      "epoch": 0.7878457059679768,
+      "grad_norm": 0.17497631907463074,
+      "learning_rate": 5.7118567298106744e-06,
+      "loss": 0.14362354278564454,
+      "step": 4330
+    },
+    {
+      "epoch": 0.7887554585152838,
+      "grad_norm": 0.16770458221435547,
+      "learning_rate": 5.665111083183905e-06,
+      "loss": 0.14136618375778198,
+      "step": 4335
+    },
+    {
+      "epoch": 0.789665211062591,
+      "grad_norm": 0.17134106159210205,
+      "learning_rate": 5.618533051322747e-06,
+      "loss": 0.1401529550552368,
+      "step": 4340
+    },
+    {
+      "epoch": 0.7905749636098981,
+      "grad_norm": 0.19458788633346558,
+      "learning_rate": 5.5721230380136435e-06,
+      "loss": 0.1393273115158081,
+      "step": 4345
+    },
+    {
+      "epoch": 0.7914847161572053,
+      "grad_norm": 0.19483692944049835,
+      "learning_rate": 5.525881445586467e-06,
+      "loss": 0.1369825482368469,
+      "step": 4350
+    },
+    {
+      "epoch": 0.7923944687045124,
+      "grad_norm": 0.3052191734313965,
+      "learning_rate": 5.4798086749110495e-06,
+      "loss": 0.14762181043624878,
+      "step": 4355
+    },
+    {
+      "epoch": 0.7933042212518195,
+      "grad_norm": 0.164458766579628,
+      "learning_rate": 5.4339051253937065e-06,
+      "loss": 0.14501686096191407,
+      "step": 4360
+    },
+    {
+      "epoch": 0.7942139737991266,
+      "grad_norm": 0.1719193458557129,
+      "learning_rate": 5.3881711949737625e-06,
+      "loss": 0.13321092128753662,
+      "step": 4365
+    },
+    {
+      "epoch": 0.7951237263464338,
+      "grad_norm": 0.17219696938991547,
+      "learning_rate": 5.342607280120121e-06,
+      "loss": 0.1413906455039978,
+      "step": 4370
+    },
+    {
+      "epoch": 0.7960334788937409,
+      "grad_norm": 0.15083056688308716,
+      "learning_rate": 5.297213775827789e-06,
+      "loss": 0.14772192239761353,
+      "step": 4375
+    },
+    {
+      "epoch": 0.7969432314410481,
+      "grad_norm": 0.1699071079492569,
+      "learning_rate": 5.251991075614507e-06,
+      "loss": 0.1392375946044922,
+      "step": 4380
+    },
+    {
+      "epoch": 0.7978529839883551,
+      "grad_norm": 0.1680395007133484,
+      "learning_rate": 5.206939571517302e-06,
+      "loss": 0.14185575246810914,
+      "step": 4385
+    },
+    {
+      "epoch": 0.7987627365356623,
+      "grad_norm": 0.16526710987091064,
+      "learning_rate": 5.162059654089083e-06,
+      "loss": 0.15001428127288818,
+      "step": 4390
+    },
+    {
+      "epoch": 0.7996724890829694,
+      "grad_norm": 0.16281752288341522,
+      "learning_rate": 5.1173517123952794e-06,
+      "loss": 0.13747023344039916,
+      "step": 4395
+    },
+    {
+      "epoch": 0.8005822416302766,
+      "grad_norm": 0.1454378366470337,
+      "learning_rate": 5.072816134010458e-06,
+      "loss": 0.14710829257965088,
+      "step": 4400
+    },
+    {
+      "epoch": 0.8014919941775837,
+      "grad_norm": 0.16565890610218048,
+      "learning_rate": 5.028453305014966e-06,
+      "loss": 0.14138611555099487,
+      "step": 4405
+    },
+    {
+      "epoch": 0.8024017467248908,
+      "grad_norm": 0.1962810605764389,
+      "learning_rate": 4.984263609991577e-06,
+      "loss": 0.13836177587509155,
+      "step": 4410
+    },
+    {
+      "epoch": 0.8033114992721979,
+      "grad_norm": 0.16091369092464447,
+      "learning_rate": 4.940247432022149e-06,
+      "loss": 0.14407440423965454,
+      "step": 4415
+    },
+    {
+      "epoch": 0.8042212518195051,
+      "grad_norm": 0.1930241584777832,
+      "learning_rate": 4.89640515268433e-06,
+      "loss": 0.14346336126327514,
+      "step": 4420
+    },
+    {
+      "epoch": 0.8051310043668122,
+      "grad_norm": 0.19301500916481018,
+      "learning_rate": 4.852737152048242e-06,
+      "loss": 0.14174317121505736,
+      "step": 4425
+    },
+    {
+      "epoch": 0.8060407569141194,
+      "grad_norm": 0.1541353315114975,
+      "learning_rate": 4.80924380867315e-06,
+      "loss": 0.14100592136383056,
+      "step": 4430
+    },
+    {
+      "epoch": 0.8069505094614265,
+      "grad_norm": 0.16285750269889832,
+      "learning_rate": 4.765925499604243e-06,
+      "loss": 0.1441288709640503,
+      "step": 4435
+    },
+    {
+      "epoch": 0.8078602620087336,
+      "grad_norm": 0.17382675409317017,
+      "learning_rate": 4.722782600369299e-06,
+      "loss": 0.13763951063156127,
+      "step": 4440
+    },
+    {
+      "epoch": 0.8087700145560408,
+      "grad_norm": 0.1697344034910202,
+      "learning_rate": 4.679815484975505e-06,
+      "loss": 0.1410105347633362,
+      "step": 4445
+    },
+    {
+      "epoch": 0.8096797671033479,
+      "grad_norm": 0.19964542984962463,
+      "learning_rate": 4.637024525906131e-06,
+      "loss": 0.1439276695251465,
+      "step": 4450
+    },
+    {
+      "epoch": 0.8105895196506551,
+      "grad_norm": 0.165307879447937,
+      "learning_rate": 4.59441009411736e-06,
+      "loss": 0.13897504806518554,
+      "step": 4455
+    },
+    {
+      "epoch": 0.8114992721979621,
+      "grad_norm": 0.16687989234924316,
+      "learning_rate": 4.551972559035067e-06,
+      "loss": 0.1422593355178833,
+      "step": 4460
+    },
+    {
+      "epoch": 0.8124090247452693,
+      "grad_norm": 0.15737789869308472,
+      "learning_rate": 4.509712288551571e-06,
+      "loss": 0.1452128052711487,
+      "step": 4465
+    },
+    {
+      "epoch": 0.8133187772925764,
+      "grad_norm": 0.17116659879684448,
+      "learning_rate": 4.467629649022509e-06,
+      "loss": 0.14385371208190917,
+      "step": 4470
+    },
+    {
+      "epoch": 0.8142285298398836,
+      "grad_norm": 0.17457640171051025,
+      "learning_rate": 4.425725005263623e-06,
+      "loss": 0.14808475971221924,
+      "step": 4475
+    },
+    {
+      "epoch": 0.8151382823871907,
+      "grad_norm": 0.1621970385313034,
+      "learning_rate": 4.383998720547583e-06,
+      "loss": 0.13927959203720092,
+      "step": 4480
+    },
+    {
+      "epoch": 0.8160480349344978,
+      "grad_norm": 0.176296666264534,
+      "learning_rate": 4.342451156600896e-06,
+      "loss": 0.15041060447692872,
+      "step": 4485
+    },
+    {
+      "epoch": 0.8169577874818049,
+      "grad_norm": 0.17157645523548126,
+      "learning_rate": 4.301082673600698e-06,
+      "loss": 0.13932652473449708,
+      "step": 4490
+    },
+    {
+      "epoch": 0.8178675400291121,
+      "grad_norm": 0.15378527343273163,
+      "learning_rate": 4.259893630171682e-06,
+      "loss": 0.1406856894493103,
+      "step": 4495
+    },
+    {
+      "epoch": 0.8187772925764192,
+      "grad_norm": 0.1750226765871048,
+      "learning_rate": 4.218884383382987e-06,
+      "loss": 0.1350164532661438,
+      "step": 4500
+    },
+    {
+      "epoch": 0.8196870451237264,
+      "grad_norm": 0.1393742561340332,
+      "learning_rate": 4.178055288745053e-06,
+      "loss": 0.13769235610961914,
+      "step": 4505
+    },
+    {
+      "epoch": 0.8205967976710334,
+      "grad_norm": 0.1668994128704071,
+      "learning_rate": 4.137406700206617e-06,
+      "loss": 0.14029752016067504,
+      "step": 4510
+    },
+    {
+      "epoch": 0.8215065502183406,
+      "grad_norm": 0.1833454668521881,
+      "learning_rate": 4.0969389701515675e-06,
+      "loss": 0.14276301860809326,
+      "step": 4515
+    },
+    {
+      "epoch": 0.8224163027656477,
+      "grad_norm": 0.16187874972820282,
+      "learning_rate": 4.056652449395945e-06,
+      "loss": 0.1444832682609558,
+      "step": 4520
+    },
+    {
+      "epoch": 0.8233260553129549,
+      "grad_norm": 0.1453280746936798,
+      "learning_rate": 4.01654748718488e-06,
+      "loss": 0.14512733221054078,
+      "step": 4525
+    },
+    {
+      "epoch": 0.824235807860262,
+      "grad_norm": 0.1782725751399994,
+      "learning_rate": 3.976624431189563e-06,
+      "loss": 0.14093561172485353,
+      "step": 4530
+    },
+    {
+      "epoch": 0.8251455604075691,
+      "grad_norm": 0.17374491691589355,
+      "learning_rate": 3.936883627504234e-06,
+      "loss": 0.14031401872634888,
+      "step": 4535
+    },
+    {
+      "epoch": 0.8260553129548762,
+      "grad_norm": 0.1609172821044922,
+      "learning_rate": 3.897325420643174e-06,
+      "loss": 0.1428336262702942,
+      "step": 4540
+    },
+    {
+      "epoch": 0.8269650655021834,
+      "grad_norm": 0.1520884931087494,
+      "learning_rate": 3.85795015353774e-06,
+      "loss": 0.1460547924041748,
+      "step": 4545
+    },
+    {
+      "epoch": 0.8278748180494906,
+      "grad_norm": 0.20986326038837433,
+      "learning_rate": 3.818758167533376e-06,
+      "loss": 0.14706350564956666,
+      "step": 4550
+    },
+    {
+      "epoch": 0.8287845705967977,
+      "grad_norm": 0.16825413703918457,
+      "learning_rate": 3.7797498023866396e-06,
+      "loss": 0.14507200717926025,
+      "step": 4555
+    },
+    {
+      "epoch": 0.8296943231441049,
+      "grad_norm": 0.16758380830287933,
+      "learning_rate": 3.740925396262296e-06,
+      "loss": 0.14898381233215333,
+      "step": 4560
+    },
+    {
+      "epoch": 0.8306040756914119,
+      "grad_norm": 0.15207453072071075,
+      "learning_rate": 3.7022852857303503e-06,
+      "loss": 0.14138854742050172,
+      "step": 4565
+    },
+    {
+      "epoch": 0.8315138282387191,
+      "grad_norm": 0.15150749683380127,
+      "learning_rate": 3.66382980576315e-06,
+      "loss": 0.13894975185394287,
+      "step": 4570
+    },
+    {
+      "epoch": 0.8324235807860262,
+      "grad_norm": 0.17071188986301422,
+      "learning_rate": 3.625559289732472e-06,
+      "loss": 0.14072470664978026,
+      "step": 4575
+    },
+    {
+      "epoch": 0.8333333333333334,
+      "grad_norm": 0.154335618019104,
+      "learning_rate": 3.5874740694066294e-06,
+      "loss": 0.13791344165802003,
+      "step": 4580
+    },
+    {
+      "epoch": 0.8342430858806404,
+      "grad_norm": 0.14017128944396973,
+      "learning_rate": 3.5495744749476116e-06,
+      "loss": 0.14427922964096068,
+      "step": 4585
+    },
+    {
+      "epoch": 0.8351528384279476,
+      "grad_norm": 0.17210033535957336,
+      "learning_rate": 3.5118608349081983e-06,
+      "loss": 0.15191166400909423,
+      "step": 4590
+    },
+    {
+      "epoch": 0.8360625909752547,
+      "grad_norm": 0.18715685606002808,
+      "learning_rate": 3.4743334762291358e-06,
+      "loss": 0.14451316595077515,
+      "step": 4595
+    },
+    {
+      "epoch": 0.8369723435225619,
+      "grad_norm": 0.18079884350299835,
+      "learning_rate": 3.436992724236293e-06,
+      "loss": 0.13530746698379517,
+      "step": 4600
+    },
+    {
+      "epoch": 0.837882096069869,
+      "grad_norm": 0.13519920408725739,
+      "learning_rate": 3.399838902637817e-06,
+      "loss": 0.1477964401245117,
+      "step": 4605
+    },
+    {
+      "epoch": 0.8387918486171762,
+      "grad_norm": 0.1778026670217514,
+      "learning_rate": 3.3628723335213885e-06,
+      "loss": 0.14419831037521363,
+      "step": 4610
+    },
+    {
+      "epoch": 0.8397016011644832,
+      "grad_norm": 0.15165366232395172,
+      "learning_rate": 3.326093337351355e-06,
+      "loss": 0.13888469934463502,
+      "step": 4615
+    },
+    {
+      "epoch": 0.8406113537117904,
+      "grad_norm": 0.17049473524093628,
+      "learning_rate": 3.2895022329660018e-06,
+      "loss": 0.14438477754592896,
+      "step": 4620
+    },
+    {
+      "epoch": 0.8415211062590975,
+      "grad_norm": 0.16536414623260498,
+      "learning_rate": 3.2530993375747833e-06,
+      "loss": 0.1444351315498352,
+      "step": 4625
+    },
+    {
+      "epoch": 0.8424308588064047,
+      "grad_norm": 0.17570015788078308,
+      "learning_rate": 3.2168849667555402e-06,
+      "loss": 0.13861945867538453,
+      "step": 4630
+    },
+    {
+      "epoch": 0.8433406113537117,
+      "grad_norm": 0.1699545532464981,
+      "learning_rate": 3.1808594344518132e-06,
+      "loss": 0.13902754783630372,
+      "step": 4635
+    },
+    {
+      "epoch": 0.8442503639010189,
+      "grad_norm": 0.12331254780292511,
+      "learning_rate": 3.1450230529700837e-06,
+      "loss": 0.14104254245758058,
+      "step": 4640
+    },
+    {
+      "epoch": 0.845160116448326,
+      "grad_norm": 0.1508190929889679,
+      "learning_rate": 3.1093761329770708e-06,
+      "loss": 0.13288766145706177,
+      "step": 4645
+    },
+    {
+      "epoch": 0.8460698689956332,
+      "grad_norm": 0.19049489498138428,
+      "learning_rate": 3.0739189834970735e-06,
+      "loss": 0.14914840459823608,
+      "step": 4650
+    },
+    {
+      "epoch": 0.8469796215429404,
+      "grad_norm": 0.1662369966506958,
+      "learning_rate": 3.0386519119092293e-06,
+      "loss": 0.14222898483276367,
+      "step": 4655
+    },
+    {
+      "epoch": 0.8478893740902474,
+      "grad_norm": 0.18985967338085175,
+      "learning_rate": 3.0035752239449126e-06,
+      "loss": 0.14431113004684448,
+      "step": 4660
+    },
+    {
+      "epoch": 0.8487991266375546,
+      "grad_norm": 0.17005261778831482,
+      "learning_rate": 2.9686892236850337e-06,
+      "loss": 0.14140807390213012,
+      "step": 4665
+    },
+    {
+      "epoch": 0.8497088791848617,
+      "grad_norm": 0.16786684095859528,
+      "learning_rate": 2.9339942135574394e-06,
+      "loss": 0.14161460399627684,
+      "step": 4670
+    },
+    {
+      "epoch": 0.8506186317321689,
+      "grad_norm": 0.16358181834220886,
+      "learning_rate": 2.899490494334281e-06,
+      "loss": 0.14674670696258546,
+      "step": 4675
+    },
+    {
+      "epoch": 0.851528384279476,
+      "grad_norm": 0.1651349812746048,
+      "learning_rate": 2.8651783651293867e-06,
+      "loss": 0.13794611692428588,
+      "step": 4680
+    },
+    {
+      "epoch": 0.8524381368267832,
+      "grad_norm": 0.16934923827648163,
+      "learning_rate": 2.831058123395694e-06,
+      "loss": 0.13199397325515747,
+      "step": 4685
+    },
+    {
+      "epoch": 0.8533478893740902,
+      "grad_norm": 0.1704150140285492,
+      "learning_rate": 2.797130064922665e-06,
+      "loss": 0.14044904708862305,
+      "step": 4690
+    },
+    {
+      "epoch": 0.8542576419213974,
+      "grad_norm": 0.1814192682504654,
+      "learning_rate": 2.7633944838337143e-06,
+      "loss": 0.1465100646018982,
+      "step": 4695
+    },
+    {
+      "epoch": 0.8551673944687045,
+      "grad_norm": 0.18942610919475555,
+      "learning_rate": 2.729851672583669e-06,
+      "loss": 0.14685982465744019,
+      "step": 4700
+    },
+    {
+      "epoch": 0.8560771470160117,
+      "grad_norm": 0.17895208299160004,
+      "learning_rate": 2.6965019219562155e-06,
+      "loss": 0.13971571922302245,
+      "step": 4705
+    },
+    {
+      "epoch": 0.8569868995633187,
+      "grad_norm": 0.22735828161239624,
+      "learning_rate": 2.6633455210614055e-06,
+      "loss": 0.13776102066040039,
+      "step": 4710
+    },
+    {
+      "epoch": 0.8578966521106259,
+      "grad_norm": 0.16779793798923492,
+      "learning_rate": 2.630382757333133e-06,
+      "loss": 0.14134042263031005,
+      "step": 4715
+    },
+    {
+      "epoch": 0.858806404657933,
+      "grad_norm": 0.2148888260126114,
+      "learning_rate": 2.597613916526637e-06,
+      "loss": 0.14680721759796142,
+      "step": 4720
+    },
+    {
+      "epoch": 0.8597161572052402,
+      "grad_norm": 0.16560257971286774,
+      "learning_rate": 2.565039282716045e-06,
+      "loss": 0.14137234687805175,
+      "step": 4725
+    },
+    {
+      "epoch": 0.8606259097525473,
+      "grad_norm": 0.16197068989276886,
+      "learning_rate": 2.532659138291879e-06,
+      "loss": 0.14969314336776735,
+      "step": 4730
+    },
+    {
+      "epoch": 0.8615356622998545,
+      "grad_norm": 0.14650246500968933,
+      "learning_rate": 2.5004737639586497e-06,
+      "loss": 0.13532910346984864,
+      "step": 4735
+    },
+    {
+      "epoch": 0.8624454148471615,
+      "grad_norm": 0.1565634310245514,
+      "learning_rate": 2.4684834387323943e-06,
+      "loss": 0.14146244525909424,
+      "step": 4740
+    },
+    {
+      "epoch": 0.8633551673944687,
+      "grad_norm": 0.18060864508152008,
+      "learning_rate": 2.4366884399382393e-06,
+      "loss": 0.14218534231185914,
+      "step": 4745
+    },
+    {
+      "epoch": 0.8642649199417758,
+      "grad_norm": 0.24613255262374878,
+      "learning_rate": 2.4050890432080557e-06,
+      "loss": 0.13907679319381713,
+      "step": 4750
+    },
+    {
+      "epoch": 0.865174672489083,
+      "grad_norm": 0.16036023199558258,
+      "learning_rate": 2.3736855224780057e-06,
+      "loss": 0.13718113899230958,
+      "step": 4755
+    },
+    {
+      "epoch": 0.86608442503639,
+      "grad_norm": 0.16678516566753387,
+      "learning_rate": 2.3424781499862075e-06,
+      "loss": 0.1327962040901184,
+      "step": 4760
+    },
+    {
+      "epoch": 0.8669941775836972,
+      "grad_norm": 0.1763770878314972,
+      "learning_rate": 2.3114671962703727e-06,
+      "loss": 0.14390318393707274,
+      "step": 4765
+    },
+    {
+      "epoch": 0.8679039301310044,
+      "grad_norm": 0.17735697329044342,
+      "learning_rate": 2.280652930165428e-06,
+      "loss": 0.15223288536071777,
+      "step": 4770
+    },
+    {
+      "epoch": 0.8688136826783115,
+      "grad_norm": 0.15827041864395142,
+      "learning_rate": 2.250035618801241e-06,
+      "loss": 0.14296332597732545,
+      "step": 4775
+    },
+    {
+      "epoch": 0.8697234352256187,
+      "grad_norm": 0.16876135766506195,
+      "learning_rate": 2.219615527600244e-06,
+      "loss": 0.1359076738357544,
+      "step": 4780
+    },
+    {
+      "epoch": 0.8706331877729258,
+      "grad_norm": 0.1800110638141632,
+      "learning_rate": 2.189392920275174e-06,
+      "loss": 0.1424281358718872,
+      "step": 4785
+    },
+    {
+      "epoch": 0.8715429403202329,
+      "grad_norm": 0.1409560889005661,
+      "learning_rate": 2.159368058826783e-06,
+      "loss": 0.14480490684509278,
+      "step": 4790
+    },
+    {
+      "epoch": 0.87245269286754,
+      "grad_norm": 0.1634288728237152,
+      "learning_rate": 2.129541203541535e-06,
+      "loss": 0.14513269662857056,
+      "step": 4795
+    },
+    {
+      "epoch": 0.8733624454148472,
+      "grad_norm": 0.17126062512397766,
+      "learning_rate": 2.099912612989391e-06,
+      "loss": 0.13546934127807617,
+      "step": 4800
+    },
+    {
+      "epoch": 0.8742721979621543,
+      "grad_norm": 0.16704080998897552,
+      "learning_rate": 2.0704825440215457e-06,
+      "loss": 0.13852492570877076,
+      "step": 4805
+    },
+    {
+      "epoch": 0.8751819505094615,
+      "grad_norm": 0.1725970208644867,
+      "learning_rate": 2.0412512517681946e-06,
+      "loss": 0.14504197835922242,
+      "step": 4810
+    },
+    {
+      "epoch": 0.8760917030567685,
+      "grad_norm": 0.1700201779603958,
+      "learning_rate": 2.0122189896363387e-06,
+      "loss": 0.14312338829040527,
+      "step": 4815
+    },
+    {
+      "epoch": 0.8770014556040757,
+      "grad_norm": 0.16491736471652985,
+      "learning_rate": 1.9833860093075834e-06,
+      "loss": 0.14062976837158203,
+      "step": 4820
+    },
+    {
+      "epoch": 0.8779112081513828,
+      "grad_norm": 0.13748787343502045,
+      "learning_rate": 1.9547525607359537e-06,
+      "loss": 0.1346171498298645,
+      "step": 4825
+    },
+    {
+      "epoch": 0.87882096069869,
+      "grad_norm": 0.16399399936199188,
+      "learning_rate": 1.926318892145712e-06,
+      "loss": 0.14178123474121093,
+      "step": 4830
+    },
+    {
+      "epoch": 0.879730713245997,
+      "grad_norm": 0.14491963386535645,
+      "learning_rate": 1.8980852500292412e-06,
+      "loss": 0.1408564567565918,
+      "step": 4835
+    },
+    {
+      "epoch": 0.8806404657933042,
+      "grad_norm": 0.17335423827171326,
+      "learning_rate": 1.8700518791448851e-06,
+      "loss": 0.14403265714645386,
+      "step": 4840
+    },
+    {
+      "epoch": 0.8815502183406113,
+      "grad_norm": 0.17399625480175018,
+      "learning_rate": 1.8422190225148155e-06,
+      "loss": 0.14289036989212037,
+      "step": 4845
+    },
+    {
+      "epoch": 0.8824599708879185,
+      "grad_norm": 0.17945612967014313,
+      "learning_rate": 1.814586921422956e-06,
+      "loss": 0.14494109153747559,
+      "step": 4850
+    },
+    {
+      "epoch": 0.8833697234352256,
+      "grad_norm": 0.1910620480775833,
+      "learning_rate": 1.7871558154128664e-06,
+      "loss": 0.13726245164871215,
+      "step": 4855
+    },
+    {
+      "epoch": 0.8842794759825328,
+      "grad_norm": 0.1771879345178604,
+      "learning_rate": 1.7599259422856756e-06,
+      "loss": 0.1464752197265625,
+      "step": 4860
+    },
+    {
+      "epoch": 0.8851892285298398,
+      "grad_norm": 0.19427461922168732,
+      "learning_rate": 1.7328975380980218e-06,
+      "loss": 0.13823356628417968,
+      "step": 4865
+    },
+    {
+      "epoch": 0.886098981077147,
+      "grad_norm": 0.1491149365901947,
+      "learning_rate": 1.7060708371599897e-06,
+      "loss": 0.1338604211807251,
+      "step": 4870
+    },
+    {
+      "epoch": 0.8870087336244541,
+      "grad_norm": 0.16087733209133148,
+      "learning_rate": 1.6794460720331057e-06,
+      "loss": 0.14184389114379883,
+      "step": 4875
+    },
+    {
+      "epoch": 0.8879184861717613,
+      "grad_norm": 0.14506325125694275,
+      "learning_rate": 1.653023473528309e-06,
+      "loss": 0.14267687797546386,
+      "step": 4880
+    },
+    {
+      "epoch": 0.8888282387190685,
+      "grad_norm": 0.16886365413665771,
+      "learning_rate": 1.626803270703936e-06,
+      "loss": 0.14266083240509034,
+      "step": 4885
+    },
+    {
+      "epoch": 0.8897379912663755,
+      "grad_norm": 0.1891999989748001,
+      "learning_rate": 1.6007856908637652e-06,
+      "loss": 0.1398016929626465,
+      "step": 4890
+    },
+    {
+      "epoch": 0.8906477438136827,
+      "grad_norm": 0.17645299434661865,
+      "learning_rate": 1.5749709595550083e-06,
+      "loss": 0.13869571685791016,
+      "step": 4895
+    },
+    {
+      "epoch": 0.8915574963609898,
+      "grad_norm": 0.17714262008666992,
+      "learning_rate": 1.549359300566408e-06,
+      "loss": 0.14957486391067504,
+      "step": 4900
+    },
+    {
+      "epoch": 0.892467248908297,
+      "grad_norm": 0.18025240302085876,
+      "learning_rate": 1.5239509359262355e-06,
+      "loss": 0.1358652949333191,
+      "step": 4905
+    },
+    {
+      "epoch": 0.8933770014556041,
+      "grad_norm": 0.17539937794208527,
+      "learning_rate": 1.4987460859004154e-06,
+      "loss": 0.13833394050598144,
+      "step": 4910
+    },
+    {
+      "epoch": 0.8942867540029112,
+      "grad_norm": 0.1772230565547943,
+      "learning_rate": 1.4737449689905953e-06,
+      "loss": 0.14202116727828978,
+      "step": 4915
+    },
+    {
+      "epoch": 0.8951965065502183,
+      "grad_norm": 0.1670161783695221,
+      "learning_rate": 1.4489478019322433e-06,
+      "loss": 0.1403665542602539,
+      "step": 4920
+    },
+    {
+      "epoch": 0.8961062590975255,
+      "grad_norm": 0.1697034239768982,
+      "learning_rate": 1.4243547996927926e-06,
+      "loss": 0.1401481032371521,
+      "step": 4925
+    },
+    {
+      "epoch": 0.8970160116448326,
+      "grad_norm": 0.16474860906600952,
+      "learning_rate": 1.3999661754697636e-06,
+      "loss": 0.13969850540161133,
+      "step": 4930
+    },
+    {
+      "epoch": 0.8979257641921398,
+      "grad_norm": 0.1664883941411972,
+      "learning_rate": 1.3757821406889027e-06,
+      "loss": 0.1399069309234619,
+      "step": 4935
+    },
+    {
+      "epoch": 0.8988355167394468,
+      "grad_norm": 0.16675794124603271,
+      "learning_rate": 1.351802905002386e-06,
+      "loss": 0.14129226207733153,
+      "step": 4940
+    },
+    {
+      "epoch": 0.899745269286754,
+      "grad_norm": 0.17529809474945068,
+      "learning_rate": 1.3280286762869632e-06,
+      "loss": 0.14663081169128417,
+      "step": 4945
+    },
+    {
+      "epoch": 0.9006550218340611,
+      "grad_norm": 0.17758169770240784,
+      "learning_rate": 1.3044596606421795e-06,
+      "loss": 0.13986254930496217,
+      "step": 4950
+    },
+    {
+      "epoch": 0.9015647743813683,
+      "grad_norm": 0.153225839138031,
+      "learning_rate": 1.2810960623885815e-06,
+      "loss": 0.14236698150634766,
+      "step": 4955
+    },
+    {
+      "epoch": 0.9024745269286754,
+      "grad_norm": 0.169761523604393,
+      "learning_rate": 1.2579380840659376e-06,
+      "loss": 0.1450445055961609,
+      "step": 4960
+    },
+    {
+      "epoch": 0.9033842794759825,
+      "grad_norm": 0.16659331321716309,
+      "learning_rate": 1.2349859264315034e-06,
+      "loss": 0.14043926000595092,
+      "step": 4965
+    },
+    {
+      "epoch": 0.9042940320232896,
+      "grad_norm": 0.16748706996440887,
+      "learning_rate": 1.2122397884582553e-06,
+      "loss": 0.14725675582885742,
+      "step": 4970
+    },
+    {
+      "epoch": 0.9052037845705968,
+      "grad_norm": 0.1600511223077774,
+      "learning_rate": 1.1896998673331883e-06,
+      "loss": 0.14551150798797607,
+      "step": 4975
+    },
+    {
+      "epoch": 0.9061135371179039,
+      "grad_norm": 0.24318362772464752,
+      "learning_rate": 1.1673663584555934e-06,
+      "loss": 0.14470888376235963,
+      "step": 4980
+    },
+    {
+      "epoch": 0.9070232896652111,
+      "grad_norm": 0.16443821787834167,
+      "learning_rate": 1.1452394554353706e-06,
+      "loss": 0.13639854192733764,
+      "step": 4985
+    },
+    {
+      "epoch": 0.9079330422125182,
+      "grad_norm": 0.14277774095535278,
+      "learning_rate": 1.1233193500913453e-06,
+      "loss": 0.13749881982803344,
+      "step": 4990
+    },
+    {
+      "epoch": 0.9088427947598253,
+      "grad_norm": 0.1610947549343109,
+      "learning_rate": 1.1016062324496008e-06,
+      "loss": 0.1385629653930664,
+      "step": 4995
+    },
+    {
+      "epoch": 0.9097525473071325,
+      "grad_norm": 0.17888498306274414,
+      "learning_rate": 1.080100290741845e-06,
+      "loss": 0.14225621223449708,
+      "step": 5000
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.748884707820236e+18,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-5000/training_args.bin b/checkpoint-5000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-5000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/checkpoint-5100/README.md b/checkpoint-5100/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-5100/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-5100/adapter_config.json b/checkpoint-5100/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-5100/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-5100/adapter_model.safetensors b/checkpoint-5100/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..991125bd90263b3b1244068c71f4d49ed24caf68
--- /dev/null
+++ b/checkpoint-5100/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:85e9de4c60daac973c28ff33d1ea1bc93b4f25c4d855c1ef80e60071f02cc7a5
+size 169741912
diff --git a/checkpoint-5100/chat_template.jinja b/checkpoint-5100/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-5100/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-5100/optimizer.pt b/checkpoint-5100/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..08a0964caa41269d32ba88e142ecca6ae873e12a
--- /dev/null
+++ b/checkpoint-5100/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e155e4b28d9304fdc51c12782756f07045b4a1fd0e13dabdde7e825ff164cc2a
+size 72807355
diff --git a/checkpoint-5100/processor_config.json b/checkpoint-5100/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-5100/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-5100/rng_state.pth b/checkpoint-5100/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-5100/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-5100/scheduler.pt b/checkpoint-5100/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5872ecc7886691192f2f521f170cd8ab806e293e
--- /dev/null
+++ b/checkpoint-5100/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c70a8a41be46fdfaa0c5bde2c2de38bd9f972e5f87edea20a2693858227a5b2d
+size 1465
diff --git a/checkpoint-5100/tokenizer.json b/checkpoint-5100/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-5100/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-5100/tokenizer_config.json b/checkpoint-5100/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-5100/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-5100/trainer_state.json b/checkpoint-5100/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..a8d3b488292433ce297a3993030e8f69cb21800d
--- /dev/null
+++ b/checkpoint-5100/trainer_state.json
@@ -0,0 +1,7182 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9279475982532751,
+  "eval_steps": 100,
+  "global_step": 5100,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    },
+    {
+      "epoch": 0.03729985443959243,
+      "grad_norm": 0.061678580939769745,
+      "learning_rate": 4.999340748636462e-05,
+      "loss": 0.24956226348876953,
+      "step": 205
+    },
+    {
+      "epoch": 0.03820960698689956,
+      "grad_norm": 0.07328873127698898,
+      "learning_rate": 4.999160884067051e-05,
+      "loss": 0.26169676780700685,
+      "step": 210
+    },
+    {
+      "epoch": 0.0391193595342067,
+      "grad_norm": 0.08287990838289261,
+      "learning_rate": 4.9989593541926246e-05,
+      "loss": 0.2574604034423828,
+      "step": 215
+    },
+    {
+      "epoch": 0.04002911208151383,
+      "grad_norm": 0.06787359714508057,
+      "learning_rate": 4.9987361607602525e-05,
+      "loss": 0.25351409912109374,
+      "step": 220
+    },
+    {
+      "epoch": 0.04093886462882096,
+      "grad_norm": 0.06695502996444702,
+      "learning_rate": 4.998491305704805e-05,
+      "loss": 0.24522039890289307,
+      "step": 225
+    },
+    {
+      "epoch": 0.041848617176128096,
+      "grad_norm": 0.08872214704751968,
+      "learning_rate": 4.9982247911489375e-05,
+      "loss": 0.2581867933273315,
+      "step": 230
+    },
+    {
+      "epoch": 0.042758369723435226,
+      "grad_norm": 0.07637131959199905,
+      "learning_rate": 4.9979366194030743e-05,
+      "loss": 0.25569658279418944,
+      "step": 235
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 0.08158119022846222,
+      "learning_rate": 4.997626792965385e-05,
+      "loss": 0.2529409646987915,
+      "step": 240
+    },
+    {
+      "epoch": 0.04457787481804949,
+      "grad_norm": 0.07529161125421524,
+      "learning_rate": 4.997295314521766e-05,
+      "loss": 0.24049024581909179,
+      "step": 245
+    },
+    {
+      "epoch": 0.04548762736535662,
+      "grad_norm": 0.08860139548778534,
+      "learning_rate": 4.996942186945813e-05,
+      "loss": 0.2490522861480713,
+      "step": 250
+    },
+    {
+      "epoch": 0.04639737991266375,
+      "grad_norm": 0.0850321501493454,
+      "learning_rate": 4.9965674132988005e-05,
+      "loss": 0.24180831909179687,
+      "step": 255
+    },
+    {
+      "epoch": 0.04730713245997089,
+      "grad_norm": 0.07556115090847015,
+      "learning_rate": 4.996170996829653e-05,
+      "loss": 0.2509631872177124,
+      "step": 260
+    },
+    {
+      "epoch": 0.04821688500727802,
+      "grad_norm": 0.07971206307411194,
+      "learning_rate": 4.995752940974918e-05,
+      "loss": 0.24398891925811766,
+      "step": 265
+    },
+    {
+      "epoch": 0.04912663755458515,
+      "grad_norm": 0.09149336814880371,
+      "learning_rate": 4.9953132493587344e-05,
+      "loss": 0.2300492286682129,
+      "step": 270
+    },
+    {
+      "epoch": 0.050036390101892286,
+      "grad_norm": 0.08265820890665054,
+      "learning_rate": 4.9948519257928034e-05,
+      "loss": 0.24246792793273925,
+      "step": 275
+    },
+    {
+      "epoch": 0.050946142649199416,
+      "grad_norm": 0.10328587144613266,
+      "learning_rate": 4.9943689742763534e-05,
+      "loss": 0.2367171049118042,
+      "step": 280
+    },
+    {
+      "epoch": 0.05185589519650655,
+      "grad_norm": 0.0836917981505394,
+      "learning_rate": 4.993864398996105e-05,
+      "loss": 0.23215813636779786,
+      "step": 285
+    },
+    {
+      "epoch": 0.05276564774381368,
+      "grad_norm": 0.09475161135196686,
+      "learning_rate": 4.99333820432624e-05,
+      "loss": 0.2350748062133789,
+      "step": 290
+    },
+    {
+      "epoch": 0.05367540029112081,
+      "grad_norm": 0.08040128648281097,
+      "learning_rate": 4.992790394828355e-05,
+      "loss": 0.23253886699676513,
+      "step": 295
+    },
+    {
+      "epoch": 0.05458515283842795,
+      "grad_norm": 0.08852150291204453,
+      "learning_rate": 4.992220975251428e-05,
+      "loss": 0.23856515884399415,
+      "step": 300
+    },
+    {
+      "epoch": 0.05549490538573508,
+      "grad_norm": 0.09565229713916779,
+      "learning_rate": 4.991629950531775e-05,
+      "loss": 0.23311660289764405,
+      "step": 305
+    },
+    {
+      "epoch": 0.05640465793304221,
+      "grad_norm": 0.08158160001039505,
+      "learning_rate": 4.991017325793009e-05,
+      "loss": 0.22467944622039795,
+      "step": 310
+    },
+    {
+      "epoch": 0.05731441048034935,
+      "grad_norm": 0.07746429741382599,
+      "learning_rate": 4.990383106345994e-05,
+      "loss": 0.229844069480896,
+      "step": 315
+    },
+    {
+      "epoch": 0.05822416302765648,
+      "grad_norm": 0.08564355969429016,
+      "learning_rate": 4.989727297688797e-05,
+      "loss": 0.22414517402648926,
+      "step": 320
+    },
+    {
+      "epoch": 0.05913391557496361,
+      "grad_norm": 0.07517435401678085,
+      "learning_rate": 4.9890499055066435e-05,
+      "loss": 0.2236532211303711,
+      "step": 325
+    },
+    {
+      "epoch": 0.060043668122270744,
+      "grad_norm": 0.111734539270401,
+      "learning_rate": 4.988350935671869e-05,
+      "loss": 0.21474847793579102,
+      "step": 330
+    },
+    {
+      "epoch": 0.060953420669577874,
+      "grad_norm": 0.09906989336013794,
+      "learning_rate": 4.987630394243866e-05,
+      "loss": 0.23321933746337892,
+      "step": 335
+    },
+    {
+      "epoch": 0.06186317321688501,
+      "grad_norm": 0.10131457448005676,
+      "learning_rate": 4.98688828746903e-05,
+      "loss": 0.2310662031173706,
+      "step": 340
+    },
+    {
+      "epoch": 0.06277292576419213,
+      "grad_norm": 0.09203507006168365,
+      "learning_rate": 4.986124621780708e-05,
+      "loss": 0.22021169662475587,
+      "step": 345
+    },
+    {
+      "epoch": 0.06368267831149928,
+      "grad_norm": 0.09505912661552429,
+      "learning_rate": 4.9853394037991416e-05,
+      "loss": 0.2197155237197876,
+      "step": 350
+    },
+    {
+      "epoch": 0.06459243085880641,
+      "grad_norm": 0.09038657695055008,
+      "learning_rate": 4.984532640331412e-05,
+      "loss": 0.22066287994384765,
+      "step": 355
+    },
+    {
+      "epoch": 0.06550218340611354,
+      "grad_norm": 0.09707064181566238,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 0.22455451488494874,
+      "step": 360
+    },
+    {
+      "epoch": 0.06641193595342067,
+      "grad_norm": 0.10367228090763092,
+      "learning_rate": 4.98285450509961e-05,
+      "loss": 0.21993820667266845,
+      "step": 365
+    },
+    {
+      "epoch": 0.0673216885007278,
+      "grad_norm": 0.12229471653699875,
+      "learning_rate": 4.9819831478833456e-05,
+      "loss": 0.2168867588043213,
+      "step": 370
+    },
+    {
+      "epoch": 0.06823144104803494,
+      "grad_norm": 0.0964592918753624,
+      "learning_rate": 4.981090274276406e-05,
+      "loss": 0.21579203605651856,
+      "step": 375
+    },
+    {
+      "epoch": 0.06914119359534207,
+      "grad_norm": 0.09400496631860733,
+      "learning_rate": 4.980175892019141e-05,
+      "loss": 0.20972180366516113,
+      "step": 380
+    },
+    {
+      "epoch": 0.0700509461426492,
+      "grad_norm": 0.08158645778894424,
+      "learning_rate": 4.9792400090383594e-05,
+      "loss": 0.22148358821868896,
+      "step": 385
+    },
+    {
+      "epoch": 0.07096069868995633,
+      "grad_norm": 0.10916394740343094,
+      "learning_rate": 4.978282633447261e-05,
+      "loss": 0.2214418649673462,
+      "step": 390
+    },
+    {
+      "epoch": 0.07187045123726346,
+      "grad_norm": 0.11138810962438583,
+      "learning_rate": 4.9773037735453636e-05,
+      "loss": 0.21814754009246826,
+      "step": 395
+    },
+    {
+      "epoch": 0.07278020378457059,
+      "grad_norm": 0.10914396494626999,
+      "learning_rate": 4.9763034378184365e-05,
+      "loss": 0.21310818195343018,
+      "step": 400
+    },
+    {
+      "epoch": 0.07368995633187773,
+      "grad_norm": 0.1043366864323616,
+      "learning_rate": 4.975281634938421e-05,
+      "loss": 0.21266789436340333,
+      "step": 405
+    },
+    {
+      "epoch": 0.07459970887918486,
+      "grad_norm": 0.1036868542432785,
+      "learning_rate": 4.9742383737633594e-05,
+      "loss": 0.21606721878051757,
+      "step": 410
+    },
+    {
+      "epoch": 0.075509461426492,
+      "grad_norm": 0.11640442907810211,
+      "learning_rate": 4.9731736633373144e-05,
+      "loss": 0.21532948017120362,
+      "step": 415
+    },
+    {
+      "epoch": 0.07641921397379912,
+      "grad_norm": 0.11219926178455353,
+      "learning_rate": 4.9720875128902956e-05,
+      "loss": 0.2191627025604248,
+      "step": 420
+    },
+    {
+      "epoch": 0.07732896652110625,
+      "grad_norm": 0.12103637307882309,
+      "learning_rate": 4.970979931838176e-05,
+      "loss": 0.20938868522644044,
+      "step": 425
+    },
+    {
+      "epoch": 0.0782387190684134,
+      "grad_norm": 0.13274189829826355,
+      "learning_rate": 4.96985092978261e-05,
+      "loss": 0.21792960166931152,
+      "step": 430
+    },
+    {
+      "epoch": 0.07914847161572053,
+      "grad_norm": 0.11164513230323792,
+      "learning_rate": 4.968700516510954e-05,
+      "loss": 0.2022618055343628,
+      "step": 435
+    },
+    {
+      "epoch": 0.08005822416302766,
+      "grad_norm": 0.09532847255468369,
+      "learning_rate": 4.967528701996174e-05,
+      "loss": 0.21255812644958497,
+      "step": 440
+    },
+    {
+      "epoch": 0.08096797671033479,
+      "grad_norm": 0.10279258340597153,
+      "learning_rate": 4.96633549639677e-05,
+      "loss": 0.20683050155639648,
+      "step": 445
+    },
+    {
+      "epoch": 0.08187772925764192,
+      "grad_norm": 0.1257462352514267,
+      "learning_rate": 4.965120910056677e-05,
+      "loss": 0.21419920921325683,
+      "step": 450
+    },
+    {
+      "epoch": 0.08278748180494905,
+      "grad_norm": 0.11663137376308441,
+      "learning_rate": 4.963884953505186e-05,
+      "loss": 0.2072287082672119,
+      "step": 455
+    },
+    {
+      "epoch": 0.08369723435225619,
+      "grad_norm": 0.10488224029541016,
+      "learning_rate": 4.96262763745684e-05,
+      "loss": 0.1982678532600403,
+      "step": 460
+    },
+    {
+      "epoch": 0.08460698689956332,
+      "grad_norm": 0.11801692098379135,
+      "learning_rate": 4.961348972811354e-05,
+      "loss": 0.20662031173706055,
+      "step": 465
+    },
+    {
+      "epoch": 0.08551673944687045,
+      "grad_norm": 0.11318827420473099,
+      "learning_rate": 4.96004897065351e-05,
+      "loss": 0.20947303771972656,
+      "step": 470
+    },
+    {
+      "epoch": 0.08642649199417758,
+      "grad_norm": 0.13409486413002014,
+      "learning_rate": 4.95872764225307e-05,
+      "loss": 0.19670876264572143,
+      "step": 475
+    },
+    {
+      "epoch": 0.08733624454148471,
+      "grad_norm": 0.14440792798995972,
+      "learning_rate": 4.957384999064672e-05,
+      "loss": 0.19842848777770997,
+      "step": 480
+    },
+    {
+      "epoch": 0.08824599708879186,
+      "grad_norm": 0.12246996909379959,
+      "learning_rate": 4.956021052727731e-05,
+      "loss": 0.20318071842193602,
+      "step": 485
+    },
+    {
+      "epoch": 0.08915574963609899,
+      "grad_norm": 0.13437233865261078,
+      "learning_rate": 4.954635815066342e-05,
+      "loss": 0.21675212383270265,
+      "step": 490
+    },
+    {
+      "epoch": 0.09006550218340612,
+      "grad_norm": 0.11109672486782074,
+      "learning_rate": 4.9532292980891744e-05,
+      "loss": 0.2100757837295532,
+      "step": 495
+    },
+    {
+      "epoch": 0.09097525473071325,
+      "grad_norm": 0.1388893872499466,
+      "learning_rate": 4.9518015139893675e-05,
+      "loss": 0.20303285121917725,
+      "step": 500
+    },
+    {
+      "epoch": 0.09188500727802038,
+      "grad_norm": 0.13239721953868866,
+      "learning_rate": 4.950352475144427e-05,
+      "loss": 0.2152268409729004,
+      "step": 505
+    },
+    {
+      "epoch": 0.0927947598253275,
+      "grad_norm": 0.12834979593753815,
+      "learning_rate": 4.948882194116119e-05,
+      "loss": 0.20799248218536376,
+      "step": 510
+    },
+    {
+      "epoch": 0.09370451237263465,
+      "grad_norm": 0.11886704713106155,
+      "learning_rate": 4.947390683650354e-05,
+      "loss": 0.20394976139068605,
+      "step": 515
+    },
+    {
+      "epoch": 0.09461426491994178,
+      "grad_norm": 0.11398876458406448,
+      "learning_rate": 4.945877956677083e-05,
+      "loss": 0.2091092586517334,
+      "step": 520
+    },
+    {
+      "epoch": 0.09552401746724891,
+      "grad_norm": 0.1422540694475174,
+      "learning_rate": 4.944344026310186e-05,
+      "loss": 0.19564238786697388,
+      "step": 525
+    },
+    {
+      "epoch": 0.09643377001455604,
+      "grad_norm": 0.11359584331512451,
+      "learning_rate": 4.9427889058473535e-05,
+      "loss": 0.20493624210357667,
+      "step": 530
+    },
+    {
+      "epoch": 0.09734352256186317,
+      "grad_norm": 0.11703553050756454,
+      "learning_rate": 4.941212608769974e-05,
+      "loss": 0.2098615884780884,
+      "step": 535
+    },
+    {
+      "epoch": 0.0982532751091703,
+      "grad_norm": 0.14552047848701477,
+      "learning_rate": 4.939615148743017e-05,
+      "loss": 0.20382182598114013,
+      "step": 540
+    },
+    {
+      "epoch": 0.09916302765647744,
+      "grad_norm": 0.13178016245365143,
+      "learning_rate": 4.937996539614914e-05,
+      "loss": 0.19901862144470214,
+      "step": 545
+    },
+    {
+      "epoch": 0.10007278020378457,
+      "grad_norm": 0.635392427444458,
+      "learning_rate": 4.936356795417439e-05,
+      "loss": 0.20694944858551026,
+      "step": 550
+    },
+    {
+      "epoch": 0.1009825327510917,
+      "grad_norm": 0.15019077062606812,
+      "learning_rate": 4.934695930365586e-05,
+      "loss": 0.19313746690750122,
+      "step": 555
+    },
+    {
+      "epoch": 0.10189228529839883,
+      "grad_norm": 0.12941956520080566,
+      "learning_rate": 4.9330139588574474e-05,
+      "loss": 0.19671722650527954,
+      "step": 560
+    },
+    {
+      "epoch": 0.10280203784570596,
+      "grad_norm": 0.13818831741809845,
+      "learning_rate": 4.931310895474088e-05,
+      "loss": 0.20026786327362062,
+      "step": 565
+    },
+    {
+      "epoch": 0.1037117903930131,
+      "grad_norm": 0.12011194974184036,
+      "learning_rate": 4.929586754979417e-05,
+      "loss": 0.1932437539100647,
+      "step": 570
+    },
+    {
+      "epoch": 0.10462154294032024,
+      "grad_norm": 0.1345364898443222,
+      "learning_rate": 4.9278415523200644e-05,
+      "loss": 0.20245940685272218,
+      "step": 575
+    },
+    {
+      "epoch": 0.10553129548762737,
+      "grad_norm": 0.13281017541885376,
+      "learning_rate": 4.926075302625247e-05,
+      "loss": 0.19864981174468993,
+      "step": 580
+    },
+    {
+      "epoch": 0.1064410480349345,
+      "grad_norm": 0.13465586304664612,
+      "learning_rate": 4.924288021206639e-05,
+      "loss": 0.19573183059692384,
+      "step": 585
+    },
+    {
+      "epoch": 0.10735080058224163,
+      "grad_norm": 0.15225961804389954,
+      "learning_rate": 4.9224797235582396e-05,
+      "loss": 0.19946801662445068,
+      "step": 590
+    },
+    {
+      "epoch": 0.10826055312954876,
+      "grad_norm": 0.12816746532917023,
+      "learning_rate": 4.92065042535624e-05,
+      "loss": 0.19851526021957397,
+      "step": 595
+    },
+    {
+      "epoch": 0.1091703056768559,
+      "grad_norm": 0.13802853226661682,
+      "learning_rate": 4.9188001424588824e-05,
+      "loss": 0.19321763515472412,
+      "step": 600
+    },
+    {
+      "epoch": 0.11008005822416303,
+      "grad_norm": 0.17504797875881195,
+      "learning_rate": 4.9169288909063295e-05,
+      "loss": 0.2032616138458252,
+      "step": 605
+    },
+    {
+      "epoch": 0.11098981077147016,
+      "grad_norm": 0.13544194400310516,
+      "learning_rate": 4.91503668692052e-05,
+      "loss": 0.2011256456375122,
+      "step": 610
+    },
+    {
+      "epoch": 0.11189956331877729,
+      "grad_norm": 1.3976134061813354,
+      "learning_rate": 4.91312354690503e-05,
+      "loss": 0.19916868209838867,
+      "step": 615
+    },
+    {
+      "epoch": 0.11280931586608442,
+      "grad_norm": 0.1465059071779251,
+      "learning_rate": 4.91118948744493e-05,
+      "loss": 0.19487457275390624,
+      "step": 620
+    },
+    {
+      "epoch": 0.11371906841339156,
+      "grad_norm": 0.12103168666362762,
+      "learning_rate": 4.909234525306645e-05,
+      "loss": 0.1907251238822937,
+      "step": 625
+    },
+    {
+      "epoch": 0.1146288209606987,
+      "grad_norm": 0.12660574913024902,
+      "learning_rate": 4.907258677437802e-05,
+      "loss": 0.19327253103256226,
+      "step": 630
+    },
+    {
+      "epoch": 0.11553857350800582,
+      "grad_norm": 0.1347813606262207,
+      "learning_rate": 4.90526196096709e-05,
+      "loss": 0.19637736082077026,
+      "step": 635
+    },
+    {
+      "epoch": 0.11644832605531295,
+      "grad_norm": 0.14953652024269104,
+      "learning_rate": 4.903244393204107e-05,
+      "loss": 0.20325069427490233,
+      "step": 640
+    },
+    {
+      "epoch": 0.11735807860262008,
+      "grad_norm": 0.13936272263526917,
+      "learning_rate": 4.901205991639213e-05,
+      "loss": 0.1930275321006775,
+      "step": 645
+    },
+    {
+      "epoch": 0.11826783114992721,
+      "grad_norm": 0.1448420137166977,
+      "learning_rate": 4.899146773943374e-05,
+      "loss": 0.20026936531066894,
+      "step": 650
+    },
+    {
+      "epoch": 0.11917758369723436,
+      "grad_norm": 0.1312534064054489,
+      "learning_rate": 4.897066757968014e-05,
+      "loss": 0.19062033891677857,
+      "step": 655
+    },
+    {
+      "epoch": 0.12008733624454149,
+      "grad_norm": 0.13644742965698242,
+      "learning_rate": 4.894965961744859e-05,
+      "loss": 0.18719595670700073,
+      "step": 660
+    },
+    {
+      "epoch": 0.12099708879184862,
+      "grad_norm": 0.14276087284088135,
+      "learning_rate": 4.892844403485777e-05,
+      "loss": 0.19784307479858398,
+      "step": 665
+    },
+    {
+      "epoch": 0.12190684133915575,
+      "grad_norm": 0.14735399186611176,
+      "learning_rate": 4.890702101582623e-05,
+      "loss": 0.19163782596588136,
+      "step": 670
+    },
+    {
+      "epoch": 0.12281659388646288,
+      "grad_norm": 0.15742065012454987,
+      "learning_rate": 4.888539074607082e-05,
+      "loss": 0.19312986135482788,
+      "step": 675
+    },
+    {
+      "epoch": 0.12372634643377002,
+      "grad_norm": 0.12917031347751617,
+      "learning_rate": 4.8863553413105025e-05,
+      "loss": 0.20066320896148682,
+      "step": 680
+    },
+    {
+      "epoch": 0.12463609898107715,
+      "grad_norm": 0.1484801322221756,
+      "learning_rate": 4.884150920623737e-05,
+      "loss": 0.20096096992492676,
+      "step": 685
+    },
+    {
+      "epoch": 0.12554585152838427,
+      "grad_norm": 0.1455296128988266,
+      "learning_rate": 4.88192583165698e-05,
+      "loss": 0.20518505573272705,
+      "step": 690
+    },
+    {
+      "epoch": 0.12645560407569142,
+      "grad_norm": 0.14517490565776825,
+      "learning_rate": 4.879680093699598e-05,
+      "loss": 0.18859238624572755,
+      "step": 695
+    },
+    {
+      "epoch": 0.12736535662299855,
+      "grad_norm": 0.18778090178966522,
+      "learning_rate": 4.877413726219964e-05,
+      "loss": 0.197074818611145,
+      "step": 700
+    },
+    {
+      "epoch": 0.12827510917030568,
+      "grad_norm": 0.13497677445411682,
+      "learning_rate": 4.87512674886529e-05,
+      "loss": 0.18713107109069824,
+      "step": 705
+    },
+    {
+      "epoch": 0.12918486171761281,
+      "grad_norm": 0.12657155096530914,
+      "learning_rate": 4.872819181461455e-05,
+      "loss": 0.1858484387397766,
+      "step": 710
+    },
+    {
+      "epoch": 0.13009461426491994,
+      "grad_norm": 0.11458148807287216,
+      "learning_rate": 4.870491044012834e-05,
+      "loss": 0.18732179403305055,
+      "step": 715
+    },
+    {
+      "epoch": 0.13100436681222707,
+      "grad_norm": 0.13000249862670898,
+      "learning_rate": 4.8681423567021244e-05,
+      "loss": 0.1872936010360718,
+      "step": 720
+    },
+    {
+      "epoch": 0.1319141193595342,
+      "grad_norm": 0.14580890536308289,
+      "learning_rate": 4.865773139890172e-05,
+      "loss": 0.19280019998550416,
+      "step": 725
+    },
+    {
+      "epoch": 0.13282387190684133,
+      "grad_norm": 0.1507277935743332,
+      "learning_rate": 4.8633834141157913e-05,
+      "loss": 0.1898929238319397,
+      "step": 730
+    },
+    {
+      "epoch": 0.13373362445414846,
+      "grad_norm": 0.1418737769126892,
+      "learning_rate": 4.860973200095592e-05,
+      "loss": 0.17926375865936278,
+      "step": 735
+    },
+    {
+      "epoch": 0.1346433770014556,
+      "grad_norm": 0.17151866853237152,
+      "learning_rate": 4.858542518723794e-05,
+      "loss": 0.18963592052459716,
+      "step": 740
+    },
+    {
+      "epoch": 0.13555312954876272,
+      "grad_norm": 0.11162743717432022,
+      "learning_rate": 4.8560913910720535e-05,
+      "loss": 0.19466646909713745,
+      "step": 745
+    },
+    {
+      "epoch": 0.13646288209606988,
+      "grad_norm": 0.15628376603126526,
+      "learning_rate": 4.8536198383892725e-05,
+      "loss": 0.19494034051895143,
+      "step": 750
+    },
+    {
+      "epoch": 0.137372634643377,
+      "grad_norm": 0.18209289014339447,
+      "learning_rate": 4.851127882101421e-05,
+      "loss": 0.18747550249099731,
+      "step": 755
+    },
+    {
+      "epoch": 0.13828238719068414,
+      "grad_norm": 0.14559614658355713,
+      "learning_rate": 4.8486155438113454e-05,
+      "loss": 0.1897158980369568,
+      "step": 760
+    },
+    {
+      "epoch": 0.13919213973799127,
+      "grad_norm": 0.3198587894439697,
+      "learning_rate": 4.846082845298586e-05,
+      "loss": 0.18571001291275024,
+      "step": 765
+    },
+    {
+      "epoch": 0.1401018922852984,
+      "grad_norm": 0.1486678421497345,
+      "learning_rate": 4.843529808519189e-05,
+      "loss": 0.19561930894851684,
+      "step": 770
+    },
+    {
+      "epoch": 0.14101164483260553,
+      "grad_norm": 0.15318170189857483,
+      "learning_rate": 4.840956455605509e-05,
+      "loss": 0.187040114402771,
+      "step": 775
+    },
+    {
+      "epoch": 0.14192139737991266,
+      "grad_norm": 0.13754244148731232,
+      "learning_rate": 4.838362808866025e-05,
+      "loss": 0.18345539569854735,
+      "step": 780
+    },
+    {
+      "epoch": 0.1428311499272198,
+      "grad_norm": 0.12943248450756073,
+      "learning_rate": 4.835748890785143e-05,
+      "loss": 0.1921079397201538,
+      "step": 785
+    },
+    {
+      "epoch": 0.14374090247452692,
+      "grad_norm": 0.110458143055439,
+      "learning_rate": 4.833114724023001e-05,
+      "loss": 0.17927205562591553,
+      "step": 790
+    },
+    {
+      "epoch": 0.14465065502183405,
+      "grad_norm": 0.2421770840883255,
+      "learning_rate": 4.830460331415275e-05,
+      "loss": 0.18317567110061644,
+      "step": 795
+    },
+    {
+      "epoch": 0.14556040756914118,
+      "grad_norm": 0.14752762019634247,
+      "learning_rate": 4.8277857359729787e-05,
+      "loss": 0.1843916058540344,
+      "step": 800
+    },
+    {
+      "epoch": 0.14647016011644834,
+      "grad_norm": 0.15043556690216064,
+      "learning_rate": 4.8250909608822644e-05,
+      "loss": 0.18354393243789674,
+      "step": 805
+    },
+    {
+      "epoch": 0.14737991266375547,
+      "grad_norm": 0.1381794661283493,
+      "learning_rate": 4.822376029504223e-05,
+      "loss": 0.1789781332015991,
+      "step": 810
+    },
+    {
+      "epoch": 0.1482896652110626,
+      "grad_norm": 0.18386174738407135,
+      "learning_rate": 4.819640965374681e-05,
+      "loss": 0.19494292736053467,
+      "step": 815
+    },
+    {
+      "epoch": 0.14919941775836973,
+      "grad_norm": 0.13829593360424042,
+      "learning_rate": 4.816885792203996e-05,
+      "loss": 0.18486063480377196,
+      "step": 820
+    },
+    {
+      "epoch": 0.15010917030567686,
+      "grad_norm": 0.15033291280269623,
+      "learning_rate": 4.814110533876852e-05,
+      "loss": 0.18061509132385253,
+      "step": 825
+    },
+    {
+      "epoch": 0.151018922852984,
+      "grad_norm": 0.17150473594665527,
+      "learning_rate": 4.811315214452051e-05,
+      "loss": 0.18464866876602173,
+      "step": 830
+    },
+    {
+      "epoch": 0.15192867540029112,
+      "grad_norm": 0.15317125618457794,
+      "learning_rate": 4.808499858162307e-05,
+      "loss": 0.1837708592414856,
+      "step": 835
+    },
+    {
+      "epoch": 0.15283842794759825,
+      "grad_norm": 0.2671392560005188,
+      "learning_rate": 4.805664489414031e-05,
+      "loss": 0.19338636398315429,
+      "step": 840
+    },
+    {
+      "epoch": 0.15374818049490538,
+      "grad_norm": 0.14047028124332428,
+      "learning_rate": 4.802809132787125e-05,
+      "loss": 0.17069108486175538,
+      "step": 845
+    },
+    {
+      "epoch": 0.1546579330422125,
+      "grad_norm": 0.1520431935787201,
+      "learning_rate": 4.799933813034768e-05,
+      "loss": 0.18607735633850098,
+      "step": 850
+    },
+    {
+      "epoch": 0.15556768558951964,
+      "grad_norm": 0.17239463329315186,
+      "learning_rate": 4.797038555083197e-05,
+      "loss": 0.18069062232971192,
+      "step": 855
+    },
+    {
+      "epoch": 0.1564774381368268,
+      "grad_norm": 0.1377955675125122,
+      "learning_rate": 4.794123384031495e-05,
+      "loss": 0.18870222568511963,
+      "step": 860
+    },
+    {
+      "epoch": 0.15738719068413393,
+      "grad_norm": 0.15901461243629456,
+      "learning_rate": 4.791188325151373e-05,
+      "loss": 0.18128334283828734,
+      "step": 865
+    },
+    {
+      "epoch": 0.15829694323144106,
+      "grad_norm": 0.14634132385253906,
+      "learning_rate": 4.7882334038869495e-05,
+      "loss": 0.1866163969039917,
+      "step": 870
+    },
+    {
+      "epoch": 0.1592066957787482,
+      "grad_norm": 0.15361061692237854,
+      "learning_rate": 4.785258645854529e-05,
+      "loss": 0.17850807905197144,
+      "step": 875
+    },
+    {
+      "epoch": 0.16011644832605532,
+      "grad_norm": 0.13751649856567383,
+      "learning_rate": 4.782264076842385e-05,
+      "loss": 0.17731113433837892,
+      "step": 880
+    },
+    {
+      "epoch": 0.16102620087336245,
+      "grad_norm": 0.17909638583660126,
+      "learning_rate": 4.7792497228105314e-05,
+      "loss": 0.18344542980194092,
+      "step": 885
+    },
+    {
+      "epoch": 0.16193595342066958,
+      "grad_norm": 0.16038304567337036,
+      "learning_rate": 4.776215609890498e-05,
+      "loss": 0.18868647813796996,
+      "step": 890
+    },
+    {
+      "epoch": 0.1628457059679767,
+      "grad_norm": 0.1653951108455658,
+      "learning_rate": 4.773161764385107e-05,
+      "loss": 0.18614152669906617,
+      "step": 895
+    },
+    {
+      "epoch": 0.16375545851528384,
+      "grad_norm": 0.16193026304244995,
+      "learning_rate": 4.770088212768241e-05,
+      "loss": 0.18564575910568237,
+      "step": 900
+    },
+    {
+      "epoch": 0.16466521106259097,
+      "grad_norm": 0.16048531234264374,
+      "learning_rate": 4.7669949816846173e-05,
+      "loss": 0.18330031633377075,
+      "step": 905
+    },
+    {
+      "epoch": 0.1655749636098981,
+      "grad_norm": 0.1440177708864212,
+      "learning_rate": 4.7638820979495534e-05,
+      "loss": 0.17712442874908446,
+      "step": 910
+    },
+    {
+      "epoch": 0.16648471615720525,
+      "grad_norm": 0.19635969400405884,
+      "learning_rate": 4.760749588548738e-05,
+      "loss": 0.18679027557373046,
+      "step": 915
+    },
+    {
+      "epoch": 0.16739446870451238,
+      "grad_norm": 0.15576541423797607,
+      "learning_rate": 4.757597480637995e-05,
+      "loss": 0.19283764362335204,
+      "step": 920
+    },
+    {
+      "epoch": 0.1683042212518195,
+      "grad_norm": 0.1550331562757492,
+      "learning_rate": 4.7544258015430463e-05,
+      "loss": 0.18269542455673218,
+      "step": 925
+    },
+    {
+      "epoch": 0.16921397379912664,
+      "grad_norm": 0.18369626998901367,
+      "learning_rate": 4.75123457875928e-05,
+      "loss": 0.1697891116142273,
+      "step": 930
+    },
+    {
+      "epoch": 0.17012372634643377,
+      "grad_norm": 0.15266314148902893,
+      "learning_rate": 4.7480238399515074e-05,
+      "loss": 0.18523451089859008,
+      "step": 935
+    },
+    {
+      "epoch": 0.1710334788937409,
+      "grad_norm": 0.16709664463996887,
+      "learning_rate": 4.744793612953724e-05,
+      "loss": 0.1803238034248352,
+      "step": 940
+    },
+    {
+      "epoch": 0.17194323144104803,
+      "grad_norm": 0.14929179847240448,
+      "learning_rate": 4.741543925768872e-05,
+      "loss": 0.1861217737197876,
+      "step": 945
+    },
+    {
+      "epoch": 0.17285298398835516,
+      "grad_norm": 0.1362280696630478,
+      "learning_rate": 4.7382748065685915e-05,
+      "loss": 0.17896100282669067,
+      "step": 950
+    },
+    {
+      "epoch": 0.1737627365356623,
+      "grad_norm": 0.15290239453315735,
+      "learning_rate": 4.734986283692982e-05,
+      "loss": 0.18432788848876952,
+      "step": 955
+    },
+    {
+      "epoch": 0.17467248908296942,
+      "grad_norm": 0.1287035197019577,
+      "learning_rate": 4.73167838565035e-05,
+      "loss": 0.18485682010650634,
+      "step": 960
+    },
+    {
+      "epoch": 0.17558224163027655,
+      "grad_norm": 0.17969627678394318,
+      "learning_rate": 4.728351141116971e-05,
+      "loss": 0.17361557483673096,
+      "step": 965
+    },
+    {
+      "epoch": 0.1764919941775837,
+      "grad_norm": 0.13751201331615448,
+      "learning_rate": 4.7250045789368326e-05,
+      "loss": 0.1731679320335388,
+      "step": 970
+    },
+    {
+      "epoch": 0.17740174672489084,
+      "grad_norm": 0.1603265255689621,
+      "learning_rate": 4.721638728121388e-05,
+      "loss": 0.17308170795440675,
+      "step": 975
+    },
+    {
+      "epoch": 0.17831149927219797,
+      "grad_norm": 0.1592789888381958,
+      "learning_rate": 4.718253617849306e-05,
+      "loss": 0.17534757852554322,
+      "step": 980
+    },
+    {
+      "epoch": 0.1792212518195051,
+      "grad_norm": 0.12727224826812744,
+      "learning_rate": 4.714849277466214e-05,
+      "loss": 0.17817609310150145,
+      "step": 985
+    },
+    {
+      "epoch": 0.18013100436681223,
+      "grad_norm": 0.15401554107666016,
+      "learning_rate": 4.711425736484447e-05,
+      "loss": 0.1733405351638794,
+      "step": 990
+    },
+    {
+      "epoch": 0.18104075691411936,
+      "grad_norm": 0.13253968954086304,
+      "learning_rate": 4.7079830245827906e-05,
+      "loss": 0.17846795320510864,
+      "step": 995
+    },
+    {
+      "epoch": 0.1819505094614265,
+      "grad_norm": 0.21846213936805725,
+      "learning_rate": 4.7045211716062245e-05,
+      "loss": 0.18021599054336548,
+      "step": 1000
+    },
+    {
+      "epoch": 0.18286026200873362,
+      "grad_norm": 0.16867990791797638,
+      "learning_rate": 4.7010402075656595e-05,
+      "loss": 0.18232386112213134,
+      "step": 1005
+    },
+    {
+      "epoch": 0.18377001455604075,
+      "grad_norm": 0.17180582880973816,
+      "learning_rate": 4.697540162637686e-05,
+      "loss": 0.1816317319869995,
+      "step": 1010
+    },
+    {
+      "epoch": 0.18467976710334788,
+      "grad_norm": 0.16480213403701782,
+      "learning_rate": 4.694021067164303e-05,
+      "loss": 0.17718446254730225,
+      "step": 1015
+    },
+    {
+      "epoch": 0.185589519650655,
+      "grad_norm": 0.15015918016433716,
+      "learning_rate": 4.6904829516526605e-05,
+      "loss": 0.17412011623382567,
+      "step": 1020
+    },
+    {
+      "epoch": 0.18649927219796217,
+      "grad_norm": 0.14445139467716217,
+      "learning_rate": 4.686925846774795e-05,
+      "loss": 0.1778018832206726,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1874090247452693,
+      "grad_norm": 0.1701960265636444,
+      "learning_rate": 4.683349783367362e-05,
+      "loss": 0.16901081800460815,
+      "step": 1030
+    },
+    {
+      "epoch": 0.18831877729257643,
+      "grad_norm": 0.15894867479801178,
+      "learning_rate": 4.679754792431368e-05,
+      "loss": 0.17055928707122803,
+      "step": 1035
+    },
+    {
+      "epoch": 0.18922852983988356,
+      "grad_norm": 0.1511942446231842,
+      "learning_rate": 4.676140905131903e-05,
+      "loss": 0.17339680194854737,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1901382823871907,
+      "grad_norm": 0.14735209941864014,
+      "learning_rate": 4.672508152797872e-05,
+      "loss": 0.17802717685699462,
+      "step": 1045
+    },
+    {
+      "epoch": 0.19104803493449782,
+      "grad_norm": 0.17367291450500488,
+      "learning_rate": 4.66885656692172e-05,
+      "loss": 0.1732744097709656,
+      "step": 1050
+    },
+    {
+      "epoch": 0.19195778748180495,
+      "grad_norm": 0.147227481007576,
+      "learning_rate": 4.665186179159159e-05,
+      "loss": 0.17040517330169677,
+      "step": 1055
+    },
+    {
+      "epoch": 0.19286754002911208,
+      "grad_norm": 0.1709655076265335,
+      "learning_rate": 4.6614970213289e-05,
+      "loss": 0.17794088125228882,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1937772925764192,
+      "grad_norm": 0.1588088721036911,
+      "learning_rate": 4.657789125412366e-05,
+      "loss": 0.17180380821228028,
+      "step": 1065
+    },
+    {
+      "epoch": 0.19468704512372634,
+      "grad_norm": 0.14827021956443787,
+      "learning_rate": 4.654062523553428e-05,
+      "loss": 0.182997989654541,
+      "step": 1070
+    },
+    {
+      "epoch": 0.19559679767103347,
+      "grad_norm": 0.16230466961860657,
+      "learning_rate": 4.6503172480581126e-05,
+      "loss": 0.17346880435943604,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1965065502183406,
+      "grad_norm": 0.1637624353170395,
+      "learning_rate": 4.646553331394333e-05,
+      "loss": 0.17263576984405518,
+      "step": 1080
+    },
+    {
+      "epoch": 0.19741630276564776,
+      "grad_norm": 0.15977843105793,
+      "learning_rate": 4.642770806191603e-05,
+      "loss": 0.17284308671951293,
+      "step": 1085
+    },
+    {
+      "epoch": 0.19832605531295489,
+      "grad_norm": 0.15394869446754456,
+      "learning_rate": 4.6389697052407534e-05,
+      "loss": 0.17797101736068727,
+      "step": 1090
+    },
+    {
+      "epoch": 0.19923580786026202,
+      "grad_norm": 0.15995225310325623,
+      "learning_rate": 4.6351500614936485e-05,
+      "loss": 0.18137198686599731,
+      "step": 1095
+    },
+    {
+      "epoch": 0.20014556040756915,
+      "grad_norm": 0.1779479682445526,
+      "learning_rate": 4.6313119080629006e-05,
+      "loss": 0.17998344898223878,
+      "step": 1100
+    },
+    {
+      "epoch": 0.20105531295487628,
+      "grad_norm": 0.14362832903862,
+      "learning_rate": 4.627455278221584e-05,
+      "loss": 0.18196423053741456,
+      "step": 1105
+    },
+    {
+      "epoch": 0.2019650655021834,
+      "grad_norm": 0.15951639413833618,
+      "learning_rate": 4.623580205402947e-05,
+      "loss": 0.17423888444900512,
+      "step": 1110
+    },
+    {
+      "epoch": 0.20287481804949054,
+      "grad_norm": 0.17273563146591187,
+      "learning_rate": 4.619686723200115e-05,
+      "loss": 0.17392473220825194,
+      "step": 1115
+    },
+    {
+      "epoch": 0.20378457059679767,
+      "grad_norm": 0.1655360758304596,
+      "learning_rate": 4.615774865365813e-05,
+      "loss": 0.17528389692306517,
+      "step": 1120
+    },
+    {
+      "epoch": 0.2046943231441048,
+      "grad_norm": 0.15920691192150116,
+      "learning_rate": 4.611844665812058e-05,
+      "loss": 0.1806849241256714,
+      "step": 1125
+    },
+    {
+      "epoch": 0.20560407569141192,
+      "grad_norm": 0.16114577651023865,
+      "learning_rate": 4.607896158609875e-05,
+      "loss": 0.17217352390289306,
+      "step": 1130
+    },
+    {
+      "epoch": 0.20651382823871905,
+      "grad_norm": 0.1499422937631607,
+      "learning_rate": 4.603929377988999e-05,
+      "loss": 0.17806737422943114,
+      "step": 1135
+    },
+    {
+      "epoch": 0.2074235807860262,
+      "grad_norm": 0.17605191469192505,
+      "learning_rate": 4.5999443583375765e-05,
+      "loss": 0.17842113971710205,
+      "step": 1140
+    },
+    {
+      "epoch": 0.20833333333333334,
+      "grad_norm": 0.16117210686206818,
+      "learning_rate": 4.595941134201871e-05,
+      "loss": 0.18379683494567872,
+      "step": 1145
+    },
+    {
+      "epoch": 0.20924308588064047,
+      "grad_norm": 0.21199050545692444,
+      "learning_rate": 4.591919740285957e-05,
+      "loss": 0.16286123991012574,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2101528384279476,
+      "grad_norm": 0.15100529789924622,
+      "learning_rate": 4.587880211451427e-05,
+      "loss": 0.17995200157165528,
+      "step": 1155
+    },
+    {
+      "epoch": 0.21106259097525473,
+      "grad_norm": 0.16618172824382782,
+      "learning_rate": 4.583822582717085e-05,
+      "loss": 0.16960303783416747,
+      "step": 1160
+    },
+    {
+      "epoch": 0.21197234352256186,
+      "grad_norm": 0.14743569493293762,
+      "learning_rate": 4.579746889258643e-05,
+      "loss": 0.17762668132781984,
+      "step": 1165
+    },
+    {
+      "epoch": 0.212882096069869,
+      "grad_norm": 0.1697179079055786,
+      "learning_rate": 4.575653166408417e-05,
+      "loss": 0.16656005382537842,
+      "step": 1170
+    },
+    {
+      "epoch": 0.21379184861717612,
+      "grad_norm": 0.14886513352394104,
+      "learning_rate": 4.57154144965502e-05,
+      "loss": 0.17091882228851318,
+      "step": 1175
+    },
+    {
+      "epoch": 0.21470160116448325,
+      "grad_norm": 0.18197473883628845,
+      "learning_rate": 4.5674117746430556e-05,
+      "loss": 0.1770920753479004,
+      "step": 1180
+    },
+    {
+      "epoch": 0.21561135371179038,
+      "grad_norm": 0.17323088645935059,
+      "learning_rate": 4.563264177172807e-05,
+      "loss": 0.1734643578529358,
+      "step": 1185
+    },
+    {
+      "epoch": 0.2165211062590975,
+      "grad_norm": 0.1521984338760376,
+      "learning_rate": 4.559098693199929e-05,
+      "loss": 0.17515116930007935,
+      "step": 1190
+    },
+    {
+      "epoch": 0.21743085880640467,
+      "grad_norm": 0.1842304915189743,
+      "learning_rate": 4.554915358835134e-05,
+      "loss": 0.16798022985458375,
+      "step": 1195
+    },
+    {
+      "epoch": 0.2183406113537118,
+      "grad_norm": 0.14753451943397522,
+      "learning_rate": 4.5507142103438794e-05,
+      "loss": 0.1755476713180542,
+      "step": 1200
+    },
+    {
+      "epoch": 0.21925036390101893,
+      "grad_norm": 0.17096194624900818,
+      "learning_rate": 4.546495284146057e-05,
+      "loss": 0.1792473554611206,
+      "step": 1205
+    },
+    {
+      "epoch": 0.22016011644832606,
+      "grad_norm": 0.1579233556985855,
+      "learning_rate": 4.542258616815669e-05,
+      "loss": 0.17230144739151002,
+      "step": 1210
+    },
+    {
+      "epoch": 0.2210698689956332,
+      "grad_norm": 0.177297905087471,
+      "learning_rate": 4.5380042450805216e-05,
+      "loss": 0.1807127833366394,
+      "step": 1215
+    },
+    {
+      "epoch": 0.22197962154294032,
+      "grad_norm": 0.14331696927547455,
+      "learning_rate": 4.533732205821897e-05,
+      "loss": 0.17201389074325563,
+      "step": 1220
+    },
+    {
+      "epoch": 0.22288937409024745,
+      "grad_norm": 0.14473360776901245,
+      "learning_rate": 4.529442536074239e-05,
+      "loss": 0.17036900520324708,
+      "step": 1225
+    },
+    {
+      "epoch": 0.22379912663755458,
+      "grad_norm": 0.1820901483297348,
+      "learning_rate": 4.5251352730248314e-05,
+      "loss": 0.17704882621765136,
+      "step": 1230
+    },
+    {
+      "epoch": 0.2247088791848617,
+      "grad_norm": 0.1948976367712021,
+      "learning_rate": 4.5208104540134746e-05,
+      "loss": 0.1706973433494568,
+      "step": 1235
+    },
+    {
+      "epoch": 0.22561863173216884,
+      "grad_norm": 0.16660070419311523,
+      "learning_rate": 4.51646811653216e-05,
+      "loss": 0.17636821269989014,
+      "step": 1240
+    },
+    {
+      "epoch": 0.22652838427947597,
+      "grad_norm": 0.1699984073638916,
+      "learning_rate": 4.512108298224751e-05,
+      "loss": 0.16986632347106934,
+      "step": 1245
+    },
+    {
+      "epoch": 0.22743813682678313,
+      "grad_norm": 0.17601042985916138,
+      "learning_rate": 4.50773103688665e-05,
+      "loss": 0.17507898807525635,
+      "step": 1250
+    },
+    {
+      "epoch": 0.22834788937409026,
+      "grad_norm": 0.17557238042354584,
+      "learning_rate": 4.503336370464476e-05,
+      "loss": 0.17702863216400147,
+      "step": 1255
+    },
+    {
+      "epoch": 0.2292576419213974,
+      "grad_norm": 0.1800651252269745,
+      "learning_rate": 4.498924337055729e-05,
+      "loss": 0.16419180631637573,
+      "step": 1260
+    },
+    {
+      "epoch": 0.23016739446870452,
+      "grad_norm": 0.2022479772567749,
+      "learning_rate": 4.494494974908468e-05,
+      "loss": 0.17482060194015503,
+      "step": 1265
+    },
+    {
+      "epoch": 0.23107714701601165,
+      "grad_norm": 0.14180205762386322,
+      "learning_rate": 4.490048322420973e-05,
+      "loss": 0.1723136067390442,
+      "step": 1270
+    },
+    {
+      "epoch": 0.23198689956331878,
+      "grad_norm": 0.18607310950756073,
+      "learning_rate": 4.485584418141419e-05,
+      "loss": 0.17096419334411622,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2328966521106259,
+      "grad_norm": 0.15958310663700104,
+      "learning_rate": 4.481103300767529e-05,
+      "loss": 0.1656244158744812,
+      "step": 1280
+    },
+    {
+      "epoch": 0.23380640465793304,
+      "grad_norm": 0.17552383244037628,
+      "learning_rate": 4.476605009146255e-05,
+      "loss": 0.17677626609802247,
+      "step": 1285
+    },
+    {
+      "epoch": 0.23471615720524017,
+      "grad_norm": 0.15299823880195618,
+      "learning_rate": 4.472089582273429e-05,
+      "loss": 0.1778991103172302,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2356259097525473,
+      "grad_norm": 0.14613987505435944,
+      "learning_rate": 4.46755705929343e-05,
+      "loss": 0.17071452140808105,
+      "step": 1295
+    },
+    {
+      "epoch": 0.23653566229985443,
+      "grad_norm": 0.17781122028827667,
+      "learning_rate": 4.463007479498843e-05,
+      "loss": 0.16955430507659913,
+      "step": 1300
+    },
+    {
+      "epoch": 0.23744541484716158,
+      "grad_norm": 0.16326487064361572,
+      "learning_rate": 4.458440882330119e-05,
+      "loss": 0.1777693510055542,
+      "step": 1305
+    },
+    {
+      "epoch": 0.23835516739446871,
+      "grad_norm": 0.17701926827430725,
+      "learning_rate": 4.4538573073752365e-05,
+      "loss": 0.16323351860046387,
+      "step": 1310
+    },
+    {
+      "epoch": 0.23926491994177584,
+      "grad_norm": 0.13104717433452606,
+      "learning_rate": 4.449256794369349e-05,
+      "loss": 0.17653456926345826,
+      "step": 1315
+    },
+    {
+      "epoch": 0.24017467248908297,
+      "grad_norm": 0.1796836256980896,
+      "learning_rate": 4.444639383194452e-05,
+      "loss": 0.17189600467681884,
+      "step": 1320
+    },
+    {
+      "epoch": 0.2410844250363901,
+      "grad_norm": 0.14919696748256683,
+      "learning_rate": 4.440005113879029e-05,
+      "loss": 0.17003334760665895,
+      "step": 1325
+    },
+    {
+      "epoch": 0.24199417758369723,
+      "grad_norm": 0.1728784441947937,
+      "learning_rate": 4.4353540265977064e-05,
+      "loss": 0.17397408485412597,
+      "step": 1330
+    },
+    {
+      "epoch": 0.24290393013100436,
+      "grad_norm": 0.14591015875339508,
+      "learning_rate": 4.43068616167091e-05,
+      "loss": 0.16498478651046752,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2438136826783115,
+      "grad_norm": 0.18417201936244965,
+      "learning_rate": 4.4260015595645055e-05,
+      "loss": 0.16841750144958495,
+      "step": 1340
+    },
+    {
+      "epoch": 0.24472343522561862,
+      "grad_norm": 0.16264279186725616,
+      "learning_rate": 4.4213002608894605e-05,
+      "loss": 0.16907373666763306,
+      "step": 1345
+    },
+    {
+      "epoch": 0.24563318777292575,
+      "grad_norm": 0.15248481929302216,
+      "learning_rate": 4.416582306401481e-05,
+      "loss": 0.15931472778320313,
+      "step": 1350
+    },
+    {
+      "epoch": 0.24654294032023288,
+      "grad_norm": 0.1488373875617981,
+      "learning_rate": 4.4118477370006636e-05,
+      "loss": 0.1701716423034668,
+      "step": 1355
+    },
+    {
+      "epoch": 0.24745269286754004,
+      "grad_norm": 0.14679782092571259,
+      "learning_rate": 4.407096593731142e-05,
+      "loss": 0.157412326335907,
+      "step": 1360
+    },
+    {
+      "epoch": 0.24836244541484717,
+      "grad_norm": 0.17139530181884766,
+      "learning_rate": 4.402328917780728e-05,
+      "loss": 0.17303754091262818,
+      "step": 1365
+    },
+    {
+      "epoch": 0.2492721979621543,
+      "grad_norm": 0.1534871757030487,
+      "learning_rate": 4.397544750480554e-05,
+      "loss": 0.1786255121231079,
+      "step": 1370
+    },
+    {
+      "epoch": 0.2501819505094614,
+      "grad_norm": 0.1876252293586731,
+      "learning_rate": 4.39274413330472e-05,
+      "loss": 0.16442898511886597,
+      "step": 1375
+    },
+    {
+      "epoch": 0.25109170305676853,
+      "grad_norm": 0.16165752708911896,
+      "learning_rate": 4.387927107869928e-05,
+      "loss": 0.1780426025390625,
+      "step": 1380
+    },
+    {
+      "epoch": 0.25200145560407566,
+      "grad_norm": 0.17242255806922913,
+      "learning_rate": 4.383093715935124e-05,
+      "loss": 0.15959256887435913,
+      "step": 1385
+    },
+    {
+      "epoch": 0.25291120815138285,
+      "grad_norm": 0.1627114862203598,
+      "learning_rate": 4.378243999401137e-05,
+      "loss": 0.17606115341186523,
+      "step": 1390
+    },
+    {
+      "epoch": 0.25382096069869,
+      "grad_norm": 0.15911224484443665,
+      "learning_rate": 4.373378000310312e-05,
+      "loss": 0.16798585653305054,
+      "step": 1395
+    },
+    {
+      "epoch": 0.2547307132459971,
+      "grad_norm": 0.15542249381542206,
+      "learning_rate": 4.3684957608461505e-05,
+      "loss": 0.1695417881011963,
+      "step": 1400
+    },
+    {
+      "epoch": 0.25564046579330424,
+      "grad_norm": 0.1475304812192917,
+      "learning_rate": 4.363597323332941e-05,
+      "loss": 0.16340878009796142,
+      "step": 1405
+    },
+    {
+      "epoch": 0.25655021834061137,
+      "grad_norm": 0.16943927109241486,
+      "learning_rate": 4.358682730235395e-05,
+      "loss": 0.17240238189697266,
+      "step": 1410
+    },
+    {
+      "epoch": 0.2574599708879185,
+      "grad_norm": 0.1816391944885254,
+      "learning_rate": 4.3537520241582744e-05,
+      "loss": 0.16558437347412108,
+      "step": 1415
+    },
+    {
+      "epoch": 0.25836972343522563,
+      "grad_norm": 0.23851341009140015,
+      "learning_rate": 4.348805247846027e-05,
+      "loss": 0.16796000003814698,
+      "step": 1420
+    },
+    {
+      "epoch": 0.25927947598253276,
+      "grad_norm": 0.15415243804454803,
+      "learning_rate": 4.343842444182414e-05,
+      "loss": 0.1746017098426819,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2601892285298399,
+      "grad_norm": 0.15651032328605652,
+      "learning_rate": 4.338863656190139e-05,
+      "loss": 0.1649057984352112,
+      "step": 1430
+    },
+    {
+      "epoch": 0.261098981077147,
+      "grad_norm": 0.16601966321468353,
+      "learning_rate": 4.333868927030471e-05,
+      "loss": 0.15888988971710205,
+      "step": 1435
+    },
+    {
+      "epoch": 0.26200873362445415,
+      "grad_norm": 0.1549467295408249,
+      "learning_rate": 4.328858300002876e-05,
+      "loss": 0.16357985734939576,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2629184861717613,
+      "grad_norm": 0.16332370042800903,
+      "learning_rate": 4.32383181854464e-05,
+      "loss": 0.16749982833862304,
+      "step": 1445
+    },
+    {
+      "epoch": 0.2638282387190684,
+      "grad_norm": 0.14827077090740204,
+      "learning_rate": 4.3187895262304894e-05,
+      "loss": 0.16886214017868043,
+      "step": 1450
+    },
+    {
+      "epoch": 0.26473799126637554,
+      "grad_norm": 0.1557198166847229,
+      "learning_rate": 4.313731466772216e-05,
+      "loss": 0.17512214183807373,
+      "step": 1455
+    },
+    {
+      "epoch": 0.26564774381368267,
+      "grad_norm": 0.17263570427894592,
+      "learning_rate": 4.308657684018299e-05,
+      "loss": 0.16248074769973755,
+      "step": 1460
+    },
+    {
+      "epoch": 0.2665574963609898,
+      "grad_norm": 0.17135761678218842,
+      "learning_rate": 4.303568221953521e-05,
+      "loss": 0.16605921983718872,
+      "step": 1465
+    },
+    {
+      "epoch": 0.26746724890829693,
+      "grad_norm": 0.14322632551193237,
+      "learning_rate": 4.2984631246985897e-05,
+      "loss": 0.1610772728919983,
+      "step": 1470
+    },
+    {
+      "epoch": 0.26837700145560406,
+      "grad_norm": 0.18852312862873077,
+      "learning_rate": 4.2933424365097564e-05,
+      "loss": 0.1686462163925171,
+      "step": 1475
+    },
+    {
+      "epoch": 0.2692867540029112,
+      "grad_norm": 0.1780245155096054,
+      "learning_rate": 4.2882062017784294e-05,
+      "loss": 0.16953932046890258,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2701965065502183,
+      "grad_norm": 0.180568665266037,
+      "learning_rate": 4.2830544650307895e-05,
+      "loss": 0.16442664861679077,
+      "step": 1485
+    },
+    {
+      "epoch": 0.27110625909752545,
+      "grad_norm": 0.16876435279846191,
+      "learning_rate": 4.277887270927407e-05,
+      "loss": 0.17128173112869263,
+      "step": 1490
+    },
+    {
+      "epoch": 0.2720160116448326,
+      "grad_norm": 0.164053276181221,
+      "learning_rate": 4.2727046642628513e-05,
+      "loss": 0.16331382989883422,
+      "step": 1495
+    },
+    {
+      "epoch": 0.27292576419213976,
+      "grad_norm": 0.14577528834342957,
+      "learning_rate": 4.267506689965305e-05,
+      "loss": 0.1638316035270691,
+      "step": 1500
+    },
+    {
+      "epoch": 0.2738355167394469,
+      "grad_norm": 0.1648740917444229,
+      "learning_rate": 4.262293393096171e-05,
+      "loss": 0.15332664251327516,
+      "step": 1505
+    },
+    {
+      "epoch": 0.274745269286754,
+      "grad_norm": 0.16445094347000122,
+      "learning_rate": 4.257064818849685e-05,
+      "loss": 0.1706634521484375,
+      "step": 1510
+    },
+    {
+      "epoch": 0.27565502183406115,
+      "grad_norm": 0.1584935486316681,
+      "learning_rate": 4.251821012552524e-05,
+      "loss": 0.1684114694595337,
+      "step": 1515
+    },
+    {
+      "epoch": 0.2765647743813683,
+      "grad_norm": 0.17215611040592194,
+      "learning_rate": 4.24656201966341e-05,
+      "loss": 0.15594131946563722,
+      "step": 1520
+    },
+    {
+      "epoch": 0.2774745269286754,
+      "grad_norm": 0.15945589542388916,
+      "learning_rate": 4.2412878857727214e-05,
+      "loss": 0.1686659574508667,
+      "step": 1525
+    },
+    {
+      "epoch": 0.27838427947598254,
+      "grad_norm": 0.16103951632976532,
+      "learning_rate": 4.2359986566020906e-05,
+      "loss": 0.17779340744018554,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2792940320232897,
+      "grad_norm": 0.1770307570695877,
+      "learning_rate": 4.230694378004014e-05,
+      "loss": 0.16786882877349854,
+      "step": 1535
+    },
+    {
+      "epoch": 0.2802037845705968,
+      "grad_norm": 0.16225053369998932,
+      "learning_rate": 4.2253750959614504e-05,
+      "loss": 0.16558897495269775,
+      "step": 1540
+    },
+    {
+      "epoch": 0.28111353711790393,
+      "grad_norm": 0.27213969826698303,
+      "learning_rate": 4.220040856587425e-05,
+      "loss": 0.1641119599342346,
+      "step": 1545
+    },
+    {
+      "epoch": 0.28202328966521106,
+      "grad_norm": 0.1773071587085724,
+      "learning_rate": 4.2146917061246284e-05,
+      "loss": 0.16919140815734862,
+      "step": 1550
+    },
+    {
+      "epoch": 0.2829330422125182,
+      "grad_norm": 0.15519705414772034,
+      "learning_rate": 4.209327690945014e-05,
+      "loss": 0.15501506328582765,
+      "step": 1555
+    },
+    {
+      "epoch": 0.2838427947598253,
+      "grad_norm": 0.19921597838401794,
+      "learning_rate": 4.203948857549402e-05,
+      "loss": 0.1690821886062622,
+      "step": 1560
+    },
+    {
+      "epoch": 0.28475254730713245,
+      "grad_norm": 0.15417630970478058,
+      "learning_rate": 4.1985552525670696e-05,
+      "loss": 0.1675640344619751,
+      "step": 1565
+    },
+    {
+      "epoch": 0.2856622998544396,
+      "grad_norm": 0.1739572137594223,
+      "learning_rate": 4.193146922755348e-05,
+      "loss": 0.16738017797470092,
+      "step": 1570
+    },
+    {
+      "epoch": 0.2865720524017467,
+      "grad_norm": 0.1384361982345581,
+      "learning_rate": 4.187723914999221e-05,
+      "loss": 0.16802358627319336,
+      "step": 1575
+    },
+    {
+      "epoch": 0.28748180494905384,
+      "grad_norm": 0.1491454839706421,
+      "learning_rate": 4.182286276310915e-05,
+      "loss": 0.1619583249092102,
+      "step": 1580
+    },
+    {
+      "epoch": 0.288391557496361,
+      "grad_norm": 0.15831919014453888,
+      "learning_rate": 4.176834053829492e-05,
+      "loss": 0.1625199794769287,
+      "step": 1585
+    },
+    {
+      "epoch": 0.2893013100436681,
+      "grad_norm": 0.16265396773815155,
+      "learning_rate": 4.1713672948204416e-05,
+      "loss": 0.16718552112579346,
+      "step": 1590
+    },
+    {
+      "epoch": 0.29021106259097523,
+      "grad_norm": 0.15153461694717407,
+      "learning_rate": 4.1658860466752714e-05,
+      "loss": 0.15979087352752686,
+      "step": 1595
+    },
+    {
+      "epoch": 0.29112081513828236,
+      "grad_norm": 0.1620412915945053,
+      "learning_rate": 4.160390356911096e-05,
+      "loss": 0.16103557348251343,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2920305676855895,
+      "grad_norm": 0.16673807799816132,
+      "learning_rate": 4.154880273170223e-05,
+      "loss": 0.16394708156585694,
+      "step": 1605
+    },
+    {
+      "epoch": 0.2929403202328967,
+      "grad_norm": 0.14834867417812347,
+      "learning_rate": 4.149355843219744e-05,
+      "loss": 0.15916435718536376,
+      "step": 1610
+    },
+    {
+      "epoch": 0.2938500727802038,
+      "grad_norm": 0.16977964341640472,
+      "learning_rate": 4.143817114951119e-05,
+      "loss": 0.16538127660751342,
+      "step": 1615
+    },
+    {
+      "epoch": 0.29475982532751094,
+      "grad_norm": 0.17986875772476196,
+      "learning_rate": 4.138264136379756e-05,
+      "loss": 0.15514618158340454,
+      "step": 1620
+    },
+    {
+      "epoch": 0.29566957787481807,
+      "grad_norm": 0.15794920921325684,
+      "learning_rate": 4.132696955644605e-05,
+      "loss": 0.15992183685302735,
+      "step": 1625
+    },
+    {
+      "epoch": 0.2965793304221252,
+      "grad_norm": 0.19955399632453918,
+      "learning_rate": 4.127115621007731e-05,
+      "loss": 0.16362056732177735,
+      "step": 1630
+    },
+    {
+      "epoch": 0.29748908296943233,
+      "grad_norm": 0.1352023035287857,
+      "learning_rate": 4.121520180853903e-05,
+      "loss": 0.15631601810455323,
+      "step": 1635
+    },
+    {
+      "epoch": 0.29839883551673946,
+      "grad_norm": 0.15340781211853027,
+      "learning_rate": 4.1159106836901674e-05,
+      "loss": 0.1571858048439026,
+      "step": 1640
+    },
+    {
+      "epoch": 0.2993085880640466,
+      "grad_norm": 0.15311770141124725,
+      "learning_rate": 4.110287178145433e-05,
+      "loss": 0.16082344055175782,
+      "step": 1645
+    },
+    {
+      "epoch": 0.3002183406113537,
+      "grad_norm": 0.17811627686023712,
+      "learning_rate": 4.10464971297005e-05,
+      "loss": 0.16117215156555176,
+      "step": 1650
+    },
+    {
+      "epoch": 0.30112809315866085,
+      "grad_norm": 0.21060039103031158,
+      "learning_rate": 4.0989983370353805e-05,
+      "loss": 0.15838587284088135,
+      "step": 1655
+    },
+    {
+      "epoch": 0.302037845705968,
+      "grad_norm": 0.155836820602417,
+      "learning_rate": 4.093333099333383e-05,
+      "loss": 0.16648870706558228,
+      "step": 1660
+    },
+    {
+      "epoch": 0.3029475982532751,
+      "grad_norm": 0.13711698353290558,
+      "learning_rate": 4.0876540489761826e-05,
+      "loss": 0.16899349689483642,
+      "step": 1665
+    },
+    {
+      "epoch": 0.30385735080058224,
+      "grad_norm": 0.15162716805934906,
+      "learning_rate": 4.0819612351956485e-05,
+      "loss": 0.16574090719223022,
+      "step": 1670
+    },
+    {
+      "epoch": 0.30476710334788937,
+      "grad_norm": 0.15016348659992218,
+      "learning_rate": 4.0762547073429615e-05,
+      "loss": 0.1689780354499817,
+      "step": 1675
+    },
+    {
+      "epoch": 0.3056768558951965,
+      "grad_norm": 0.15182986855506897,
+      "learning_rate": 4.070534514888194e-05,
+      "loss": 0.1593686819076538,
+      "step": 1680
+    },
+    {
+      "epoch": 0.3065866084425036,
+      "grad_norm": 0.15648750960826874,
+      "learning_rate": 4.0648007074198765e-05,
+      "loss": 0.16436235904693602,
+      "step": 1685
+    },
+    {
+      "epoch": 0.30749636098981076,
+      "grad_norm": 0.18339484930038452,
+      "learning_rate": 4.0590533346445665e-05,
+      "loss": 0.1678077220916748,
+      "step": 1690
+    },
+    {
+      "epoch": 0.3084061135371179,
+      "grad_norm": 0.16426527500152588,
+      "learning_rate": 4.053292446386422e-05,
+      "loss": 0.1689227342605591,
+      "step": 1695
+    },
+    {
+      "epoch": 0.309315866084425,
+      "grad_norm": 0.16129335761070251,
+      "learning_rate": 4.047518092586766e-05,
+      "loss": 0.16592445373535156,
+      "step": 1700
+    },
+    {
+      "epoch": 0.31022561863173215,
+      "grad_norm": 0.15512363612651825,
+      "learning_rate": 4.041730323303654e-05,
+      "loss": 0.16142364740371704,
+      "step": 1705
+    },
+    {
+      "epoch": 0.3111353711790393,
+      "grad_norm": 0.159842386841774,
+      "learning_rate": 4.0359291887114425e-05,
+      "loss": 0.1702875852584839,
+      "step": 1710
+    },
+    {
+      "epoch": 0.3120451237263464,
+      "grad_norm": 0.19558854401111603,
+      "learning_rate": 4.030114739100352e-05,
+      "loss": 0.15966148376464845,
+      "step": 1715
+    },
+    {
+      "epoch": 0.3129548762736536,
+      "grad_norm": 0.1577496975660324,
+      "learning_rate": 4.024287024876029e-05,
+      "loss": 0.1620358943939209,
+      "step": 1720
+    },
+    {
+      "epoch": 0.3138646288209607,
+      "grad_norm": 0.1629355251789093,
+      "learning_rate": 4.0184460965591144e-05,
+      "loss": 0.16511552333831786,
+      "step": 1725
+    },
+    {
+      "epoch": 0.31477438136826785,
+      "grad_norm": 0.17060767114162445,
+      "learning_rate": 4.0125920047848e-05,
+      "loss": 0.15672838687896729,
+      "step": 1730
+    },
+    {
+      "epoch": 0.315684133915575,
+      "grad_norm": 0.22447620332241058,
+      "learning_rate": 4.006724800302394e-05,
+      "loss": 0.15339784622192382,
+      "step": 1735
+    },
+    {
+      "epoch": 0.3165938864628821,
+      "grad_norm": 0.14572037756443024,
+      "learning_rate": 4.000844533974878e-05,
+      "loss": 0.16566959619522095,
+      "step": 1740
+    },
+    {
+      "epoch": 0.31750363901018924,
+      "grad_norm": 0.15915483236312866,
+      "learning_rate": 3.9949512567784684e-05,
+      "loss": 0.16153957843780517,
+      "step": 1745
+    },
+    {
+      "epoch": 0.3184133915574964,
+      "grad_norm": 0.1668540984392166,
+      "learning_rate": 3.9890450198021704e-05,
+      "loss": 0.1659809947013855,
+      "step": 1750
+    },
+    {
+      "epoch": 0.3193231441048035,
+      "grad_norm": 0.16612035036087036,
+      "learning_rate": 3.983125874247341e-05,
+      "loss": 0.16941241025924683,
+      "step": 1755
+    },
+    {
+      "epoch": 0.32023289665211063,
+      "grad_norm": 0.15163679420948029,
+      "learning_rate": 3.9771938714272407e-05,
+      "loss": 0.16053590774536133,
+      "step": 1760
+    },
+    {
+      "epoch": 0.32114264919941776,
+      "grad_norm": 0.1797824203968048,
+      "learning_rate": 3.97124906276659e-05,
+      "loss": 0.1667110800743103,
+      "step": 1765
+    },
+    {
+      "epoch": 0.3220524017467249,
+      "grad_norm": 0.15076608955860138,
+      "learning_rate": 3.9652914998011237e-05,
+      "loss": 0.1607860803604126,
+      "step": 1770
+    },
+    {
+      "epoch": 0.322962154294032,
+      "grad_norm": 0.16523587703704834,
+      "learning_rate": 3.959321234177144e-05,
+      "loss": 0.16515827178955078,
+      "step": 1775
+    },
+    {
+      "epoch": 0.32387190684133915,
+      "grad_norm": 0.22065149247646332,
+      "learning_rate": 3.9533383176510746e-05,
+      "loss": 0.1618957757949829,
+      "step": 1780
+    },
+    {
+      "epoch": 0.3247816593886463,
+      "grad_norm": 0.16426463425159454,
+      "learning_rate": 3.9473428020890066e-05,
+      "loss": 0.15763382911682128,
+      "step": 1785
+    },
+    {
+      "epoch": 0.3256914119359534,
+      "grad_norm": 0.16474904119968414,
+      "learning_rate": 3.941334739466257e-05,
+      "loss": 0.15135571956634522,
+      "step": 1790
+    },
+    {
+      "epoch": 0.32660116448326054,
+      "grad_norm": 0.16746412217617035,
+      "learning_rate": 3.935314181866909e-05,
+      "loss": 0.15925389528274536,
+      "step": 1795
+    },
+    {
+      "epoch": 0.32751091703056767,
+      "grad_norm": 0.17819371819496155,
+      "learning_rate": 3.929281181483369e-05,
+      "loss": 0.1598669171333313,
+      "step": 1800
+    },
+    {
+      "epoch": 0.3284206695778748,
+      "grad_norm": 0.1816040277481079,
+      "learning_rate": 3.923235790615907e-05,
+      "loss": 0.1652522087097168,
+      "step": 1805
+    },
+    {
+      "epoch": 0.32933042212518193,
+      "grad_norm": 0.14846695959568024,
+      "learning_rate": 3.917178061672211e-05,
+      "loss": 0.16665585041046144,
+      "step": 1810
+    },
+    {
+      "epoch": 0.33024017467248906,
+      "grad_norm": 0.1734926551580429,
+      "learning_rate": 3.911108047166924e-05,
+      "loss": 0.16069791316986085,
+      "step": 1815
+    },
+    {
+      "epoch": 0.3311499272197962,
+      "grad_norm": 0.16154922544956207,
+      "learning_rate": 3.905025799721194e-05,
+      "loss": 0.16114097833633423,
+      "step": 1820
+    },
+    {
+      "epoch": 0.3320596797671033,
+      "grad_norm": 0.1538771390914917,
+      "learning_rate": 3.898931372062217e-05,
+      "loss": 0.1602831244468689,
+      "step": 1825
+    },
+    {
+      "epoch": 0.3329694323144105,
+      "grad_norm": 0.14036566019058228,
+      "learning_rate": 3.892824817022781e-05,
+      "loss": 0.1502395749092102,
+      "step": 1830
+    },
+    {
+      "epoch": 0.33387918486171764,
+      "grad_norm": 0.19212059676647186,
+      "learning_rate": 3.886706187540804e-05,
+      "loss": 0.16265250444412233,
+      "step": 1835
+    },
+    {
+      "epoch": 0.33478893740902477,
+      "grad_norm": 0.17410333454608917,
+      "learning_rate": 3.880575536658881e-05,
+      "loss": 0.15689224004745483,
+      "step": 1840
+    },
+    {
+      "epoch": 0.3356986899563319,
+      "grad_norm": 0.15165294706821442,
+      "learning_rate": 3.874432917523817e-05,
+      "loss": 0.15033140182495117,
+      "step": 1845
+    },
+    {
+      "epoch": 0.336608442503639,
+      "grad_norm": 0.16166730225086212,
+      "learning_rate": 3.8682783833861736e-05,
+      "loss": 0.16896235942840576,
+      "step": 1850
+    },
+    {
+      "epoch": 0.33751819505094616,
+      "grad_norm": 0.16497021913528442,
+      "learning_rate": 3.8621119875998026e-05,
+      "loss": 0.1600774645805359,
+      "step": 1855
+    },
+    {
+      "epoch": 0.3384279475982533,
+      "grad_norm": 0.17264948785305023,
+      "learning_rate": 3.855933783621384e-05,
+      "loss": 0.16947593688964843,
+      "step": 1860
+    },
+    {
+      "epoch": 0.3393377001455604,
+      "grad_norm": 0.16870704293251038,
+      "learning_rate": 3.8497438250099636e-05,
+      "loss": 0.16062095165252685,
+      "step": 1865
+    },
+    {
+      "epoch": 0.34024745269286755,
+      "grad_norm": 0.16644036769866943,
+      "learning_rate": 3.843542165426492e-05,
+      "loss": 0.16015599966049193,
+      "step": 1870
+    },
+    {
+      "epoch": 0.3411572052401747,
+      "grad_norm": 0.1626352220773697,
+      "learning_rate": 3.837328858633349e-05,
+      "loss": 0.17444703578948975,
+      "step": 1875
+    },
+    {
+      "epoch": 0.3420669577874818,
+      "grad_norm": 0.1427375227212906,
+      "learning_rate": 3.83110395849389e-05,
+      "loss": 0.1589805006980896,
+      "step": 1880
+    },
+    {
+      "epoch": 0.34297671033478894,
+      "grad_norm": 0.17840255796909332,
+      "learning_rate": 3.824867518971973e-05,
+      "loss": 0.15953952074050903,
+      "step": 1885
+    },
+    {
+      "epoch": 0.34388646288209607,
+      "grad_norm": 0.16998249292373657,
+      "learning_rate": 3.818619594131489e-05,
+      "loss": 0.16027032136917113,
+      "step": 1890
+    },
+    {
+      "epoch": 0.3447962154294032,
+      "grad_norm": 0.14950257539749146,
+      "learning_rate": 3.812360238135897e-05,
+      "loss": 0.15335670709609986,
+      "step": 1895
+    },
+    {
+      "epoch": 0.3457059679767103,
+      "grad_norm": 0.1678011417388916,
+      "learning_rate": 3.806089505247752e-05,
+      "loss": 0.1560648798942566,
+      "step": 1900
+    },
+    {
+      "epoch": 0.34661572052401746,
+      "grad_norm": 0.17944541573524475,
+      "learning_rate": 3.799807449828238e-05,
+      "loss": 0.16072254180908202,
+      "step": 1905
+    },
+    {
+      "epoch": 0.3475254730713246,
+      "grad_norm": 0.166817307472229,
+      "learning_rate": 3.793514126336691e-05,
+      "loss": 0.1542820692062378,
+      "step": 1910
+    },
+    {
+      "epoch": 0.3484352256186317,
+      "grad_norm": 0.16047626733779907,
+      "learning_rate": 3.787209589330134e-05,
+      "loss": 0.16092092990875245,
+      "step": 1915
+    },
+    {
+      "epoch": 0.34934497816593885,
+      "grad_norm": 0.16478900611400604,
+      "learning_rate": 3.7808938934627965e-05,
+      "loss": 0.16765867471694945,
+      "step": 1920
+    },
+    {
+      "epoch": 0.350254730713246,
+      "grad_norm": 0.15349514782428741,
+      "learning_rate": 3.774567093485648e-05,
+      "loss": 0.15890377759933472,
+      "step": 1925
+    },
+    {
+      "epoch": 0.3511644832605531,
+      "grad_norm": 0.1515921950340271,
+      "learning_rate": 3.768229244245917e-05,
+      "loss": 0.16668319702148438,
+      "step": 1930
+    },
+    {
+      "epoch": 0.35207423580786024,
+      "grad_norm": 0.16310466825962067,
+      "learning_rate": 3.7618804006866195e-05,
+      "loss": 0.15182652473449706,
+      "step": 1935
+    },
+    {
+      "epoch": 0.3529839883551674,
+      "grad_norm": 0.17294517159461975,
+      "learning_rate": 3.755520617846084e-05,
+      "loss": 0.16287628412246705,
+      "step": 1940
+    },
+    {
+      "epoch": 0.35389374090247455,
+      "grad_norm": 0.1482895463705063,
+      "learning_rate": 3.749149950857467e-05,
+      "loss": 0.15321952104568481,
+      "step": 1945
+    },
+    {
+      "epoch": 0.3548034934497817,
+      "grad_norm": 0.2236029952764511,
+      "learning_rate": 3.7427684549482847e-05,
+      "loss": 0.15403482913970948,
+      "step": 1950
+    },
+    {
+      "epoch": 0.3557132459970888,
+      "grad_norm": 0.20185327529907227,
+      "learning_rate": 3.736376185439927e-05,
+      "loss": 0.1633884072303772,
+      "step": 1955
+    },
+    {
+      "epoch": 0.35662299854439594,
+      "grad_norm": 0.13906247913837433,
+      "learning_rate": 3.7299731977471816e-05,
+      "loss": 0.15925350189208984,
+      "step": 1960
+    },
+    {
+      "epoch": 0.35753275109170307,
+      "grad_norm": 0.18665002286434174,
+      "learning_rate": 3.723559547377751e-05,
+      "loss": 0.1612026572227478,
+      "step": 1965
+    },
+    {
+      "epoch": 0.3584425036390102,
+      "grad_norm": 0.16913433372974396,
+      "learning_rate": 3.717135289931774e-05,
+      "loss": 0.15479494333267213,
+      "step": 1970
+    },
+    {
+      "epoch": 0.35935225618631733,
+      "grad_norm": 0.1620066910982132,
+      "learning_rate": 3.7107004811013434e-05,
+      "loss": 0.1604058027267456,
+      "step": 1975
+    },
+    {
+      "epoch": 0.36026200873362446,
+      "grad_norm": 0.16838301718235016,
+      "learning_rate": 3.704255176670021e-05,
+      "loss": 0.15335073471069335,
+      "step": 1980
+    },
+    {
+      "epoch": 0.3611717612809316,
+      "grad_norm": 0.3054695427417755,
+      "learning_rate": 3.6977994325123535e-05,
+      "loss": 0.16558053493499755,
+      "step": 1985
+    },
+    {
+      "epoch": 0.3620815138282387,
+      "grad_norm": 0.1526716649532318,
+      "learning_rate": 3.6913333045933934e-05,
+      "loss": 0.16148923635482787,
+      "step": 1990
+    },
+    {
+      "epoch": 0.36299126637554585,
+      "grad_norm": 0.15328513085842133,
+      "learning_rate": 3.684856848968209e-05,
+      "loss": 0.1553613781929016,
+      "step": 1995
+    },
+    {
+      "epoch": 0.363901018922853,
+      "grad_norm": 0.16129714250564575,
+      "learning_rate": 3.6783701217813995e-05,
+      "loss": 0.16724612712860107,
+      "step": 2000
+    },
+    {
+      "epoch": 0.3648107714701601,
+      "grad_norm": 0.15715539455413818,
+      "learning_rate": 3.6718731792666086e-05,
+      "loss": 0.15867922306060792,
+      "step": 2005
+    },
+    {
+      "epoch": 0.36572052401746724,
+      "grad_norm": 0.15569166839122772,
+      "learning_rate": 3.6653660777460366e-05,
+      "loss": 0.1552058696746826,
+      "step": 2010
+    },
+    {
+      "epoch": 0.36663027656477437,
+      "grad_norm": 0.16223010420799255,
+      "learning_rate": 3.6588488736299535e-05,
+      "loss": 0.1583200454711914,
+      "step": 2015
+    },
+    {
+      "epoch": 0.3675400291120815,
+      "grad_norm": 0.18441995978355408,
+      "learning_rate": 3.652321623416209e-05,
+      "loss": 0.15050662755966188,
+      "step": 2020
+    },
+    {
+      "epoch": 0.36844978165938863,
+      "grad_norm": 0.13792674243450165,
+      "learning_rate": 3.645784383689742e-05,
+      "loss": 0.15458759069442748,
+      "step": 2025
+    },
+    {
+      "epoch": 0.36935953420669576,
+      "grad_norm": 0.14993111789226532,
+      "learning_rate": 3.639237211122091e-05,
+      "loss": 0.15926222801208495,
+      "step": 2030
+    },
+    {
+      "epoch": 0.3702692867540029,
+      "grad_norm": 0.16815930604934692,
+      "learning_rate": 3.632680162470904e-05,
+      "loss": 0.15524441003799438,
+      "step": 2035
+    },
+    {
+      "epoch": 0.37117903930131,
+      "grad_norm": 0.13312821090221405,
+      "learning_rate": 3.626113294579441e-05,
+      "loss": 0.15883516073226928,
+      "step": 2040
+    },
+    {
+      "epoch": 0.37208879184861715,
+      "grad_norm": 0.16838273406028748,
+      "learning_rate": 3.619536664376091e-05,
+      "loss": 0.15829603672027587,
+      "step": 2045
+    },
+    {
+      "epoch": 0.37299854439592434,
+      "grad_norm": 0.14706873893737793,
+      "learning_rate": 3.612950328873869e-05,
+      "loss": 0.15644397735595703,
+      "step": 2050
+    },
+    {
+      "epoch": 0.37390829694323147,
+      "grad_norm": 0.1644199639558792,
+      "learning_rate": 3.606354345169926e-05,
+      "loss": 0.15858219861984252,
+      "step": 2055
+    },
+    {
+      "epoch": 0.3748180494905386,
+      "grad_norm": 0.18077051639556885,
+      "learning_rate": 3.599748770445055e-05,
+      "loss": 0.1641286849975586,
+      "step": 2060
+    },
+    {
+      "epoch": 0.3757278020378457,
+      "grad_norm": 0.16329127550125122,
+      "learning_rate": 3.5931336619631914e-05,
+      "loss": 0.15027186870574952,
+      "step": 2065
+    },
+    {
+      "epoch": 0.37663755458515286,
+      "grad_norm": 0.16346783936023712,
+      "learning_rate": 3.586509077070922e-05,
+      "loss": 0.1558641314506531,
+      "step": 2070
+    },
+    {
+      "epoch": 0.37754730713246,
+      "grad_norm": 0.1727602630853653,
+      "learning_rate": 3.5798750731969834e-05,
+      "loss": 0.15390506982803345,
+      "step": 2075
+    },
+    {
+      "epoch": 0.3784570596797671,
+      "grad_norm": 0.7598192691802979,
+      "learning_rate": 3.5732317078517654e-05,
+      "loss": 0.1533232808113098,
+      "step": 2080
+    },
+    {
+      "epoch": 0.37936681222707425,
+      "grad_norm": 0.1433355212211609,
+      "learning_rate": 3.5665790386268124e-05,
+      "loss": 0.15560413599014283,
+      "step": 2085
+    },
+    {
+      "epoch": 0.3802765647743814,
+      "grad_norm": 0.18439625203609467,
+      "learning_rate": 3.559917123194325e-05,
+      "loss": 0.16695556640625,
+      "step": 2090
+    },
+    {
+      "epoch": 0.3811863173216885,
+      "grad_norm": 0.1693502813577652,
+      "learning_rate": 3.55324601930666e-05,
+      "loss": 0.15957870483398437,
+      "step": 2095
+    },
+    {
+      "epoch": 0.38209606986899564,
+      "grad_norm": 0.17776088416576385,
+      "learning_rate": 3.54656578479583e-05,
+      "loss": 0.1527492880821228,
+      "step": 2100
+    },
+    {
+      "epoch": 0.38300582241630277,
+      "grad_norm": 0.15993724763393402,
+      "learning_rate": 3.539876477572998e-05,
+      "loss": 0.1567505717277527,
+      "step": 2105
+    },
+    {
+      "epoch": 0.3839155749636099,
+      "grad_norm": 0.17067375779151917,
+      "learning_rate": 3.533178155627981e-05,
+      "loss": 0.14660797119140626,
+      "step": 2110
+    },
+    {
+      "epoch": 0.384825327510917,
+      "grad_norm": 0.20239882171154022,
+      "learning_rate": 3.526470877028745e-05,
+      "loss": 0.1596767544746399,
+      "step": 2115
+    },
+    {
+      "epoch": 0.38573508005822416,
+      "grad_norm": 0.1863643079996109,
+      "learning_rate": 3.5197546999209005e-05,
+      "loss": 0.15738571882247926,
+      "step": 2120
+    },
+    {
+      "epoch": 0.3866448326055313,
+      "grad_norm": 0.16994133591651917,
+      "learning_rate": 3.5130296825272014e-05,
+      "loss": 0.16255316734313965,
+      "step": 2125
+    },
+    {
+      "epoch": 0.3875545851528384,
+      "grad_norm": 0.18703415989875793,
+      "learning_rate": 3.5062958831470355e-05,
+      "loss": 0.15206334590911866,
+      "step": 2130
+    },
+    {
+      "epoch": 0.38846433770014555,
+      "grad_norm": 0.15433982014656067,
+      "learning_rate": 3.4995533601559226e-05,
+      "loss": 0.1590178370475769,
+      "step": 2135
+    },
+    {
+      "epoch": 0.3893740902474527,
+      "grad_norm": 0.16498146951198578,
+      "learning_rate": 3.4928021720050104e-05,
+      "loss": 0.14759145975112914,
+      "step": 2140
+    },
+    {
+      "epoch": 0.3902838427947598,
+      "grad_norm": 0.17880478501319885,
+      "learning_rate": 3.486042377220562e-05,
+      "loss": 0.1642458915710449,
+      "step": 2145
+    },
+    {
+      "epoch": 0.39119359534206694,
+      "grad_norm": 0.14700061082839966,
+      "learning_rate": 3.479274034403455e-05,
+      "loss": 0.16105138063430785,
+      "step": 2150
+    },
+    {
+      "epoch": 0.39210334788937407,
+      "grad_norm": 0.1620762050151825,
+      "learning_rate": 3.472497202228664e-05,
+      "loss": 0.15104985237121582,
+      "step": 2155
+    },
+    {
+      "epoch": 0.3930131004366812,
+      "grad_norm": 0.1625058799982071,
+      "learning_rate": 3.4657119394447654e-05,
+      "loss": 0.16145485639572144,
+      "step": 2160
+    },
+    {
+      "epoch": 0.3939228529839884,
+      "grad_norm": 0.1631549596786499,
+      "learning_rate": 3.458918304873417e-05,
+      "loss": 0.16712255477905275,
+      "step": 2165
+    },
+    {
+      "epoch": 0.3948326055312955,
+      "grad_norm": 0.16041551530361176,
+      "learning_rate": 3.452116357408853e-05,
+      "loss": 0.15118330717086792,
+      "step": 2170
+    },
+    {
+      "epoch": 0.39574235807860264,
+      "grad_norm": 0.16692611575126648,
+      "learning_rate": 3.44530615601737e-05,
+      "loss": 0.16982550621032716,
+      "step": 2175
+    },
+    {
+      "epoch": 0.39665211062590977,
+      "grad_norm": 0.16082268953323364,
+      "learning_rate": 3.438487759736821e-05,
+      "loss": 0.1513260006904602,
+      "step": 2180
+    },
+    {
+      "epoch": 0.3975618631732169,
+      "grad_norm": 0.1474589854478836,
+      "learning_rate": 3.4316612276761004e-05,
+      "loss": 0.14968743324279785,
+      "step": 2185
+    },
+    {
+      "epoch": 0.39847161572052403,
+      "grad_norm": 0.14531342685222626,
+      "learning_rate": 3.42482661901463e-05,
+      "loss": 0.1563260555267334,
+      "step": 2190
+    },
+    {
+      "epoch": 0.39938136826783116,
+      "grad_norm": 0.16775506734848022,
+      "learning_rate": 3.41798399300185e-05,
+      "loss": 0.14861010313034057,
+      "step": 2195
+    },
+    {
+      "epoch": 0.4002911208151383,
+      "grad_norm": 0.15065217018127441,
+      "learning_rate": 3.411133408956703e-05,
+      "loss": 0.15559519529342652,
+      "step": 2200
+    },
+    {
+      "epoch": 0.4012008733624454,
+      "grad_norm": 0.16655296087265015,
+      "learning_rate": 3.4042749262671184e-05,
+      "loss": 0.16025567054748535,
+      "step": 2205
+    },
+    {
+      "epoch": 0.40211062590975255,
+      "grad_norm": 0.14773905277252197,
+      "learning_rate": 3.397408604389501e-05,
+      "loss": 0.15074082612991332,
+      "step": 2210
+    },
+    {
+      "epoch": 0.4030203784570597,
+      "grad_norm": 0.16233304142951965,
+      "learning_rate": 3.3905345028482125e-05,
+      "loss": 0.15490520000457764,
+      "step": 2215
+    },
+    {
+      "epoch": 0.4039301310043668,
+      "grad_norm": 0.17520153522491455,
+      "learning_rate": 3.383652681235058e-05,
+      "loss": 0.1517520785331726,
+      "step": 2220
+    },
+    {
+      "epoch": 0.40483988355167394,
+      "grad_norm": 0.14749875664710999,
+      "learning_rate": 3.376763199208766e-05,
+      "loss": 0.15410997867584228,
+      "step": 2225
+    },
+    {
+      "epoch": 0.40574963609898107,
+      "grad_norm": 0.16855919361114502,
+      "learning_rate": 3.369866116494477e-05,
+      "loss": 0.1510261058807373,
+      "step": 2230
+    },
+    {
+      "epoch": 0.4066593886462882,
+      "grad_norm": 0.1594122350215912,
+      "learning_rate": 3.362961492883218e-05,
+      "loss": 0.1493813395500183,
+      "step": 2235
+    },
+    {
+      "epoch": 0.40756914119359533,
+      "grad_norm": 0.13645926117897034,
+      "learning_rate": 3.3560493882313915e-05,
+      "loss": 0.14876762628555298,
+      "step": 2240
+    },
+    {
+      "epoch": 0.40847889374090246,
+      "grad_norm": 0.14304400980472565,
+      "learning_rate": 3.349129862460251e-05,
+      "loss": 0.15567013025283813,
+      "step": 2245
+    },
+    {
+      "epoch": 0.4093886462882096,
+      "grad_norm": 0.17040041089057922,
+      "learning_rate": 3.342202975555386e-05,
+      "loss": 0.1563249945640564,
+      "step": 2250
+    },
+    {
+      "epoch": 0.4102983988355167,
+      "grad_norm": 0.15594671666622162,
+      "learning_rate": 3.3352687875661984e-05,
+      "loss": 0.1546410083770752,
+      "step": 2255
+    },
+    {
+      "epoch": 0.41120815138282385,
+      "grad_norm": 0.1677195280790329,
+      "learning_rate": 3.328327358605384e-05,
+      "loss": 0.15710171461105346,
+      "step": 2260
+    },
+    {
+      "epoch": 0.412117903930131,
+      "grad_norm": 0.1731705516576767,
+      "learning_rate": 3.321378748848412e-05,
+      "loss": 0.16444036960601807,
+      "step": 2265
+    },
+    {
+      "epoch": 0.4130276564774381,
+      "grad_norm": 0.18779033422470093,
+      "learning_rate": 3.3144230185329984e-05,
+      "loss": 0.15659687519073487,
+      "step": 2270
+    },
+    {
+      "epoch": 0.4139374090247453,
+      "grad_norm": 0.1543768346309662,
+      "learning_rate": 3.3074602279585913e-05,
+      "loss": 0.15100739002227784,
+      "step": 2275
+    },
+    {
+      "epoch": 0.4148471615720524,
+      "grad_norm": 0.16672168672084808,
+      "learning_rate": 3.300490437485843e-05,
+      "loss": 0.15535364151000977,
+      "step": 2280
+    },
+    {
+      "epoch": 0.41575691411935956,
+      "grad_norm": 0.16741308569908142,
+      "learning_rate": 3.293513707536089e-05,
+      "loss": 0.15523911714553834,
+      "step": 2285
+    },
+    {
+      "epoch": 0.4166666666666667,
+      "grad_norm": 0.1488303542137146,
+      "learning_rate": 3.286530098590822e-05,
+      "loss": 0.1542000651359558,
+      "step": 2290
+    },
+    {
+      "epoch": 0.4175764192139738,
+      "grad_norm": 0.1637732982635498,
+      "learning_rate": 3.2795396711911694e-05,
+      "loss": 0.15354831218719484,
+      "step": 2295
+    },
+    {
+      "epoch": 0.41848617176128095,
+      "grad_norm": 0.1472022533416748,
+      "learning_rate": 3.272542485937369e-05,
+      "loss": 0.16235145330429077,
+      "step": 2300
+    },
+    {
+      "epoch": 0.4193959243085881,
+      "grad_norm": 0.15908290445804596,
+      "learning_rate": 3.265538603488241e-05,
+      "loss": 0.15642645359039306,
+      "step": 2305
+    },
+    {
+      "epoch": 0.4203056768558952,
+      "grad_norm": 0.1584865301847458,
+      "learning_rate": 3.2585280845606645e-05,
+      "loss": 0.15490249395370484,
+      "step": 2310
+    },
+    {
+      "epoch": 0.42121542940320233,
+      "grad_norm": 0.15893949568271637,
+      "learning_rate": 3.251510989929052e-05,
+      "loss": 0.1598116159439087,
+      "step": 2315
+    },
+    {
+      "epoch": 0.42212518195050946,
+      "grad_norm": 0.18930596113204956,
+      "learning_rate": 3.244487380424817e-05,
+      "loss": 0.1482008934020996,
+      "step": 2320
+    },
+    {
+      "epoch": 0.4230349344978166,
+      "grad_norm": 0.132876455783844,
+      "learning_rate": 3.237457316935856e-05,
+      "loss": 0.15304710865020751,
+      "step": 2325
+    },
+    {
+      "epoch": 0.4239446870451237,
+      "grad_norm": 0.16447032988071442,
+      "learning_rate": 3.2304208604060106e-05,
+      "loss": 0.15298750400543212,
+      "step": 2330
+    },
+    {
+      "epoch": 0.42485443959243085,
+      "grad_norm": 0.17748120427131653,
+      "learning_rate": 3.223378071834546e-05,
+      "loss": 0.1556084156036377,
+      "step": 2335
+    },
+    {
+      "epoch": 0.425764192139738,
+      "grad_norm": 0.16366586089134216,
+      "learning_rate": 3.2163290122756206e-05,
+      "loss": 0.14387927055358887,
+      "step": 2340
+    },
+    {
+      "epoch": 0.4266739446870451,
+      "grad_norm": 0.15398970246315002,
+      "learning_rate": 3.209273742837755e-05,
+      "loss": 0.16091293096542358,
+      "step": 2345
+    },
+    {
+      "epoch": 0.42758369723435224,
+      "grad_norm": 0.164212167263031,
+      "learning_rate": 3.202212324683305e-05,
+      "loss": 0.15523531436920165,
+      "step": 2350
+    },
+    {
+      "epoch": 0.4284934497816594,
+      "grad_norm": 0.16749800741672516,
+      "learning_rate": 3.1951448190279255e-05,
+      "loss": 0.15354975461959838,
+      "step": 2355
+    },
+    {
+      "epoch": 0.4294032023289665,
+      "grad_norm": 0.14137034118175507,
+      "learning_rate": 3.18807128714005e-05,
+      "loss": 0.14981694221496583,
+      "step": 2360
+    },
+    {
+      "epoch": 0.43031295487627363,
+      "grad_norm": 0.14848439395427704,
+      "learning_rate": 3.1809917903403507e-05,
+      "loss": 0.15448769330978393,
+      "step": 2365
+    },
+    {
+      "epoch": 0.43122270742358076,
+      "grad_norm": 0.1747605800628662,
+      "learning_rate": 3.1739063900012095e-05,
+      "loss": 0.15882387161254882,
+      "step": 2370
+    },
+    {
+      "epoch": 0.4321324599708879,
+      "grad_norm": 0.16054467856884003,
+      "learning_rate": 3.166815147546186e-05,
+      "loss": 0.15170297622680665,
+      "step": 2375
+    },
+    {
+      "epoch": 0.433042212518195,
+      "grad_norm": 0.15428027510643005,
+      "learning_rate": 3.1597181244494886e-05,
+      "loss": 0.16202548742294312,
+      "step": 2380
+    },
+    {
+      "epoch": 0.4339519650655022,
+      "grad_norm": 0.16747219860553741,
+      "learning_rate": 3.1526153822354325e-05,
+      "loss": 0.15461477041244506,
+      "step": 2385
+    },
+    {
+      "epoch": 0.43486171761280934,
+      "grad_norm": 0.17415772378444672,
+      "learning_rate": 3.145506982477918e-05,
+      "loss": 0.16173542737960817,
+      "step": 2390
+    },
+    {
+      "epoch": 0.43577147016011647,
+      "grad_norm": 0.1293518990278244,
+      "learning_rate": 3.1383929867998865e-05,
+      "loss": 0.15572521686553956,
+      "step": 2395
+    },
+    {
+      "epoch": 0.4366812227074236,
+      "grad_norm": 0.16909323632717133,
+      "learning_rate": 3.1312734568727935e-05,
+      "loss": 0.15898628234863282,
+      "step": 2400
+    },
+    {
+      "epoch": 0.43759097525473073,
+      "grad_norm": 0.16770294308662415,
+      "learning_rate": 3.124148454416069e-05,
+      "loss": 0.1536281704902649,
+      "step": 2405
+    },
+    {
+      "epoch": 0.43850072780203786,
+      "grad_norm": 0.14078612625598907,
+      "learning_rate": 3.117018041196585e-05,
+      "loss": 0.15274266004562378,
+      "step": 2410
+    },
+    {
+      "epoch": 0.439410480349345,
+      "grad_norm": 0.15457536280155182,
+      "learning_rate": 3.1098822790281226e-05,
+      "loss": 0.15391263961791993,
+      "step": 2415
+    },
+    {
+      "epoch": 0.4403202328966521,
+      "grad_norm": 0.1640717089176178,
+      "learning_rate": 3.102741229770827e-05,
+      "loss": 0.15515168905258178,
+      "step": 2420
+    },
+    {
+      "epoch": 0.44122998544395925,
+      "grad_norm": 0.2601533830165863,
+      "learning_rate": 3.095594955330683e-05,
+      "loss": 0.1587247371673584,
+      "step": 2425
+    },
+    {
+      "epoch": 0.4421397379912664,
+      "grad_norm": 0.1352529525756836,
+      "learning_rate": 3.08844351765897e-05,
+      "loss": 0.1483217477798462,
+      "step": 2430
+    },
+    {
+      "epoch": 0.4430494905385735,
+      "grad_norm": 0.18479721248149872,
+      "learning_rate": 3.081286978751728e-05,
+      "loss": 0.15121787786483765,
+      "step": 2435
+    },
+    {
+      "epoch": 0.44395924308588064,
+      "grad_norm": 0.16954511404037476,
+      "learning_rate": 3.074125400649221e-05,
+      "loss": 0.16073100566864013,
+      "step": 2440
+    },
+    {
+      "epoch": 0.44486899563318777,
+      "grad_norm": 0.15154729783535004,
+      "learning_rate": 3.0669588454353944e-05,
+      "loss": 0.15738017559051515,
+      "step": 2445
+    },
+    {
+      "epoch": 0.4457787481804949,
+      "grad_norm": 0.1540488302707672,
+      "learning_rate": 3.059787375237344e-05,
+      "loss": 0.1515384554862976,
+      "step": 2450
+    },
+    {
+      "epoch": 0.44668850072780203,
+      "grad_norm": 0.1814432442188263,
+      "learning_rate": 3.052611052224774e-05,
+      "loss": 0.15731438398361205,
+      "step": 2455
+    },
+    {
+      "epoch": 0.44759825327510916,
+      "grad_norm": 0.16657036542892456,
+      "learning_rate": 3.0454299386094542e-05,
+      "loss": 0.15741543769836425,
+      "step": 2460
+    },
+    {
+      "epoch": 0.4485080058224163,
+      "grad_norm": 0.2177237570285797,
+      "learning_rate": 3.0382440966446875e-05,
+      "loss": 0.14972515106201173,
+      "step": 2465
+    },
+    {
+      "epoch": 0.4494177583697234,
+      "grad_norm": 0.1669909954071045,
+      "learning_rate": 3.031053588624766e-05,
+      "loss": 0.1506432294845581,
+      "step": 2470
+    },
+    {
+      "epoch": 0.45032751091703055,
+      "grad_norm": 0.1752234250307083,
+      "learning_rate": 3.0238584768844313e-05,
+      "loss": 0.14969609975814818,
+      "step": 2475
+    },
+    {
+      "epoch": 0.4512372634643377,
+      "grad_norm": 0.18267901241779327,
+      "learning_rate": 3.0166588237983363e-05,
+      "loss": 0.15112748146057128,
+      "step": 2480
+    },
+    {
+      "epoch": 0.4521470160116448,
+      "grad_norm": 0.16250105202198029,
+      "learning_rate": 3.0094546917805007e-05,
+      "loss": 0.15864100456237792,
+      "step": 2485
+    },
+    {
+      "epoch": 0.45305676855895194,
+      "grad_norm": 0.14825721085071564,
+      "learning_rate": 3.0022461432837752e-05,
+      "loss": 0.1513954520225525,
+      "step": 2490
+    },
+    {
+      "epoch": 0.4539665211062591,
+      "grad_norm": 0.1626640111207962,
+      "learning_rate": 2.9950332407992943e-05,
+      "loss": 0.1505578875541687,
+      "step": 2495
+    },
+    {
+      "epoch": 0.45487627365356625,
+      "grad_norm": 0.1535351574420929,
+      "learning_rate": 2.987816046855939e-05,
+      "loss": 0.15255829095840454,
+      "step": 2500
+    },
+    {
+      "epoch": 0.4557860262008734,
+      "grad_norm": 0.17552775144577026,
+      "learning_rate": 2.9805946240197928e-05,
+      "loss": 0.1516443133354187,
+      "step": 2505
+    },
+    {
+      "epoch": 0.4566957787481805,
+      "grad_norm": 0.16020981967449188,
+      "learning_rate": 2.9733690348935994e-05,
+      "loss": 0.14519743919372557,
+      "step": 2510
+    },
+    {
+      "epoch": 0.45760553129548764,
+      "grad_norm": 0.17800211906433105,
+      "learning_rate": 2.9661393421162204e-05,
+      "loss": 0.15679080486297609,
+      "step": 2515
+    },
+    {
+      "epoch": 0.4585152838427948,
+      "grad_norm": 0.16016991436481476,
+      "learning_rate": 2.9589056083620902e-05,
+      "loss": 0.14768127202987671,
+      "step": 2520
+    },
+    {
+      "epoch": 0.4594250363901019,
+      "grad_norm": 0.16272081434726715,
+      "learning_rate": 2.951667896340679e-05,
+      "loss": 0.1513301968574524,
+      "step": 2525
+    },
+    {
+      "epoch": 0.46033478893740903,
+      "grad_norm": 0.1726413071155548,
+      "learning_rate": 2.9444262687959402e-05,
+      "loss": 0.14819332361221313,
+      "step": 2530
+    },
+    {
+      "epoch": 0.46124454148471616,
+      "grad_norm": 0.1670403778553009,
+      "learning_rate": 2.9371807885057735e-05,
+      "loss": 0.15245940685272216,
+      "step": 2535
+    },
+    {
+      "epoch": 0.4621542940320233,
+      "grad_norm": 0.1650049239397049,
+      "learning_rate": 2.9299315182814772e-05,
+      "loss": 0.15187418460845947,
+      "step": 2540
+    },
+    {
+      "epoch": 0.4630640465793304,
+      "grad_norm": 0.16327734291553497,
+      "learning_rate": 2.9226785209672047e-05,
+      "loss": 0.15579828023910522,
+      "step": 2545
+    },
+    {
+      "epoch": 0.46397379912663755,
+      "grad_norm": 0.3367880582809448,
+      "learning_rate": 2.91542185943942e-05,
+      "loss": 0.15617697238922118,
+      "step": 2550
+    },
+    {
+      "epoch": 0.4648835516739447,
+      "grad_norm": 0.1731594055891037,
+      "learning_rate": 2.908161596606353e-05,
+      "loss": 0.1559603691101074,
+      "step": 2555
+    },
+    {
+      "epoch": 0.4657933042212518,
+      "grad_norm": 0.1477293074131012,
+      "learning_rate": 2.9008977954074517e-05,
+      "loss": 0.15567959547042848,
+      "step": 2560
+    },
+    {
+      "epoch": 0.46670305676855894,
+      "grad_norm": 0.16227173805236816,
+      "learning_rate": 2.8936305188128392e-05,
+      "loss": 0.1522113561630249,
+      "step": 2565
+    },
+    {
+      "epoch": 0.4676128093158661,
+      "grad_norm": 0.2031075656414032,
+      "learning_rate": 2.8863598298227674e-05,
+      "loss": 0.15054640769958497,
+      "step": 2570
+    },
+    {
+      "epoch": 0.4685225618631732,
+      "grad_norm": 0.18351472914218903,
+      "learning_rate": 2.8790857914670698e-05,
+      "loss": 0.15837019681930542,
+      "step": 2575
+    },
+    {
+      "epoch": 0.46943231441048033,
+      "grad_norm": 0.15914765000343323,
+      "learning_rate": 2.871808466804616e-05,
+      "loss": 0.1550259470939636,
+      "step": 2580
+    },
+    {
+      "epoch": 0.47034206695778746,
+      "grad_norm": 0.17366717755794525,
+      "learning_rate": 2.8645279189227636e-05,
+      "loss": 0.15702390670776367,
+      "step": 2585
+    },
+    {
+      "epoch": 0.4712518195050946,
+      "grad_norm": 0.13677838444709778,
+      "learning_rate": 2.8572442109368134e-05,
+      "loss": 0.15485031604766847,
+      "step": 2590
+    },
+    {
+      "epoch": 0.4721615720524017,
+      "grad_norm": 0.1477748304605484,
+      "learning_rate": 2.8499574059894617e-05,
+      "loss": 0.14577245712280273,
+      "step": 2595
+    },
+    {
+      "epoch": 0.47307132459970885,
+      "grad_norm": 0.1582217663526535,
+      "learning_rate": 2.842667567250252e-05,
+      "loss": 0.15586793422698975,
+      "step": 2600
+    },
+    {
+      "epoch": 0.47398107714701604,
+      "grad_norm": 0.19658738374710083,
+      "learning_rate": 2.8353747579150268e-05,
+      "loss": 0.15060495138168334,
+      "step": 2605
+    },
+    {
+      "epoch": 0.47489082969432317,
+      "grad_norm": 0.176767036318779,
+      "learning_rate": 2.828079041205382e-05,
+      "loss": 0.15116705894470214,
+      "step": 2610
+    },
+    {
+      "epoch": 0.4758005822416303,
+      "grad_norm": 0.16972507536411285,
+      "learning_rate": 2.820780480368117e-05,
+      "loss": 0.1541937470436096,
+      "step": 2615
+    },
+    {
+      "epoch": 0.47671033478893743,
+      "grad_norm": 0.1548585742712021,
+      "learning_rate": 2.8134791386746884e-05,
+      "loss": 0.14334756135940552,
+      "step": 2620
+    },
+    {
+      "epoch": 0.47762008733624456,
+      "grad_norm": 0.15411986410617828,
+      "learning_rate": 2.806175079420658e-05,
+      "loss": 0.14642289876937867,
+      "step": 2625
+    },
+    {
+      "epoch": 0.4785298398835517,
+      "grad_norm": 0.16609491407871246,
+      "learning_rate": 2.7988683659251474e-05,
+      "loss": 0.15083469152450563,
+      "step": 2630
+    },
+    {
+      "epoch": 0.4794395924308588,
+      "grad_norm": 0.16592684388160706,
+      "learning_rate": 2.791559061530289e-05,
+      "loss": 0.14218480587005616,
+      "step": 2635
+    },
+    {
+      "epoch": 0.48034934497816595,
+      "grad_norm": 0.1764935404062271,
+      "learning_rate": 2.7842472296006722e-05,
+      "loss": 0.15004343986511232,
+      "step": 2640
+    },
+    {
+      "epoch": 0.4812590975254731,
+      "grad_norm": 0.20094354450702667,
+      "learning_rate": 2.7769329335228022e-05,
+      "loss": 0.14975016117095946,
+      "step": 2645
+    },
+    {
+      "epoch": 0.4821688500727802,
+      "grad_norm": 0.1869269460439682,
+      "learning_rate": 2.769616236704542e-05,
+      "loss": 0.155981707572937,
+      "step": 2650
+    },
+    {
+      "epoch": 0.48307860262008734,
+      "grad_norm": 0.16671574115753174,
+      "learning_rate": 2.762297202574571e-05,
+      "loss": 0.14633859395980836,
+      "step": 2655
+    },
+    {
+      "epoch": 0.48398835516739447,
+      "grad_norm": 0.14999663829803467,
+      "learning_rate": 2.754975894581826e-05,
+      "loss": 0.15692603588104248,
+      "step": 2660
+    },
+    {
+      "epoch": 0.4848981077147016,
+      "grad_norm": 0.16893649101257324,
+      "learning_rate": 2.7476523761949592e-05,
+      "loss": 0.14530394077301026,
+      "step": 2665
+    },
+    {
+      "epoch": 0.48580786026200873,
+      "grad_norm": 0.16039884090423584,
+      "learning_rate": 2.740326710901784e-05,
+      "loss": 0.15013915300369263,
+      "step": 2670
+    },
+    {
+      "epoch": 0.48671761280931586,
+      "grad_norm": 0.16672006249427795,
+      "learning_rate": 2.732998962208725e-05,
+      "loss": 0.15667349100112915,
+      "step": 2675
+    },
+    {
+      "epoch": 0.487627365356623,
+      "grad_norm": 0.2160867303609848,
+      "learning_rate": 2.7256691936402684e-05,
+      "loss": 0.14335414171218872,
+      "step": 2680
+    },
+    {
+      "epoch": 0.4885371179039301,
+      "grad_norm": 0.349030077457428,
+      "learning_rate": 2.71833746873841e-05,
+      "loss": 0.1437530279159546,
+      "step": 2685
+    },
+    {
+      "epoch": 0.48944687045123725,
+      "grad_norm": 0.18380966782569885,
+      "learning_rate": 2.7110038510621073e-05,
+      "loss": 0.1476014256477356,
+      "step": 2690
+    },
+    {
+      "epoch": 0.4903566229985444,
+      "grad_norm": 0.1523742377758026,
+      "learning_rate": 2.703668404186722e-05,
+      "loss": 0.14578526020050048,
+      "step": 2695
+    },
+    {
+      "epoch": 0.4912663755458515,
+      "grad_norm": 0.16092729568481445,
+      "learning_rate": 2.696331191703479e-05,
+      "loss": 0.15335593223571778,
+      "step": 2700
+    },
+    {
+      "epoch": 0.49217612809315864,
+      "grad_norm": 0.17185333371162415,
+      "learning_rate": 2.688992277218904e-05,
+      "loss": 0.1540898084640503,
+      "step": 2705
+    },
+    {
+      "epoch": 0.49308588064046577,
+      "grad_norm": 0.1521969735622406,
+      "learning_rate": 2.6816517243542792e-05,
+      "loss": 0.15171396732330322,
+      "step": 2710
+    },
+    {
+      "epoch": 0.49399563318777295,
+      "grad_norm": 0.16064171493053436,
+      "learning_rate": 2.674309596745092e-05,
+      "loss": 0.1505839228630066,
+      "step": 2715
+    },
+    {
+      "epoch": 0.4949053857350801,
+      "grad_norm": 0.16430898010730743,
+      "learning_rate": 2.6669659580404795e-05,
+      "loss": 0.1551363468170166,
+      "step": 2720
+    },
+    {
+      "epoch": 0.4958151382823872,
+      "grad_norm": 0.16125477850437164,
+      "learning_rate": 2.659620871902677e-05,
+      "loss": 0.15069286823272704,
+      "step": 2725
+    },
+    {
+      "epoch": 0.49672489082969434,
+      "grad_norm": 0.1428450047969818,
+      "learning_rate": 2.652274402006471e-05,
+      "loss": 0.15511081218719483,
+      "step": 2730
+    },
+    {
+      "epoch": 0.4976346433770015,
+      "grad_norm": 0.15452754497528076,
+      "learning_rate": 2.6449266120386406e-05,
+      "loss": 0.14941939115524291,
+      "step": 2735
+    },
+    {
+      "epoch": 0.4985443959243086,
+      "grad_norm": 0.17243537306785583,
+      "learning_rate": 2.6375775656974123e-05,
+      "loss": 0.151741623878479,
+      "step": 2740
+    },
+    {
+      "epoch": 0.49945414847161573,
+      "grad_norm": 0.13736453652381897,
+      "learning_rate": 2.6302273266919008e-05,
+      "loss": 0.147042977809906,
+      "step": 2745
+    },
+    {
+      "epoch": 0.5003639010189228,
+      "grad_norm": 0.16241495311260223,
+      "learning_rate": 2.6228759587415614e-05,
+      "loss": 0.14664684534072875,
+      "step": 2750
+    },
+    {
+      "epoch": 0.50127365356623,
+      "grad_norm": 0.193496435880661,
+      "learning_rate": 2.6155235255756356e-05,
+      "loss": 0.15486966371536254,
+      "step": 2755
+    },
+    {
+      "epoch": 0.5021834061135371,
+      "grad_norm": 0.1542847901582718,
+      "learning_rate": 2.6081700909326e-05,
+      "loss": 0.15148009061813356,
+      "step": 2760
+    },
+    {
+      "epoch": 0.5030931586608443,
+      "grad_norm": 0.1696511209011078,
+      "learning_rate": 2.6008157185596142e-05,
+      "loss": 0.14190055131912233,
+      "step": 2765
+    },
+    {
+      "epoch": 0.5040029112081513,
+      "grad_norm": 0.14690077304840088,
+      "learning_rate": 2.5934604722119655e-05,
+      "loss": 0.1570739269256592,
+      "step": 2770
+    },
+    {
+      "epoch": 0.5049126637554585,
+      "grad_norm": 0.17149671912193298,
+      "learning_rate": 2.5861044156525162e-05,
+      "loss": 0.14940304756164552,
+      "step": 2775
+    },
+    {
+      "epoch": 0.5058224163027657,
+      "grad_norm": 0.16639231145381927,
+      "learning_rate": 2.578747612651155e-05,
+      "loss": 0.15691237449645995,
+      "step": 2780
+    },
+    {
+      "epoch": 0.5067321688500728,
+      "grad_norm": 0.2062763124704361,
+      "learning_rate": 2.5713901269842404e-05,
+      "loss": 0.1564734935760498,
+      "step": 2785
+    },
+    {
+      "epoch": 0.50764192139738,
+      "grad_norm": 0.12636308372020721,
+      "learning_rate": 2.5640320224340502e-05,
+      "loss": 0.14539417028427123,
+      "step": 2790
+    },
+    {
+      "epoch": 0.508551673944687,
+      "grad_norm": 0.16893689334392548,
+      "learning_rate": 2.556673362788225e-05,
+      "loss": 0.15440930128097535,
+      "step": 2795
+    },
+    {
+      "epoch": 0.5094614264919942,
+      "grad_norm": 0.16250015795230865,
+      "learning_rate": 2.54931421183922e-05,
+      "loss": 0.14485647678375244,
+      "step": 2800
+    },
+    {
+      "epoch": 0.5103711790393013,
+      "grad_norm": 0.1700994372367859,
+      "learning_rate": 2.5419546333837462e-05,
+      "loss": 0.15411126613616943,
+      "step": 2805
+    },
+    {
+      "epoch": 0.5112809315866085,
+      "grad_norm": 0.1547706127166748,
+      "learning_rate": 2.5345946912222256e-05,
+      "loss": 0.15516072511672974,
+      "step": 2810
+    },
+    {
+      "epoch": 0.5121906841339156,
+      "grad_norm": 0.17955681681632996,
+      "learning_rate": 2.527234449158228e-05,
+      "loss": 0.15546923875808716,
+      "step": 2815
+    },
+    {
+      "epoch": 0.5131004366812227,
+      "grad_norm": 0.163709819316864,
+      "learning_rate": 2.519873970997927e-05,
+      "loss": 0.15665037631988527,
+      "step": 2820
+    },
+    {
+      "epoch": 0.5140101892285298,
+      "grad_norm": 0.17859576642513275,
+      "learning_rate": 2.5125133205495405e-05,
+      "loss": 0.1539722204208374,
+      "step": 2825
+    },
+    {
+      "epoch": 0.514919941775837,
+      "grad_norm": 0.17443150281906128,
+      "learning_rate": 2.5051525616227806e-05,
+      "loss": 0.148411762714386,
+      "step": 2830
+    },
+    {
+      "epoch": 0.5158296943231441,
+      "grad_norm": 0.17397581040859222,
+      "learning_rate": 2.4977917580283007e-05,
+      "loss": 0.14880497455596925,
+      "step": 2835
+    },
+    {
+      "epoch": 0.5167394468704513,
+      "grad_norm": 0.14565663039684296,
+      "learning_rate": 2.4904309735771405e-05,
+      "loss": 0.14934680461883545,
+      "step": 2840
+    },
+    {
+      "epoch": 0.5176491994177583,
+      "grad_norm": 0.17895659804344177,
+      "learning_rate": 2.4830702720801746e-05,
+      "loss": 0.15287939310073853,
+      "step": 2845
+    },
+    {
+      "epoch": 0.5185589519650655,
+      "grad_norm": 0.15812788903713226,
+      "learning_rate": 2.4757097173475572e-05,
+      "loss": 0.14576947689056396,
+      "step": 2850
+    },
+    {
+      "epoch": 0.5194687045123726,
+      "grad_norm": 0.17123781144618988,
+      "learning_rate": 2.46834937318817e-05,
+      "loss": 0.15224847793579102,
+      "step": 2855
+    },
+    {
+      "epoch": 0.5203784570596798,
+      "grad_norm": 0.14845474064350128,
+      "learning_rate": 2.460989303409072e-05,
+      "loss": 0.14901585578918458,
+      "step": 2860
+    },
+    {
+      "epoch": 0.5212882096069869,
+      "grad_norm": 0.23493704199790955,
+      "learning_rate": 2.4536295718149407e-05,
+      "loss": 0.1517487049102783,
+      "step": 2865
+    },
+    {
+      "epoch": 0.522197962154294,
+      "grad_norm": 0.16209843754768372,
+      "learning_rate": 2.4462702422075217e-05,
+      "loss": 0.14327445030212402,
+      "step": 2870
+    },
+    {
+      "epoch": 0.5231077147016011,
+      "grad_norm": 0.17249803245067596,
+      "learning_rate": 2.4389113783850793e-05,
+      "loss": 0.1517549753189087,
+      "step": 2875
+    },
+    {
+      "epoch": 0.5240174672489083,
+      "grad_norm": 0.14561402797698975,
+      "learning_rate": 2.431553044141836e-05,
+      "loss": 0.14764087200164794,
+      "step": 2880
+    },
+    {
+      "epoch": 0.5249272197962155,
+      "grad_norm": 0.17033302783966064,
+      "learning_rate": 2.4241953032674256e-05,
+      "loss": 0.15181604623794556,
+      "step": 2885
+    },
+    {
+      "epoch": 0.5258369723435226,
+      "grad_norm": 0.1184430941939354,
+      "learning_rate": 2.4168382195463367e-05,
+      "loss": 0.14264242649078368,
+      "step": 2890
+    },
+    {
+      "epoch": 0.5267467248908297,
+      "grad_norm": 0.17521196603775024,
+      "learning_rate": 2.4094818567573618e-05,
+      "loss": 0.1509538173675537,
+      "step": 2895
+    },
+    {
+      "epoch": 0.5276564774381368,
+      "grad_norm": 0.1681576371192932,
+      "learning_rate": 2.4021262786730428e-05,
+      "loss": 0.15344605445861817,
+      "step": 2900
+    },
+    {
+      "epoch": 0.528566229985444,
+      "grad_norm": 0.17134182155132294,
+      "learning_rate": 2.3947715490591206e-05,
+      "loss": 0.15161689519882202,
+      "step": 2905
+    },
+    {
+      "epoch": 0.5294759825327511,
+      "grad_norm": 0.1796472817659378,
+      "learning_rate": 2.3874177316739778e-05,
+      "loss": 0.15086464881896972,
+      "step": 2910
+    },
+    {
+      "epoch": 0.5303857350800583,
+      "grad_norm": 0.23268625140190125,
+      "learning_rate": 2.380064890268093e-05,
+      "loss": 0.15354180335998535,
+      "step": 2915
+    },
+    {
+      "epoch": 0.5312954876273653,
+      "grad_norm": 0.16318941116333008,
+      "learning_rate": 2.372713088583481e-05,
+      "loss": 0.15131797790527343,
+      "step": 2920
+    },
+    {
+      "epoch": 0.5322052401746725,
+      "grad_norm": 0.18171803653240204,
+      "learning_rate": 2.365362390353143e-05,
+      "loss": 0.15784090757369995,
+      "step": 2925
+    },
+    {
+      "epoch": 0.5331149927219796,
+      "grad_norm": 0.17672640085220337,
+      "learning_rate": 2.3580128593005156e-05,
+      "loss": 0.15509436130523682,
+      "step": 2930
+    },
+    {
+      "epoch": 0.5340247452692868,
+      "grad_norm": 0.15985223650932312,
+      "learning_rate": 2.3506645591389174e-05,
+      "loss": 0.14851027727127075,
+      "step": 2935
+    },
+    {
+      "epoch": 0.5349344978165939,
+      "grad_norm": 0.16597607731819153,
+      "learning_rate": 2.343317553570995e-05,
+      "loss": 0.1504931092262268,
+      "step": 2940
+    },
+    {
+      "epoch": 0.535844250363901,
+      "grad_norm": 0.20180748403072357,
+      "learning_rate": 2.3359719062881725e-05,
+      "loss": 0.15023820400238036,
+      "step": 2945
+    },
+    {
+      "epoch": 0.5367540029112081,
+      "grad_norm": 0.1735963076353073,
+      "learning_rate": 2.3286276809701e-05,
+      "loss": 0.15374408960342406,
+      "step": 2950
+    },
+    {
+      "epoch": 0.5376637554585153,
+      "grad_norm": 0.17629501223564148,
+      "learning_rate": 2.3212849412840995e-05,
+      "loss": 0.15007833242416382,
+      "step": 2955
+    },
+    {
+      "epoch": 0.5385735080058224,
+      "grad_norm": 0.1493796557188034,
+      "learning_rate": 2.3139437508846155e-05,
+      "loss": 0.15206656455993653,
+      "step": 2960
+    },
+    {
+      "epoch": 0.5394832605531296,
+      "grad_norm": 0.17426837980747223,
+      "learning_rate": 2.306604173412659e-05,
+      "loss": 0.1441131591796875,
+      "step": 2965
+    },
+    {
+      "epoch": 0.5403930131004366,
+      "grad_norm": 0.16984431445598602,
+      "learning_rate": 2.2992662724952613e-05,
+      "loss": 0.14438753128051757,
+      "step": 2970
+    },
+    {
+      "epoch": 0.5413027656477438,
+      "grad_norm": 0.1814386397600174,
+      "learning_rate": 2.2919301117449167e-05,
+      "loss": 0.14887022972106934,
+      "step": 2975
+    },
+    {
+      "epoch": 0.5422125181950509,
+      "grad_norm": 0.158392995595932,
+      "learning_rate": 2.2845957547590368e-05,
+      "loss": 0.14404361248016356,
+      "step": 2980
+    },
+    {
+      "epoch": 0.5431222707423581,
+      "grad_norm": 0.17496263980865479,
+      "learning_rate": 2.2772632651193953e-05,
+      "loss": 0.1454906702041626,
+      "step": 2985
+    },
+    {
+      "epoch": 0.5440320232896652,
+      "grad_norm": 0.157533198595047,
+      "learning_rate": 2.2699327063915766e-05,
+      "loss": 0.1458217740058899,
+      "step": 2990
+    },
+    {
+      "epoch": 0.5449417758369723,
+      "grad_norm": 0.1767890453338623,
+      "learning_rate": 2.262604142124427e-05,
+      "loss": 0.14384825229644777,
+      "step": 2995
+    },
+    {
+      "epoch": 0.5458515283842795,
+      "grad_norm": 0.1851050704717636,
+      "learning_rate": 2.2552776358495033e-05,
+      "loss": 0.14832457304000854,
+      "step": 3000
+    },
+    {
+      "epoch": 0.5467612809315866,
+      "grad_norm": 0.164175882935524,
+      "learning_rate": 2.247953251080521e-05,
+      "loss": 0.14999878406524658,
+      "step": 3005
+    },
+    {
+      "epoch": 0.5476710334788938,
+      "grad_norm": 0.3403675854206085,
+      "learning_rate": 2.240631051312804e-05,
+      "loss": 0.1443937063217163,
+      "step": 3010
+    },
+    {
+      "epoch": 0.5485807860262009,
+      "grad_norm": 0.16751109063625336,
+      "learning_rate": 2.2333111000227342e-05,
+      "loss": 0.1462402105331421,
+      "step": 3015
+    },
+    {
+      "epoch": 0.549490538573508,
+      "grad_norm": 0.14741151034832,
+      "learning_rate": 2.225993460667201e-05,
+      "loss": 0.149855899810791,
+      "step": 3020
+    },
+    {
+      "epoch": 0.5504002911208151,
+      "grad_norm": 0.20605266094207764,
+      "learning_rate": 2.218678196683054e-05,
+      "loss": 0.15413178205490113,
+      "step": 3025
+    },
+    {
+      "epoch": 0.5513100436681223,
+      "grad_norm": 0.14884796738624573,
+      "learning_rate": 2.2113653714865473e-05,
+      "loss": 0.14592334032058715,
+      "step": 3030
+    },
+    {
+      "epoch": 0.5522197962154294,
+      "grad_norm": 0.17114350199699402,
+      "learning_rate": 2.2040550484727943e-05,
+      "loss": 0.1498338460922241,
+      "step": 3035
+    },
+    {
+      "epoch": 0.5531295487627366,
+      "grad_norm": 0.16496853530406952,
+      "learning_rate": 2.196747291015219e-05,
+      "loss": 0.14650315046310425,
+      "step": 3040
+    },
+    {
+      "epoch": 0.5540393013100436,
+      "grad_norm": 0.15172401070594788,
+      "learning_rate": 2.189442162465001e-05,
+      "loss": 0.14984124898910522,
+      "step": 3045
+    },
+    {
+      "epoch": 0.5549490538573508,
+      "grad_norm": 0.19258467853069305,
+      "learning_rate": 2.182139726150532e-05,
+      "loss": 0.1486764669418335,
+      "step": 3050
+    },
+    {
+      "epoch": 0.5558588064046579,
+      "grad_norm": 0.1749001443386078,
+      "learning_rate": 2.1748400453768652e-05,
+      "loss": 0.14983701705932617,
+      "step": 3055
+    },
+    {
+      "epoch": 0.5567685589519651,
+      "grad_norm": 0.37510567903518677,
+      "learning_rate": 2.1675431834251637e-05,
+      "loss": 0.14483561515808105,
+      "step": 3060
+    },
+    {
+      "epoch": 0.5576783114992722,
+      "grad_norm": 0.16932405531406403,
+      "learning_rate": 2.1602492035521553e-05,
+      "loss": 0.14487643241882325,
+      "step": 3065
+    },
+    {
+      "epoch": 0.5585880640465793,
+      "grad_norm": 0.174176424741745,
+      "learning_rate": 2.152958168989584e-05,
+      "loss": 0.14737497568130492,
+      "step": 3070
+    },
+    {
+      "epoch": 0.5594978165938864,
+      "grad_norm": 0.1601252257823944,
+      "learning_rate": 2.1456701429436577e-05,
+      "loss": 0.15183379650115966,
+      "step": 3075
+    },
+    {
+      "epoch": 0.5604075691411936,
+      "grad_norm": 0.14960910379886627,
+      "learning_rate": 2.1383851885945085e-05,
+      "loss": 0.143074893951416,
+      "step": 3080
+    },
+    {
+      "epoch": 0.5613173216885007,
+      "grad_norm": 0.1678633838891983,
+      "learning_rate": 2.1311033690956346e-05,
+      "loss": 0.14961432218551635,
+      "step": 3085
+    },
+    {
+      "epoch": 0.5622270742358079,
+      "grad_norm": 0.15814319252967834,
+      "learning_rate": 2.1238247475733613e-05,
+      "loss": 0.14308581352233887,
+      "step": 3090
+    },
+    {
+      "epoch": 0.5631368267831149,
+      "grad_norm": 0.21240772306919098,
+      "learning_rate": 2.1165493871262887e-05,
+      "loss": 0.14737485647201537,
+      "step": 3095
+    },
+    {
+      "epoch": 0.5640465793304221,
+      "grad_norm": 0.15161271393299103,
+      "learning_rate": 2.109277350824749e-05,
+      "loss": 0.14534420967102052,
+      "step": 3100
+    },
+    {
+      "epoch": 0.5649563318777293,
+      "grad_norm": 0.16572362184524536,
+      "learning_rate": 2.1020087017102537e-05,
+      "loss": 0.14299670457839966,
+      "step": 3105
+    },
+    {
+      "epoch": 0.5658660844250364,
+      "grad_norm": 0.1548164039850235,
+      "learning_rate": 2.094743502794954e-05,
+      "loss": 0.14371142387390137,
+      "step": 3110
+    },
+    {
+      "epoch": 0.5667758369723436,
+      "grad_norm": 0.2574169933795929,
+      "learning_rate": 2.0874818170610885e-05,
+      "loss": 0.14350423812866211,
+      "step": 3115
+    },
+    {
+      "epoch": 0.5676855895196506,
+      "grad_norm": 0.16359548270702362,
+      "learning_rate": 2.080223707460443e-05,
+      "loss": 0.1520243763923645,
+      "step": 3120
+    },
+    {
+      "epoch": 0.5685953420669578,
+      "grad_norm": 0.1798320859670639,
+      "learning_rate": 2.072969236913799e-05,
+      "loss": 0.14832595586776734,
+      "step": 3125
+    },
+    {
+      "epoch": 0.5695050946142649,
+      "grad_norm": 0.17045916616916656,
+      "learning_rate": 2.0657184683103926e-05,
+      "loss": 0.15308042764663696,
+      "step": 3130
+    },
+    {
+      "epoch": 0.5704148471615721,
+      "grad_norm": 0.16345897316932678,
+      "learning_rate": 2.058471464507366e-05,
+      "loss": 0.14564799070358275,
+      "step": 3135
+    },
+    {
+      "epoch": 0.5713245997088792,
+      "grad_norm": 0.15170110762119293,
+      "learning_rate": 2.0512282883292257e-05,
+      "loss": 0.14161767959594726,
+      "step": 3140
+    },
+    {
+      "epoch": 0.5722343522561864,
+      "grad_norm": 0.8107472658157349,
+      "learning_rate": 2.0439890025672955e-05,
+      "loss": 0.14481087923049926,
+      "step": 3145
+    },
+    {
+      "epoch": 0.5731441048034934,
+      "grad_norm": 0.15346679091453552,
+      "learning_rate": 2.036753669979174e-05,
+      "loss": 0.14860262870788574,
+      "step": 3150
+    },
+    {
+      "epoch": 0.5740538573508006,
+      "grad_norm": 0.1632593423128128,
+      "learning_rate": 2.0295223532881886e-05,
+      "loss": 0.1481687307357788,
+      "step": 3155
+    },
+    {
+      "epoch": 0.5749636098981077,
+      "grad_norm": 0.23399172723293304,
+      "learning_rate": 2.022295115182852e-05,
+      "loss": 0.149153733253479,
+      "step": 3160
+    },
+    {
+      "epoch": 0.5758733624454149,
+      "grad_norm": 0.14977394044399261,
+      "learning_rate": 2.015072018316323e-05,
+      "loss": 0.14921388626098633,
+      "step": 3165
+    },
+    {
+      "epoch": 0.576783114992722,
+      "grad_norm": 0.1550658792257309,
+      "learning_rate": 2.007853125305856e-05,
+      "loss": 0.1482759475708008,
+      "step": 3170
+    },
+    {
+      "epoch": 0.5776928675400291,
+      "grad_norm": 0.16661737859249115,
+      "learning_rate": 2.0006384987322645e-05,
+      "loss": 0.14903552532196046,
+      "step": 3175
+    },
+    {
+      "epoch": 0.5786026200873362,
+      "grad_norm": 0.1746823936700821,
+      "learning_rate": 1.9934282011393753e-05,
+      "loss": 0.1412947654724121,
+      "step": 3180
+    },
+    {
+      "epoch": 0.5795123726346434,
+      "grad_norm": 0.17025792598724365,
+      "learning_rate": 1.9862222950334857e-05,
+      "loss": 0.15289769172668458,
+      "step": 3185
+    },
+    {
+      "epoch": 0.5804221251819505,
+      "grad_norm": 0.16857658326625824,
+      "learning_rate": 1.9790208428828252e-05,
+      "loss": 0.14419941902160643,
+      "step": 3190
+    },
+    {
+      "epoch": 0.5813318777292577,
+      "grad_norm": 0.16099876165390015,
+      "learning_rate": 1.9718239071170118e-05,
+      "loss": 0.14476487636566163,
+      "step": 3195
+    },
+    {
+      "epoch": 0.5822416302765647,
+      "grad_norm": 0.16140873730182648,
+      "learning_rate": 1.964631550126508e-05,
+      "loss": 0.14588416814804078,
+      "step": 3200
+    },
+    {
+      "epoch": 0.5831513828238719,
+      "grad_norm": 0.15719448029994965,
+      "learning_rate": 1.957443834262087e-05,
+      "loss": 0.15144693851470947,
+      "step": 3205
+    },
+    {
+      "epoch": 0.584061135371179,
+      "grad_norm": 0.16512645781040192,
+      "learning_rate": 1.950260821834285e-05,
+      "loss": 0.14787566661834717,
+      "step": 3210
+    },
+    {
+      "epoch": 0.5849708879184862,
+      "grad_norm": 0.18584516644477844,
+      "learning_rate": 1.9430825751128643e-05,
+      "loss": 0.14514710903167724,
+      "step": 3215
+    },
+    {
+      "epoch": 0.5858806404657934,
+      "grad_norm": 0.17640981078147888,
+      "learning_rate": 1.9359091563262742e-05,
+      "loss": 0.1511004686355591,
+      "step": 3220
+    },
+    {
+      "epoch": 0.5867903930131004,
+      "grad_norm": 0.1697624921798706,
+      "learning_rate": 1.9287406276611095e-05,
+      "loss": 0.15392563343048096,
+      "step": 3225
+    },
+    {
+      "epoch": 0.5877001455604076,
+      "grad_norm": 0.1677260845899582,
+      "learning_rate": 1.9215770512615725e-05,
+      "loss": 0.15311745405197144,
+      "step": 3230
+    },
+    {
+      "epoch": 0.5886098981077147,
+      "grad_norm": 0.15357480943202972,
+      "learning_rate": 1.9144184892289337e-05,
+      "loss": 0.14370160102844237,
+      "step": 3235
+    },
+    {
+      "epoch": 0.5895196506550219,
+      "grad_norm": 0.18601207435131073,
+      "learning_rate": 1.9072650036209955e-05,
+      "loss": 0.14095077514648438,
+      "step": 3240
+    },
+    {
+      "epoch": 0.590429403202329,
+      "grad_norm": 0.17313526570796967,
+      "learning_rate": 1.9001166564515513e-05,
+      "loss": 0.148259174823761,
+      "step": 3245
+    },
+    {
+      "epoch": 0.5913391557496361,
+      "grad_norm": 0.1634378433227539,
+      "learning_rate": 1.8929735096898504e-05,
+      "loss": 0.15082294940948487,
+      "step": 3250
+    },
+    {
+      "epoch": 0.5922489082969432,
+      "grad_norm": 0.18542174994945526,
+      "learning_rate": 1.885835625260058e-05,
+      "loss": 0.14461435079574586,
+      "step": 3255
+    },
+    {
+      "epoch": 0.5931586608442504,
+      "grad_norm": 0.1740756630897522,
+      "learning_rate": 1.87870306504072e-05,
+      "loss": 0.14083608388900756,
+      "step": 3260
+    },
+    {
+      "epoch": 0.5940684133915575,
+      "grad_norm": 0.25606217980384827,
+      "learning_rate": 1.8715758908642288e-05,
+      "loss": 0.15125386714935302,
+      "step": 3265
+    },
+    {
+      "epoch": 0.5949781659388647,
+      "grad_norm": 0.20194627344608307,
+      "learning_rate": 1.8644541645162834e-05,
+      "loss": 0.14433003664016725,
+      "step": 3270
+    },
+    {
+      "epoch": 0.5958879184861717,
+      "grad_norm": 0.1902168095111847,
+      "learning_rate": 1.8573379477353542e-05,
+      "loss": 0.14718132019042968,
+      "step": 3275
+    },
+    {
+      "epoch": 0.5967976710334789,
+      "grad_norm": 0.15122972428798676,
+      "learning_rate": 1.850227302212151e-05,
+      "loss": 0.153376567363739,
+      "step": 3280
+    },
+    {
+      "epoch": 0.597707423580786,
+      "grad_norm": 0.14331959187984467,
+      "learning_rate": 1.843122289589085e-05,
+      "loss": 0.146630597114563,
+      "step": 3285
+    },
+    {
+      "epoch": 0.5986171761280932,
+      "grad_norm": 0.15083099901676178,
+      "learning_rate": 1.836022971459737e-05,
+      "loss": 0.1445971965789795,
+      "step": 3290
+    },
+    {
+      "epoch": 0.5995269286754003,
+      "grad_norm": 0.16585418581962585,
+      "learning_rate": 1.828929409368321e-05,
+      "loss": 0.15120241641998292,
+      "step": 3295
+    },
+    {
+      "epoch": 0.6004366812227074,
+      "grad_norm": 0.1653224229812622,
+      "learning_rate": 1.8218416648091524e-05,
+      "loss": 0.14349838495254516,
+      "step": 3300
+    },
+    {
+      "epoch": 0.6013464337700145,
+      "grad_norm": 0.1891375184059143,
+      "learning_rate": 1.8147597992261124e-05,
+      "loss": 0.15171384811401367,
+      "step": 3305
+    },
+    {
+      "epoch": 0.6022561863173217,
+      "grad_norm": 0.13392704725265503,
+      "learning_rate": 1.8076838740121187e-05,
+      "loss": 0.14607118368148803,
+      "step": 3310
+    },
+    {
+      "epoch": 0.6031659388646288,
+      "grad_norm": 0.15421944856643677,
+      "learning_rate": 1.8006139505085926e-05,
+      "loss": 0.1380957007408142,
+      "step": 3315
+    },
+    {
+      "epoch": 0.604075691411936,
+      "grad_norm": 0.16637761890888214,
+      "learning_rate": 1.7935500900049246e-05,
+      "loss": 0.14604611396789552,
+      "step": 3320
+    },
+    {
+      "epoch": 0.6049854439592431,
+      "grad_norm": 0.16638441383838654,
+      "learning_rate": 1.7864923537379445e-05,
+      "loss": 0.1513611912727356,
+      "step": 3325
+    },
+    {
+      "epoch": 0.6058951965065502,
+      "grad_norm": 0.1745707094669342,
+      "learning_rate": 1.779440802891394e-05,
+      "loss": 0.15391240119934083,
+      "step": 3330
+    },
+    {
+      "epoch": 0.6068049490538574,
+      "grad_norm": 0.1620505005121231,
+      "learning_rate": 1.77239549859539e-05,
+      "loss": 0.14986472129821776,
+      "step": 3335
+    },
+    {
+      "epoch": 0.6077147016011645,
+      "grad_norm": 0.1579132080078125,
+      "learning_rate": 1.7653565019259e-05,
+      "loss": 0.1466603994369507,
+      "step": 3340
+    },
+    {
+      "epoch": 0.6086244541484717,
+      "grad_norm": 0.19154994189739227,
+      "learning_rate": 1.7583238739042086e-05,
+      "loss": 0.15228934288024903,
+      "step": 3345
+    },
+    {
+      "epoch": 0.6095342066957787,
+      "grad_norm": 0.15771779417991638,
+      "learning_rate": 1.7512976754963913e-05,
+      "loss": 0.14965078830718995,
+      "step": 3350
+    },
+    {
+      "epoch": 0.6104439592430859,
+      "grad_norm": 0.18406136333942413,
+      "learning_rate": 1.744277967612785e-05,
+      "loss": 0.1473196864128113,
+      "step": 3355
+    },
+    {
+      "epoch": 0.611353711790393,
+      "grad_norm": 0.17603816092014313,
+      "learning_rate": 1.7372648111074607e-05,
+      "loss": 0.1430676221847534,
+      "step": 3360
+    },
+    {
+      "epoch": 0.6122634643377002,
+      "grad_norm": 0.156408429145813,
+      "learning_rate": 1.7302582667776933e-05,
+      "loss": 0.14018454551696777,
+      "step": 3365
+    },
+    {
+      "epoch": 0.6131732168850073,
+      "grad_norm": 0.14504843950271606,
+      "learning_rate": 1.7232583953634407e-05,
+      "loss": 0.14505640268325806,
+      "step": 3370
+    },
+    {
+      "epoch": 0.6140829694323144,
+      "grad_norm": 0.1864968240261078,
+      "learning_rate": 1.716265257546808e-05,
+      "loss": 0.14810394048690795,
+      "step": 3375
+    },
+    {
+      "epoch": 0.6149927219796215,
+      "grad_norm": 0.1621711403131485,
+      "learning_rate": 1.7092789139515295e-05,
+      "loss": 0.14203091859817504,
+      "step": 3380
+    },
+    {
+      "epoch": 0.6159024745269287,
+      "grad_norm": 0.17994914948940277,
+      "learning_rate": 1.70229942514244e-05,
+      "loss": 0.14565644264221192,
+      "step": 3385
+    },
+    {
+      "epoch": 0.6168122270742358,
+      "grad_norm": 0.1707388162612915,
+      "learning_rate": 1.6953268516249486e-05,
+      "loss": 0.14449434280395507,
+      "step": 3390
+    },
+    {
+      "epoch": 0.617721979621543,
+      "grad_norm": 0.16425329446792603,
+      "learning_rate": 1.6883612538445175e-05,
+      "loss": 0.15185940265655518,
+      "step": 3395
+    },
+    {
+      "epoch": 0.61863173216885,
+      "grad_norm": 0.15987788140773773,
+      "learning_rate": 1.6814026921861335e-05,
+      "loss": 0.14994431734085084,
+      "step": 3400
+    },
+    {
+      "epoch": 0.6195414847161572,
+      "grad_norm": 0.2987690269947052,
+      "learning_rate": 1.6744512269737894e-05,
+      "loss": 0.14652738571166993,
+      "step": 3405
+    },
+    {
+      "epoch": 0.6204512372634643,
+      "grad_norm": 0.1681315004825592,
+      "learning_rate": 1.6675069184699574e-05,
+      "loss": 0.14566165208816528,
+      "step": 3410
+    },
+    {
+      "epoch": 0.6213609898107715,
+      "grad_norm": 0.15847846865653992,
+      "learning_rate": 1.660569826875069e-05,
+      "loss": 0.1374401330947876,
+      "step": 3415
+    },
+    {
+      "epoch": 0.6222707423580786,
+      "grad_norm": 0.16370312869548798,
+      "learning_rate": 1.6536400123269907e-05,
+      "loss": 0.14905524253845215,
+      "step": 3420
+    },
+    {
+      "epoch": 0.6231804949053857,
+      "grad_norm": 0.16054444015026093,
+      "learning_rate": 1.6467175349005054e-05,
+      "loss": 0.1496324896812439,
+      "step": 3425
+    },
+    {
+      "epoch": 0.6240902474526928,
+      "grad_norm": 0.1663951277732849,
+      "learning_rate": 1.639802454606788e-05,
+      "loss": 0.1504170298576355,
+      "step": 3430
+    },
+    {
+      "epoch": 0.625,
+      "grad_norm": 0.1591310054063797,
+      "learning_rate": 1.6328948313928906e-05,
+      "loss": 0.1410186171531677,
+      "step": 3435
+    },
+    {
+      "epoch": 0.6259097525473072,
+      "grad_norm": 0.1637524962425232,
+      "learning_rate": 1.6259947251412178e-05,
+      "loss": 0.13963305950164795,
+      "step": 3440
+    },
+    {
+      "epoch": 0.6268195050946143,
+      "grad_norm": 0.1688017100095749,
+      "learning_rate": 1.6191021956690096e-05,
+      "loss": 0.14727941751480103,
+      "step": 3445
+    },
+    {
+      "epoch": 0.6277292576419214,
+      "grad_norm": 0.1691795438528061,
+      "learning_rate": 1.612217302727821e-05,
+      "loss": 0.14856183528900146,
+      "step": 3450
+    },
+    {
+      "epoch": 0.6286390101892285,
+      "grad_norm": 0.18501746654510498,
+      "learning_rate": 1.60534010600301e-05,
+      "loss": 0.1481746554374695,
+      "step": 3455
+    },
+    {
+      "epoch": 0.6295487627365357,
+      "grad_norm": 0.16234716773033142,
+      "learning_rate": 1.5984706651132125e-05,
+      "loss": 0.1427530527114868,
+      "step": 3460
+    },
+    {
+      "epoch": 0.6304585152838428,
+      "grad_norm": 0.16013780236244202,
+      "learning_rate": 1.5916090396098293e-05,
+      "loss": 0.14264426231384278,
+      "step": 3465
+    },
+    {
+      "epoch": 0.63136826783115,
+      "grad_norm": 0.17116396129131317,
+      "learning_rate": 1.5847552889765095e-05,
+      "loss": 0.14109257459640503,
+      "step": 3470
+    },
+    {
+      "epoch": 0.632278020378457,
+      "grad_norm": 0.16949769854545593,
+      "learning_rate": 1.5779094726286344e-05,
+      "loss": 0.1387040376663208,
+      "step": 3475
+    },
+    {
+      "epoch": 0.6331877729257642,
+      "grad_norm": 0.14983431994915009,
+      "learning_rate": 1.5710716499128044e-05,
+      "loss": 0.13645120859146118,
+      "step": 3480
+    },
+    {
+      "epoch": 0.6340975254730713,
+      "grad_norm": 0.1632554531097412,
+      "learning_rate": 1.564241880106321e-05,
+      "loss": 0.14883992671966553,
+      "step": 3485
+    },
+    {
+      "epoch": 0.6350072780203785,
+      "grad_norm": 0.15686506032943726,
+      "learning_rate": 1.5574202224166744e-05,
+      "loss": 0.14244272708892822,
+      "step": 3490
+    },
+    {
+      "epoch": 0.6359170305676856,
+      "grad_norm": 0.18843458592891693,
+      "learning_rate": 1.5506067359810333e-05,
+      "loss": 0.15149861574172974,
+      "step": 3495
+    },
+    {
+      "epoch": 0.6368267831149927,
+      "grad_norm": 0.15874551236629486,
+      "learning_rate": 1.5438014798657275e-05,
+      "loss": 0.15188233852386473,
+      "step": 3500
+    },
+    {
+      "epoch": 0.6377365356622998,
+      "grad_norm": 0.17014239728450775,
+      "learning_rate": 1.5370045130657366e-05,
+      "loss": 0.14694437980651856,
+      "step": 3505
+    },
+    {
+      "epoch": 0.638646288209607,
+      "grad_norm": 0.14744038879871368,
+      "learning_rate": 1.5302158945041838e-05,
+      "loss": 0.14434736967086792,
+      "step": 3510
+    },
+    {
+      "epoch": 0.6395560407569141,
+      "grad_norm": 0.2069770246744156,
+      "learning_rate": 1.523435683031818e-05,
+      "loss": 0.13982917070388795,
+      "step": 3515
+    },
+    {
+      "epoch": 0.6404657933042213,
+      "grad_norm": 0.17811502516269684,
+      "learning_rate": 1.5166639374265063e-05,
+      "loss": 0.1408839702606201,
+      "step": 3520
+    },
+    {
+      "epoch": 0.6413755458515283,
+      "grad_norm": 0.165786474943161,
+      "learning_rate": 1.509900716392728e-05,
+      "loss": 0.15312877893447877,
+      "step": 3525
+    },
+    {
+      "epoch": 0.6422852983988355,
+      "grad_norm": 0.1633884161710739,
+      "learning_rate": 1.5031460785610596e-05,
+      "loss": 0.1488795518875122,
+      "step": 3530
+    },
+    {
+      "epoch": 0.6431950509461426,
+      "grad_norm": 0.16498984396457672,
+      "learning_rate": 1.4964000824876723e-05,
+      "loss": 0.15031465291976928,
+      "step": 3535
+    },
+    {
+      "epoch": 0.6441048034934498,
+      "grad_norm": 0.18043678998947144,
+      "learning_rate": 1.4896627866538191e-05,
+      "loss": 0.147829806804657,
+      "step": 3540
+    },
+    {
+      "epoch": 0.6450145560407569,
+      "grad_norm": 0.16813597083091736,
+      "learning_rate": 1.4829342494653315e-05,
+      "loss": 0.1418998956680298,
+      "step": 3545
+    },
+    {
+      "epoch": 0.645924308588064,
+      "grad_norm": 0.1817242056131363,
+      "learning_rate": 1.4762145292521118e-05,
+      "loss": 0.14508869647979736,
+      "step": 3550
+    },
+    {
+      "epoch": 0.6468340611353712,
+      "grad_norm": 0.14666494727134705,
+      "learning_rate": 1.469503684267628e-05,
+      "loss": 0.14159854650497436,
+      "step": 3555
+    },
+    {
+      "epoch": 0.6477438136826783,
+      "grad_norm": 0.16485381126403809,
+      "learning_rate": 1.4628017726884086e-05,
+      "loss": 0.14419105052947997,
+      "step": 3560
+    },
+    {
+      "epoch": 0.6486535662299855,
+      "grad_norm": 0.16100342571735382,
+      "learning_rate": 1.4561088526135375e-05,
+      "loss": 0.14501721858978273,
+      "step": 3565
+    },
+    {
+      "epoch": 0.6495633187772926,
+      "grad_norm": 0.16996590793132782,
+      "learning_rate": 1.4494249820641493e-05,
+      "loss": 0.1377166509628296,
+      "step": 3570
+    },
+    {
+      "epoch": 0.6504730713245997,
+      "grad_norm": 0.16168837249279022,
+      "learning_rate": 1.4427502189829339e-05,
+      "loss": 0.1414325475692749,
+      "step": 3575
+    },
+    {
+      "epoch": 0.6513828238719068,
+      "grad_norm": 0.16318906843662262,
+      "learning_rate": 1.436084621233621e-05,
+      "loss": 0.14685193300247193,
+      "step": 3580
+    },
+    {
+      "epoch": 0.652292576419214,
+      "grad_norm": 0.1636219322681427,
+      "learning_rate": 1.4294282466004899e-05,
+      "loss": 0.1405899167060852,
+      "step": 3585
+    },
+    {
+      "epoch": 0.6532023289665211,
+      "grad_norm": 0.1838461309671402,
+      "learning_rate": 1.422781152787865e-05,
+      "loss": 0.14386332035064697,
+      "step": 3590
+    },
+    {
+      "epoch": 0.6541120815138283,
+      "grad_norm": 0.1796344667673111,
+      "learning_rate": 1.4161433974196115e-05,
+      "loss": 0.1513024687767029,
+      "step": 3595
+    },
+    {
+      "epoch": 0.6550218340611353,
+      "grad_norm": 0.16424529254436493,
+      "learning_rate": 1.4095150380386427e-05,
+      "loss": 0.14238927364349366,
+      "step": 3600
+    },
+    {
+      "epoch": 0.6559315866084425,
+      "grad_norm": 0.19264160096645355,
+      "learning_rate": 1.402896132106415e-05,
+      "loss": 0.14297477006912232,
+      "step": 3605
+    },
+    {
+      "epoch": 0.6568413391557496,
+      "grad_norm": 0.18319948017597198,
+      "learning_rate": 1.3962867370024347e-05,
+      "loss": 0.1448880434036255,
+      "step": 3610
+    },
+    {
+      "epoch": 0.6577510917030568,
+      "grad_norm": 0.16507290303707123,
+      "learning_rate": 1.389686910023758e-05,
+      "loss": 0.14724698066711425,
+      "step": 3615
+    },
+    {
+      "epoch": 0.6586608442503639,
+      "grad_norm": 0.17871244251728058,
+      "learning_rate": 1.3830967083844942e-05,
+      "loss": 0.14479386806488037,
+      "step": 3620
+    },
+    {
+      "epoch": 0.659570596797671,
+      "grad_norm": 0.1846228390932083,
+      "learning_rate": 1.3765161892153112e-05,
+      "loss": 0.1453616738319397,
+      "step": 3625
+    },
+    {
+      "epoch": 0.6604803493449781,
+      "grad_norm": 0.17185978591442108,
+      "learning_rate": 1.3699454095629372e-05,
+      "loss": 0.14906206130981445,
+      "step": 3630
+    },
+    {
+      "epoch": 0.6613901018922853,
+      "grad_norm": 0.14751191437244415,
+      "learning_rate": 1.3633844263896698e-05,
+      "loss": 0.13991892337799072,
+      "step": 3635
+    },
+    {
+      "epoch": 0.6622998544395924,
+      "grad_norm": 0.22059763967990875,
+      "learning_rate": 1.3568332965728817e-05,
+      "loss": 0.14680869579315187,
+      "step": 3640
+    },
+    {
+      "epoch": 0.6632096069868996,
+      "grad_norm": 0.15295909345149994,
+      "learning_rate": 1.3502920769045232e-05,
+      "loss": 0.1404443383216858,
+      "step": 3645
+    },
+    {
+      "epoch": 0.6641193595342066,
+      "grad_norm": 0.14600558578968048,
+      "learning_rate": 1.3437608240906364e-05,
+      "loss": 0.14663270711898804,
+      "step": 3650
+    },
+    {
+      "epoch": 0.6650291120815138,
+      "grad_norm": 0.15548352897167206,
+      "learning_rate": 1.3372395947508587e-05,
+      "loss": 0.1431443452835083,
+      "step": 3655
+    },
+    {
+      "epoch": 0.665938864628821,
+      "grad_norm": 0.1813388466835022,
+      "learning_rate": 1.3307284454179342e-05,
+      "loss": 0.1458706736564636,
+      "step": 3660
+    },
+    {
+      "epoch": 0.6668486171761281,
+      "grad_norm": 0.16326870024204254,
+      "learning_rate": 1.3242274325372247e-05,
+      "loss": 0.14700595140457154,
+      "step": 3665
+    },
+    {
+      "epoch": 0.6677583697234353,
+      "grad_norm": 0.18779197335243225,
+      "learning_rate": 1.3177366124662149e-05,
+      "loss": 0.1497237801551819,
+      "step": 3670
+    },
+    {
+      "epoch": 0.6686681222707423,
+      "grad_norm": 0.16291002929210663,
+      "learning_rate": 1.3112560414740315e-05,
+      "loss": 0.1387086868286133,
+      "step": 3675
+    },
+    {
+      "epoch": 0.6695778748180495,
+      "grad_norm": 0.1532297134399414,
+      "learning_rate": 1.3047857757409487e-05,
+      "loss": 0.14497545957565308,
+      "step": 3680
+    },
+    {
+      "epoch": 0.6704876273653566,
+      "grad_norm": 0.14697515964508057,
+      "learning_rate": 1.2983258713579066e-05,
+      "loss": 0.1494283437728882,
+      "step": 3685
+    },
+    {
+      "epoch": 0.6713973799126638,
+      "grad_norm": 0.15213452279567719,
+      "learning_rate": 1.2918763843260218e-05,
+      "loss": 0.1468907594680786,
+      "step": 3690
+    },
+    {
+      "epoch": 0.6723071324599709,
+      "grad_norm": 0.1745215803384781,
+      "learning_rate": 1.285437370556099e-05,
+      "loss": 0.14997754096984864,
+      "step": 3695
+    },
+    {
+      "epoch": 0.673216885007278,
+      "grad_norm": 0.19207637012004852,
+      "learning_rate": 1.2790088858681577e-05,
+      "loss": 0.14202862977981567,
+      "step": 3700
+    },
+    {
+      "epoch": 0.6741266375545851,
+      "grad_norm": 0.1521359086036682,
+      "learning_rate": 1.2725909859909313e-05,
+      "loss": 0.14547673463821412,
+      "step": 3705
+    },
+    {
+      "epoch": 0.6750363901018923,
+      "grad_norm": 0.16975535452365875,
+      "learning_rate": 1.2661837265613999e-05,
+      "loss": 0.14006874561309815,
+      "step": 3710
+    },
+    {
+      "epoch": 0.6759461426491994,
+      "grad_norm": 0.22234582901000977,
+      "learning_rate": 1.2597871631242992e-05,
+      "loss": 0.13691173791885375,
+      "step": 3715
+    },
+    {
+      "epoch": 0.6768558951965066,
+      "grad_norm": 0.16082969307899475,
+      "learning_rate": 1.2534013511316383e-05,
+      "loss": 0.14932308197021485,
+      "step": 3720
+    },
+    {
+      "epoch": 0.6777656477438136,
+      "grad_norm": 0.1751091182231903,
+      "learning_rate": 1.247026345942226e-05,
+      "loss": 0.14531974792480468,
+      "step": 3725
+    },
+    {
+      "epoch": 0.6786754002911208,
+      "grad_norm": 0.15838147699832916,
+      "learning_rate": 1.2406622028211844e-05,
+      "loss": 0.14759832620620728,
+      "step": 3730
+    },
+    {
+      "epoch": 0.6795851528384279,
+      "grad_norm": 0.1771744042634964,
+      "learning_rate": 1.2343089769394714e-05,
+      "loss": 0.1382831573486328,
+      "step": 3735
+    },
+    {
+      "epoch": 0.6804949053857351,
+      "grad_norm": 0.16301538050174713,
+      "learning_rate": 1.2279667233734037e-05,
+      "loss": 0.14444775581359864,
+      "step": 3740
+    },
+    {
+      "epoch": 0.6814046579330422,
+      "grad_norm": 0.1584121286869049,
+      "learning_rate": 1.2216354971041796e-05,
+      "loss": 0.14200170040130616,
+      "step": 3745
+    },
+    {
+      "epoch": 0.6823144104803494,
+      "grad_norm": 0.139187291264534,
+      "learning_rate": 1.2153153530174007e-05,
+      "loss": 0.14318310022354125,
+      "step": 3750
+    },
+    {
+      "epoch": 0.6832241630276564,
+      "grad_norm": 0.13665248453617096,
+      "learning_rate": 1.2090063459025955e-05,
+      "loss": 0.1411946654319763,
+      "step": 3755
+    },
+    {
+      "epoch": 0.6841339155749636,
+      "grad_norm": 0.16273781657218933,
+      "learning_rate": 1.2027085304527475e-05,
+      "loss": 0.14873508214950562,
+      "step": 3760
+    },
+    {
+      "epoch": 0.6850436681222707,
+      "grad_norm": 0.16317526996135712,
+      "learning_rate": 1.1964219612638194e-05,
+      "loss": 0.14644203186035157,
+      "step": 3765
+    },
+    {
+      "epoch": 0.6859534206695779,
+      "grad_norm": 0.17253617942333221,
+      "learning_rate": 1.1901466928342777e-05,
+      "loss": 0.14027841091156007,
+      "step": 3770
+    },
+    {
+      "epoch": 0.6868631732168851,
+      "grad_norm": 0.19692830741405487,
+      "learning_rate": 1.183882779564624e-05,
+      "loss": 0.14411110877990724,
+      "step": 3775
+    },
+    {
+      "epoch": 0.6877729257641921,
+      "grad_norm": 0.15444578230381012,
+      "learning_rate": 1.1776302757569214e-05,
+      "loss": 0.14355008602142333,
+      "step": 3780
+    },
+    {
+      "epoch": 0.6886826783114993,
+      "grad_norm": 0.1622200757265091,
+      "learning_rate": 1.1713892356143239e-05,
+      "loss": 0.14794334173202514,
+      "step": 3785
+    },
+    {
+      "epoch": 0.6895924308588064,
+      "grad_norm": 0.1898501068353653,
+      "learning_rate": 1.1651597132406073e-05,
+      "loss": 0.1418622612953186,
+      "step": 3790
+    },
+    {
+      "epoch": 0.6905021834061136,
+      "grad_norm": 0.17803208529949188,
+      "learning_rate": 1.1589417626396973e-05,
+      "loss": 0.14576040506362914,
+      "step": 3795
+    },
+    {
+      "epoch": 0.6914119359534207,
+      "grad_norm": 0.17138013243675232,
+      "learning_rate": 1.1527354377152053e-05,
+      "loss": 0.14494270086288452,
+      "step": 3800
+    },
+    {
+      "epoch": 0.6923216885007278,
+      "grad_norm": 0.15170913934707642,
+      "learning_rate": 1.1465407922699603e-05,
+      "loss": 0.144084370136261,
+      "step": 3805
+    },
+    {
+      "epoch": 0.6932314410480349,
+      "grad_norm": 0.158562570810318,
+      "learning_rate": 1.1403578800055387e-05,
+      "loss": 0.13636608123779298,
+      "step": 3810
+    },
+    {
+      "epoch": 0.6941411935953421,
+      "grad_norm": 0.17687302827835083,
+      "learning_rate": 1.1341867545218044e-05,
+      "loss": 0.14214688539505005,
+      "step": 3815
+    },
+    {
+      "epoch": 0.6950509461426492,
+      "grad_norm": 0.15394899249076843,
+      "learning_rate": 1.1280274693164378e-05,
+      "loss": 0.14914129972457885,
+      "step": 3820
+    },
+    {
+      "epoch": 0.6959606986899564,
+      "grad_norm": 0.15709355473518372,
+      "learning_rate": 1.12188007778448e-05,
+      "loss": 0.14798580408096312,
+      "step": 3825
+    },
+    {
+      "epoch": 0.6968704512372634,
+      "grad_norm": 0.16631539165973663,
+      "learning_rate": 1.115744633217864e-05,
+      "loss": 0.14756966829299928,
+      "step": 3830
+    },
+    {
+      "epoch": 0.6977802037845706,
+      "grad_norm": 0.15893076360225677,
+      "learning_rate": 1.109621188804951e-05,
+      "loss": 0.14061959981918334,
+      "step": 3835
+    },
+    {
+      "epoch": 0.6986899563318777,
+      "grad_norm": 0.183414489030838,
+      "learning_rate": 1.103509797630077e-05,
+      "loss": 0.1448473334312439,
+      "step": 3840
+    },
+    {
+      "epoch": 0.6995997088791849,
+      "grad_norm": 0.14087305963039398,
+      "learning_rate": 1.0974105126730841e-05,
+      "loss": 0.14369285106658936,
+      "step": 3845
+    },
+    {
+      "epoch": 0.700509461426492,
+      "grad_norm": 0.16919967532157898,
+      "learning_rate": 1.0913233868088685e-05,
+      "loss": 0.1478085398674011,
+      "step": 3850
+    },
+    {
+      "epoch": 0.7014192139737991,
+      "grad_norm": 0.1439533829689026,
+      "learning_rate": 1.0852484728069178e-05,
+      "loss": 0.14376721382141114,
+      "step": 3855
+    },
+    {
+      "epoch": 0.7023289665211062,
+      "grad_norm": 0.17719274759292603,
+      "learning_rate": 1.0791858233308521e-05,
+      "loss": 0.14089040756225585,
+      "step": 3860
+    },
+    {
+      "epoch": 0.7032387190684134,
+      "grad_norm": 0.19753769040107727,
+      "learning_rate": 1.0731354909379754e-05,
+      "loss": 0.15021742582321168,
+      "step": 3865
+    },
+    {
+      "epoch": 0.7041484716157205,
+      "grad_norm": 0.19186992943286896,
+      "learning_rate": 1.0670975280788086e-05,
+      "loss": 0.14113202095031738,
+      "step": 3870
+    },
+    {
+      "epoch": 0.7050582241630277,
+      "grad_norm": 0.1709229201078415,
+      "learning_rate": 1.0610719870966443e-05,
+      "loss": 0.1500566840171814,
+      "step": 3875
+    },
+    {
+      "epoch": 0.7059679767103348,
+      "grad_norm": 0.17846204340457916,
+      "learning_rate": 1.0550589202270892e-05,
+      "loss": 0.15014195442199707,
+      "step": 3880
+    },
+    {
+      "epoch": 0.7068777292576419,
+      "grad_norm": 0.1827082335948944,
+      "learning_rate": 1.0490583795976091e-05,
+      "loss": 0.1423472762107849,
+      "step": 3885
+    },
+    {
+      "epoch": 0.7077874818049491,
+      "grad_norm": 0.17418377101421356,
+      "learning_rate": 1.043070417227083e-05,
+      "loss": 0.14668900966644288,
+      "step": 3890
+    },
+    {
+      "epoch": 0.7086972343522562,
+      "grad_norm": 0.17385616898536682,
+      "learning_rate": 1.0370950850253449e-05,
+      "loss": 0.14627279043197633,
+      "step": 3895
+    },
+    {
+      "epoch": 0.7096069868995634,
+      "grad_norm": 0.16486723721027374,
+      "learning_rate": 1.0311324347927404e-05,
+      "loss": 0.14603652954101562,
+      "step": 3900
+    },
+    {
+      "epoch": 0.7105167394468704,
+      "grad_norm": 0.21806862950325012,
+      "learning_rate": 1.0251825182196732e-05,
+      "loss": 0.1488169550895691,
+      "step": 3905
+    },
+    {
+      "epoch": 0.7114264919941776,
+      "grad_norm": 0.19884569942951202,
+      "learning_rate": 1.019245386886159e-05,
+      "loss": 0.14387656450271608,
+      "step": 3910
+    },
+    {
+      "epoch": 0.7123362445414847,
+      "grad_norm": 0.16139011085033417,
+      "learning_rate": 1.0133210922613789e-05,
+      "loss": 0.1483074426651001,
+      "step": 3915
+    },
+    {
+      "epoch": 0.7132459970887919,
+      "grad_norm": 0.17000740766525269,
+      "learning_rate": 1.007409685703229e-05,
+      "loss": 0.14050065279006957,
+      "step": 3920
+    },
+    {
+      "epoch": 0.714155749636099,
+      "grad_norm": 0.17235304415225983,
+      "learning_rate": 1.0015112184578813e-05,
+      "loss": 0.1440442681312561,
+      "step": 3925
+    },
+    {
+      "epoch": 0.7150655021834061,
+      "grad_norm": 0.15737567842006683,
+      "learning_rate": 9.956257416593362e-06,
+      "loss": 0.14960765838623047,
+      "step": 3930
+    },
+    {
+      "epoch": 0.7159752547307132,
+      "grad_norm": 0.15499180555343628,
+      "learning_rate": 9.897533063289773e-06,
+      "loss": 0.14488829374313356,
+      "step": 3935
+    },
+    {
+      "epoch": 0.7168850072780204,
+      "grad_norm": 0.17744216322898865,
+      "learning_rate": 9.838939633751337e-06,
+      "loss": 0.1416949987411499,
+      "step": 3940
+    },
+    {
+      "epoch": 0.7177947598253275,
+      "grad_norm": 0.1597192883491516,
+      "learning_rate": 9.780477635926358e-06,
+      "loss": 0.14275280237197877,
+      "step": 3945
+    },
+    {
+      "epoch": 0.7187045123726347,
+      "grad_norm": 0.17800374329090118,
+      "learning_rate": 9.722147576623743e-06,
+      "loss": 0.14532098770141602,
+      "step": 3950
+    },
+    {
+      "epoch": 0.7196142649199417,
+      "grad_norm": 0.1828162521123886,
+      "learning_rate": 9.66394996150864e-06,
+      "loss": 0.14525585174560546,
+      "step": 3955
+    },
+    {
+      "epoch": 0.7205240174672489,
+      "grad_norm": 0.1800539344549179,
+      "learning_rate": 9.605885295098005e-06,
+      "loss": 0.14235819578170777,
+      "step": 3960
+    },
+    {
+      "epoch": 0.721433770014556,
+      "grad_norm": 0.16556483507156372,
+      "learning_rate": 9.54795408075628e-06,
+      "loss": 0.13965482711791993,
+      "step": 3965
+    },
+    {
+      "epoch": 0.7223435225618632,
+      "grad_norm": 0.1592024862766266,
+      "learning_rate": 9.49015682069101e-06,
+      "loss": 0.14051042795181273,
+      "step": 3970
+    },
+    {
+      "epoch": 0.7232532751091703,
+      "grad_norm": 0.18988847732543945,
+      "learning_rate": 9.43249401594846e-06,
+      "loss": 0.1436900496482849,
+      "step": 3975
+    },
+    {
+      "epoch": 0.7241630276564774,
+      "grad_norm": 0.24433808028697968,
+      "learning_rate": 9.374966166409329e-06,
+      "loss": 0.14883997440338134,
+      "step": 3980
+    },
+    {
+      "epoch": 0.7250727802037845,
+      "grad_norm": 0.15091639757156372,
+      "learning_rate": 9.317573770784352e-06,
+      "loss": 0.14726560115814208,
+      "step": 3985
+    },
+    {
+      "epoch": 0.7259825327510917,
+      "grad_norm": 0.17045573890209198,
+      "learning_rate": 9.260317326610051e-06,
+      "loss": 0.14120506048202514,
+      "step": 3990
+    },
+    {
+      "epoch": 0.7268922852983989,
+      "grad_norm": 0.18847957253456116,
+      "learning_rate": 9.203197330244343e-06,
+      "loss": 0.1377041220664978,
+      "step": 3995
+    },
+    {
+      "epoch": 0.727802037845706,
+      "grad_norm": 0.1516445279121399,
+      "learning_rate": 9.14621427686229e-06,
+      "loss": 0.14043946266174318,
+      "step": 4000
+    },
+    {
+      "epoch": 0.7287117903930131,
+      "grad_norm": 0.18264050781726837,
+      "learning_rate": 9.0893686604518e-06,
+      "loss": 0.14080368280410765,
+      "step": 4005
+    },
+    {
+      "epoch": 0.7296215429403202,
+      "grad_norm": 0.19129371643066406,
+      "learning_rate": 9.032660973809312e-06,
+      "loss": 0.1402561902999878,
+      "step": 4010
+    },
+    {
+      "epoch": 0.7305312954876274,
+      "grad_norm": 0.15762710571289062,
+      "learning_rate": 8.976091708535567e-06,
+      "loss": 0.14421157836914061,
+      "step": 4015
+    },
+    {
+      "epoch": 0.7314410480349345,
+      "grad_norm": 0.17785198986530304,
+      "learning_rate": 8.919661355031331e-06,
+      "loss": 0.14999009370803834,
+      "step": 4020
+    },
+    {
+      "epoch": 0.7323508005822417,
+      "grad_norm": 0.15306031703948975,
+      "learning_rate": 8.8633704024931e-06,
+      "loss": 0.14101698398590087,
+      "step": 4025
+    },
+    {
+      "epoch": 0.7332605531295487,
+      "grad_norm": 0.16481758654117584,
+      "learning_rate": 8.807219338908968e-06,
+      "loss": 0.14170764684677123,
+      "step": 4030
+    },
+    {
+      "epoch": 0.7341703056768559,
+      "grad_norm": 0.14892235398292542,
+      "learning_rate": 8.751208651054257e-06,
+      "loss": 0.15317896604537964,
+      "step": 4035
+    },
+    {
+      "epoch": 0.735080058224163,
+      "grad_norm": 0.1775592565536499,
+      "learning_rate": 8.695338824487409e-06,
+      "loss": 0.1520617723464966,
+      "step": 4040
+    },
+    {
+      "epoch": 0.7359898107714702,
+      "grad_norm": 0.1614258885383606,
+      "learning_rate": 8.639610343545728e-06,
+      "loss": 0.13747400045394897,
+      "step": 4045
+    },
+    {
+      "epoch": 0.7368995633187773,
+      "grad_norm": 0.21415506303310394,
+      "learning_rate": 8.58402369134117e-06,
+      "loss": 0.1432439088821411,
+      "step": 4050
+    },
+    {
+      "epoch": 0.7378093158660844,
+      "grad_norm": 0.1759418249130249,
+      "learning_rate": 8.528579349756205e-06,
+      "loss": 0.141641104221344,
+      "step": 4055
+    },
+    {
+      "epoch": 0.7387190684133915,
+      "grad_norm": 0.16738329827785492,
+      "learning_rate": 8.47327779943957e-06,
+      "loss": 0.14294810295104982,
+      "step": 4060
+    },
+    {
+      "epoch": 0.7396288209606987,
+      "grad_norm": 0.13916844129562378,
+      "learning_rate": 8.41811951980217e-06,
+      "loss": 0.13876968622207642,
+      "step": 4065
+    },
+    {
+      "epoch": 0.7405385735080058,
+      "grad_norm": 0.1828441321849823,
+      "learning_rate": 8.36310498901288e-06,
+      "loss": 0.148428475856781,
+      "step": 4070
+    },
+    {
+      "epoch": 0.741448326055313,
+      "grad_norm": 0.16534076631069183,
+      "learning_rate": 8.308234683994415e-06,
+      "loss": 0.14222711324691772,
+      "step": 4075
+    },
+    {
+      "epoch": 0.74235807860262,
+      "grad_norm": 0.17922644317150116,
+      "learning_rate": 8.253509080419198e-06,
+      "loss": 0.14365782737731933,
+      "step": 4080
+    },
+    {
+      "epoch": 0.7432678311499272,
+      "grad_norm": 0.15061035752296448,
+      "learning_rate": 8.198928652705204e-06,
+      "loss": 0.13571925163269044,
+      "step": 4085
+    },
+    {
+      "epoch": 0.7441775836972343,
+      "grad_norm": 0.18075402081012726,
+      "learning_rate": 8.144493874011908e-06,
+      "loss": 0.14385528564453126,
+      "step": 4090
+    },
+    {
+      "epoch": 0.7450873362445415,
+      "grad_norm": 0.16514739394187927,
+      "learning_rate": 8.090205216236135e-06,
+      "loss": 0.14920626878738402,
+      "step": 4095
+    },
+    {
+      "epoch": 0.7459970887918487,
+      "grad_norm": 0.16453702747821808,
+      "learning_rate": 8.03606315000797e-06,
+      "loss": 0.14704222679138185,
+      "step": 4100
+    },
+    {
+      "epoch": 0.7469068413391557,
+      "grad_norm": 0.16719917953014374,
+      "learning_rate": 7.982068144686707e-06,
+      "loss": 0.14722511768341065,
+      "step": 4105
+    },
+    {
+      "epoch": 0.7478165938864629,
+      "grad_norm": 0.18499110639095306,
+      "learning_rate": 7.92822066835677e-06,
+      "loss": 0.1401848554611206,
+      "step": 4110
+    },
+    {
+      "epoch": 0.74872634643377,
+      "grad_norm": 0.17249563336372375,
+      "learning_rate": 7.87452118782363e-06,
+      "loss": 0.15132423639297485,
+      "step": 4115
+    },
+    {
+      "epoch": 0.7496360989810772,
+      "grad_norm": 0.15049682557582855,
+      "learning_rate": 7.8209701686098e-06,
+      "loss": 0.1341150164604187,
+      "step": 4120
+    },
+    {
+      "epoch": 0.7505458515283843,
+      "grad_norm": 0.16892646253108978,
+      "learning_rate": 7.767568074950751e-06,
+      "loss": 0.1466840147972107,
+      "step": 4125
+    },
+    {
+      "epoch": 0.7514556040756915,
+      "grad_norm": 0.17288286983966827,
+      "learning_rate": 7.714315369790942e-06,
+      "loss": 0.13819680213928223,
+      "step": 4130
+    },
+    {
+      "epoch": 0.7523653566229985,
+      "grad_norm": 0.21893996000289917,
+      "learning_rate": 7.661212514779745e-06,
+      "loss": 0.14369510412216185,
+      "step": 4135
+    },
+    {
+      "epoch": 0.7532751091703057,
+      "grad_norm": 0.1674601435661316,
+      "learning_rate": 7.608259970267509e-06,
+      "loss": 0.14810250997543334,
+      "step": 4140
+    },
+    {
+      "epoch": 0.7541848617176128,
+      "grad_norm": 0.15875539183616638,
+      "learning_rate": 7.555458195301526e-06,
+      "loss": 0.14103198051452637,
+      "step": 4145
+    },
+    {
+      "epoch": 0.75509461426492,
+      "grad_norm": 0.19454079866409302,
+      "learning_rate": 7.502807647622037e-06,
+      "loss": 0.13848764896392823,
+      "step": 4150
+    },
+    {
+      "epoch": 0.756004366812227,
+      "grad_norm": 0.1795455813407898,
+      "learning_rate": 7.450308783658341e-06,
+      "loss": 0.14459335803985596,
+      "step": 4155
+    },
+    {
+      "epoch": 0.7569141193595342,
+      "grad_norm": 0.1643362045288086,
+      "learning_rate": 7.397962058524735e-06,
+      "loss": 0.14335378408432006,
+      "step": 4160
+    },
+    {
+      "epoch": 0.7578238719068413,
+      "grad_norm": 0.16362066566944122,
+      "learning_rate": 7.3457679260166475e-06,
+      "loss": 0.14222005605697632,
+      "step": 4165
+    },
+    {
+      "epoch": 0.7587336244541485,
+      "grad_norm": 0.17313003540039062,
+      "learning_rate": 7.293726838606674e-06,
+      "loss": 0.14272255897521974,
+      "step": 4170
+    },
+    {
+      "epoch": 0.7596433770014556,
+      "grad_norm": 0.1809929460287094,
+      "learning_rate": 7.2418392474406405e-06,
+      "loss": 0.14089123010635377,
+      "step": 4175
+    },
+    {
+      "epoch": 0.7605531295487628,
+      "grad_norm": 0.14306005835533142,
+      "learning_rate": 7.19010560233373e-06,
+      "loss": 0.13531534671783446,
+      "step": 4180
+    },
+    {
+      "epoch": 0.7614628820960698,
+      "grad_norm": 0.15525390207767487,
+      "learning_rate": 7.138526351766559e-06,
+      "loss": 0.14340845346450806,
+      "step": 4185
+    },
+    {
+      "epoch": 0.762372634643377,
+      "grad_norm": 0.24478943645954132,
+      "learning_rate": 7.087101942881263e-06,
+      "loss": 0.14744555950164795,
+      "step": 4190
+    },
+    {
+      "epoch": 0.7632823871906841,
+      "grad_norm": 0.31335577368736267,
+      "learning_rate": 7.035832821477711e-06,
+      "loss": 0.1484094500541687,
+      "step": 4195
+    },
+    {
+      "epoch": 0.7641921397379913,
+      "grad_norm": 0.15140366554260254,
+      "learning_rate": 6.984719432009515e-06,
+      "loss": 0.14991614818572999,
+      "step": 4200
+    },
+    {
+      "epoch": 0.7651018922852983,
+      "grad_norm": 0.16125506162643433,
+      "learning_rate": 6.933762217580289e-06,
+      "loss": 0.1408134937286377,
+      "step": 4205
+    },
+    {
+      "epoch": 0.7660116448326055,
+      "grad_norm": 0.2501450181007385,
+      "learning_rate": 6.882961619939726e-06,
+      "loss": 0.13875640630722047,
+      "step": 4210
+    },
+    {
+      "epoch": 0.7669213973799127,
+      "grad_norm": 0.16227811574935913,
+      "learning_rate": 6.8323180794798245e-06,
+      "loss": 0.14138660430908204,
+      "step": 4215
+    },
+    {
+      "epoch": 0.7678311499272198,
+      "grad_norm": 0.16676810383796692,
+      "learning_rate": 6.781832035231053e-06,
+      "loss": 0.14696706533432008,
+      "step": 4220
+    },
+    {
+      "epoch": 0.768740902474527,
+      "grad_norm": 0.14638574421405792,
+      "learning_rate": 6.731503924858518e-06,
+      "loss": 0.14263020753860473,
+      "step": 4225
+    },
+    {
+      "epoch": 0.769650655021834,
+      "grad_norm": 0.17093190550804138,
+      "learning_rate": 6.681334184658211e-06,
+      "loss": 0.14694111347198485,
+      "step": 4230
+    },
+    {
+      "epoch": 0.7705604075691412,
+      "grad_norm": 0.17174287140369415,
+      "learning_rate": 6.631323249553201e-06,
+      "loss": 0.13854929208755493,
+      "step": 4235
+    },
+    {
+      "epoch": 0.7714701601164483,
+      "grad_norm": 0.14599016308784485,
+      "learning_rate": 6.5814715530898745e-06,
+      "loss": 0.14058833122253417,
+      "step": 4240
+    },
+    {
+      "epoch": 0.7723799126637555,
+      "grad_norm": 0.16222265362739563,
+      "learning_rate": 6.531779527434176e-06,
+      "loss": 0.1428326725959778,
+      "step": 4245
+    },
+    {
+      "epoch": 0.7732896652110626,
+      "grad_norm": 0.1741994023323059,
+      "learning_rate": 6.482247603367839e-06,
+      "loss": 0.13985042572021483,
+      "step": 4250
+    },
+    {
+      "epoch": 0.7741994177583698,
+      "grad_norm": 0.17427101731300354,
+      "learning_rate": 6.432876210284688e-06,
+      "loss": 0.1442667603492737,
+      "step": 4255
+    },
+    {
+      "epoch": 0.7751091703056768,
+      "grad_norm": 0.1665259599685669,
+      "learning_rate": 6.383665776186912e-06,
+      "loss": 0.1421986222267151,
+      "step": 4260
+    },
+    {
+      "epoch": 0.776018922852984,
+      "grad_norm": 0.1728232353925705,
+      "learning_rate": 6.334616727681303e-06,
+      "loss": 0.1367053508758545,
+      "step": 4265
+    },
+    {
+      "epoch": 0.7769286754002911,
+      "grad_norm": 0.15882381796836853,
+      "learning_rate": 6.285729489975639e-06,
+      "loss": 0.14551182985305786,
+      "step": 4270
+    },
+    {
+      "epoch": 0.7778384279475983,
+      "grad_norm": 0.242042675614357,
+      "learning_rate": 6.2370044868749115e-06,
+      "loss": 0.1455132007598877,
+      "step": 4275
+    },
+    {
+      "epoch": 0.7787481804949054,
+      "grad_norm": 0.1599501073360443,
+      "learning_rate": 6.188442140777742e-06,
+      "loss": 0.1424942970275879,
+      "step": 4280
+    },
+    {
+      "epoch": 0.7796579330422125,
+      "grad_norm": 0.15182635188102722,
+      "learning_rate": 6.140042872672647e-06,
+      "loss": 0.14212887287139891,
+      "step": 4285
+    },
+    {
+      "epoch": 0.7805676855895196,
+      "grad_norm": 0.1720375418663025,
+      "learning_rate": 6.091807102134403e-06,
+      "loss": 0.14243412017822266,
+      "step": 4290
+    },
+    {
+      "epoch": 0.7814774381368268,
+      "grad_norm": 0.16436047852039337,
+      "learning_rate": 6.043735247320454e-06,
+      "loss": 0.15035657882690429,
+      "step": 4295
+    },
+    {
+      "epoch": 0.7823871906841339,
+      "grad_norm": 0.1498408019542694,
+      "learning_rate": 5.995827724967218e-06,
+      "loss": 0.14494839906692505,
+      "step": 4300
+    },
+    {
+      "epoch": 0.7832969432314411,
+      "grad_norm": 0.16924560070037842,
+      "learning_rate": 5.948084950386535e-06,
+      "loss": 0.13581212759017944,
+      "step": 4305
+    },
+    {
+      "epoch": 0.7842066957787481,
+      "grad_norm": 0.15889139473438263,
+      "learning_rate": 5.900507337462036e-06,
+      "loss": 0.15071530342102052,
+      "step": 4310
+    },
+    {
+      "epoch": 0.7851164483260553,
+      "grad_norm": 0.17201054096221924,
+      "learning_rate": 5.853095298645542e-06,
+      "loss": 0.1398628830909729,
+      "step": 4315
+    },
+    {
+      "epoch": 0.7860262008733624,
+      "grad_norm": 0.17965619266033173,
+      "learning_rate": 5.805849244953548e-06,
+      "loss": 0.14666696786880493,
+      "step": 4320
+    },
+    {
+      "epoch": 0.7869359534206696,
+      "grad_norm": 0.17514032125473022,
+      "learning_rate": 5.758769585963569e-06,
+      "loss": 0.1383386731147766,
+      "step": 4325
+    },
+    {
+      "epoch": 0.7878457059679768,
+      "grad_norm": 0.17497631907463074,
+      "learning_rate": 5.7118567298106744e-06,
+      "loss": 0.14362354278564454,
+      "step": 4330
+    },
+    {
+      "epoch": 0.7887554585152838,
+      "grad_norm": 0.16770458221435547,
+      "learning_rate": 5.665111083183905e-06,
+      "loss": 0.14136618375778198,
+      "step": 4335
+    },
+    {
+      "epoch": 0.789665211062591,
+      "grad_norm": 0.17134106159210205,
+      "learning_rate": 5.618533051322747e-06,
+      "loss": 0.1401529550552368,
+      "step": 4340
+    },
+    {
+      "epoch": 0.7905749636098981,
+      "grad_norm": 0.19458788633346558,
+      "learning_rate": 5.5721230380136435e-06,
+      "loss": 0.1393273115158081,
+      "step": 4345
+    },
+    {
+      "epoch": 0.7914847161572053,
+      "grad_norm": 0.19483692944049835,
+      "learning_rate": 5.525881445586467e-06,
+      "loss": 0.1369825482368469,
+      "step": 4350
+    },
+    {
+      "epoch": 0.7923944687045124,
+      "grad_norm": 0.3052191734313965,
+      "learning_rate": 5.4798086749110495e-06,
+      "loss": 0.14762181043624878,
+      "step": 4355
+    },
+    {
+      "epoch": 0.7933042212518195,
+      "grad_norm": 0.164458766579628,
+      "learning_rate": 5.4339051253937065e-06,
+      "loss": 0.14501686096191407,
+      "step": 4360
+    },
+    {
+      "epoch": 0.7942139737991266,
+      "grad_norm": 0.1719193458557129,
+      "learning_rate": 5.3881711949737625e-06,
+      "loss": 0.13321092128753662,
+      "step": 4365
+    },
+    {
+      "epoch": 0.7951237263464338,
+      "grad_norm": 0.17219696938991547,
+      "learning_rate": 5.342607280120121e-06,
+      "loss": 0.1413906455039978,
+      "step": 4370
+    },
+    {
+      "epoch": 0.7960334788937409,
+      "grad_norm": 0.15083056688308716,
+      "learning_rate": 5.297213775827789e-06,
+      "loss": 0.14772192239761353,
+      "step": 4375
+    },
+    {
+      "epoch": 0.7969432314410481,
+      "grad_norm": 0.1699071079492569,
+      "learning_rate": 5.251991075614507e-06,
+      "loss": 0.1392375946044922,
+      "step": 4380
+    },
+    {
+      "epoch": 0.7978529839883551,
+      "grad_norm": 0.1680395007133484,
+      "learning_rate": 5.206939571517302e-06,
+      "loss": 0.14185575246810914,
+      "step": 4385
+    },
+    {
+      "epoch": 0.7987627365356623,
+      "grad_norm": 0.16526710987091064,
+      "learning_rate": 5.162059654089083e-06,
+      "loss": 0.15001428127288818,
+      "step": 4390
+    },
+    {
+      "epoch": 0.7996724890829694,
+      "grad_norm": 0.16281752288341522,
+      "learning_rate": 5.1173517123952794e-06,
+      "loss": 0.13747023344039916,
+      "step": 4395
+    },
+    {
+      "epoch": 0.8005822416302766,
+      "grad_norm": 0.1454378366470337,
+      "learning_rate": 5.072816134010458e-06,
+      "loss": 0.14710829257965088,
+      "step": 4400
+    },
+    {
+      "epoch": 0.8014919941775837,
+      "grad_norm": 0.16565890610218048,
+      "learning_rate": 5.028453305014966e-06,
+      "loss": 0.14138611555099487,
+      "step": 4405
+    },
+    {
+      "epoch": 0.8024017467248908,
+      "grad_norm": 0.1962810605764389,
+      "learning_rate": 4.984263609991577e-06,
+      "loss": 0.13836177587509155,
+      "step": 4410
+    },
+    {
+      "epoch": 0.8033114992721979,
+      "grad_norm": 0.16091369092464447,
+      "learning_rate": 4.940247432022149e-06,
+      "loss": 0.14407440423965454,
+      "step": 4415
+    },
+    {
+      "epoch": 0.8042212518195051,
+      "grad_norm": 0.1930241584777832,
+      "learning_rate": 4.89640515268433e-06,
+      "loss": 0.14346336126327514,
+      "step": 4420
+    },
+    {
+      "epoch": 0.8051310043668122,
+      "grad_norm": 0.19301500916481018,
+      "learning_rate": 4.852737152048242e-06,
+      "loss": 0.14174317121505736,
+      "step": 4425
+    },
+    {
+      "epoch": 0.8060407569141194,
+      "grad_norm": 0.1541353315114975,
+      "learning_rate": 4.80924380867315e-06,
+      "loss": 0.14100592136383056,
+      "step": 4430
+    },
+    {
+      "epoch": 0.8069505094614265,
+      "grad_norm": 0.16285750269889832,
+      "learning_rate": 4.765925499604243e-06,
+      "loss": 0.1441288709640503,
+      "step": 4435
+    },
+    {
+      "epoch": 0.8078602620087336,
+      "grad_norm": 0.17382675409317017,
+      "learning_rate": 4.722782600369299e-06,
+      "loss": 0.13763951063156127,
+      "step": 4440
+    },
+    {
+      "epoch": 0.8087700145560408,
+      "grad_norm": 0.1697344034910202,
+      "learning_rate": 4.679815484975505e-06,
+      "loss": 0.1410105347633362,
+      "step": 4445
+    },
+    {
+      "epoch": 0.8096797671033479,
+      "grad_norm": 0.19964542984962463,
+      "learning_rate": 4.637024525906131e-06,
+      "loss": 0.1439276695251465,
+      "step": 4450
+    },
+    {
+      "epoch": 0.8105895196506551,
+      "grad_norm": 0.165307879447937,
+      "learning_rate": 4.59441009411736e-06,
+      "loss": 0.13897504806518554,
+      "step": 4455
+    },
+    {
+      "epoch": 0.8114992721979621,
+      "grad_norm": 0.16687989234924316,
+      "learning_rate": 4.551972559035067e-06,
+      "loss": 0.1422593355178833,
+      "step": 4460
+    },
+    {
+      "epoch": 0.8124090247452693,
+      "grad_norm": 0.15737789869308472,
+      "learning_rate": 4.509712288551571e-06,
+      "loss": 0.1452128052711487,
+      "step": 4465
+    },
+    {
+      "epoch": 0.8133187772925764,
+      "grad_norm": 0.17116659879684448,
+      "learning_rate": 4.467629649022509e-06,
+      "loss": 0.14385371208190917,
+      "step": 4470
+    },
+    {
+      "epoch": 0.8142285298398836,
+      "grad_norm": 0.17457640171051025,
+      "learning_rate": 4.425725005263623e-06,
+      "loss": 0.14808475971221924,
+      "step": 4475
+    },
+    {
+      "epoch": 0.8151382823871907,
+      "grad_norm": 0.1621970385313034,
+      "learning_rate": 4.383998720547583e-06,
+      "loss": 0.13927959203720092,
+      "step": 4480
+    },
+    {
+      "epoch": 0.8160480349344978,
+      "grad_norm": 0.176296666264534,
+      "learning_rate": 4.342451156600896e-06,
+      "loss": 0.15041060447692872,
+      "step": 4485
+    },
+    {
+      "epoch": 0.8169577874818049,
+      "grad_norm": 0.17157645523548126,
+      "learning_rate": 4.301082673600698e-06,
+      "loss": 0.13932652473449708,
+      "step": 4490
+    },
+    {
+      "epoch": 0.8178675400291121,
+      "grad_norm": 0.15378527343273163,
+      "learning_rate": 4.259893630171682e-06,
+      "loss": 0.1406856894493103,
+      "step": 4495
+    },
+    {
+      "epoch": 0.8187772925764192,
+      "grad_norm": 0.1750226765871048,
+      "learning_rate": 4.218884383382987e-06,
+      "loss": 0.1350164532661438,
+      "step": 4500
+    },
+    {
+      "epoch": 0.8196870451237264,
+      "grad_norm": 0.1393742561340332,
+      "learning_rate": 4.178055288745053e-06,
+      "loss": 0.13769235610961914,
+      "step": 4505
+    },
+    {
+      "epoch": 0.8205967976710334,
+      "grad_norm": 0.1668994128704071,
+      "learning_rate": 4.137406700206617e-06,
+      "loss": 0.14029752016067504,
+      "step": 4510
+    },
+    {
+      "epoch": 0.8215065502183406,
+      "grad_norm": 0.1833454668521881,
+      "learning_rate": 4.0969389701515675e-06,
+      "loss": 0.14276301860809326,
+      "step": 4515
+    },
+    {
+      "epoch": 0.8224163027656477,
+      "grad_norm": 0.16187874972820282,
+      "learning_rate": 4.056652449395945e-06,
+      "loss": 0.1444832682609558,
+      "step": 4520
+    },
+    {
+      "epoch": 0.8233260553129549,
+      "grad_norm": 0.1453280746936798,
+      "learning_rate": 4.01654748718488e-06,
+      "loss": 0.14512733221054078,
+      "step": 4525
+    },
+    {
+      "epoch": 0.824235807860262,
+      "grad_norm": 0.1782725751399994,
+      "learning_rate": 3.976624431189563e-06,
+      "loss": 0.14093561172485353,
+      "step": 4530
+    },
+    {
+      "epoch": 0.8251455604075691,
+      "grad_norm": 0.17374491691589355,
+      "learning_rate": 3.936883627504234e-06,
+      "loss": 0.14031401872634888,
+      "step": 4535
+    },
+    {
+      "epoch": 0.8260553129548762,
+      "grad_norm": 0.1609172821044922,
+      "learning_rate": 3.897325420643174e-06,
+      "loss": 0.1428336262702942,
+      "step": 4540
+    },
+    {
+      "epoch": 0.8269650655021834,
+      "grad_norm": 0.1520884931087494,
+      "learning_rate": 3.85795015353774e-06,
+      "loss": 0.1460547924041748,
+      "step": 4545
+    },
+    {
+      "epoch": 0.8278748180494906,
+      "grad_norm": 0.20986326038837433,
+      "learning_rate": 3.818758167533376e-06,
+      "loss": 0.14706350564956666,
+      "step": 4550
+    },
+    {
+      "epoch": 0.8287845705967977,
+      "grad_norm": 0.16825413703918457,
+      "learning_rate": 3.7797498023866396e-06,
+      "loss": 0.14507200717926025,
+      "step": 4555
+    },
+    {
+      "epoch": 0.8296943231441049,
+      "grad_norm": 0.16758380830287933,
+      "learning_rate": 3.740925396262296e-06,
+      "loss": 0.14898381233215333,
+      "step": 4560
+    },
+    {
+      "epoch": 0.8306040756914119,
+      "grad_norm": 0.15207453072071075,
+      "learning_rate": 3.7022852857303503e-06,
+      "loss": 0.14138854742050172,
+      "step": 4565
+    },
+    {
+      "epoch": 0.8315138282387191,
+      "grad_norm": 0.15150749683380127,
+      "learning_rate": 3.66382980576315e-06,
+      "loss": 0.13894975185394287,
+      "step": 4570
+    },
+    {
+      "epoch": 0.8324235807860262,
+      "grad_norm": 0.17071188986301422,
+      "learning_rate": 3.625559289732472e-06,
+      "loss": 0.14072470664978026,
+      "step": 4575
+    },
+    {
+      "epoch": 0.8333333333333334,
+      "grad_norm": 0.154335618019104,
+      "learning_rate": 3.5874740694066294e-06,
+      "loss": 0.13791344165802003,
+      "step": 4580
+    },
+    {
+      "epoch": 0.8342430858806404,
+      "grad_norm": 0.14017128944396973,
+      "learning_rate": 3.5495744749476116e-06,
+      "loss": 0.14427922964096068,
+      "step": 4585
+    },
+    {
+      "epoch": 0.8351528384279476,
+      "grad_norm": 0.17210033535957336,
+      "learning_rate": 3.5118608349081983e-06,
+      "loss": 0.15191166400909423,
+      "step": 4590
+    },
+    {
+      "epoch": 0.8360625909752547,
+      "grad_norm": 0.18715685606002808,
+      "learning_rate": 3.4743334762291358e-06,
+      "loss": 0.14451316595077515,
+      "step": 4595
+    },
+    {
+      "epoch": 0.8369723435225619,
+      "grad_norm": 0.18079884350299835,
+      "learning_rate": 3.436992724236293e-06,
+      "loss": 0.13530746698379517,
+      "step": 4600
+    },
+    {
+      "epoch": 0.837882096069869,
+      "grad_norm": 0.13519920408725739,
+      "learning_rate": 3.399838902637817e-06,
+      "loss": 0.1477964401245117,
+      "step": 4605
+    },
+    {
+      "epoch": 0.8387918486171762,
+      "grad_norm": 0.1778026670217514,
+      "learning_rate": 3.3628723335213885e-06,
+      "loss": 0.14419831037521363,
+      "step": 4610
+    },
+    {
+      "epoch": 0.8397016011644832,
+      "grad_norm": 0.15165366232395172,
+      "learning_rate": 3.326093337351355e-06,
+      "loss": 0.13888469934463502,
+      "step": 4615
+    },
+    {
+      "epoch": 0.8406113537117904,
+      "grad_norm": 0.17049473524093628,
+      "learning_rate": 3.2895022329660018e-06,
+      "loss": 0.14438477754592896,
+      "step": 4620
+    },
+    {
+      "epoch": 0.8415211062590975,
+      "grad_norm": 0.16536414623260498,
+      "learning_rate": 3.2530993375747833e-06,
+      "loss": 0.1444351315498352,
+      "step": 4625
+    },
+    {
+      "epoch": 0.8424308588064047,
+      "grad_norm": 0.17570015788078308,
+      "learning_rate": 3.2168849667555402e-06,
+      "loss": 0.13861945867538453,
+      "step": 4630
+    },
+    {
+      "epoch": 0.8433406113537117,
+      "grad_norm": 0.1699545532464981,
+      "learning_rate": 3.1808594344518132e-06,
+      "loss": 0.13902754783630372,
+      "step": 4635
+    },
+    {
+      "epoch": 0.8442503639010189,
+      "grad_norm": 0.12331254780292511,
+      "learning_rate": 3.1450230529700837e-06,
+      "loss": 0.14104254245758058,
+      "step": 4640
+    },
+    {
+      "epoch": 0.845160116448326,
+      "grad_norm": 0.1508190929889679,
+      "learning_rate": 3.1093761329770708e-06,
+      "loss": 0.13288766145706177,
+      "step": 4645
+    },
+    {
+      "epoch": 0.8460698689956332,
+      "grad_norm": 0.19049489498138428,
+      "learning_rate": 3.0739189834970735e-06,
+      "loss": 0.14914840459823608,
+      "step": 4650
+    },
+    {
+      "epoch": 0.8469796215429404,
+      "grad_norm": 0.1662369966506958,
+      "learning_rate": 3.0386519119092293e-06,
+      "loss": 0.14222898483276367,
+      "step": 4655
+    },
+    {
+      "epoch": 0.8478893740902474,
+      "grad_norm": 0.18985967338085175,
+      "learning_rate": 3.0035752239449126e-06,
+      "loss": 0.14431113004684448,
+      "step": 4660
+    },
+    {
+      "epoch": 0.8487991266375546,
+      "grad_norm": 0.17005261778831482,
+      "learning_rate": 2.9686892236850337e-06,
+      "loss": 0.14140807390213012,
+      "step": 4665
+    },
+    {
+      "epoch": 0.8497088791848617,
+      "grad_norm": 0.16786684095859528,
+      "learning_rate": 2.9339942135574394e-06,
+      "loss": 0.14161460399627684,
+      "step": 4670
+    },
+    {
+      "epoch": 0.8506186317321689,
+      "grad_norm": 0.16358181834220886,
+      "learning_rate": 2.899490494334281e-06,
+      "loss": 0.14674670696258546,
+      "step": 4675
+    },
+    {
+      "epoch": 0.851528384279476,
+      "grad_norm": 0.1651349812746048,
+      "learning_rate": 2.8651783651293867e-06,
+      "loss": 0.13794611692428588,
+      "step": 4680
+    },
+    {
+      "epoch": 0.8524381368267832,
+      "grad_norm": 0.16934923827648163,
+      "learning_rate": 2.831058123395694e-06,
+      "loss": 0.13199397325515747,
+      "step": 4685
+    },
+    {
+      "epoch": 0.8533478893740902,
+      "grad_norm": 0.1704150140285492,
+      "learning_rate": 2.797130064922665e-06,
+      "loss": 0.14044904708862305,
+      "step": 4690
+    },
+    {
+      "epoch": 0.8542576419213974,
+      "grad_norm": 0.1814192682504654,
+      "learning_rate": 2.7633944838337143e-06,
+      "loss": 0.1465100646018982,
+      "step": 4695
+    },
+    {
+      "epoch": 0.8551673944687045,
+      "grad_norm": 0.18942610919475555,
+      "learning_rate": 2.729851672583669e-06,
+      "loss": 0.14685982465744019,
+      "step": 4700
+    },
+    {
+      "epoch": 0.8560771470160117,
+      "grad_norm": 0.17895208299160004,
+      "learning_rate": 2.6965019219562155e-06,
+      "loss": 0.13971571922302245,
+      "step": 4705
+    },
+    {
+      "epoch": 0.8569868995633187,
+      "grad_norm": 0.22735828161239624,
+      "learning_rate": 2.6633455210614055e-06,
+      "loss": 0.13776102066040039,
+      "step": 4710
+    },
+    {
+      "epoch": 0.8578966521106259,
+      "grad_norm": 0.16779793798923492,
+      "learning_rate": 2.630382757333133e-06,
+      "loss": 0.14134042263031005,
+      "step": 4715
+    },
+    {
+      "epoch": 0.858806404657933,
+      "grad_norm": 0.2148888260126114,
+      "learning_rate": 2.597613916526637e-06,
+      "loss": 0.14680721759796142,
+      "step": 4720
+    },
+    {
+      "epoch": 0.8597161572052402,
+      "grad_norm": 0.16560257971286774,
+      "learning_rate": 2.565039282716045e-06,
+      "loss": 0.14137234687805175,
+      "step": 4725
+    },
+    {
+      "epoch": 0.8606259097525473,
+      "grad_norm": 0.16197068989276886,
+      "learning_rate": 2.532659138291879e-06,
+      "loss": 0.14969314336776735,
+      "step": 4730
+    },
+    {
+      "epoch": 0.8615356622998545,
+      "grad_norm": 0.14650246500968933,
+      "learning_rate": 2.5004737639586497e-06,
+      "loss": 0.13532910346984864,
+      "step": 4735
+    },
+    {
+      "epoch": 0.8624454148471615,
+      "grad_norm": 0.1565634310245514,
+      "learning_rate": 2.4684834387323943e-06,
+      "loss": 0.14146244525909424,
+      "step": 4740
+    },
+    {
+      "epoch": 0.8633551673944687,
+      "grad_norm": 0.18060864508152008,
+      "learning_rate": 2.4366884399382393e-06,
+      "loss": 0.14218534231185914,
+      "step": 4745
+    },
+    {
+      "epoch": 0.8642649199417758,
+      "grad_norm": 0.24613255262374878,
+      "learning_rate": 2.4050890432080557e-06,
+      "loss": 0.13907679319381713,
+      "step": 4750
+    },
+    {
+      "epoch": 0.865174672489083,
+      "grad_norm": 0.16036023199558258,
+      "learning_rate": 2.3736855224780057e-06,
+      "loss": 0.13718113899230958,
+      "step": 4755
+    },
+    {
+      "epoch": 0.86608442503639,
+      "grad_norm": 0.16678516566753387,
+      "learning_rate": 2.3424781499862075e-06,
+      "loss": 0.1327962040901184,
+      "step": 4760
+    },
+    {
+      "epoch": 0.8669941775836972,
+      "grad_norm": 0.1763770878314972,
+      "learning_rate": 2.3114671962703727e-06,
+      "loss": 0.14390318393707274,
+      "step": 4765
+    },
+    {
+      "epoch": 0.8679039301310044,
+      "grad_norm": 0.17735697329044342,
+      "learning_rate": 2.280652930165428e-06,
+      "loss": 0.15223288536071777,
+      "step": 4770
+    },
+    {
+      "epoch": 0.8688136826783115,
+      "grad_norm": 0.15827041864395142,
+      "learning_rate": 2.250035618801241e-06,
+      "loss": 0.14296332597732545,
+      "step": 4775
+    },
+    {
+      "epoch": 0.8697234352256187,
+      "grad_norm": 0.16876135766506195,
+      "learning_rate": 2.219615527600244e-06,
+      "loss": 0.1359076738357544,
+      "step": 4780
+    },
+    {
+      "epoch": 0.8706331877729258,
+      "grad_norm": 0.1800110638141632,
+      "learning_rate": 2.189392920275174e-06,
+      "loss": 0.1424281358718872,
+      "step": 4785
+    },
+    {
+      "epoch": 0.8715429403202329,
+      "grad_norm": 0.1409560889005661,
+      "learning_rate": 2.159368058826783e-06,
+      "loss": 0.14480490684509278,
+      "step": 4790
+    },
+    {
+      "epoch": 0.87245269286754,
+      "grad_norm": 0.1634288728237152,
+      "learning_rate": 2.129541203541535e-06,
+      "loss": 0.14513269662857056,
+      "step": 4795
+    },
+    {
+      "epoch": 0.8733624454148472,
+      "grad_norm": 0.17126062512397766,
+      "learning_rate": 2.099912612989391e-06,
+      "loss": 0.13546934127807617,
+      "step": 4800
+    },
+    {
+      "epoch": 0.8742721979621543,
+      "grad_norm": 0.16704080998897552,
+      "learning_rate": 2.0704825440215457e-06,
+      "loss": 0.13852492570877076,
+      "step": 4805
+    },
+    {
+      "epoch": 0.8751819505094615,
+      "grad_norm": 0.1725970208644867,
+      "learning_rate": 2.0412512517681946e-06,
+      "loss": 0.14504197835922242,
+      "step": 4810
+    },
+    {
+      "epoch": 0.8760917030567685,
+      "grad_norm": 0.1700201779603958,
+      "learning_rate": 2.0122189896363387e-06,
+      "loss": 0.14312338829040527,
+      "step": 4815
+    },
+    {
+      "epoch": 0.8770014556040757,
+      "grad_norm": 0.16491736471652985,
+      "learning_rate": 1.9833860093075834e-06,
+      "loss": 0.14062976837158203,
+      "step": 4820
+    },
+    {
+      "epoch": 0.8779112081513828,
+      "grad_norm": 0.13748787343502045,
+      "learning_rate": 1.9547525607359537e-06,
+      "loss": 0.1346171498298645,
+      "step": 4825
+    },
+    {
+      "epoch": 0.87882096069869,
+      "grad_norm": 0.16399399936199188,
+      "learning_rate": 1.926318892145712e-06,
+      "loss": 0.14178123474121093,
+      "step": 4830
+    },
+    {
+      "epoch": 0.879730713245997,
+      "grad_norm": 0.14491963386535645,
+      "learning_rate": 1.8980852500292412e-06,
+      "loss": 0.1408564567565918,
+      "step": 4835
+    },
+    {
+      "epoch": 0.8806404657933042,
+      "grad_norm": 0.17335423827171326,
+      "learning_rate": 1.8700518791448851e-06,
+      "loss": 0.14403265714645386,
+      "step": 4840
+    },
+    {
+      "epoch": 0.8815502183406113,
+      "grad_norm": 0.17399625480175018,
+      "learning_rate": 1.8422190225148155e-06,
+      "loss": 0.14289036989212037,
+      "step": 4845
+    },
+    {
+      "epoch": 0.8824599708879185,
+      "grad_norm": 0.17945612967014313,
+      "learning_rate": 1.814586921422956e-06,
+      "loss": 0.14494109153747559,
+      "step": 4850
+    },
+    {
+      "epoch": 0.8833697234352256,
+      "grad_norm": 0.1910620480775833,
+      "learning_rate": 1.7871558154128664e-06,
+      "loss": 0.13726245164871215,
+      "step": 4855
+    },
+    {
+      "epoch": 0.8842794759825328,
+      "grad_norm": 0.1771879345178604,
+      "learning_rate": 1.7599259422856756e-06,
+      "loss": 0.1464752197265625,
+      "step": 4860
+    },
+    {
+      "epoch": 0.8851892285298398,
+      "grad_norm": 0.19427461922168732,
+      "learning_rate": 1.7328975380980218e-06,
+      "loss": 0.13823356628417968,
+      "step": 4865
+    },
+    {
+      "epoch": 0.886098981077147,
+      "grad_norm": 0.1491149365901947,
+      "learning_rate": 1.7060708371599897e-06,
+      "loss": 0.1338604211807251,
+      "step": 4870
+    },
+    {
+      "epoch": 0.8870087336244541,
+      "grad_norm": 0.16087733209133148,
+      "learning_rate": 1.6794460720331057e-06,
+      "loss": 0.14184389114379883,
+      "step": 4875
+    },
+    {
+      "epoch": 0.8879184861717613,
+      "grad_norm": 0.14506325125694275,
+      "learning_rate": 1.653023473528309e-06,
+      "loss": 0.14267687797546386,
+      "step": 4880
+    },
+    {
+      "epoch": 0.8888282387190685,
+      "grad_norm": 0.16886365413665771,
+      "learning_rate": 1.626803270703936e-06,
+      "loss": 0.14266083240509034,
+      "step": 4885
+    },
+    {
+      "epoch": 0.8897379912663755,
+      "grad_norm": 0.1891999989748001,
+      "learning_rate": 1.6007856908637652e-06,
+      "loss": 0.1398016929626465,
+      "step": 4890
+    },
+    {
+      "epoch": 0.8906477438136827,
+      "grad_norm": 0.17645299434661865,
+      "learning_rate": 1.5749709595550083e-06,
+      "loss": 0.13869571685791016,
+      "step": 4895
+    },
+    {
+      "epoch": 0.8915574963609898,
+      "grad_norm": 0.17714262008666992,
+      "learning_rate": 1.549359300566408e-06,
+      "loss": 0.14957486391067504,
+      "step": 4900
+    },
+    {
+      "epoch": 0.892467248908297,
+      "grad_norm": 0.18025240302085876,
+      "learning_rate": 1.5239509359262355e-06,
+      "loss": 0.1358652949333191,
+      "step": 4905
+    },
+    {
+      "epoch": 0.8933770014556041,
+      "grad_norm": 0.17539937794208527,
+      "learning_rate": 1.4987460859004154e-06,
+      "loss": 0.13833394050598144,
+      "step": 4910
+    },
+    {
+      "epoch": 0.8942867540029112,
+      "grad_norm": 0.1772230565547943,
+      "learning_rate": 1.4737449689905953e-06,
+      "loss": 0.14202116727828978,
+      "step": 4915
+    },
+    {
+      "epoch": 0.8951965065502183,
+      "grad_norm": 0.1670161783695221,
+      "learning_rate": 1.4489478019322433e-06,
+      "loss": 0.1403665542602539,
+      "step": 4920
+    },
+    {
+      "epoch": 0.8961062590975255,
+      "grad_norm": 0.1697034239768982,
+      "learning_rate": 1.4243547996927926e-06,
+      "loss": 0.1401481032371521,
+      "step": 4925
+    },
+    {
+      "epoch": 0.8970160116448326,
+      "grad_norm": 0.16474860906600952,
+      "learning_rate": 1.3999661754697636e-06,
+      "loss": 0.13969850540161133,
+      "step": 4930
+    },
+    {
+      "epoch": 0.8979257641921398,
+      "grad_norm": 0.1664883941411972,
+      "learning_rate": 1.3757821406889027e-06,
+      "loss": 0.1399069309234619,
+      "step": 4935
+    },
+    {
+      "epoch": 0.8988355167394468,
+      "grad_norm": 0.16675794124603271,
+      "learning_rate": 1.351802905002386e-06,
+      "loss": 0.14129226207733153,
+      "step": 4940
+    },
+    {
+      "epoch": 0.899745269286754,
+      "grad_norm": 0.17529809474945068,
+      "learning_rate": 1.3280286762869632e-06,
+      "loss": 0.14663081169128417,
+      "step": 4945
+    },
+    {
+      "epoch": 0.9006550218340611,
+      "grad_norm": 0.17758169770240784,
+      "learning_rate": 1.3044596606421795e-06,
+      "loss": 0.13986254930496217,
+      "step": 4950
+    },
+    {
+      "epoch": 0.9015647743813683,
+      "grad_norm": 0.153225839138031,
+      "learning_rate": 1.2810960623885815e-06,
+      "loss": 0.14236698150634766,
+      "step": 4955
+    },
+    {
+      "epoch": 0.9024745269286754,
+      "grad_norm": 0.169761523604393,
+      "learning_rate": 1.2579380840659376e-06,
+      "loss": 0.1450445055961609,
+      "step": 4960
+    },
+    {
+      "epoch": 0.9033842794759825,
+      "grad_norm": 0.16659331321716309,
+      "learning_rate": 1.2349859264315034e-06,
+      "loss": 0.14043926000595092,
+      "step": 4965
+    },
+    {
+      "epoch": 0.9042940320232896,
+      "grad_norm": 0.16748706996440887,
+      "learning_rate": 1.2122397884582553e-06,
+      "loss": 0.14725675582885742,
+      "step": 4970
+    },
+    {
+      "epoch": 0.9052037845705968,
+      "grad_norm": 0.1600511223077774,
+      "learning_rate": 1.1896998673331883e-06,
+      "loss": 0.14551150798797607,
+      "step": 4975
+    },
+    {
+      "epoch": 0.9061135371179039,
+      "grad_norm": 0.24318362772464752,
+      "learning_rate": 1.1673663584555934e-06,
+      "loss": 0.14470888376235963,
+      "step": 4980
+    },
+    {
+      "epoch": 0.9070232896652111,
+      "grad_norm": 0.16443821787834167,
+      "learning_rate": 1.1452394554353706e-06,
+      "loss": 0.13639854192733764,
+      "step": 4985
+    },
+    {
+      "epoch": 0.9079330422125182,
+      "grad_norm": 0.14277774095535278,
+      "learning_rate": 1.1233193500913453e-06,
+      "loss": 0.13749881982803344,
+      "step": 4990
+    },
+    {
+      "epoch": 0.9088427947598253,
+      "grad_norm": 0.1610947549343109,
+      "learning_rate": 1.1016062324496008e-06,
+      "loss": 0.1385629653930664,
+      "step": 4995
+    },
+    {
+      "epoch": 0.9097525473071325,
+      "grad_norm": 0.17888498306274414,
+      "learning_rate": 1.080100290741845e-06,
+      "loss": 0.14225621223449708,
+      "step": 5000
+    },
+    {
+      "epoch": 0.9106622998544396,
+      "grad_norm": 0.17488449811935425,
+      "learning_rate": 1.0588017114037729e-06,
+      "loss": 0.14187805652618407,
+      "step": 5005
+    },
+    {
+      "epoch": 0.9115720524017468,
+      "grad_norm": 0.16410665214061737,
+      "learning_rate": 1.0377106790734392e-06,
+      "loss": 0.1407416582107544,
+      "step": 5010
+    },
+    {
+      "epoch": 0.9124818049490538,
+      "grad_norm": 0.18115971982479095,
+      "learning_rate": 1.016827376589674e-06,
+      "loss": 0.1427263855934143,
+      "step": 5015
+    },
+    {
+      "epoch": 0.913391557496361,
+      "grad_norm": 0.18507841229438782,
+      "learning_rate": 9.961519849904898e-07,
+      "loss": 0.1390499472618103,
+      "step": 5020
+    },
+    {
+      "epoch": 0.9143013100436681,
+      "grad_norm": 0.21296796202659607,
+      "learning_rate": 9.75684683511513e-07,
+      "loss": 0.1382216691970825,
+      "step": 5025
+    },
+    {
+      "epoch": 0.9152110625909753,
+      "grad_norm": 0.2308044582605362,
+      "learning_rate": 9.55425649584435e-07,
+      "loss": 0.14271280765533448,
+      "step": 5030
+    },
+    {
+      "epoch": 0.9161208151382824,
+      "grad_norm": 0.15796682238578796,
+      "learning_rate": 9.353750588354527e-07,
+      "loss": 0.13807624578475952,
+      "step": 5035
+    },
+    {
+      "epoch": 0.9170305676855895,
+      "grad_norm": 0.1695316582918167,
+      "learning_rate": 9.155330850837834e-07,
+      "loss": 0.14289476871490478,
+      "step": 5040
+    },
+    {
+      "epoch": 0.9179403202328966,
+      "grad_norm": 0.1738404780626297,
+      "learning_rate": 8.958999003401191e-07,
+      "loss": 0.14070619344711305,
+      "step": 5045
+    },
+    {
+      "epoch": 0.9188500727802038,
+      "grad_norm": 0.20618964731693268,
+      "learning_rate": 8.764756748051662e-07,
+      "loss": 0.14535053968429565,
+      "step": 5050
+    },
+    {
+      "epoch": 0.9197598253275109,
+      "grad_norm": 0.1506137251853943,
+      "learning_rate": 8.572605768681546e-07,
+      "loss": 0.13995139598846434,
+      "step": 5055
+    },
+    {
+      "epoch": 0.9206695778748181,
+      "grad_norm": 0.17772039771080017,
+      "learning_rate": 8.382547731053708e-07,
+      "loss": 0.14470311403274536,
+      "step": 5060
+    },
+    {
+      "epoch": 0.9215793304221251,
+      "grad_norm": 0.19897456467151642,
+      "learning_rate": 8.194584282787382e-07,
+      "loss": 0.144488525390625,
+      "step": 5065
+    },
+    {
+      "epoch": 0.9224890829694323,
+      "grad_norm": 0.15899236500263214,
+      "learning_rate": 8.008717053343606e-07,
+      "loss": 0.1352991580963135,
+      "step": 5070
+    },
+    {
+      "epoch": 0.9233988355167394,
+      "grad_norm": 0.14965768158435822,
+      "learning_rate": 7.824947654011345e-07,
+      "loss": 0.13827911615371705,
+      "step": 5075
+    },
+    {
+      "epoch": 0.9243085880640466,
+      "grad_norm": 0.43651485443115234,
+      "learning_rate": 7.643277677893329e-07,
+      "loss": 0.14149526357650757,
+      "step": 5080
+    },
+    {
+      "epoch": 0.9252183406113537,
+      "grad_norm": 0.19912713766098022,
+      "learning_rate": 7.463708699892325e-07,
+      "loss": 0.14357032775878906,
+      "step": 5085
+    },
+    {
+      "epoch": 0.9261280931586608,
+      "grad_norm": 0.1635904610157013,
+      "learning_rate": 7.286242276697524e-07,
+      "loss": 0.13550699949264527,
+      "step": 5090
+    },
+    {
+      "epoch": 0.9270378457059679,
+      "grad_norm": 0.19391080737113953,
+      "learning_rate": 7.11087994677101e-07,
+      "loss": 0.14674756526947022,
+      "step": 5095
+    },
+    {
+      "epoch": 0.9279475982532751,
+      "grad_norm": 0.17458125948905945,
+      "learning_rate": 6.937623230334284e-07,
+      "loss": 0.14155579805374147,
+      "step": 5100
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.80383382252852e+18,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-5100/training_args.bin b/checkpoint-5100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-5100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/checkpoint-5200/README.md b/checkpoint-5200/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-5200/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-5200/adapter_config.json b/checkpoint-5200/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-5200/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-5200/adapter_model.safetensors b/checkpoint-5200/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..0a8d989d01ea447e029b7061f74fd9b090841cb4
--- /dev/null
+++ b/checkpoint-5200/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2d4132af238f49ee03df7f66e26b8f70f7512186b9a13bfe82a31c1aed04cd19
+size 169741912
diff --git a/checkpoint-5200/chat_template.jinja b/checkpoint-5200/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-5200/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-5200/optimizer.pt b/checkpoint-5200/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..939f4536c13053ee071dab2632e2741ebcc25c50
--- /dev/null
+++ b/checkpoint-5200/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7172af83fe1eb61895afe57e7fa607cb3b9c14fffa96ff4215f95b993b3cf93d
+size 72807355
diff --git a/checkpoint-5200/processor_config.json b/checkpoint-5200/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-5200/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-5200/rng_state.pth b/checkpoint-5200/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-5200/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-5200/scheduler.pt b/checkpoint-5200/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bdaf72ea9881a24b87c7c0436cdc9643e84d0692
--- /dev/null
+++ b/checkpoint-5200/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5a5ffec2bd06b88fab5cacdc3271508d4c91b0c41f2c3d99454223ab1ca20a73
+size 1465
diff --git a/checkpoint-5200/tokenizer.json b/checkpoint-5200/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-5200/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-5200/tokenizer_config.json b/checkpoint-5200/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-5200/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-5200/trainer_state.json b/checkpoint-5200/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..91a801a13367f0bd58b4800a43761fda3910d13c
--- /dev/null
+++ b/checkpoint-5200/trainer_state.json
@@ -0,0 +1,7322 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9461426491994177,
+  "eval_steps": 100,
+  "global_step": 5200,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    },
+    {
+      "epoch": 0.03729985443959243,
+      "grad_norm": 0.061678580939769745,
+      "learning_rate": 4.999340748636462e-05,
+      "loss": 0.24956226348876953,
+      "step": 205
+    },
+    {
+      "epoch": 0.03820960698689956,
+      "grad_norm": 0.07328873127698898,
+      "learning_rate": 4.999160884067051e-05,
+      "loss": 0.26169676780700685,
+      "step": 210
+    },
+    {
+      "epoch": 0.0391193595342067,
+      "grad_norm": 0.08287990838289261,
+      "learning_rate": 4.9989593541926246e-05,
+      "loss": 0.2574604034423828,
+      "step": 215
+    },
+    {
+      "epoch": 0.04002911208151383,
+      "grad_norm": 0.06787359714508057,
+      "learning_rate": 4.9987361607602525e-05,
+      "loss": 0.25351409912109374,
+      "step": 220
+    },
+    {
+      "epoch": 0.04093886462882096,
+      "grad_norm": 0.06695502996444702,
+      "learning_rate": 4.998491305704805e-05,
+      "loss": 0.24522039890289307,
+      "step": 225
+    },
+    {
+      "epoch": 0.041848617176128096,
+      "grad_norm": 0.08872214704751968,
+      "learning_rate": 4.9982247911489375e-05,
+      "loss": 0.2581867933273315,
+      "step": 230
+    },
+    {
+      "epoch": 0.042758369723435226,
+      "grad_norm": 0.07637131959199905,
+      "learning_rate": 4.9979366194030743e-05,
+      "loss": 0.25569658279418944,
+      "step": 235
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 0.08158119022846222,
+      "learning_rate": 4.997626792965385e-05,
+      "loss": 0.2529409646987915,
+      "step": 240
+    },
+    {
+      "epoch": 0.04457787481804949,
+      "grad_norm": 0.07529161125421524,
+      "learning_rate": 4.997295314521766e-05,
+      "loss": 0.24049024581909179,
+      "step": 245
+    },
+    {
+      "epoch": 0.04548762736535662,
+      "grad_norm": 0.08860139548778534,
+      "learning_rate": 4.996942186945813e-05,
+      "loss": 0.2490522861480713,
+      "step": 250
+    },
+    {
+      "epoch": 0.04639737991266375,
+      "grad_norm": 0.0850321501493454,
+      "learning_rate": 4.9965674132988005e-05,
+      "loss": 0.24180831909179687,
+      "step": 255
+    },
+    {
+      "epoch": 0.04730713245997089,
+      "grad_norm": 0.07556115090847015,
+      "learning_rate": 4.996170996829653e-05,
+      "loss": 0.2509631872177124,
+      "step": 260
+    },
+    {
+      "epoch": 0.04821688500727802,
+      "grad_norm": 0.07971206307411194,
+      "learning_rate": 4.995752940974918e-05,
+      "loss": 0.24398891925811766,
+      "step": 265
+    },
+    {
+      "epoch": 0.04912663755458515,
+      "grad_norm": 0.09149336814880371,
+      "learning_rate": 4.9953132493587344e-05,
+      "loss": 0.2300492286682129,
+      "step": 270
+    },
+    {
+      "epoch": 0.050036390101892286,
+      "grad_norm": 0.08265820890665054,
+      "learning_rate": 4.9948519257928034e-05,
+      "loss": 0.24246792793273925,
+      "step": 275
+    },
+    {
+      "epoch": 0.050946142649199416,
+      "grad_norm": 0.10328587144613266,
+      "learning_rate": 4.9943689742763534e-05,
+      "loss": 0.2367171049118042,
+      "step": 280
+    },
+    {
+      "epoch": 0.05185589519650655,
+      "grad_norm": 0.0836917981505394,
+      "learning_rate": 4.993864398996105e-05,
+      "loss": 0.23215813636779786,
+      "step": 285
+    },
+    {
+      "epoch": 0.05276564774381368,
+      "grad_norm": 0.09475161135196686,
+      "learning_rate": 4.99333820432624e-05,
+      "loss": 0.2350748062133789,
+      "step": 290
+    },
+    {
+      "epoch": 0.05367540029112081,
+      "grad_norm": 0.08040128648281097,
+      "learning_rate": 4.992790394828355e-05,
+      "loss": 0.23253886699676513,
+      "step": 295
+    },
+    {
+      "epoch": 0.05458515283842795,
+      "grad_norm": 0.08852150291204453,
+      "learning_rate": 4.992220975251428e-05,
+      "loss": 0.23856515884399415,
+      "step": 300
+    },
+    {
+      "epoch": 0.05549490538573508,
+      "grad_norm": 0.09565229713916779,
+      "learning_rate": 4.991629950531775e-05,
+      "loss": 0.23311660289764405,
+      "step": 305
+    },
+    {
+      "epoch": 0.05640465793304221,
+      "grad_norm": 0.08158160001039505,
+      "learning_rate": 4.991017325793009e-05,
+      "loss": 0.22467944622039795,
+      "step": 310
+    },
+    {
+      "epoch": 0.05731441048034935,
+      "grad_norm": 0.07746429741382599,
+      "learning_rate": 4.990383106345994e-05,
+      "loss": 0.229844069480896,
+      "step": 315
+    },
+    {
+      "epoch": 0.05822416302765648,
+      "grad_norm": 0.08564355969429016,
+      "learning_rate": 4.989727297688797e-05,
+      "loss": 0.22414517402648926,
+      "step": 320
+    },
+    {
+      "epoch": 0.05913391557496361,
+      "grad_norm": 0.07517435401678085,
+      "learning_rate": 4.9890499055066435e-05,
+      "loss": 0.2236532211303711,
+      "step": 325
+    },
+    {
+      "epoch": 0.060043668122270744,
+      "grad_norm": 0.111734539270401,
+      "learning_rate": 4.988350935671869e-05,
+      "loss": 0.21474847793579102,
+      "step": 330
+    },
+    {
+      "epoch": 0.060953420669577874,
+      "grad_norm": 0.09906989336013794,
+      "learning_rate": 4.987630394243866e-05,
+      "loss": 0.23321933746337892,
+      "step": 335
+    },
+    {
+      "epoch": 0.06186317321688501,
+      "grad_norm": 0.10131457448005676,
+      "learning_rate": 4.98688828746903e-05,
+      "loss": 0.2310662031173706,
+      "step": 340
+    },
+    {
+      "epoch": 0.06277292576419213,
+      "grad_norm": 0.09203507006168365,
+      "learning_rate": 4.986124621780708e-05,
+      "loss": 0.22021169662475587,
+      "step": 345
+    },
+    {
+      "epoch": 0.06368267831149928,
+      "grad_norm": 0.09505912661552429,
+      "learning_rate": 4.9853394037991416e-05,
+      "loss": 0.2197155237197876,
+      "step": 350
+    },
+    {
+      "epoch": 0.06459243085880641,
+      "grad_norm": 0.09038657695055008,
+      "learning_rate": 4.984532640331412e-05,
+      "loss": 0.22066287994384765,
+      "step": 355
+    },
+    {
+      "epoch": 0.06550218340611354,
+      "grad_norm": 0.09707064181566238,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 0.22455451488494874,
+      "step": 360
+    },
+    {
+      "epoch": 0.06641193595342067,
+      "grad_norm": 0.10367228090763092,
+      "learning_rate": 4.98285450509961e-05,
+      "loss": 0.21993820667266845,
+      "step": 365
+    },
+    {
+      "epoch": 0.0673216885007278,
+      "grad_norm": 0.12229471653699875,
+      "learning_rate": 4.9819831478833456e-05,
+      "loss": 0.2168867588043213,
+      "step": 370
+    },
+    {
+      "epoch": 0.06823144104803494,
+      "grad_norm": 0.0964592918753624,
+      "learning_rate": 4.981090274276406e-05,
+      "loss": 0.21579203605651856,
+      "step": 375
+    },
+    {
+      "epoch": 0.06914119359534207,
+      "grad_norm": 0.09400496631860733,
+      "learning_rate": 4.980175892019141e-05,
+      "loss": 0.20972180366516113,
+      "step": 380
+    },
+    {
+      "epoch": 0.0700509461426492,
+      "grad_norm": 0.08158645778894424,
+      "learning_rate": 4.9792400090383594e-05,
+      "loss": 0.22148358821868896,
+      "step": 385
+    },
+    {
+      "epoch": 0.07096069868995633,
+      "grad_norm": 0.10916394740343094,
+      "learning_rate": 4.978282633447261e-05,
+      "loss": 0.2214418649673462,
+      "step": 390
+    },
+    {
+      "epoch": 0.07187045123726346,
+      "grad_norm": 0.11138810962438583,
+      "learning_rate": 4.9773037735453636e-05,
+      "loss": 0.21814754009246826,
+      "step": 395
+    },
+    {
+      "epoch": 0.07278020378457059,
+      "grad_norm": 0.10914396494626999,
+      "learning_rate": 4.9763034378184365e-05,
+      "loss": 0.21310818195343018,
+      "step": 400
+    },
+    {
+      "epoch": 0.07368995633187773,
+      "grad_norm": 0.1043366864323616,
+      "learning_rate": 4.975281634938421e-05,
+      "loss": 0.21266789436340333,
+      "step": 405
+    },
+    {
+      "epoch": 0.07459970887918486,
+      "grad_norm": 0.1036868542432785,
+      "learning_rate": 4.9742383737633594e-05,
+      "loss": 0.21606721878051757,
+      "step": 410
+    },
+    {
+      "epoch": 0.075509461426492,
+      "grad_norm": 0.11640442907810211,
+      "learning_rate": 4.9731736633373144e-05,
+      "loss": 0.21532948017120362,
+      "step": 415
+    },
+    {
+      "epoch": 0.07641921397379912,
+      "grad_norm": 0.11219926178455353,
+      "learning_rate": 4.9720875128902956e-05,
+      "loss": 0.2191627025604248,
+      "step": 420
+    },
+    {
+      "epoch": 0.07732896652110625,
+      "grad_norm": 0.12103637307882309,
+      "learning_rate": 4.970979931838176e-05,
+      "loss": 0.20938868522644044,
+      "step": 425
+    },
+    {
+      "epoch": 0.0782387190684134,
+      "grad_norm": 0.13274189829826355,
+      "learning_rate": 4.96985092978261e-05,
+      "loss": 0.21792960166931152,
+      "step": 430
+    },
+    {
+      "epoch": 0.07914847161572053,
+      "grad_norm": 0.11164513230323792,
+      "learning_rate": 4.968700516510954e-05,
+      "loss": 0.2022618055343628,
+      "step": 435
+    },
+    {
+      "epoch": 0.08005822416302766,
+      "grad_norm": 0.09532847255468369,
+      "learning_rate": 4.967528701996174e-05,
+      "loss": 0.21255812644958497,
+      "step": 440
+    },
+    {
+      "epoch": 0.08096797671033479,
+      "grad_norm": 0.10279258340597153,
+      "learning_rate": 4.96633549639677e-05,
+      "loss": 0.20683050155639648,
+      "step": 445
+    },
+    {
+      "epoch": 0.08187772925764192,
+      "grad_norm": 0.1257462352514267,
+      "learning_rate": 4.965120910056677e-05,
+      "loss": 0.21419920921325683,
+      "step": 450
+    },
+    {
+      "epoch": 0.08278748180494905,
+      "grad_norm": 0.11663137376308441,
+      "learning_rate": 4.963884953505186e-05,
+      "loss": 0.2072287082672119,
+      "step": 455
+    },
+    {
+      "epoch": 0.08369723435225619,
+      "grad_norm": 0.10488224029541016,
+      "learning_rate": 4.96262763745684e-05,
+      "loss": 0.1982678532600403,
+      "step": 460
+    },
+    {
+      "epoch": 0.08460698689956332,
+      "grad_norm": 0.11801692098379135,
+      "learning_rate": 4.961348972811354e-05,
+      "loss": 0.20662031173706055,
+      "step": 465
+    },
+    {
+      "epoch": 0.08551673944687045,
+      "grad_norm": 0.11318827420473099,
+      "learning_rate": 4.96004897065351e-05,
+      "loss": 0.20947303771972656,
+      "step": 470
+    },
+    {
+      "epoch": 0.08642649199417758,
+      "grad_norm": 0.13409486413002014,
+      "learning_rate": 4.95872764225307e-05,
+      "loss": 0.19670876264572143,
+      "step": 475
+    },
+    {
+      "epoch": 0.08733624454148471,
+      "grad_norm": 0.14440792798995972,
+      "learning_rate": 4.957384999064672e-05,
+      "loss": 0.19842848777770997,
+      "step": 480
+    },
+    {
+      "epoch": 0.08824599708879186,
+      "grad_norm": 0.12246996909379959,
+      "learning_rate": 4.956021052727731e-05,
+      "loss": 0.20318071842193602,
+      "step": 485
+    },
+    {
+      "epoch": 0.08915574963609899,
+      "grad_norm": 0.13437233865261078,
+      "learning_rate": 4.954635815066342e-05,
+      "loss": 0.21675212383270265,
+      "step": 490
+    },
+    {
+      "epoch": 0.09006550218340612,
+      "grad_norm": 0.11109672486782074,
+      "learning_rate": 4.9532292980891744e-05,
+      "loss": 0.2100757837295532,
+      "step": 495
+    },
+    {
+      "epoch": 0.09097525473071325,
+      "grad_norm": 0.1388893872499466,
+      "learning_rate": 4.9518015139893675e-05,
+      "loss": 0.20303285121917725,
+      "step": 500
+    },
+    {
+      "epoch": 0.09188500727802038,
+      "grad_norm": 0.13239721953868866,
+      "learning_rate": 4.950352475144427e-05,
+      "loss": 0.2152268409729004,
+      "step": 505
+    },
+    {
+      "epoch": 0.0927947598253275,
+      "grad_norm": 0.12834979593753815,
+      "learning_rate": 4.948882194116119e-05,
+      "loss": 0.20799248218536376,
+      "step": 510
+    },
+    {
+      "epoch": 0.09370451237263465,
+      "grad_norm": 0.11886704713106155,
+      "learning_rate": 4.947390683650354e-05,
+      "loss": 0.20394976139068605,
+      "step": 515
+    },
+    {
+      "epoch": 0.09461426491994178,
+      "grad_norm": 0.11398876458406448,
+      "learning_rate": 4.945877956677083e-05,
+      "loss": 0.2091092586517334,
+      "step": 520
+    },
+    {
+      "epoch": 0.09552401746724891,
+      "grad_norm": 0.1422540694475174,
+      "learning_rate": 4.944344026310186e-05,
+      "loss": 0.19564238786697388,
+      "step": 525
+    },
+    {
+      "epoch": 0.09643377001455604,
+      "grad_norm": 0.11359584331512451,
+      "learning_rate": 4.9427889058473535e-05,
+      "loss": 0.20493624210357667,
+      "step": 530
+    },
+    {
+      "epoch": 0.09734352256186317,
+      "grad_norm": 0.11703553050756454,
+      "learning_rate": 4.941212608769974e-05,
+      "loss": 0.2098615884780884,
+      "step": 535
+    },
+    {
+      "epoch": 0.0982532751091703,
+      "grad_norm": 0.14552047848701477,
+      "learning_rate": 4.939615148743017e-05,
+      "loss": 0.20382182598114013,
+      "step": 540
+    },
+    {
+      "epoch": 0.09916302765647744,
+      "grad_norm": 0.13178016245365143,
+      "learning_rate": 4.937996539614914e-05,
+      "loss": 0.19901862144470214,
+      "step": 545
+    },
+    {
+      "epoch": 0.10007278020378457,
+      "grad_norm": 0.635392427444458,
+      "learning_rate": 4.936356795417439e-05,
+      "loss": 0.20694944858551026,
+      "step": 550
+    },
+    {
+      "epoch": 0.1009825327510917,
+      "grad_norm": 0.15019077062606812,
+      "learning_rate": 4.934695930365586e-05,
+      "loss": 0.19313746690750122,
+      "step": 555
+    },
+    {
+      "epoch": 0.10189228529839883,
+      "grad_norm": 0.12941956520080566,
+      "learning_rate": 4.9330139588574474e-05,
+      "loss": 0.19671722650527954,
+      "step": 560
+    },
+    {
+      "epoch": 0.10280203784570596,
+      "grad_norm": 0.13818831741809845,
+      "learning_rate": 4.931310895474088e-05,
+      "loss": 0.20026786327362062,
+      "step": 565
+    },
+    {
+      "epoch": 0.1037117903930131,
+      "grad_norm": 0.12011194974184036,
+      "learning_rate": 4.929586754979417e-05,
+      "loss": 0.1932437539100647,
+      "step": 570
+    },
+    {
+      "epoch": 0.10462154294032024,
+      "grad_norm": 0.1345364898443222,
+      "learning_rate": 4.9278415523200644e-05,
+      "loss": 0.20245940685272218,
+      "step": 575
+    },
+    {
+      "epoch": 0.10553129548762737,
+      "grad_norm": 0.13281017541885376,
+      "learning_rate": 4.926075302625247e-05,
+      "loss": 0.19864981174468993,
+      "step": 580
+    },
+    {
+      "epoch": 0.1064410480349345,
+      "grad_norm": 0.13465586304664612,
+      "learning_rate": 4.924288021206639e-05,
+      "loss": 0.19573183059692384,
+      "step": 585
+    },
+    {
+      "epoch": 0.10735080058224163,
+      "grad_norm": 0.15225961804389954,
+      "learning_rate": 4.9224797235582396e-05,
+      "loss": 0.19946801662445068,
+      "step": 590
+    },
+    {
+      "epoch": 0.10826055312954876,
+      "grad_norm": 0.12816746532917023,
+      "learning_rate": 4.92065042535624e-05,
+      "loss": 0.19851526021957397,
+      "step": 595
+    },
+    {
+      "epoch": 0.1091703056768559,
+      "grad_norm": 0.13802853226661682,
+      "learning_rate": 4.9188001424588824e-05,
+      "loss": 0.19321763515472412,
+      "step": 600
+    },
+    {
+      "epoch": 0.11008005822416303,
+      "grad_norm": 0.17504797875881195,
+      "learning_rate": 4.9169288909063295e-05,
+      "loss": 0.2032616138458252,
+      "step": 605
+    },
+    {
+      "epoch": 0.11098981077147016,
+      "grad_norm": 0.13544194400310516,
+      "learning_rate": 4.91503668692052e-05,
+      "loss": 0.2011256456375122,
+      "step": 610
+    },
+    {
+      "epoch": 0.11189956331877729,
+      "grad_norm": 1.3976134061813354,
+      "learning_rate": 4.91312354690503e-05,
+      "loss": 0.19916868209838867,
+      "step": 615
+    },
+    {
+      "epoch": 0.11280931586608442,
+      "grad_norm": 0.1465059071779251,
+      "learning_rate": 4.91118948744493e-05,
+      "loss": 0.19487457275390624,
+      "step": 620
+    },
+    {
+      "epoch": 0.11371906841339156,
+      "grad_norm": 0.12103168666362762,
+      "learning_rate": 4.909234525306645e-05,
+      "loss": 0.1907251238822937,
+      "step": 625
+    },
+    {
+      "epoch": 0.1146288209606987,
+      "grad_norm": 0.12660574913024902,
+      "learning_rate": 4.907258677437802e-05,
+      "loss": 0.19327253103256226,
+      "step": 630
+    },
+    {
+      "epoch": 0.11553857350800582,
+      "grad_norm": 0.1347813606262207,
+      "learning_rate": 4.90526196096709e-05,
+      "loss": 0.19637736082077026,
+      "step": 635
+    },
+    {
+      "epoch": 0.11644832605531295,
+      "grad_norm": 0.14953652024269104,
+      "learning_rate": 4.903244393204107e-05,
+      "loss": 0.20325069427490233,
+      "step": 640
+    },
+    {
+      "epoch": 0.11735807860262008,
+      "grad_norm": 0.13936272263526917,
+      "learning_rate": 4.901205991639213e-05,
+      "loss": 0.1930275321006775,
+      "step": 645
+    },
+    {
+      "epoch": 0.11826783114992721,
+      "grad_norm": 0.1448420137166977,
+      "learning_rate": 4.899146773943374e-05,
+      "loss": 0.20026936531066894,
+      "step": 650
+    },
+    {
+      "epoch": 0.11917758369723436,
+      "grad_norm": 0.1312534064054489,
+      "learning_rate": 4.897066757968014e-05,
+      "loss": 0.19062033891677857,
+      "step": 655
+    },
+    {
+      "epoch": 0.12008733624454149,
+      "grad_norm": 0.13644742965698242,
+      "learning_rate": 4.894965961744859e-05,
+      "loss": 0.18719595670700073,
+      "step": 660
+    },
+    {
+      "epoch": 0.12099708879184862,
+      "grad_norm": 0.14276087284088135,
+      "learning_rate": 4.892844403485777e-05,
+      "loss": 0.19784307479858398,
+      "step": 665
+    },
+    {
+      "epoch": 0.12190684133915575,
+      "grad_norm": 0.14735399186611176,
+      "learning_rate": 4.890702101582623e-05,
+      "loss": 0.19163782596588136,
+      "step": 670
+    },
+    {
+      "epoch": 0.12281659388646288,
+      "grad_norm": 0.15742065012454987,
+      "learning_rate": 4.888539074607082e-05,
+      "loss": 0.19312986135482788,
+      "step": 675
+    },
+    {
+      "epoch": 0.12372634643377002,
+      "grad_norm": 0.12917031347751617,
+      "learning_rate": 4.8863553413105025e-05,
+      "loss": 0.20066320896148682,
+      "step": 680
+    },
+    {
+      "epoch": 0.12463609898107715,
+      "grad_norm": 0.1484801322221756,
+      "learning_rate": 4.884150920623737e-05,
+      "loss": 0.20096096992492676,
+      "step": 685
+    },
+    {
+      "epoch": 0.12554585152838427,
+      "grad_norm": 0.1455296128988266,
+      "learning_rate": 4.88192583165698e-05,
+      "loss": 0.20518505573272705,
+      "step": 690
+    },
+    {
+      "epoch": 0.12645560407569142,
+      "grad_norm": 0.14517490565776825,
+      "learning_rate": 4.879680093699598e-05,
+      "loss": 0.18859238624572755,
+      "step": 695
+    },
+    {
+      "epoch": 0.12736535662299855,
+      "grad_norm": 0.18778090178966522,
+      "learning_rate": 4.877413726219964e-05,
+      "loss": 0.197074818611145,
+      "step": 700
+    },
+    {
+      "epoch": 0.12827510917030568,
+      "grad_norm": 0.13497677445411682,
+      "learning_rate": 4.87512674886529e-05,
+      "loss": 0.18713107109069824,
+      "step": 705
+    },
+    {
+      "epoch": 0.12918486171761281,
+      "grad_norm": 0.12657155096530914,
+      "learning_rate": 4.872819181461455e-05,
+      "loss": 0.1858484387397766,
+      "step": 710
+    },
+    {
+      "epoch": 0.13009461426491994,
+      "grad_norm": 0.11458148807287216,
+      "learning_rate": 4.870491044012834e-05,
+      "loss": 0.18732179403305055,
+      "step": 715
+    },
+    {
+      "epoch": 0.13100436681222707,
+      "grad_norm": 0.13000249862670898,
+      "learning_rate": 4.8681423567021244e-05,
+      "loss": 0.1872936010360718,
+      "step": 720
+    },
+    {
+      "epoch": 0.1319141193595342,
+      "grad_norm": 0.14580890536308289,
+      "learning_rate": 4.865773139890172e-05,
+      "loss": 0.19280019998550416,
+      "step": 725
+    },
+    {
+      "epoch": 0.13282387190684133,
+      "grad_norm": 0.1507277935743332,
+      "learning_rate": 4.8633834141157913e-05,
+      "loss": 0.1898929238319397,
+      "step": 730
+    },
+    {
+      "epoch": 0.13373362445414846,
+      "grad_norm": 0.1418737769126892,
+      "learning_rate": 4.860973200095592e-05,
+      "loss": 0.17926375865936278,
+      "step": 735
+    },
+    {
+      "epoch": 0.1346433770014556,
+      "grad_norm": 0.17151866853237152,
+      "learning_rate": 4.858542518723794e-05,
+      "loss": 0.18963592052459716,
+      "step": 740
+    },
+    {
+      "epoch": 0.13555312954876272,
+      "grad_norm": 0.11162743717432022,
+      "learning_rate": 4.8560913910720535e-05,
+      "loss": 0.19466646909713745,
+      "step": 745
+    },
+    {
+      "epoch": 0.13646288209606988,
+      "grad_norm": 0.15628376603126526,
+      "learning_rate": 4.8536198383892725e-05,
+      "loss": 0.19494034051895143,
+      "step": 750
+    },
+    {
+      "epoch": 0.137372634643377,
+      "grad_norm": 0.18209289014339447,
+      "learning_rate": 4.851127882101421e-05,
+      "loss": 0.18747550249099731,
+      "step": 755
+    },
+    {
+      "epoch": 0.13828238719068414,
+      "grad_norm": 0.14559614658355713,
+      "learning_rate": 4.8486155438113454e-05,
+      "loss": 0.1897158980369568,
+      "step": 760
+    },
+    {
+      "epoch": 0.13919213973799127,
+      "grad_norm": 0.3198587894439697,
+      "learning_rate": 4.846082845298586e-05,
+      "loss": 0.18571001291275024,
+      "step": 765
+    },
+    {
+      "epoch": 0.1401018922852984,
+      "grad_norm": 0.1486678421497345,
+      "learning_rate": 4.843529808519189e-05,
+      "loss": 0.19561930894851684,
+      "step": 770
+    },
+    {
+      "epoch": 0.14101164483260553,
+      "grad_norm": 0.15318170189857483,
+      "learning_rate": 4.840956455605509e-05,
+      "loss": 0.187040114402771,
+      "step": 775
+    },
+    {
+      "epoch": 0.14192139737991266,
+      "grad_norm": 0.13754244148731232,
+      "learning_rate": 4.838362808866025e-05,
+      "loss": 0.18345539569854735,
+      "step": 780
+    },
+    {
+      "epoch": 0.1428311499272198,
+      "grad_norm": 0.12943248450756073,
+      "learning_rate": 4.835748890785143e-05,
+      "loss": 0.1921079397201538,
+      "step": 785
+    },
+    {
+      "epoch": 0.14374090247452692,
+      "grad_norm": 0.110458143055439,
+      "learning_rate": 4.833114724023001e-05,
+      "loss": 0.17927205562591553,
+      "step": 790
+    },
+    {
+      "epoch": 0.14465065502183405,
+      "grad_norm": 0.2421770840883255,
+      "learning_rate": 4.830460331415275e-05,
+      "loss": 0.18317567110061644,
+      "step": 795
+    },
+    {
+      "epoch": 0.14556040756914118,
+      "grad_norm": 0.14752762019634247,
+      "learning_rate": 4.8277857359729787e-05,
+      "loss": 0.1843916058540344,
+      "step": 800
+    },
+    {
+      "epoch": 0.14647016011644834,
+      "grad_norm": 0.15043556690216064,
+      "learning_rate": 4.8250909608822644e-05,
+      "loss": 0.18354393243789674,
+      "step": 805
+    },
+    {
+      "epoch": 0.14737991266375547,
+      "grad_norm": 0.1381794661283493,
+      "learning_rate": 4.822376029504223e-05,
+      "loss": 0.1789781332015991,
+      "step": 810
+    },
+    {
+      "epoch": 0.1482896652110626,
+      "grad_norm": 0.18386174738407135,
+      "learning_rate": 4.819640965374681e-05,
+      "loss": 0.19494292736053467,
+      "step": 815
+    },
+    {
+      "epoch": 0.14919941775836973,
+      "grad_norm": 0.13829593360424042,
+      "learning_rate": 4.816885792203996e-05,
+      "loss": 0.18486063480377196,
+      "step": 820
+    },
+    {
+      "epoch": 0.15010917030567686,
+      "grad_norm": 0.15033291280269623,
+      "learning_rate": 4.814110533876852e-05,
+      "loss": 0.18061509132385253,
+      "step": 825
+    },
+    {
+      "epoch": 0.151018922852984,
+      "grad_norm": 0.17150473594665527,
+      "learning_rate": 4.811315214452051e-05,
+      "loss": 0.18464866876602173,
+      "step": 830
+    },
+    {
+      "epoch": 0.15192867540029112,
+      "grad_norm": 0.15317125618457794,
+      "learning_rate": 4.808499858162307e-05,
+      "loss": 0.1837708592414856,
+      "step": 835
+    },
+    {
+      "epoch": 0.15283842794759825,
+      "grad_norm": 0.2671392560005188,
+      "learning_rate": 4.805664489414031e-05,
+      "loss": 0.19338636398315429,
+      "step": 840
+    },
+    {
+      "epoch": 0.15374818049490538,
+      "grad_norm": 0.14047028124332428,
+      "learning_rate": 4.802809132787125e-05,
+      "loss": 0.17069108486175538,
+      "step": 845
+    },
+    {
+      "epoch": 0.1546579330422125,
+      "grad_norm": 0.1520431935787201,
+      "learning_rate": 4.799933813034768e-05,
+      "loss": 0.18607735633850098,
+      "step": 850
+    },
+    {
+      "epoch": 0.15556768558951964,
+      "grad_norm": 0.17239463329315186,
+      "learning_rate": 4.797038555083197e-05,
+      "loss": 0.18069062232971192,
+      "step": 855
+    },
+    {
+      "epoch": 0.1564774381368268,
+      "grad_norm": 0.1377955675125122,
+      "learning_rate": 4.794123384031495e-05,
+      "loss": 0.18870222568511963,
+      "step": 860
+    },
+    {
+      "epoch": 0.15738719068413393,
+      "grad_norm": 0.15901461243629456,
+      "learning_rate": 4.791188325151373e-05,
+      "loss": 0.18128334283828734,
+      "step": 865
+    },
+    {
+      "epoch": 0.15829694323144106,
+      "grad_norm": 0.14634132385253906,
+      "learning_rate": 4.7882334038869495e-05,
+      "loss": 0.1866163969039917,
+      "step": 870
+    },
+    {
+      "epoch": 0.1592066957787482,
+      "grad_norm": 0.15361061692237854,
+      "learning_rate": 4.785258645854529e-05,
+      "loss": 0.17850807905197144,
+      "step": 875
+    },
+    {
+      "epoch": 0.16011644832605532,
+      "grad_norm": 0.13751649856567383,
+      "learning_rate": 4.782264076842385e-05,
+      "loss": 0.17731113433837892,
+      "step": 880
+    },
+    {
+      "epoch": 0.16102620087336245,
+      "grad_norm": 0.17909638583660126,
+      "learning_rate": 4.7792497228105314e-05,
+      "loss": 0.18344542980194092,
+      "step": 885
+    },
+    {
+      "epoch": 0.16193595342066958,
+      "grad_norm": 0.16038304567337036,
+      "learning_rate": 4.776215609890498e-05,
+      "loss": 0.18868647813796996,
+      "step": 890
+    },
+    {
+      "epoch": 0.1628457059679767,
+      "grad_norm": 0.1653951108455658,
+      "learning_rate": 4.773161764385107e-05,
+      "loss": 0.18614152669906617,
+      "step": 895
+    },
+    {
+      "epoch": 0.16375545851528384,
+      "grad_norm": 0.16193026304244995,
+      "learning_rate": 4.770088212768241e-05,
+      "loss": 0.18564575910568237,
+      "step": 900
+    },
+    {
+      "epoch": 0.16466521106259097,
+      "grad_norm": 0.16048531234264374,
+      "learning_rate": 4.7669949816846173e-05,
+      "loss": 0.18330031633377075,
+      "step": 905
+    },
+    {
+      "epoch": 0.1655749636098981,
+      "grad_norm": 0.1440177708864212,
+      "learning_rate": 4.7638820979495534e-05,
+      "loss": 0.17712442874908446,
+      "step": 910
+    },
+    {
+      "epoch": 0.16648471615720525,
+      "grad_norm": 0.19635969400405884,
+      "learning_rate": 4.760749588548738e-05,
+      "loss": 0.18679027557373046,
+      "step": 915
+    },
+    {
+      "epoch": 0.16739446870451238,
+      "grad_norm": 0.15576541423797607,
+      "learning_rate": 4.757597480637995e-05,
+      "loss": 0.19283764362335204,
+      "step": 920
+    },
+    {
+      "epoch": 0.1683042212518195,
+      "grad_norm": 0.1550331562757492,
+      "learning_rate": 4.7544258015430463e-05,
+      "loss": 0.18269542455673218,
+      "step": 925
+    },
+    {
+      "epoch": 0.16921397379912664,
+      "grad_norm": 0.18369626998901367,
+      "learning_rate": 4.75123457875928e-05,
+      "loss": 0.1697891116142273,
+      "step": 930
+    },
+    {
+      "epoch": 0.17012372634643377,
+      "grad_norm": 0.15266314148902893,
+      "learning_rate": 4.7480238399515074e-05,
+      "loss": 0.18523451089859008,
+      "step": 935
+    },
+    {
+      "epoch": 0.1710334788937409,
+      "grad_norm": 0.16709664463996887,
+      "learning_rate": 4.744793612953724e-05,
+      "loss": 0.1803238034248352,
+      "step": 940
+    },
+    {
+      "epoch": 0.17194323144104803,
+      "grad_norm": 0.14929179847240448,
+      "learning_rate": 4.741543925768872e-05,
+      "loss": 0.1861217737197876,
+      "step": 945
+    },
+    {
+      "epoch": 0.17285298398835516,
+      "grad_norm": 0.1362280696630478,
+      "learning_rate": 4.7382748065685915e-05,
+      "loss": 0.17896100282669067,
+      "step": 950
+    },
+    {
+      "epoch": 0.1737627365356623,
+      "grad_norm": 0.15290239453315735,
+      "learning_rate": 4.734986283692982e-05,
+      "loss": 0.18432788848876952,
+      "step": 955
+    },
+    {
+      "epoch": 0.17467248908296942,
+      "grad_norm": 0.1287035197019577,
+      "learning_rate": 4.73167838565035e-05,
+      "loss": 0.18485682010650634,
+      "step": 960
+    },
+    {
+      "epoch": 0.17558224163027655,
+      "grad_norm": 0.17969627678394318,
+      "learning_rate": 4.728351141116971e-05,
+      "loss": 0.17361557483673096,
+      "step": 965
+    },
+    {
+      "epoch": 0.1764919941775837,
+      "grad_norm": 0.13751201331615448,
+      "learning_rate": 4.7250045789368326e-05,
+      "loss": 0.1731679320335388,
+      "step": 970
+    },
+    {
+      "epoch": 0.17740174672489084,
+      "grad_norm": 0.1603265255689621,
+      "learning_rate": 4.721638728121388e-05,
+      "loss": 0.17308170795440675,
+      "step": 975
+    },
+    {
+      "epoch": 0.17831149927219797,
+      "grad_norm": 0.1592789888381958,
+      "learning_rate": 4.718253617849306e-05,
+      "loss": 0.17534757852554322,
+      "step": 980
+    },
+    {
+      "epoch": 0.1792212518195051,
+      "grad_norm": 0.12727224826812744,
+      "learning_rate": 4.714849277466214e-05,
+      "loss": 0.17817609310150145,
+      "step": 985
+    },
+    {
+      "epoch": 0.18013100436681223,
+      "grad_norm": 0.15401554107666016,
+      "learning_rate": 4.711425736484447e-05,
+      "loss": 0.1733405351638794,
+      "step": 990
+    },
+    {
+      "epoch": 0.18104075691411936,
+      "grad_norm": 0.13253968954086304,
+      "learning_rate": 4.7079830245827906e-05,
+      "loss": 0.17846795320510864,
+      "step": 995
+    },
+    {
+      "epoch": 0.1819505094614265,
+      "grad_norm": 0.21846213936805725,
+      "learning_rate": 4.7045211716062245e-05,
+      "loss": 0.18021599054336548,
+      "step": 1000
+    },
+    {
+      "epoch": 0.18286026200873362,
+      "grad_norm": 0.16867990791797638,
+      "learning_rate": 4.7010402075656595e-05,
+      "loss": 0.18232386112213134,
+      "step": 1005
+    },
+    {
+      "epoch": 0.18377001455604075,
+      "grad_norm": 0.17180582880973816,
+      "learning_rate": 4.697540162637686e-05,
+      "loss": 0.1816317319869995,
+      "step": 1010
+    },
+    {
+      "epoch": 0.18467976710334788,
+      "grad_norm": 0.16480213403701782,
+      "learning_rate": 4.694021067164303e-05,
+      "loss": 0.17718446254730225,
+      "step": 1015
+    },
+    {
+      "epoch": 0.185589519650655,
+      "grad_norm": 0.15015918016433716,
+      "learning_rate": 4.6904829516526605e-05,
+      "loss": 0.17412011623382567,
+      "step": 1020
+    },
+    {
+      "epoch": 0.18649927219796217,
+      "grad_norm": 0.14445139467716217,
+      "learning_rate": 4.686925846774795e-05,
+      "loss": 0.1778018832206726,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1874090247452693,
+      "grad_norm": 0.1701960265636444,
+      "learning_rate": 4.683349783367362e-05,
+      "loss": 0.16901081800460815,
+      "step": 1030
+    },
+    {
+      "epoch": 0.18831877729257643,
+      "grad_norm": 0.15894867479801178,
+      "learning_rate": 4.679754792431368e-05,
+      "loss": 0.17055928707122803,
+      "step": 1035
+    },
+    {
+      "epoch": 0.18922852983988356,
+      "grad_norm": 0.1511942446231842,
+      "learning_rate": 4.676140905131903e-05,
+      "loss": 0.17339680194854737,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1901382823871907,
+      "grad_norm": 0.14735209941864014,
+      "learning_rate": 4.672508152797872e-05,
+      "loss": 0.17802717685699462,
+      "step": 1045
+    },
+    {
+      "epoch": 0.19104803493449782,
+      "grad_norm": 0.17367291450500488,
+      "learning_rate": 4.66885656692172e-05,
+      "loss": 0.1732744097709656,
+      "step": 1050
+    },
+    {
+      "epoch": 0.19195778748180495,
+      "grad_norm": 0.147227481007576,
+      "learning_rate": 4.665186179159159e-05,
+      "loss": 0.17040517330169677,
+      "step": 1055
+    },
+    {
+      "epoch": 0.19286754002911208,
+      "grad_norm": 0.1709655076265335,
+      "learning_rate": 4.6614970213289e-05,
+      "loss": 0.17794088125228882,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1937772925764192,
+      "grad_norm": 0.1588088721036911,
+      "learning_rate": 4.657789125412366e-05,
+      "loss": 0.17180380821228028,
+      "step": 1065
+    },
+    {
+      "epoch": 0.19468704512372634,
+      "grad_norm": 0.14827021956443787,
+      "learning_rate": 4.654062523553428e-05,
+      "loss": 0.182997989654541,
+      "step": 1070
+    },
+    {
+      "epoch": 0.19559679767103347,
+      "grad_norm": 0.16230466961860657,
+      "learning_rate": 4.6503172480581126e-05,
+      "loss": 0.17346880435943604,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1965065502183406,
+      "grad_norm": 0.1637624353170395,
+      "learning_rate": 4.646553331394333e-05,
+      "loss": 0.17263576984405518,
+      "step": 1080
+    },
+    {
+      "epoch": 0.19741630276564776,
+      "grad_norm": 0.15977843105793,
+      "learning_rate": 4.642770806191603e-05,
+      "loss": 0.17284308671951293,
+      "step": 1085
+    },
+    {
+      "epoch": 0.19832605531295489,
+      "grad_norm": 0.15394869446754456,
+      "learning_rate": 4.6389697052407534e-05,
+      "loss": 0.17797101736068727,
+      "step": 1090
+    },
+    {
+      "epoch": 0.19923580786026202,
+      "grad_norm": 0.15995225310325623,
+      "learning_rate": 4.6351500614936485e-05,
+      "loss": 0.18137198686599731,
+      "step": 1095
+    },
+    {
+      "epoch": 0.20014556040756915,
+      "grad_norm": 0.1779479682445526,
+      "learning_rate": 4.6313119080629006e-05,
+      "loss": 0.17998344898223878,
+      "step": 1100
+    },
+    {
+      "epoch": 0.20105531295487628,
+      "grad_norm": 0.14362832903862,
+      "learning_rate": 4.627455278221584e-05,
+      "loss": 0.18196423053741456,
+      "step": 1105
+    },
+    {
+      "epoch": 0.2019650655021834,
+      "grad_norm": 0.15951639413833618,
+      "learning_rate": 4.623580205402947e-05,
+      "loss": 0.17423888444900512,
+      "step": 1110
+    },
+    {
+      "epoch": 0.20287481804949054,
+      "grad_norm": 0.17273563146591187,
+      "learning_rate": 4.619686723200115e-05,
+      "loss": 0.17392473220825194,
+      "step": 1115
+    },
+    {
+      "epoch": 0.20378457059679767,
+      "grad_norm": 0.1655360758304596,
+      "learning_rate": 4.615774865365813e-05,
+      "loss": 0.17528389692306517,
+      "step": 1120
+    },
+    {
+      "epoch": 0.2046943231441048,
+      "grad_norm": 0.15920691192150116,
+      "learning_rate": 4.611844665812058e-05,
+      "loss": 0.1806849241256714,
+      "step": 1125
+    },
+    {
+      "epoch": 0.20560407569141192,
+      "grad_norm": 0.16114577651023865,
+      "learning_rate": 4.607896158609875e-05,
+      "loss": 0.17217352390289306,
+      "step": 1130
+    },
+    {
+      "epoch": 0.20651382823871905,
+      "grad_norm": 0.1499422937631607,
+      "learning_rate": 4.603929377988999e-05,
+      "loss": 0.17806737422943114,
+      "step": 1135
+    },
+    {
+      "epoch": 0.2074235807860262,
+      "grad_norm": 0.17605191469192505,
+      "learning_rate": 4.5999443583375765e-05,
+      "loss": 0.17842113971710205,
+      "step": 1140
+    },
+    {
+      "epoch": 0.20833333333333334,
+      "grad_norm": 0.16117210686206818,
+      "learning_rate": 4.595941134201871e-05,
+      "loss": 0.18379683494567872,
+      "step": 1145
+    },
+    {
+      "epoch": 0.20924308588064047,
+      "grad_norm": 0.21199050545692444,
+      "learning_rate": 4.591919740285957e-05,
+      "loss": 0.16286123991012574,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2101528384279476,
+      "grad_norm": 0.15100529789924622,
+      "learning_rate": 4.587880211451427e-05,
+      "loss": 0.17995200157165528,
+      "step": 1155
+    },
+    {
+      "epoch": 0.21106259097525473,
+      "grad_norm": 0.16618172824382782,
+      "learning_rate": 4.583822582717085e-05,
+      "loss": 0.16960303783416747,
+      "step": 1160
+    },
+    {
+      "epoch": 0.21197234352256186,
+      "grad_norm": 0.14743569493293762,
+      "learning_rate": 4.579746889258643e-05,
+      "loss": 0.17762668132781984,
+      "step": 1165
+    },
+    {
+      "epoch": 0.212882096069869,
+      "grad_norm": 0.1697179079055786,
+      "learning_rate": 4.575653166408417e-05,
+      "loss": 0.16656005382537842,
+      "step": 1170
+    },
+    {
+      "epoch": 0.21379184861717612,
+      "grad_norm": 0.14886513352394104,
+      "learning_rate": 4.57154144965502e-05,
+      "loss": 0.17091882228851318,
+      "step": 1175
+    },
+    {
+      "epoch": 0.21470160116448325,
+      "grad_norm": 0.18197473883628845,
+      "learning_rate": 4.5674117746430556e-05,
+      "loss": 0.1770920753479004,
+      "step": 1180
+    },
+    {
+      "epoch": 0.21561135371179038,
+      "grad_norm": 0.17323088645935059,
+      "learning_rate": 4.563264177172807e-05,
+      "loss": 0.1734643578529358,
+      "step": 1185
+    },
+    {
+      "epoch": 0.2165211062590975,
+      "grad_norm": 0.1521984338760376,
+      "learning_rate": 4.559098693199929e-05,
+      "loss": 0.17515116930007935,
+      "step": 1190
+    },
+    {
+      "epoch": 0.21743085880640467,
+      "grad_norm": 0.1842304915189743,
+      "learning_rate": 4.554915358835134e-05,
+      "loss": 0.16798022985458375,
+      "step": 1195
+    },
+    {
+      "epoch": 0.2183406113537118,
+      "grad_norm": 0.14753451943397522,
+      "learning_rate": 4.5507142103438794e-05,
+      "loss": 0.1755476713180542,
+      "step": 1200
+    },
+    {
+      "epoch": 0.21925036390101893,
+      "grad_norm": 0.17096194624900818,
+      "learning_rate": 4.546495284146057e-05,
+      "loss": 0.1792473554611206,
+      "step": 1205
+    },
+    {
+      "epoch": 0.22016011644832606,
+      "grad_norm": 0.1579233556985855,
+      "learning_rate": 4.542258616815669e-05,
+      "loss": 0.17230144739151002,
+      "step": 1210
+    },
+    {
+      "epoch": 0.2210698689956332,
+      "grad_norm": 0.177297905087471,
+      "learning_rate": 4.5380042450805216e-05,
+      "loss": 0.1807127833366394,
+      "step": 1215
+    },
+    {
+      "epoch": 0.22197962154294032,
+      "grad_norm": 0.14331696927547455,
+      "learning_rate": 4.533732205821897e-05,
+      "loss": 0.17201389074325563,
+      "step": 1220
+    },
+    {
+      "epoch": 0.22288937409024745,
+      "grad_norm": 0.14473360776901245,
+      "learning_rate": 4.529442536074239e-05,
+      "loss": 0.17036900520324708,
+      "step": 1225
+    },
+    {
+      "epoch": 0.22379912663755458,
+      "grad_norm": 0.1820901483297348,
+      "learning_rate": 4.5251352730248314e-05,
+      "loss": 0.17704882621765136,
+      "step": 1230
+    },
+    {
+      "epoch": 0.2247088791848617,
+      "grad_norm": 0.1948976367712021,
+      "learning_rate": 4.5208104540134746e-05,
+      "loss": 0.1706973433494568,
+      "step": 1235
+    },
+    {
+      "epoch": 0.22561863173216884,
+      "grad_norm": 0.16660070419311523,
+      "learning_rate": 4.51646811653216e-05,
+      "loss": 0.17636821269989014,
+      "step": 1240
+    },
+    {
+      "epoch": 0.22652838427947597,
+      "grad_norm": 0.1699984073638916,
+      "learning_rate": 4.512108298224751e-05,
+      "loss": 0.16986632347106934,
+      "step": 1245
+    },
+    {
+      "epoch": 0.22743813682678313,
+      "grad_norm": 0.17601042985916138,
+      "learning_rate": 4.50773103688665e-05,
+      "loss": 0.17507898807525635,
+      "step": 1250
+    },
+    {
+      "epoch": 0.22834788937409026,
+      "grad_norm": 0.17557238042354584,
+      "learning_rate": 4.503336370464476e-05,
+      "loss": 0.17702863216400147,
+      "step": 1255
+    },
+    {
+      "epoch": 0.2292576419213974,
+      "grad_norm": 0.1800651252269745,
+      "learning_rate": 4.498924337055729e-05,
+      "loss": 0.16419180631637573,
+      "step": 1260
+    },
+    {
+      "epoch": 0.23016739446870452,
+      "grad_norm": 0.2022479772567749,
+      "learning_rate": 4.494494974908468e-05,
+      "loss": 0.17482060194015503,
+      "step": 1265
+    },
+    {
+      "epoch": 0.23107714701601165,
+      "grad_norm": 0.14180205762386322,
+      "learning_rate": 4.490048322420973e-05,
+      "loss": 0.1723136067390442,
+      "step": 1270
+    },
+    {
+      "epoch": 0.23198689956331878,
+      "grad_norm": 0.18607310950756073,
+      "learning_rate": 4.485584418141419e-05,
+      "loss": 0.17096419334411622,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2328966521106259,
+      "grad_norm": 0.15958310663700104,
+      "learning_rate": 4.481103300767529e-05,
+      "loss": 0.1656244158744812,
+      "step": 1280
+    },
+    {
+      "epoch": 0.23380640465793304,
+      "grad_norm": 0.17552383244037628,
+      "learning_rate": 4.476605009146255e-05,
+      "loss": 0.17677626609802247,
+      "step": 1285
+    },
+    {
+      "epoch": 0.23471615720524017,
+      "grad_norm": 0.15299823880195618,
+      "learning_rate": 4.472089582273429e-05,
+      "loss": 0.1778991103172302,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2356259097525473,
+      "grad_norm": 0.14613987505435944,
+      "learning_rate": 4.46755705929343e-05,
+      "loss": 0.17071452140808105,
+      "step": 1295
+    },
+    {
+      "epoch": 0.23653566229985443,
+      "grad_norm": 0.17781122028827667,
+      "learning_rate": 4.463007479498843e-05,
+      "loss": 0.16955430507659913,
+      "step": 1300
+    },
+    {
+      "epoch": 0.23744541484716158,
+      "grad_norm": 0.16326487064361572,
+      "learning_rate": 4.458440882330119e-05,
+      "loss": 0.1777693510055542,
+      "step": 1305
+    },
+    {
+      "epoch": 0.23835516739446871,
+      "grad_norm": 0.17701926827430725,
+      "learning_rate": 4.4538573073752365e-05,
+      "loss": 0.16323351860046387,
+      "step": 1310
+    },
+    {
+      "epoch": 0.23926491994177584,
+      "grad_norm": 0.13104717433452606,
+      "learning_rate": 4.449256794369349e-05,
+      "loss": 0.17653456926345826,
+      "step": 1315
+    },
+    {
+      "epoch": 0.24017467248908297,
+      "grad_norm": 0.1796836256980896,
+      "learning_rate": 4.444639383194452e-05,
+      "loss": 0.17189600467681884,
+      "step": 1320
+    },
+    {
+      "epoch": 0.2410844250363901,
+      "grad_norm": 0.14919696748256683,
+      "learning_rate": 4.440005113879029e-05,
+      "loss": 0.17003334760665895,
+      "step": 1325
+    },
+    {
+      "epoch": 0.24199417758369723,
+      "grad_norm": 0.1728784441947937,
+      "learning_rate": 4.4353540265977064e-05,
+      "loss": 0.17397408485412597,
+      "step": 1330
+    },
+    {
+      "epoch": 0.24290393013100436,
+      "grad_norm": 0.14591015875339508,
+      "learning_rate": 4.43068616167091e-05,
+      "loss": 0.16498478651046752,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2438136826783115,
+      "grad_norm": 0.18417201936244965,
+      "learning_rate": 4.4260015595645055e-05,
+      "loss": 0.16841750144958495,
+      "step": 1340
+    },
+    {
+      "epoch": 0.24472343522561862,
+      "grad_norm": 0.16264279186725616,
+      "learning_rate": 4.4213002608894605e-05,
+      "loss": 0.16907373666763306,
+      "step": 1345
+    },
+    {
+      "epoch": 0.24563318777292575,
+      "grad_norm": 0.15248481929302216,
+      "learning_rate": 4.416582306401481e-05,
+      "loss": 0.15931472778320313,
+      "step": 1350
+    },
+    {
+      "epoch": 0.24654294032023288,
+      "grad_norm": 0.1488373875617981,
+      "learning_rate": 4.4118477370006636e-05,
+      "loss": 0.1701716423034668,
+      "step": 1355
+    },
+    {
+      "epoch": 0.24745269286754004,
+      "grad_norm": 0.14679782092571259,
+      "learning_rate": 4.407096593731142e-05,
+      "loss": 0.157412326335907,
+      "step": 1360
+    },
+    {
+      "epoch": 0.24836244541484717,
+      "grad_norm": 0.17139530181884766,
+      "learning_rate": 4.402328917780728e-05,
+      "loss": 0.17303754091262818,
+      "step": 1365
+    },
+    {
+      "epoch": 0.2492721979621543,
+      "grad_norm": 0.1534871757030487,
+      "learning_rate": 4.397544750480554e-05,
+      "loss": 0.1786255121231079,
+      "step": 1370
+    },
+    {
+      "epoch": 0.2501819505094614,
+      "grad_norm": 0.1876252293586731,
+      "learning_rate": 4.39274413330472e-05,
+      "loss": 0.16442898511886597,
+      "step": 1375
+    },
+    {
+      "epoch": 0.25109170305676853,
+      "grad_norm": 0.16165752708911896,
+      "learning_rate": 4.387927107869928e-05,
+      "loss": 0.1780426025390625,
+      "step": 1380
+    },
+    {
+      "epoch": 0.25200145560407566,
+      "grad_norm": 0.17242255806922913,
+      "learning_rate": 4.383093715935124e-05,
+      "loss": 0.15959256887435913,
+      "step": 1385
+    },
+    {
+      "epoch": 0.25291120815138285,
+      "grad_norm": 0.1627114862203598,
+      "learning_rate": 4.378243999401137e-05,
+      "loss": 0.17606115341186523,
+      "step": 1390
+    },
+    {
+      "epoch": 0.25382096069869,
+      "grad_norm": 0.15911224484443665,
+      "learning_rate": 4.373378000310312e-05,
+      "loss": 0.16798585653305054,
+      "step": 1395
+    },
+    {
+      "epoch": 0.2547307132459971,
+      "grad_norm": 0.15542249381542206,
+      "learning_rate": 4.3684957608461505e-05,
+      "loss": 0.1695417881011963,
+      "step": 1400
+    },
+    {
+      "epoch": 0.25564046579330424,
+      "grad_norm": 0.1475304812192917,
+      "learning_rate": 4.363597323332941e-05,
+      "loss": 0.16340878009796142,
+      "step": 1405
+    },
+    {
+      "epoch": 0.25655021834061137,
+      "grad_norm": 0.16943927109241486,
+      "learning_rate": 4.358682730235395e-05,
+      "loss": 0.17240238189697266,
+      "step": 1410
+    },
+    {
+      "epoch": 0.2574599708879185,
+      "grad_norm": 0.1816391944885254,
+      "learning_rate": 4.3537520241582744e-05,
+      "loss": 0.16558437347412108,
+      "step": 1415
+    },
+    {
+      "epoch": 0.25836972343522563,
+      "grad_norm": 0.23851341009140015,
+      "learning_rate": 4.348805247846027e-05,
+      "loss": 0.16796000003814698,
+      "step": 1420
+    },
+    {
+      "epoch": 0.25927947598253276,
+      "grad_norm": 0.15415243804454803,
+      "learning_rate": 4.343842444182414e-05,
+      "loss": 0.1746017098426819,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2601892285298399,
+      "grad_norm": 0.15651032328605652,
+      "learning_rate": 4.338863656190139e-05,
+      "loss": 0.1649057984352112,
+      "step": 1430
+    },
+    {
+      "epoch": 0.261098981077147,
+      "grad_norm": 0.16601966321468353,
+      "learning_rate": 4.333868927030471e-05,
+      "loss": 0.15888988971710205,
+      "step": 1435
+    },
+    {
+      "epoch": 0.26200873362445415,
+      "grad_norm": 0.1549467295408249,
+      "learning_rate": 4.328858300002876e-05,
+      "loss": 0.16357985734939576,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2629184861717613,
+      "grad_norm": 0.16332370042800903,
+      "learning_rate": 4.32383181854464e-05,
+      "loss": 0.16749982833862304,
+      "step": 1445
+    },
+    {
+      "epoch": 0.2638282387190684,
+      "grad_norm": 0.14827077090740204,
+      "learning_rate": 4.3187895262304894e-05,
+      "loss": 0.16886214017868043,
+      "step": 1450
+    },
+    {
+      "epoch": 0.26473799126637554,
+      "grad_norm": 0.1557198166847229,
+      "learning_rate": 4.313731466772216e-05,
+      "loss": 0.17512214183807373,
+      "step": 1455
+    },
+    {
+      "epoch": 0.26564774381368267,
+      "grad_norm": 0.17263570427894592,
+      "learning_rate": 4.308657684018299e-05,
+      "loss": 0.16248074769973755,
+      "step": 1460
+    },
+    {
+      "epoch": 0.2665574963609898,
+      "grad_norm": 0.17135761678218842,
+      "learning_rate": 4.303568221953521e-05,
+      "loss": 0.16605921983718872,
+      "step": 1465
+    },
+    {
+      "epoch": 0.26746724890829693,
+      "grad_norm": 0.14322632551193237,
+      "learning_rate": 4.2984631246985897e-05,
+      "loss": 0.1610772728919983,
+      "step": 1470
+    },
+    {
+      "epoch": 0.26837700145560406,
+      "grad_norm": 0.18852312862873077,
+      "learning_rate": 4.2933424365097564e-05,
+      "loss": 0.1686462163925171,
+      "step": 1475
+    },
+    {
+      "epoch": 0.2692867540029112,
+      "grad_norm": 0.1780245155096054,
+      "learning_rate": 4.2882062017784294e-05,
+      "loss": 0.16953932046890258,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2701965065502183,
+      "grad_norm": 0.180568665266037,
+      "learning_rate": 4.2830544650307895e-05,
+      "loss": 0.16442664861679077,
+      "step": 1485
+    },
+    {
+      "epoch": 0.27110625909752545,
+      "grad_norm": 0.16876435279846191,
+      "learning_rate": 4.277887270927407e-05,
+      "loss": 0.17128173112869263,
+      "step": 1490
+    },
+    {
+      "epoch": 0.2720160116448326,
+      "grad_norm": 0.164053276181221,
+      "learning_rate": 4.2727046642628513e-05,
+      "loss": 0.16331382989883422,
+      "step": 1495
+    },
+    {
+      "epoch": 0.27292576419213976,
+      "grad_norm": 0.14577528834342957,
+      "learning_rate": 4.267506689965305e-05,
+      "loss": 0.1638316035270691,
+      "step": 1500
+    },
+    {
+      "epoch": 0.2738355167394469,
+      "grad_norm": 0.1648740917444229,
+      "learning_rate": 4.262293393096171e-05,
+      "loss": 0.15332664251327516,
+      "step": 1505
+    },
+    {
+      "epoch": 0.274745269286754,
+      "grad_norm": 0.16445094347000122,
+      "learning_rate": 4.257064818849685e-05,
+      "loss": 0.1706634521484375,
+      "step": 1510
+    },
+    {
+      "epoch": 0.27565502183406115,
+      "grad_norm": 0.1584935486316681,
+      "learning_rate": 4.251821012552524e-05,
+      "loss": 0.1684114694595337,
+      "step": 1515
+    },
+    {
+      "epoch": 0.2765647743813683,
+      "grad_norm": 0.17215611040592194,
+      "learning_rate": 4.24656201966341e-05,
+      "loss": 0.15594131946563722,
+      "step": 1520
+    },
+    {
+      "epoch": 0.2774745269286754,
+      "grad_norm": 0.15945589542388916,
+      "learning_rate": 4.2412878857727214e-05,
+      "loss": 0.1686659574508667,
+      "step": 1525
+    },
+    {
+      "epoch": 0.27838427947598254,
+      "grad_norm": 0.16103951632976532,
+      "learning_rate": 4.2359986566020906e-05,
+      "loss": 0.17779340744018554,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2792940320232897,
+      "grad_norm": 0.1770307570695877,
+      "learning_rate": 4.230694378004014e-05,
+      "loss": 0.16786882877349854,
+      "step": 1535
+    },
+    {
+      "epoch": 0.2802037845705968,
+      "grad_norm": 0.16225053369998932,
+      "learning_rate": 4.2253750959614504e-05,
+      "loss": 0.16558897495269775,
+      "step": 1540
+    },
+    {
+      "epoch": 0.28111353711790393,
+      "grad_norm": 0.27213969826698303,
+      "learning_rate": 4.220040856587425e-05,
+      "loss": 0.1641119599342346,
+      "step": 1545
+    },
+    {
+      "epoch": 0.28202328966521106,
+      "grad_norm": 0.1773071587085724,
+      "learning_rate": 4.2146917061246284e-05,
+      "loss": 0.16919140815734862,
+      "step": 1550
+    },
+    {
+      "epoch": 0.2829330422125182,
+      "grad_norm": 0.15519705414772034,
+      "learning_rate": 4.209327690945014e-05,
+      "loss": 0.15501506328582765,
+      "step": 1555
+    },
+    {
+      "epoch": 0.2838427947598253,
+      "grad_norm": 0.19921597838401794,
+      "learning_rate": 4.203948857549402e-05,
+      "loss": 0.1690821886062622,
+      "step": 1560
+    },
+    {
+      "epoch": 0.28475254730713245,
+      "grad_norm": 0.15417630970478058,
+      "learning_rate": 4.1985552525670696e-05,
+      "loss": 0.1675640344619751,
+      "step": 1565
+    },
+    {
+      "epoch": 0.2856622998544396,
+      "grad_norm": 0.1739572137594223,
+      "learning_rate": 4.193146922755348e-05,
+      "loss": 0.16738017797470092,
+      "step": 1570
+    },
+    {
+      "epoch": 0.2865720524017467,
+      "grad_norm": 0.1384361982345581,
+      "learning_rate": 4.187723914999221e-05,
+      "loss": 0.16802358627319336,
+      "step": 1575
+    },
+    {
+      "epoch": 0.28748180494905384,
+      "grad_norm": 0.1491454839706421,
+      "learning_rate": 4.182286276310915e-05,
+      "loss": 0.1619583249092102,
+      "step": 1580
+    },
+    {
+      "epoch": 0.288391557496361,
+      "grad_norm": 0.15831919014453888,
+      "learning_rate": 4.176834053829492e-05,
+      "loss": 0.1625199794769287,
+      "step": 1585
+    },
+    {
+      "epoch": 0.2893013100436681,
+      "grad_norm": 0.16265396773815155,
+      "learning_rate": 4.1713672948204416e-05,
+      "loss": 0.16718552112579346,
+      "step": 1590
+    },
+    {
+      "epoch": 0.29021106259097523,
+      "grad_norm": 0.15153461694717407,
+      "learning_rate": 4.1658860466752714e-05,
+      "loss": 0.15979087352752686,
+      "step": 1595
+    },
+    {
+      "epoch": 0.29112081513828236,
+      "grad_norm": 0.1620412915945053,
+      "learning_rate": 4.160390356911096e-05,
+      "loss": 0.16103557348251343,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2920305676855895,
+      "grad_norm": 0.16673807799816132,
+      "learning_rate": 4.154880273170223e-05,
+      "loss": 0.16394708156585694,
+      "step": 1605
+    },
+    {
+      "epoch": 0.2929403202328967,
+      "grad_norm": 0.14834867417812347,
+      "learning_rate": 4.149355843219744e-05,
+      "loss": 0.15916435718536376,
+      "step": 1610
+    },
+    {
+      "epoch": 0.2938500727802038,
+      "grad_norm": 0.16977964341640472,
+      "learning_rate": 4.143817114951119e-05,
+      "loss": 0.16538127660751342,
+      "step": 1615
+    },
+    {
+      "epoch": 0.29475982532751094,
+      "grad_norm": 0.17986875772476196,
+      "learning_rate": 4.138264136379756e-05,
+      "loss": 0.15514618158340454,
+      "step": 1620
+    },
+    {
+      "epoch": 0.29566957787481807,
+      "grad_norm": 0.15794920921325684,
+      "learning_rate": 4.132696955644605e-05,
+      "loss": 0.15992183685302735,
+      "step": 1625
+    },
+    {
+      "epoch": 0.2965793304221252,
+      "grad_norm": 0.19955399632453918,
+      "learning_rate": 4.127115621007731e-05,
+      "loss": 0.16362056732177735,
+      "step": 1630
+    },
+    {
+      "epoch": 0.29748908296943233,
+      "grad_norm": 0.1352023035287857,
+      "learning_rate": 4.121520180853903e-05,
+      "loss": 0.15631601810455323,
+      "step": 1635
+    },
+    {
+      "epoch": 0.29839883551673946,
+      "grad_norm": 0.15340781211853027,
+      "learning_rate": 4.1159106836901674e-05,
+      "loss": 0.1571858048439026,
+      "step": 1640
+    },
+    {
+      "epoch": 0.2993085880640466,
+      "grad_norm": 0.15311770141124725,
+      "learning_rate": 4.110287178145433e-05,
+      "loss": 0.16082344055175782,
+      "step": 1645
+    },
+    {
+      "epoch": 0.3002183406113537,
+      "grad_norm": 0.17811627686023712,
+      "learning_rate": 4.10464971297005e-05,
+      "loss": 0.16117215156555176,
+      "step": 1650
+    },
+    {
+      "epoch": 0.30112809315866085,
+      "grad_norm": 0.21060039103031158,
+      "learning_rate": 4.0989983370353805e-05,
+      "loss": 0.15838587284088135,
+      "step": 1655
+    },
+    {
+      "epoch": 0.302037845705968,
+      "grad_norm": 0.155836820602417,
+      "learning_rate": 4.093333099333383e-05,
+      "loss": 0.16648870706558228,
+      "step": 1660
+    },
+    {
+      "epoch": 0.3029475982532751,
+      "grad_norm": 0.13711698353290558,
+      "learning_rate": 4.0876540489761826e-05,
+      "loss": 0.16899349689483642,
+      "step": 1665
+    },
+    {
+      "epoch": 0.30385735080058224,
+      "grad_norm": 0.15162716805934906,
+      "learning_rate": 4.0819612351956485e-05,
+      "loss": 0.16574090719223022,
+      "step": 1670
+    },
+    {
+      "epoch": 0.30476710334788937,
+      "grad_norm": 0.15016348659992218,
+      "learning_rate": 4.0762547073429615e-05,
+      "loss": 0.1689780354499817,
+      "step": 1675
+    },
+    {
+      "epoch": 0.3056768558951965,
+      "grad_norm": 0.15182986855506897,
+      "learning_rate": 4.070534514888194e-05,
+      "loss": 0.1593686819076538,
+      "step": 1680
+    },
+    {
+      "epoch": 0.3065866084425036,
+      "grad_norm": 0.15648750960826874,
+      "learning_rate": 4.0648007074198765e-05,
+      "loss": 0.16436235904693602,
+      "step": 1685
+    },
+    {
+      "epoch": 0.30749636098981076,
+      "grad_norm": 0.18339484930038452,
+      "learning_rate": 4.0590533346445665e-05,
+      "loss": 0.1678077220916748,
+      "step": 1690
+    },
+    {
+      "epoch": 0.3084061135371179,
+      "grad_norm": 0.16426527500152588,
+      "learning_rate": 4.053292446386422e-05,
+      "loss": 0.1689227342605591,
+      "step": 1695
+    },
+    {
+      "epoch": 0.309315866084425,
+      "grad_norm": 0.16129335761070251,
+      "learning_rate": 4.047518092586766e-05,
+      "loss": 0.16592445373535156,
+      "step": 1700
+    },
+    {
+      "epoch": 0.31022561863173215,
+      "grad_norm": 0.15512363612651825,
+      "learning_rate": 4.041730323303654e-05,
+      "loss": 0.16142364740371704,
+      "step": 1705
+    },
+    {
+      "epoch": 0.3111353711790393,
+      "grad_norm": 0.159842386841774,
+      "learning_rate": 4.0359291887114425e-05,
+      "loss": 0.1702875852584839,
+      "step": 1710
+    },
+    {
+      "epoch": 0.3120451237263464,
+      "grad_norm": 0.19558854401111603,
+      "learning_rate": 4.030114739100352e-05,
+      "loss": 0.15966148376464845,
+      "step": 1715
+    },
+    {
+      "epoch": 0.3129548762736536,
+      "grad_norm": 0.1577496975660324,
+      "learning_rate": 4.024287024876029e-05,
+      "loss": 0.1620358943939209,
+      "step": 1720
+    },
+    {
+      "epoch": 0.3138646288209607,
+      "grad_norm": 0.1629355251789093,
+      "learning_rate": 4.0184460965591144e-05,
+      "loss": 0.16511552333831786,
+      "step": 1725
+    },
+    {
+      "epoch": 0.31477438136826785,
+      "grad_norm": 0.17060767114162445,
+      "learning_rate": 4.0125920047848e-05,
+      "loss": 0.15672838687896729,
+      "step": 1730
+    },
+    {
+      "epoch": 0.315684133915575,
+      "grad_norm": 0.22447620332241058,
+      "learning_rate": 4.006724800302394e-05,
+      "loss": 0.15339784622192382,
+      "step": 1735
+    },
+    {
+      "epoch": 0.3165938864628821,
+      "grad_norm": 0.14572037756443024,
+      "learning_rate": 4.000844533974878e-05,
+      "loss": 0.16566959619522095,
+      "step": 1740
+    },
+    {
+      "epoch": 0.31750363901018924,
+      "grad_norm": 0.15915483236312866,
+      "learning_rate": 3.9949512567784684e-05,
+      "loss": 0.16153957843780517,
+      "step": 1745
+    },
+    {
+      "epoch": 0.3184133915574964,
+      "grad_norm": 0.1668540984392166,
+      "learning_rate": 3.9890450198021704e-05,
+      "loss": 0.1659809947013855,
+      "step": 1750
+    },
+    {
+      "epoch": 0.3193231441048035,
+      "grad_norm": 0.16612035036087036,
+      "learning_rate": 3.983125874247341e-05,
+      "loss": 0.16941241025924683,
+      "step": 1755
+    },
+    {
+      "epoch": 0.32023289665211063,
+      "grad_norm": 0.15163679420948029,
+      "learning_rate": 3.9771938714272407e-05,
+      "loss": 0.16053590774536133,
+      "step": 1760
+    },
+    {
+      "epoch": 0.32114264919941776,
+      "grad_norm": 0.1797824203968048,
+      "learning_rate": 3.97124906276659e-05,
+      "loss": 0.1667110800743103,
+      "step": 1765
+    },
+    {
+      "epoch": 0.3220524017467249,
+      "grad_norm": 0.15076608955860138,
+      "learning_rate": 3.9652914998011237e-05,
+      "loss": 0.1607860803604126,
+      "step": 1770
+    },
+    {
+      "epoch": 0.322962154294032,
+      "grad_norm": 0.16523587703704834,
+      "learning_rate": 3.959321234177144e-05,
+      "loss": 0.16515827178955078,
+      "step": 1775
+    },
+    {
+      "epoch": 0.32387190684133915,
+      "grad_norm": 0.22065149247646332,
+      "learning_rate": 3.9533383176510746e-05,
+      "loss": 0.1618957757949829,
+      "step": 1780
+    },
+    {
+      "epoch": 0.3247816593886463,
+      "grad_norm": 0.16426463425159454,
+      "learning_rate": 3.9473428020890066e-05,
+      "loss": 0.15763382911682128,
+      "step": 1785
+    },
+    {
+      "epoch": 0.3256914119359534,
+      "grad_norm": 0.16474904119968414,
+      "learning_rate": 3.941334739466257e-05,
+      "loss": 0.15135571956634522,
+      "step": 1790
+    },
+    {
+      "epoch": 0.32660116448326054,
+      "grad_norm": 0.16746412217617035,
+      "learning_rate": 3.935314181866909e-05,
+      "loss": 0.15925389528274536,
+      "step": 1795
+    },
+    {
+      "epoch": 0.32751091703056767,
+      "grad_norm": 0.17819371819496155,
+      "learning_rate": 3.929281181483369e-05,
+      "loss": 0.1598669171333313,
+      "step": 1800
+    },
+    {
+      "epoch": 0.3284206695778748,
+      "grad_norm": 0.1816040277481079,
+      "learning_rate": 3.923235790615907e-05,
+      "loss": 0.1652522087097168,
+      "step": 1805
+    },
+    {
+      "epoch": 0.32933042212518193,
+      "grad_norm": 0.14846695959568024,
+      "learning_rate": 3.917178061672211e-05,
+      "loss": 0.16665585041046144,
+      "step": 1810
+    },
+    {
+      "epoch": 0.33024017467248906,
+      "grad_norm": 0.1734926551580429,
+      "learning_rate": 3.911108047166924e-05,
+      "loss": 0.16069791316986085,
+      "step": 1815
+    },
+    {
+      "epoch": 0.3311499272197962,
+      "grad_norm": 0.16154922544956207,
+      "learning_rate": 3.905025799721194e-05,
+      "loss": 0.16114097833633423,
+      "step": 1820
+    },
+    {
+      "epoch": 0.3320596797671033,
+      "grad_norm": 0.1538771390914917,
+      "learning_rate": 3.898931372062217e-05,
+      "loss": 0.1602831244468689,
+      "step": 1825
+    },
+    {
+      "epoch": 0.3329694323144105,
+      "grad_norm": 0.14036566019058228,
+      "learning_rate": 3.892824817022781e-05,
+      "loss": 0.1502395749092102,
+      "step": 1830
+    },
+    {
+      "epoch": 0.33387918486171764,
+      "grad_norm": 0.19212059676647186,
+      "learning_rate": 3.886706187540804e-05,
+      "loss": 0.16265250444412233,
+      "step": 1835
+    },
+    {
+      "epoch": 0.33478893740902477,
+      "grad_norm": 0.17410333454608917,
+      "learning_rate": 3.880575536658881e-05,
+      "loss": 0.15689224004745483,
+      "step": 1840
+    },
+    {
+      "epoch": 0.3356986899563319,
+      "grad_norm": 0.15165294706821442,
+      "learning_rate": 3.874432917523817e-05,
+      "loss": 0.15033140182495117,
+      "step": 1845
+    },
+    {
+      "epoch": 0.336608442503639,
+      "grad_norm": 0.16166730225086212,
+      "learning_rate": 3.8682783833861736e-05,
+      "loss": 0.16896235942840576,
+      "step": 1850
+    },
+    {
+      "epoch": 0.33751819505094616,
+      "grad_norm": 0.16497021913528442,
+      "learning_rate": 3.8621119875998026e-05,
+      "loss": 0.1600774645805359,
+      "step": 1855
+    },
+    {
+      "epoch": 0.3384279475982533,
+      "grad_norm": 0.17264948785305023,
+      "learning_rate": 3.855933783621384e-05,
+      "loss": 0.16947593688964843,
+      "step": 1860
+    },
+    {
+      "epoch": 0.3393377001455604,
+      "grad_norm": 0.16870704293251038,
+      "learning_rate": 3.8497438250099636e-05,
+      "loss": 0.16062095165252685,
+      "step": 1865
+    },
+    {
+      "epoch": 0.34024745269286755,
+      "grad_norm": 0.16644036769866943,
+      "learning_rate": 3.843542165426492e-05,
+      "loss": 0.16015599966049193,
+      "step": 1870
+    },
+    {
+      "epoch": 0.3411572052401747,
+      "grad_norm": 0.1626352220773697,
+      "learning_rate": 3.837328858633349e-05,
+      "loss": 0.17444703578948975,
+      "step": 1875
+    },
+    {
+      "epoch": 0.3420669577874818,
+      "grad_norm": 0.1427375227212906,
+      "learning_rate": 3.83110395849389e-05,
+      "loss": 0.1589805006980896,
+      "step": 1880
+    },
+    {
+      "epoch": 0.34297671033478894,
+      "grad_norm": 0.17840255796909332,
+      "learning_rate": 3.824867518971973e-05,
+      "loss": 0.15953952074050903,
+      "step": 1885
+    },
+    {
+      "epoch": 0.34388646288209607,
+      "grad_norm": 0.16998249292373657,
+      "learning_rate": 3.818619594131489e-05,
+      "loss": 0.16027032136917113,
+      "step": 1890
+    },
+    {
+      "epoch": 0.3447962154294032,
+      "grad_norm": 0.14950257539749146,
+      "learning_rate": 3.812360238135897e-05,
+      "loss": 0.15335670709609986,
+      "step": 1895
+    },
+    {
+      "epoch": 0.3457059679767103,
+      "grad_norm": 0.1678011417388916,
+      "learning_rate": 3.806089505247752e-05,
+      "loss": 0.1560648798942566,
+      "step": 1900
+    },
+    {
+      "epoch": 0.34661572052401746,
+      "grad_norm": 0.17944541573524475,
+      "learning_rate": 3.799807449828238e-05,
+      "loss": 0.16072254180908202,
+      "step": 1905
+    },
+    {
+      "epoch": 0.3475254730713246,
+      "grad_norm": 0.166817307472229,
+      "learning_rate": 3.793514126336691e-05,
+      "loss": 0.1542820692062378,
+      "step": 1910
+    },
+    {
+      "epoch": 0.3484352256186317,
+      "grad_norm": 0.16047626733779907,
+      "learning_rate": 3.787209589330134e-05,
+      "loss": 0.16092092990875245,
+      "step": 1915
+    },
+    {
+      "epoch": 0.34934497816593885,
+      "grad_norm": 0.16478900611400604,
+      "learning_rate": 3.7808938934627965e-05,
+      "loss": 0.16765867471694945,
+      "step": 1920
+    },
+    {
+      "epoch": 0.350254730713246,
+      "grad_norm": 0.15349514782428741,
+      "learning_rate": 3.774567093485648e-05,
+      "loss": 0.15890377759933472,
+      "step": 1925
+    },
+    {
+      "epoch": 0.3511644832605531,
+      "grad_norm": 0.1515921950340271,
+      "learning_rate": 3.768229244245917e-05,
+      "loss": 0.16668319702148438,
+      "step": 1930
+    },
+    {
+      "epoch": 0.35207423580786024,
+      "grad_norm": 0.16310466825962067,
+      "learning_rate": 3.7618804006866195e-05,
+      "loss": 0.15182652473449706,
+      "step": 1935
+    },
+    {
+      "epoch": 0.3529839883551674,
+      "grad_norm": 0.17294517159461975,
+      "learning_rate": 3.755520617846084e-05,
+      "loss": 0.16287628412246705,
+      "step": 1940
+    },
+    {
+      "epoch": 0.35389374090247455,
+      "grad_norm": 0.1482895463705063,
+      "learning_rate": 3.749149950857467e-05,
+      "loss": 0.15321952104568481,
+      "step": 1945
+    },
+    {
+      "epoch": 0.3548034934497817,
+      "grad_norm": 0.2236029952764511,
+      "learning_rate": 3.7427684549482847e-05,
+      "loss": 0.15403482913970948,
+      "step": 1950
+    },
+    {
+      "epoch": 0.3557132459970888,
+      "grad_norm": 0.20185327529907227,
+      "learning_rate": 3.736376185439927e-05,
+      "loss": 0.1633884072303772,
+      "step": 1955
+    },
+    {
+      "epoch": 0.35662299854439594,
+      "grad_norm": 0.13906247913837433,
+      "learning_rate": 3.7299731977471816e-05,
+      "loss": 0.15925350189208984,
+      "step": 1960
+    },
+    {
+      "epoch": 0.35753275109170307,
+      "grad_norm": 0.18665002286434174,
+      "learning_rate": 3.723559547377751e-05,
+      "loss": 0.1612026572227478,
+      "step": 1965
+    },
+    {
+      "epoch": 0.3584425036390102,
+      "grad_norm": 0.16913433372974396,
+      "learning_rate": 3.717135289931774e-05,
+      "loss": 0.15479494333267213,
+      "step": 1970
+    },
+    {
+      "epoch": 0.35935225618631733,
+      "grad_norm": 0.1620066910982132,
+      "learning_rate": 3.7107004811013434e-05,
+      "loss": 0.1604058027267456,
+      "step": 1975
+    },
+    {
+      "epoch": 0.36026200873362446,
+      "grad_norm": 0.16838301718235016,
+      "learning_rate": 3.704255176670021e-05,
+      "loss": 0.15335073471069335,
+      "step": 1980
+    },
+    {
+      "epoch": 0.3611717612809316,
+      "grad_norm": 0.3054695427417755,
+      "learning_rate": 3.6977994325123535e-05,
+      "loss": 0.16558053493499755,
+      "step": 1985
+    },
+    {
+      "epoch": 0.3620815138282387,
+      "grad_norm": 0.1526716649532318,
+      "learning_rate": 3.6913333045933934e-05,
+      "loss": 0.16148923635482787,
+      "step": 1990
+    },
+    {
+      "epoch": 0.36299126637554585,
+      "grad_norm": 0.15328513085842133,
+      "learning_rate": 3.684856848968209e-05,
+      "loss": 0.1553613781929016,
+      "step": 1995
+    },
+    {
+      "epoch": 0.363901018922853,
+      "grad_norm": 0.16129714250564575,
+      "learning_rate": 3.6783701217813995e-05,
+      "loss": 0.16724612712860107,
+      "step": 2000
+    },
+    {
+      "epoch": 0.3648107714701601,
+      "grad_norm": 0.15715539455413818,
+      "learning_rate": 3.6718731792666086e-05,
+      "loss": 0.15867922306060792,
+      "step": 2005
+    },
+    {
+      "epoch": 0.36572052401746724,
+      "grad_norm": 0.15569166839122772,
+      "learning_rate": 3.6653660777460366e-05,
+      "loss": 0.1552058696746826,
+      "step": 2010
+    },
+    {
+      "epoch": 0.36663027656477437,
+      "grad_norm": 0.16223010420799255,
+      "learning_rate": 3.6588488736299535e-05,
+      "loss": 0.1583200454711914,
+      "step": 2015
+    },
+    {
+      "epoch": 0.3675400291120815,
+      "grad_norm": 0.18441995978355408,
+      "learning_rate": 3.652321623416209e-05,
+      "loss": 0.15050662755966188,
+      "step": 2020
+    },
+    {
+      "epoch": 0.36844978165938863,
+      "grad_norm": 0.13792674243450165,
+      "learning_rate": 3.645784383689742e-05,
+      "loss": 0.15458759069442748,
+      "step": 2025
+    },
+    {
+      "epoch": 0.36935953420669576,
+      "grad_norm": 0.14993111789226532,
+      "learning_rate": 3.639237211122091e-05,
+      "loss": 0.15926222801208495,
+      "step": 2030
+    },
+    {
+      "epoch": 0.3702692867540029,
+      "grad_norm": 0.16815930604934692,
+      "learning_rate": 3.632680162470904e-05,
+      "loss": 0.15524441003799438,
+      "step": 2035
+    },
+    {
+      "epoch": 0.37117903930131,
+      "grad_norm": 0.13312821090221405,
+      "learning_rate": 3.626113294579441e-05,
+      "loss": 0.15883516073226928,
+      "step": 2040
+    },
+    {
+      "epoch": 0.37208879184861715,
+      "grad_norm": 0.16838273406028748,
+      "learning_rate": 3.619536664376091e-05,
+      "loss": 0.15829603672027587,
+      "step": 2045
+    },
+    {
+      "epoch": 0.37299854439592434,
+      "grad_norm": 0.14706873893737793,
+      "learning_rate": 3.612950328873869e-05,
+      "loss": 0.15644397735595703,
+      "step": 2050
+    },
+    {
+      "epoch": 0.37390829694323147,
+      "grad_norm": 0.1644199639558792,
+      "learning_rate": 3.606354345169926e-05,
+      "loss": 0.15858219861984252,
+      "step": 2055
+    },
+    {
+      "epoch": 0.3748180494905386,
+      "grad_norm": 0.18077051639556885,
+      "learning_rate": 3.599748770445055e-05,
+      "loss": 0.1641286849975586,
+      "step": 2060
+    },
+    {
+      "epoch": 0.3757278020378457,
+      "grad_norm": 0.16329127550125122,
+      "learning_rate": 3.5931336619631914e-05,
+      "loss": 0.15027186870574952,
+      "step": 2065
+    },
+    {
+      "epoch": 0.37663755458515286,
+      "grad_norm": 0.16346783936023712,
+      "learning_rate": 3.586509077070922e-05,
+      "loss": 0.1558641314506531,
+      "step": 2070
+    },
+    {
+      "epoch": 0.37754730713246,
+      "grad_norm": 0.1727602630853653,
+      "learning_rate": 3.5798750731969834e-05,
+      "loss": 0.15390506982803345,
+      "step": 2075
+    },
+    {
+      "epoch": 0.3784570596797671,
+      "grad_norm": 0.7598192691802979,
+      "learning_rate": 3.5732317078517654e-05,
+      "loss": 0.1533232808113098,
+      "step": 2080
+    },
+    {
+      "epoch": 0.37936681222707425,
+      "grad_norm": 0.1433355212211609,
+      "learning_rate": 3.5665790386268124e-05,
+      "loss": 0.15560413599014283,
+      "step": 2085
+    },
+    {
+      "epoch": 0.3802765647743814,
+      "grad_norm": 0.18439625203609467,
+      "learning_rate": 3.559917123194325e-05,
+      "loss": 0.16695556640625,
+      "step": 2090
+    },
+    {
+      "epoch": 0.3811863173216885,
+      "grad_norm": 0.1693502813577652,
+      "learning_rate": 3.55324601930666e-05,
+      "loss": 0.15957870483398437,
+      "step": 2095
+    },
+    {
+      "epoch": 0.38209606986899564,
+      "grad_norm": 0.17776088416576385,
+      "learning_rate": 3.54656578479583e-05,
+      "loss": 0.1527492880821228,
+      "step": 2100
+    },
+    {
+      "epoch": 0.38300582241630277,
+      "grad_norm": 0.15993724763393402,
+      "learning_rate": 3.539876477572998e-05,
+      "loss": 0.1567505717277527,
+      "step": 2105
+    },
+    {
+      "epoch": 0.3839155749636099,
+      "grad_norm": 0.17067375779151917,
+      "learning_rate": 3.533178155627981e-05,
+      "loss": 0.14660797119140626,
+      "step": 2110
+    },
+    {
+      "epoch": 0.384825327510917,
+      "grad_norm": 0.20239882171154022,
+      "learning_rate": 3.526470877028745e-05,
+      "loss": 0.1596767544746399,
+      "step": 2115
+    },
+    {
+      "epoch": 0.38573508005822416,
+      "grad_norm": 0.1863643079996109,
+      "learning_rate": 3.5197546999209005e-05,
+      "loss": 0.15738571882247926,
+      "step": 2120
+    },
+    {
+      "epoch": 0.3866448326055313,
+      "grad_norm": 0.16994133591651917,
+      "learning_rate": 3.5130296825272014e-05,
+      "loss": 0.16255316734313965,
+      "step": 2125
+    },
+    {
+      "epoch": 0.3875545851528384,
+      "grad_norm": 0.18703415989875793,
+      "learning_rate": 3.5062958831470355e-05,
+      "loss": 0.15206334590911866,
+      "step": 2130
+    },
+    {
+      "epoch": 0.38846433770014555,
+      "grad_norm": 0.15433982014656067,
+      "learning_rate": 3.4995533601559226e-05,
+      "loss": 0.1590178370475769,
+      "step": 2135
+    },
+    {
+      "epoch": 0.3893740902474527,
+      "grad_norm": 0.16498146951198578,
+      "learning_rate": 3.4928021720050104e-05,
+      "loss": 0.14759145975112914,
+      "step": 2140
+    },
+    {
+      "epoch": 0.3902838427947598,
+      "grad_norm": 0.17880478501319885,
+      "learning_rate": 3.486042377220562e-05,
+      "loss": 0.1642458915710449,
+      "step": 2145
+    },
+    {
+      "epoch": 0.39119359534206694,
+      "grad_norm": 0.14700061082839966,
+      "learning_rate": 3.479274034403455e-05,
+      "loss": 0.16105138063430785,
+      "step": 2150
+    },
+    {
+      "epoch": 0.39210334788937407,
+      "grad_norm": 0.1620762050151825,
+      "learning_rate": 3.472497202228664e-05,
+      "loss": 0.15104985237121582,
+      "step": 2155
+    },
+    {
+      "epoch": 0.3930131004366812,
+      "grad_norm": 0.1625058799982071,
+      "learning_rate": 3.4657119394447654e-05,
+      "loss": 0.16145485639572144,
+      "step": 2160
+    },
+    {
+      "epoch": 0.3939228529839884,
+      "grad_norm": 0.1631549596786499,
+      "learning_rate": 3.458918304873417e-05,
+      "loss": 0.16712255477905275,
+      "step": 2165
+    },
+    {
+      "epoch": 0.3948326055312955,
+      "grad_norm": 0.16041551530361176,
+      "learning_rate": 3.452116357408853e-05,
+      "loss": 0.15118330717086792,
+      "step": 2170
+    },
+    {
+      "epoch": 0.39574235807860264,
+      "grad_norm": 0.16692611575126648,
+      "learning_rate": 3.44530615601737e-05,
+      "loss": 0.16982550621032716,
+      "step": 2175
+    },
+    {
+      "epoch": 0.39665211062590977,
+      "grad_norm": 0.16082268953323364,
+      "learning_rate": 3.438487759736821e-05,
+      "loss": 0.1513260006904602,
+      "step": 2180
+    },
+    {
+      "epoch": 0.3975618631732169,
+      "grad_norm": 0.1474589854478836,
+      "learning_rate": 3.4316612276761004e-05,
+      "loss": 0.14968743324279785,
+      "step": 2185
+    },
+    {
+      "epoch": 0.39847161572052403,
+      "grad_norm": 0.14531342685222626,
+      "learning_rate": 3.42482661901463e-05,
+      "loss": 0.1563260555267334,
+      "step": 2190
+    },
+    {
+      "epoch": 0.39938136826783116,
+      "grad_norm": 0.16775506734848022,
+      "learning_rate": 3.41798399300185e-05,
+      "loss": 0.14861010313034057,
+      "step": 2195
+    },
+    {
+      "epoch": 0.4002911208151383,
+      "grad_norm": 0.15065217018127441,
+      "learning_rate": 3.411133408956703e-05,
+      "loss": 0.15559519529342652,
+      "step": 2200
+    },
+    {
+      "epoch": 0.4012008733624454,
+      "grad_norm": 0.16655296087265015,
+      "learning_rate": 3.4042749262671184e-05,
+      "loss": 0.16025567054748535,
+      "step": 2205
+    },
+    {
+      "epoch": 0.40211062590975255,
+      "grad_norm": 0.14773905277252197,
+      "learning_rate": 3.397408604389501e-05,
+      "loss": 0.15074082612991332,
+      "step": 2210
+    },
+    {
+      "epoch": 0.4030203784570597,
+      "grad_norm": 0.16233304142951965,
+      "learning_rate": 3.3905345028482125e-05,
+      "loss": 0.15490520000457764,
+      "step": 2215
+    },
+    {
+      "epoch": 0.4039301310043668,
+      "grad_norm": 0.17520153522491455,
+      "learning_rate": 3.383652681235058e-05,
+      "loss": 0.1517520785331726,
+      "step": 2220
+    },
+    {
+      "epoch": 0.40483988355167394,
+      "grad_norm": 0.14749875664710999,
+      "learning_rate": 3.376763199208766e-05,
+      "loss": 0.15410997867584228,
+      "step": 2225
+    },
+    {
+      "epoch": 0.40574963609898107,
+      "grad_norm": 0.16855919361114502,
+      "learning_rate": 3.369866116494477e-05,
+      "loss": 0.1510261058807373,
+      "step": 2230
+    },
+    {
+      "epoch": 0.4066593886462882,
+      "grad_norm": 0.1594122350215912,
+      "learning_rate": 3.362961492883218e-05,
+      "loss": 0.1493813395500183,
+      "step": 2235
+    },
+    {
+      "epoch": 0.40756914119359533,
+      "grad_norm": 0.13645926117897034,
+      "learning_rate": 3.3560493882313915e-05,
+      "loss": 0.14876762628555298,
+      "step": 2240
+    },
+    {
+      "epoch": 0.40847889374090246,
+      "grad_norm": 0.14304400980472565,
+      "learning_rate": 3.349129862460251e-05,
+      "loss": 0.15567013025283813,
+      "step": 2245
+    },
+    {
+      "epoch": 0.4093886462882096,
+      "grad_norm": 0.17040041089057922,
+      "learning_rate": 3.342202975555386e-05,
+      "loss": 0.1563249945640564,
+      "step": 2250
+    },
+    {
+      "epoch": 0.4102983988355167,
+      "grad_norm": 0.15594671666622162,
+      "learning_rate": 3.3352687875661984e-05,
+      "loss": 0.1546410083770752,
+      "step": 2255
+    },
+    {
+      "epoch": 0.41120815138282385,
+      "grad_norm": 0.1677195280790329,
+      "learning_rate": 3.328327358605384e-05,
+      "loss": 0.15710171461105346,
+      "step": 2260
+    },
+    {
+      "epoch": 0.412117903930131,
+      "grad_norm": 0.1731705516576767,
+      "learning_rate": 3.321378748848412e-05,
+      "loss": 0.16444036960601807,
+      "step": 2265
+    },
+    {
+      "epoch": 0.4130276564774381,
+      "grad_norm": 0.18779033422470093,
+      "learning_rate": 3.3144230185329984e-05,
+      "loss": 0.15659687519073487,
+      "step": 2270
+    },
+    {
+      "epoch": 0.4139374090247453,
+      "grad_norm": 0.1543768346309662,
+      "learning_rate": 3.3074602279585913e-05,
+      "loss": 0.15100739002227784,
+      "step": 2275
+    },
+    {
+      "epoch": 0.4148471615720524,
+      "grad_norm": 0.16672168672084808,
+      "learning_rate": 3.300490437485843e-05,
+      "loss": 0.15535364151000977,
+      "step": 2280
+    },
+    {
+      "epoch": 0.41575691411935956,
+      "grad_norm": 0.16741308569908142,
+      "learning_rate": 3.293513707536089e-05,
+      "loss": 0.15523911714553834,
+      "step": 2285
+    },
+    {
+      "epoch": 0.4166666666666667,
+      "grad_norm": 0.1488303542137146,
+      "learning_rate": 3.286530098590822e-05,
+      "loss": 0.1542000651359558,
+      "step": 2290
+    },
+    {
+      "epoch": 0.4175764192139738,
+      "grad_norm": 0.1637732982635498,
+      "learning_rate": 3.2795396711911694e-05,
+      "loss": 0.15354831218719484,
+      "step": 2295
+    },
+    {
+      "epoch": 0.41848617176128095,
+      "grad_norm": 0.1472022533416748,
+      "learning_rate": 3.272542485937369e-05,
+      "loss": 0.16235145330429077,
+      "step": 2300
+    },
+    {
+      "epoch": 0.4193959243085881,
+      "grad_norm": 0.15908290445804596,
+      "learning_rate": 3.265538603488241e-05,
+      "loss": 0.15642645359039306,
+      "step": 2305
+    },
+    {
+      "epoch": 0.4203056768558952,
+      "grad_norm": 0.1584865301847458,
+      "learning_rate": 3.2585280845606645e-05,
+      "loss": 0.15490249395370484,
+      "step": 2310
+    },
+    {
+      "epoch": 0.42121542940320233,
+      "grad_norm": 0.15893949568271637,
+      "learning_rate": 3.251510989929052e-05,
+      "loss": 0.1598116159439087,
+      "step": 2315
+    },
+    {
+      "epoch": 0.42212518195050946,
+      "grad_norm": 0.18930596113204956,
+      "learning_rate": 3.244487380424817e-05,
+      "loss": 0.1482008934020996,
+      "step": 2320
+    },
+    {
+      "epoch": 0.4230349344978166,
+      "grad_norm": 0.132876455783844,
+      "learning_rate": 3.237457316935856e-05,
+      "loss": 0.15304710865020751,
+      "step": 2325
+    },
+    {
+      "epoch": 0.4239446870451237,
+      "grad_norm": 0.16447032988071442,
+      "learning_rate": 3.2304208604060106e-05,
+      "loss": 0.15298750400543212,
+      "step": 2330
+    },
+    {
+      "epoch": 0.42485443959243085,
+      "grad_norm": 0.17748120427131653,
+      "learning_rate": 3.223378071834546e-05,
+      "loss": 0.1556084156036377,
+      "step": 2335
+    },
+    {
+      "epoch": 0.425764192139738,
+      "grad_norm": 0.16366586089134216,
+      "learning_rate": 3.2163290122756206e-05,
+      "loss": 0.14387927055358887,
+      "step": 2340
+    },
+    {
+      "epoch": 0.4266739446870451,
+      "grad_norm": 0.15398970246315002,
+      "learning_rate": 3.209273742837755e-05,
+      "loss": 0.16091293096542358,
+      "step": 2345
+    },
+    {
+      "epoch": 0.42758369723435224,
+      "grad_norm": 0.164212167263031,
+      "learning_rate": 3.202212324683305e-05,
+      "loss": 0.15523531436920165,
+      "step": 2350
+    },
+    {
+      "epoch": 0.4284934497816594,
+      "grad_norm": 0.16749800741672516,
+      "learning_rate": 3.1951448190279255e-05,
+      "loss": 0.15354975461959838,
+      "step": 2355
+    },
+    {
+      "epoch": 0.4294032023289665,
+      "grad_norm": 0.14137034118175507,
+      "learning_rate": 3.18807128714005e-05,
+      "loss": 0.14981694221496583,
+      "step": 2360
+    },
+    {
+      "epoch": 0.43031295487627363,
+      "grad_norm": 0.14848439395427704,
+      "learning_rate": 3.1809917903403507e-05,
+      "loss": 0.15448769330978393,
+      "step": 2365
+    },
+    {
+      "epoch": 0.43122270742358076,
+      "grad_norm": 0.1747605800628662,
+      "learning_rate": 3.1739063900012095e-05,
+      "loss": 0.15882387161254882,
+      "step": 2370
+    },
+    {
+      "epoch": 0.4321324599708879,
+      "grad_norm": 0.16054467856884003,
+      "learning_rate": 3.166815147546186e-05,
+      "loss": 0.15170297622680665,
+      "step": 2375
+    },
+    {
+      "epoch": 0.433042212518195,
+      "grad_norm": 0.15428027510643005,
+      "learning_rate": 3.1597181244494886e-05,
+      "loss": 0.16202548742294312,
+      "step": 2380
+    },
+    {
+      "epoch": 0.4339519650655022,
+      "grad_norm": 0.16747219860553741,
+      "learning_rate": 3.1526153822354325e-05,
+      "loss": 0.15461477041244506,
+      "step": 2385
+    },
+    {
+      "epoch": 0.43486171761280934,
+      "grad_norm": 0.17415772378444672,
+      "learning_rate": 3.145506982477918e-05,
+      "loss": 0.16173542737960817,
+      "step": 2390
+    },
+    {
+      "epoch": 0.43577147016011647,
+      "grad_norm": 0.1293518990278244,
+      "learning_rate": 3.1383929867998865e-05,
+      "loss": 0.15572521686553956,
+      "step": 2395
+    },
+    {
+      "epoch": 0.4366812227074236,
+      "grad_norm": 0.16909323632717133,
+      "learning_rate": 3.1312734568727935e-05,
+      "loss": 0.15898628234863282,
+      "step": 2400
+    },
+    {
+      "epoch": 0.43759097525473073,
+      "grad_norm": 0.16770294308662415,
+      "learning_rate": 3.124148454416069e-05,
+      "loss": 0.1536281704902649,
+      "step": 2405
+    },
+    {
+      "epoch": 0.43850072780203786,
+      "grad_norm": 0.14078612625598907,
+      "learning_rate": 3.117018041196585e-05,
+      "loss": 0.15274266004562378,
+      "step": 2410
+    },
+    {
+      "epoch": 0.439410480349345,
+      "grad_norm": 0.15457536280155182,
+      "learning_rate": 3.1098822790281226e-05,
+      "loss": 0.15391263961791993,
+      "step": 2415
+    },
+    {
+      "epoch": 0.4403202328966521,
+      "grad_norm": 0.1640717089176178,
+      "learning_rate": 3.102741229770827e-05,
+      "loss": 0.15515168905258178,
+      "step": 2420
+    },
+    {
+      "epoch": 0.44122998544395925,
+      "grad_norm": 0.2601533830165863,
+      "learning_rate": 3.095594955330683e-05,
+      "loss": 0.1587247371673584,
+      "step": 2425
+    },
+    {
+      "epoch": 0.4421397379912664,
+      "grad_norm": 0.1352529525756836,
+      "learning_rate": 3.08844351765897e-05,
+      "loss": 0.1483217477798462,
+      "step": 2430
+    },
+    {
+      "epoch": 0.4430494905385735,
+      "grad_norm": 0.18479721248149872,
+      "learning_rate": 3.081286978751728e-05,
+      "loss": 0.15121787786483765,
+      "step": 2435
+    },
+    {
+      "epoch": 0.44395924308588064,
+      "grad_norm": 0.16954511404037476,
+      "learning_rate": 3.074125400649221e-05,
+      "loss": 0.16073100566864013,
+      "step": 2440
+    },
+    {
+      "epoch": 0.44486899563318777,
+      "grad_norm": 0.15154729783535004,
+      "learning_rate": 3.0669588454353944e-05,
+      "loss": 0.15738017559051515,
+      "step": 2445
+    },
+    {
+      "epoch": 0.4457787481804949,
+      "grad_norm": 0.1540488302707672,
+      "learning_rate": 3.059787375237344e-05,
+      "loss": 0.1515384554862976,
+      "step": 2450
+    },
+    {
+      "epoch": 0.44668850072780203,
+      "grad_norm": 0.1814432442188263,
+      "learning_rate": 3.052611052224774e-05,
+      "loss": 0.15731438398361205,
+      "step": 2455
+    },
+    {
+      "epoch": 0.44759825327510916,
+      "grad_norm": 0.16657036542892456,
+      "learning_rate": 3.0454299386094542e-05,
+      "loss": 0.15741543769836425,
+      "step": 2460
+    },
+    {
+      "epoch": 0.4485080058224163,
+      "grad_norm": 0.2177237570285797,
+      "learning_rate": 3.0382440966446875e-05,
+      "loss": 0.14972515106201173,
+      "step": 2465
+    },
+    {
+      "epoch": 0.4494177583697234,
+      "grad_norm": 0.1669909954071045,
+      "learning_rate": 3.031053588624766e-05,
+      "loss": 0.1506432294845581,
+      "step": 2470
+    },
+    {
+      "epoch": 0.45032751091703055,
+      "grad_norm": 0.1752234250307083,
+      "learning_rate": 3.0238584768844313e-05,
+      "loss": 0.14969609975814818,
+      "step": 2475
+    },
+    {
+      "epoch": 0.4512372634643377,
+      "grad_norm": 0.18267901241779327,
+      "learning_rate": 3.0166588237983363e-05,
+      "loss": 0.15112748146057128,
+      "step": 2480
+    },
+    {
+      "epoch": 0.4521470160116448,
+      "grad_norm": 0.16250105202198029,
+      "learning_rate": 3.0094546917805007e-05,
+      "loss": 0.15864100456237792,
+      "step": 2485
+    },
+    {
+      "epoch": 0.45305676855895194,
+      "grad_norm": 0.14825721085071564,
+      "learning_rate": 3.0022461432837752e-05,
+      "loss": 0.1513954520225525,
+      "step": 2490
+    },
+    {
+      "epoch": 0.4539665211062591,
+      "grad_norm": 0.1626640111207962,
+      "learning_rate": 2.9950332407992943e-05,
+      "loss": 0.1505578875541687,
+      "step": 2495
+    },
+    {
+      "epoch": 0.45487627365356625,
+      "grad_norm": 0.1535351574420929,
+      "learning_rate": 2.987816046855939e-05,
+      "loss": 0.15255829095840454,
+      "step": 2500
+    },
+    {
+      "epoch": 0.4557860262008734,
+      "grad_norm": 0.17552775144577026,
+      "learning_rate": 2.9805946240197928e-05,
+      "loss": 0.1516443133354187,
+      "step": 2505
+    },
+    {
+      "epoch": 0.4566957787481805,
+      "grad_norm": 0.16020981967449188,
+      "learning_rate": 2.9733690348935994e-05,
+      "loss": 0.14519743919372557,
+      "step": 2510
+    },
+    {
+      "epoch": 0.45760553129548764,
+      "grad_norm": 0.17800211906433105,
+      "learning_rate": 2.9661393421162204e-05,
+      "loss": 0.15679080486297609,
+      "step": 2515
+    },
+    {
+      "epoch": 0.4585152838427948,
+      "grad_norm": 0.16016991436481476,
+      "learning_rate": 2.9589056083620902e-05,
+      "loss": 0.14768127202987671,
+      "step": 2520
+    },
+    {
+      "epoch": 0.4594250363901019,
+      "grad_norm": 0.16272081434726715,
+      "learning_rate": 2.951667896340679e-05,
+      "loss": 0.1513301968574524,
+      "step": 2525
+    },
+    {
+      "epoch": 0.46033478893740903,
+      "grad_norm": 0.1726413071155548,
+      "learning_rate": 2.9444262687959402e-05,
+      "loss": 0.14819332361221313,
+      "step": 2530
+    },
+    {
+      "epoch": 0.46124454148471616,
+      "grad_norm": 0.1670403778553009,
+      "learning_rate": 2.9371807885057735e-05,
+      "loss": 0.15245940685272216,
+      "step": 2535
+    },
+    {
+      "epoch": 0.4621542940320233,
+      "grad_norm": 0.1650049239397049,
+      "learning_rate": 2.9299315182814772e-05,
+      "loss": 0.15187418460845947,
+      "step": 2540
+    },
+    {
+      "epoch": 0.4630640465793304,
+      "grad_norm": 0.16327734291553497,
+      "learning_rate": 2.9226785209672047e-05,
+      "loss": 0.15579828023910522,
+      "step": 2545
+    },
+    {
+      "epoch": 0.46397379912663755,
+      "grad_norm": 0.3367880582809448,
+      "learning_rate": 2.91542185943942e-05,
+      "loss": 0.15617697238922118,
+      "step": 2550
+    },
+    {
+      "epoch": 0.4648835516739447,
+      "grad_norm": 0.1731594055891037,
+      "learning_rate": 2.908161596606353e-05,
+      "loss": 0.1559603691101074,
+      "step": 2555
+    },
+    {
+      "epoch": 0.4657933042212518,
+      "grad_norm": 0.1477293074131012,
+      "learning_rate": 2.9008977954074517e-05,
+      "loss": 0.15567959547042848,
+      "step": 2560
+    },
+    {
+      "epoch": 0.46670305676855894,
+      "grad_norm": 0.16227173805236816,
+      "learning_rate": 2.8936305188128392e-05,
+      "loss": 0.1522113561630249,
+      "step": 2565
+    },
+    {
+      "epoch": 0.4676128093158661,
+      "grad_norm": 0.2031075656414032,
+      "learning_rate": 2.8863598298227674e-05,
+      "loss": 0.15054640769958497,
+      "step": 2570
+    },
+    {
+      "epoch": 0.4685225618631732,
+      "grad_norm": 0.18351472914218903,
+      "learning_rate": 2.8790857914670698e-05,
+      "loss": 0.15837019681930542,
+      "step": 2575
+    },
+    {
+      "epoch": 0.46943231441048033,
+      "grad_norm": 0.15914765000343323,
+      "learning_rate": 2.871808466804616e-05,
+      "loss": 0.1550259470939636,
+      "step": 2580
+    },
+    {
+      "epoch": 0.47034206695778746,
+      "grad_norm": 0.17366717755794525,
+      "learning_rate": 2.8645279189227636e-05,
+      "loss": 0.15702390670776367,
+      "step": 2585
+    },
+    {
+      "epoch": 0.4712518195050946,
+      "grad_norm": 0.13677838444709778,
+      "learning_rate": 2.8572442109368134e-05,
+      "loss": 0.15485031604766847,
+      "step": 2590
+    },
+    {
+      "epoch": 0.4721615720524017,
+      "grad_norm": 0.1477748304605484,
+      "learning_rate": 2.8499574059894617e-05,
+      "loss": 0.14577245712280273,
+      "step": 2595
+    },
+    {
+      "epoch": 0.47307132459970885,
+      "grad_norm": 0.1582217663526535,
+      "learning_rate": 2.842667567250252e-05,
+      "loss": 0.15586793422698975,
+      "step": 2600
+    },
+    {
+      "epoch": 0.47398107714701604,
+      "grad_norm": 0.19658738374710083,
+      "learning_rate": 2.8353747579150268e-05,
+      "loss": 0.15060495138168334,
+      "step": 2605
+    },
+    {
+      "epoch": 0.47489082969432317,
+      "grad_norm": 0.176767036318779,
+      "learning_rate": 2.828079041205382e-05,
+      "loss": 0.15116705894470214,
+      "step": 2610
+    },
+    {
+      "epoch": 0.4758005822416303,
+      "grad_norm": 0.16972507536411285,
+      "learning_rate": 2.820780480368117e-05,
+      "loss": 0.1541937470436096,
+      "step": 2615
+    },
+    {
+      "epoch": 0.47671033478893743,
+      "grad_norm": 0.1548585742712021,
+      "learning_rate": 2.8134791386746884e-05,
+      "loss": 0.14334756135940552,
+      "step": 2620
+    },
+    {
+      "epoch": 0.47762008733624456,
+      "grad_norm": 0.15411986410617828,
+      "learning_rate": 2.806175079420658e-05,
+      "loss": 0.14642289876937867,
+      "step": 2625
+    },
+    {
+      "epoch": 0.4785298398835517,
+      "grad_norm": 0.16609491407871246,
+      "learning_rate": 2.7988683659251474e-05,
+      "loss": 0.15083469152450563,
+      "step": 2630
+    },
+    {
+      "epoch": 0.4794395924308588,
+      "grad_norm": 0.16592684388160706,
+      "learning_rate": 2.791559061530289e-05,
+      "loss": 0.14218480587005616,
+      "step": 2635
+    },
+    {
+      "epoch": 0.48034934497816595,
+      "grad_norm": 0.1764935404062271,
+      "learning_rate": 2.7842472296006722e-05,
+      "loss": 0.15004343986511232,
+      "step": 2640
+    },
+    {
+      "epoch": 0.4812590975254731,
+      "grad_norm": 0.20094354450702667,
+      "learning_rate": 2.7769329335228022e-05,
+      "loss": 0.14975016117095946,
+      "step": 2645
+    },
+    {
+      "epoch": 0.4821688500727802,
+      "grad_norm": 0.1869269460439682,
+      "learning_rate": 2.769616236704542e-05,
+      "loss": 0.155981707572937,
+      "step": 2650
+    },
+    {
+      "epoch": 0.48307860262008734,
+      "grad_norm": 0.16671574115753174,
+      "learning_rate": 2.762297202574571e-05,
+      "loss": 0.14633859395980836,
+      "step": 2655
+    },
+    {
+      "epoch": 0.48398835516739447,
+      "grad_norm": 0.14999663829803467,
+      "learning_rate": 2.754975894581826e-05,
+      "loss": 0.15692603588104248,
+      "step": 2660
+    },
+    {
+      "epoch": 0.4848981077147016,
+      "grad_norm": 0.16893649101257324,
+      "learning_rate": 2.7476523761949592e-05,
+      "loss": 0.14530394077301026,
+      "step": 2665
+    },
+    {
+      "epoch": 0.48580786026200873,
+      "grad_norm": 0.16039884090423584,
+      "learning_rate": 2.740326710901784e-05,
+      "loss": 0.15013915300369263,
+      "step": 2670
+    },
+    {
+      "epoch": 0.48671761280931586,
+      "grad_norm": 0.16672006249427795,
+      "learning_rate": 2.732998962208725e-05,
+      "loss": 0.15667349100112915,
+      "step": 2675
+    },
+    {
+      "epoch": 0.487627365356623,
+      "grad_norm": 0.2160867303609848,
+      "learning_rate": 2.7256691936402684e-05,
+      "loss": 0.14335414171218872,
+      "step": 2680
+    },
+    {
+      "epoch": 0.4885371179039301,
+      "grad_norm": 0.349030077457428,
+      "learning_rate": 2.71833746873841e-05,
+      "loss": 0.1437530279159546,
+      "step": 2685
+    },
+    {
+      "epoch": 0.48944687045123725,
+      "grad_norm": 0.18380966782569885,
+      "learning_rate": 2.7110038510621073e-05,
+      "loss": 0.1476014256477356,
+      "step": 2690
+    },
+    {
+      "epoch": 0.4903566229985444,
+      "grad_norm": 0.1523742377758026,
+      "learning_rate": 2.703668404186722e-05,
+      "loss": 0.14578526020050048,
+      "step": 2695
+    },
+    {
+      "epoch": 0.4912663755458515,
+      "grad_norm": 0.16092729568481445,
+      "learning_rate": 2.696331191703479e-05,
+      "loss": 0.15335593223571778,
+      "step": 2700
+    },
+    {
+      "epoch": 0.49217612809315864,
+      "grad_norm": 0.17185333371162415,
+      "learning_rate": 2.688992277218904e-05,
+      "loss": 0.1540898084640503,
+      "step": 2705
+    },
+    {
+      "epoch": 0.49308588064046577,
+      "grad_norm": 0.1521969735622406,
+      "learning_rate": 2.6816517243542792e-05,
+      "loss": 0.15171396732330322,
+      "step": 2710
+    },
+    {
+      "epoch": 0.49399563318777295,
+      "grad_norm": 0.16064171493053436,
+      "learning_rate": 2.674309596745092e-05,
+      "loss": 0.1505839228630066,
+      "step": 2715
+    },
+    {
+      "epoch": 0.4949053857350801,
+      "grad_norm": 0.16430898010730743,
+      "learning_rate": 2.6669659580404795e-05,
+      "loss": 0.1551363468170166,
+      "step": 2720
+    },
+    {
+      "epoch": 0.4958151382823872,
+      "grad_norm": 0.16125477850437164,
+      "learning_rate": 2.659620871902677e-05,
+      "loss": 0.15069286823272704,
+      "step": 2725
+    },
+    {
+      "epoch": 0.49672489082969434,
+      "grad_norm": 0.1428450047969818,
+      "learning_rate": 2.652274402006471e-05,
+      "loss": 0.15511081218719483,
+      "step": 2730
+    },
+    {
+      "epoch": 0.4976346433770015,
+      "grad_norm": 0.15452754497528076,
+      "learning_rate": 2.6449266120386406e-05,
+      "loss": 0.14941939115524291,
+      "step": 2735
+    },
+    {
+      "epoch": 0.4985443959243086,
+      "grad_norm": 0.17243537306785583,
+      "learning_rate": 2.6375775656974123e-05,
+      "loss": 0.151741623878479,
+      "step": 2740
+    },
+    {
+      "epoch": 0.49945414847161573,
+      "grad_norm": 0.13736453652381897,
+      "learning_rate": 2.6302273266919008e-05,
+      "loss": 0.147042977809906,
+      "step": 2745
+    },
+    {
+      "epoch": 0.5003639010189228,
+      "grad_norm": 0.16241495311260223,
+      "learning_rate": 2.6228759587415614e-05,
+      "loss": 0.14664684534072875,
+      "step": 2750
+    },
+    {
+      "epoch": 0.50127365356623,
+      "grad_norm": 0.193496435880661,
+      "learning_rate": 2.6155235255756356e-05,
+      "loss": 0.15486966371536254,
+      "step": 2755
+    },
+    {
+      "epoch": 0.5021834061135371,
+      "grad_norm": 0.1542847901582718,
+      "learning_rate": 2.6081700909326e-05,
+      "loss": 0.15148009061813356,
+      "step": 2760
+    },
+    {
+      "epoch": 0.5030931586608443,
+      "grad_norm": 0.1696511209011078,
+      "learning_rate": 2.6008157185596142e-05,
+      "loss": 0.14190055131912233,
+      "step": 2765
+    },
+    {
+      "epoch": 0.5040029112081513,
+      "grad_norm": 0.14690077304840088,
+      "learning_rate": 2.5934604722119655e-05,
+      "loss": 0.1570739269256592,
+      "step": 2770
+    },
+    {
+      "epoch": 0.5049126637554585,
+      "grad_norm": 0.17149671912193298,
+      "learning_rate": 2.5861044156525162e-05,
+      "loss": 0.14940304756164552,
+      "step": 2775
+    },
+    {
+      "epoch": 0.5058224163027657,
+      "grad_norm": 0.16639231145381927,
+      "learning_rate": 2.578747612651155e-05,
+      "loss": 0.15691237449645995,
+      "step": 2780
+    },
+    {
+      "epoch": 0.5067321688500728,
+      "grad_norm": 0.2062763124704361,
+      "learning_rate": 2.5713901269842404e-05,
+      "loss": 0.1564734935760498,
+      "step": 2785
+    },
+    {
+      "epoch": 0.50764192139738,
+      "grad_norm": 0.12636308372020721,
+      "learning_rate": 2.5640320224340502e-05,
+      "loss": 0.14539417028427123,
+      "step": 2790
+    },
+    {
+      "epoch": 0.508551673944687,
+      "grad_norm": 0.16893689334392548,
+      "learning_rate": 2.556673362788225e-05,
+      "loss": 0.15440930128097535,
+      "step": 2795
+    },
+    {
+      "epoch": 0.5094614264919942,
+      "grad_norm": 0.16250015795230865,
+      "learning_rate": 2.54931421183922e-05,
+      "loss": 0.14485647678375244,
+      "step": 2800
+    },
+    {
+      "epoch": 0.5103711790393013,
+      "grad_norm": 0.1700994372367859,
+      "learning_rate": 2.5419546333837462e-05,
+      "loss": 0.15411126613616943,
+      "step": 2805
+    },
+    {
+      "epoch": 0.5112809315866085,
+      "grad_norm": 0.1547706127166748,
+      "learning_rate": 2.5345946912222256e-05,
+      "loss": 0.15516072511672974,
+      "step": 2810
+    },
+    {
+      "epoch": 0.5121906841339156,
+      "grad_norm": 0.17955681681632996,
+      "learning_rate": 2.527234449158228e-05,
+      "loss": 0.15546923875808716,
+      "step": 2815
+    },
+    {
+      "epoch": 0.5131004366812227,
+      "grad_norm": 0.163709819316864,
+      "learning_rate": 2.519873970997927e-05,
+      "loss": 0.15665037631988527,
+      "step": 2820
+    },
+    {
+      "epoch": 0.5140101892285298,
+      "grad_norm": 0.17859576642513275,
+      "learning_rate": 2.5125133205495405e-05,
+      "loss": 0.1539722204208374,
+      "step": 2825
+    },
+    {
+      "epoch": 0.514919941775837,
+      "grad_norm": 0.17443150281906128,
+      "learning_rate": 2.5051525616227806e-05,
+      "loss": 0.148411762714386,
+      "step": 2830
+    },
+    {
+      "epoch": 0.5158296943231441,
+      "grad_norm": 0.17397581040859222,
+      "learning_rate": 2.4977917580283007e-05,
+      "loss": 0.14880497455596925,
+      "step": 2835
+    },
+    {
+      "epoch": 0.5167394468704513,
+      "grad_norm": 0.14565663039684296,
+      "learning_rate": 2.4904309735771405e-05,
+      "loss": 0.14934680461883545,
+      "step": 2840
+    },
+    {
+      "epoch": 0.5176491994177583,
+      "grad_norm": 0.17895659804344177,
+      "learning_rate": 2.4830702720801746e-05,
+      "loss": 0.15287939310073853,
+      "step": 2845
+    },
+    {
+      "epoch": 0.5185589519650655,
+      "grad_norm": 0.15812788903713226,
+      "learning_rate": 2.4757097173475572e-05,
+      "loss": 0.14576947689056396,
+      "step": 2850
+    },
+    {
+      "epoch": 0.5194687045123726,
+      "grad_norm": 0.17123781144618988,
+      "learning_rate": 2.46834937318817e-05,
+      "loss": 0.15224847793579102,
+      "step": 2855
+    },
+    {
+      "epoch": 0.5203784570596798,
+      "grad_norm": 0.14845474064350128,
+      "learning_rate": 2.460989303409072e-05,
+      "loss": 0.14901585578918458,
+      "step": 2860
+    },
+    {
+      "epoch": 0.5212882096069869,
+      "grad_norm": 0.23493704199790955,
+      "learning_rate": 2.4536295718149407e-05,
+      "loss": 0.1517487049102783,
+      "step": 2865
+    },
+    {
+      "epoch": 0.522197962154294,
+      "grad_norm": 0.16209843754768372,
+      "learning_rate": 2.4462702422075217e-05,
+      "loss": 0.14327445030212402,
+      "step": 2870
+    },
+    {
+      "epoch": 0.5231077147016011,
+      "grad_norm": 0.17249803245067596,
+      "learning_rate": 2.4389113783850793e-05,
+      "loss": 0.1517549753189087,
+      "step": 2875
+    },
+    {
+      "epoch": 0.5240174672489083,
+      "grad_norm": 0.14561402797698975,
+      "learning_rate": 2.431553044141836e-05,
+      "loss": 0.14764087200164794,
+      "step": 2880
+    },
+    {
+      "epoch": 0.5249272197962155,
+      "grad_norm": 0.17033302783966064,
+      "learning_rate": 2.4241953032674256e-05,
+      "loss": 0.15181604623794556,
+      "step": 2885
+    },
+    {
+      "epoch": 0.5258369723435226,
+      "grad_norm": 0.1184430941939354,
+      "learning_rate": 2.4168382195463367e-05,
+      "loss": 0.14264242649078368,
+      "step": 2890
+    },
+    {
+      "epoch": 0.5267467248908297,
+      "grad_norm": 0.17521196603775024,
+      "learning_rate": 2.4094818567573618e-05,
+      "loss": 0.1509538173675537,
+      "step": 2895
+    },
+    {
+      "epoch": 0.5276564774381368,
+      "grad_norm": 0.1681576371192932,
+      "learning_rate": 2.4021262786730428e-05,
+      "loss": 0.15344605445861817,
+      "step": 2900
+    },
+    {
+      "epoch": 0.528566229985444,
+      "grad_norm": 0.17134182155132294,
+      "learning_rate": 2.3947715490591206e-05,
+      "loss": 0.15161689519882202,
+      "step": 2905
+    },
+    {
+      "epoch": 0.5294759825327511,
+      "grad_norm": 0.1796472817659378,
+      "learning_rate": 2.3874177316739778e-05,
+      "loss": 0.15086464881896972,
+      "step": 2910
+    },
+    {
+      "epoch": 0.5303857350800583,
+      "grad_norm": 0.23268625140190125,
+      "learning_rate": 2.380064890268093e-05,
+      "loss": 0.15354180335998535,
+      "step": 2915
+    },
+    {
+      "epoch": 0.5312954876273653,
+      "grad_norm": 0.16318941116333008,
+      "learning_rate": 2.372713088583481e-05,
+      "loss": 0.15131797790527343,
+      "step": 2920
+    },
+    {
+      "epoch": 0.5322052401746725,
+      "grad_norm": 0.18171803653240204,
+      "learning_rate": 2.365362390353143e-05,
+      "loss": 0.15784090757369995,
+      "step": 2925
+    },
+    {
+      "epoch": 0.5331149927219796,
+      "grad_norm": 0.17672640085220337,
+      "learning_rate": 2.3580128593005156e-05,
+      "loss": 0.15509436130523682,
+      "step": 2930
+    },
+    {
+      "epoch": 0.5340247452692868,
+      "grad_norm": 0.15985223650932312,
+      "learning_rate": 2.3506645591389174e-05,
+      "loss": 0.14851027727127075,
+      "step": 2935
+    },
+    {
+      "epoch": 0.5349344978165939,
+      "grad_norm": 0.16597607731819153,
+      "learning_rate": 2.343317553570995e-05,
+      "loss": 0.1504931092262268,
+      "step": 2940
+    },
+    {
+      "epoch": 0.535844250363901,
+      "grad_norm": 0.20180748403072357,
+      "learning_rate": 2.3359719062881725e-05,
+      "loss": 0.15023820400238036,
+      "step": 2945
+    },
+    {
+      "epoch": 0.5367540029112081,
+      "grad_norm": 0.1735963076353073,
+      "learning_rate": 2.3286276809701e-05,
+      "loss": 0.15374408960342406,
+      "step": 2950
+    },
+    {
+      "epoch": 0.5376637554585153,
+      "grad_norm": 0.17629501223564148,
+      "learning_rate": 2.3212849412840995e-05,
+      "loss": 0.15007833242416382,
+      "step": 2955
+    },
+    {
+      "epoch": 0.5385735080058224,
+      "grad_norm": 0.1493796557188034,
+      "learning_rate": 2.3139437508846155e-05,
+      "loss": 0.15206656455993653,
+      "step": 2960
+    },
+    {
+      "epoch": 0.5394832605531296,
+      "grad_norm": 0.17426837980747223,
+      "learning_rate": 2.306604173412659e-05,
+      "loss": 0.1441131591796875,
+      "step": 2965
+    },
+    {
+      "epoch": 0.5403930131004366,
+      "grad_norm": 0.16984431445598602,
+      "learning_rate": 2.2992662724952613e-05,
+      "loss": 0.14438753128051757,
+      "step": 2970
+    },
+    {
+      "epoch": 0.5413027656477438,
+      "grad_norm": 0.1814386397600174,
+      "learning_rate": 2.2919301117449167e-05,
+      "loss": 0.14887022972106934,
+      "step": 2975
+    },
+    {
+      "epoch": 0.5422125181950509,
+      "grad_norm": 0.158392995595932,
+      "learning_rate": 2.2845957547590368e-05,
+      "loss": 0.14404361248016356,
+      "step": 2980
+    },
+    {
+      "epoch": 0.5431222707423581,
+      "grad_norm": 0.17496263980865479,
+      "learning_rate": 2.2772632651193953e-05,
+      "loss": 0.1454906702041626,
+      "step": 2985
+    },
+    {
+      "epoch": 0.5440320232896652,
+      "grad_norm": 0.157533198595047,
+      "learning_rate": 2.2699327063915766e-05,
+      "loss": 0.1458217740058899,
+      "step": 2990
+    },
+    {
+      "epoch": 0.5449417758369723,
+      "grad_norm": 0.1767890453338623,
+      "learning_rate": 2.262604142124427e-05,
+      "loss": 0.14384825229644777,
+      "step": 2995
+    },
+    {
+      "epoch": 0.5458515283842795,
+      "grad_norm": 0.1851050704717636,
+      "learning_rate": 2.2552776358495033e-05,
+      "loss": 0.14832457304000854,
+      "step": 3000
+    },
+    {
+      "epoch": 0.5467612809315866,
+      "grad_norm": 0.164175882935524,
+      "learning_rate": 2.247953251080521e-05,
+      "loss": 0.14999878406524658,
+      "step": 3005
+    },
+    {
+      "epoch": 0.5476710334788938,
+      "grad_norm": 0.3403675854206085,
+      "learning_rate": 2.240631051312804e-05,
+      "loss": 0.1443937063217163,
+      "step": 3010
+    },
+    {
+      "epoch": 0.5485807860262009,
+      "grad_norm": 0.16751109063625336,
+      "learning_rate": 2.2333111000227342e-05,
+      "loss": 0.1462402105331421,
+      "step": 3015
+    },
+    {
+      "epoch": 0.549490538573508,
+      "grad_norm": 0.14741151034832,
+      "learning_rate": 2.225993460667201e-05,
+      "loss": 0.149855899810791,
+      "step": 3020
+    },
+    {
+      "epoch": 0.5504002911208151,
+      "grad_norm": 0.20605266094207764,
+      "learning_rate": 2.218678196683054e-05,
+      "loss": 0.15413178205490113,
+      "step": 3025
+    },
+    {
+      "epoch": 0.5513100436681223,
+      "grad_norm": 0.14884796738624573,
+      "learning_rate": 2.2113653714865473e-05,
+      "loss": 0.14592334032058715,
+      "step": 3030
+    },
+    {
+      "epoch": 0.5522197962154294,
+      "grad_norm": 0.17114350199699402,
+      "learning_rate": 2.2040550484727943e-05,
+      "loss": 0.1498338460922241,
+      "step": 3035
+    },
+    {
+      "epoch": 0.5531295487627366,
+      "grad_norm": 0.16496853530406952,
+      "learning_rate": 2.196747291015219e-05,
+      "loss": 0.14650315046310425,
+      "step": 3040
+    },
+    {
+      "epoch": 0.5540393013100436,
+      "grad_norm": 0.15172401070594788,
+      "learning_rate": 2.189442162465001e-05,
+      "loss": 0.14984124898910522,
+      "step": 3045
+    },
+    {
+      "epoch": 0.5549490538573508,
+      "grad_norm": 0.19258467853069305,
+      "learning_rate": 2.182139726150532e-05,
+      "loss": 0.1486764669418335,
+      "step": 3050
+    },
+    {
+      "epoch": 0.5558588064046579,
+      "grad_norm": 0.1749001443386078,
+      "learning_rate": 2.1748400453768652e-05,
+      "loss": 0.14983701705932617,
+      "step": 3055
+    },
+    {
+      "epoch": 0.5567685589519651,
+      "grad_norm": 0.37510567903518677,
+      "learning_rate": 2.1675431834251637e-05,
+      "loss": 0.14483561515808105,
+      "step": 3060
+    },
+    {
+      "epoch": 0.5576783114992722,
+      "grad_norm": 0.16932405531406403,
+      "learning_rate": 2.1602492035521553e-05,
+      "loss": 0.14487643241882325,
+      "step": 3065
+    },
+    {
+      "epoch": 0.5585880640465793,
+      "grad_norm": 0.174176424741745,
+      "learning_rate": 2.152958168989584e-05,
+      "loss": 0.14737497568130492,
+      "step": 3070
+    },
+    {
+      "epoch": 0.5594978165938864,
+      "grad_norm": 0.1601252257823944,
+      "learning_rate": 2.1456701429436577e-05,
+      "loss": 0.15183379650115966,
+      "step": 3075
+    },
+    {
+      "epoch": 0.5604075691411936,
+      "grad_norm": 0.14960910379886627,
+      "learning_rate": 2.1383851885945085e-05,
+      "loss": 0.143074893951416,
+      "step": 3080
+    },
+    {
+      "epoch": 0.5613173216885007,
+      "grad_norm": 0.1678633838891983,
+      "learning_rate": 2.1311033690956346e-05,
+      "loss": 0.14961432218551635,
+      "step": 3085
+    },
+    {
+      "epoch": 0.5622270742358079,
+      "grad_norm": 0.15814319252967834,
+      "learning_rate": 2.1238247475733613e-05,
+      "loss": 0.14308581352233887,
+      "step": 3090
+    },
+    {
+      "epoch": 0.5631368267831149,
+      "grad_norm": 0.21240772306919098,
+      "learning_rate": 2.1165493871262887e-05,
+      "loss": 0.14737485647201537,
+      "step": 3095
+    },
+    {
+      "epoch": 0.5640465793304221,
+      "grad_norm": 0.15161271393299103,
+      "learning_rate": 2.109277350824749e-05,
+      "loss": 0.14534420967102052,
+      "step": 3100
+    },
+    {
+      "epoch": 0.5649563318777293,
+      "grad_norm": 0.16572362184524536,
+      "learning_rate": 2.1020087017102537e-05,
+      "loss": 0.14299670457839966,
+      "step": 3105
+    },
+    {
+      "epoch": 0.5658660844250364,
+      "grad_norm": 0.1548164039850235,
+      "learning_rate": 2.094743502794954e-05,
+      "loss": 0.14371142387390137,
+      "step": 3110
+    },
+    {
+      "epoch": 0.5667758369723436,
+      "grad_norm": 0.2574169933795929,
+      "learning_rate": 2.0874818170610885e-05,
+      "loss": 0.14350423812866211,
+      "step": 3115
+    },
+    {
+      "epoch": 0.5676855895196506,
+      "grad_norm": 0.16359548270702362,
+      "learning_rate": 2.080223707460443e-05,
+      "loss": 0.1520243763923645,
+      "step": 3120
+    },
+    {
+      "epoch": 0.5685953420669578,
+      "grad_norm": 0.1798320859670639,
+      "learning_rate": 2.072969236913799e-05,
+      "loss": 0.14832595586776734,
+      "step": 3125
+    },
+    {
+      "epoch": 0.5695050946142649,
+      "grad_norm": 0.17045916616916656,
+      "learning_rate": 2.0657184683103926e-05,
+      "loss": 0.15308042764663696,
+      "step": 3130
+    },
+    {
+      "epoch": 0.5704148471615721,
+      "grad_norm": 0.16345897316932678,
+      "learning_rate": 2.058471464507366e-05,
+      "loss": 0.14564799070358275,
+      "step": 3135
+    },
+    {
+      "epoch": 0.5713245997088792,
+      "grad_norm": 0.15170110762119293,
+      "learning_rate": 2.0512282883292257e-05,
+      "loss": 0.14161767959594726,
+      "step": 3140
+    },
+    {
+      "epoch": 0.5722343522561864,
+      "grad_norm": 0.8107472658157349,
+      "learning_rate": 2.0439890025672955e-05,
+      "loss": 0.14481087923049926,
+      "step": 3145
+    },
+    {
+      "epoch": 0.5731441048034934,
+      "grad_norm": 0.15346679091453552,
+      "learning_rate": 2.036753669979174e-05,
+      "loss": 0.14860262870788574,
+      "step": 3150
+    },
+    {
+      "epoch": 0.5740538573508006,
+      "grad_norm": 0.1632593423128128,
+      "learning_rate": 2.0295223532881886e-05,
+      "loss": 0.1481687307357788,
+      "step": 3155
+    },
+    {
+      "epoch": 0.5749636098981077,
+      "grad_norm": 0.23399172723293304,
+      "learning_rate": 2.022295115182852e-05,
+      "loss": 0.149153733253479,
+      "step": 3160
+    },
+    {
+      "epoch": 0.5758733624454149,
+      "grad_norm": 0.14977394044399261,
+      "learning_rate": 2.015072018316323e-05,
+      "loss": 0.14921388626098633,
+      "step": 3165
+    },
+    {
+      "epoch": 0.576783114992722,
+      "grad_norm": 0.1550658792257309,
+      "learning_rate": 2.007853125305856e-05,
+      "loss": 0.1482759475708008,
+      "step": 3170
+    },
+    {
+      "epoch": 0.5776928675400291,
+      "grad_norm": 0.16661737859249115,
+      "learning_rate": 2.0006384987322645e-05,
+      "loss": 0.14903552532196046,
+      "step": 3175
+    },
+    {
+      "epoch": 0.5786026200873362,
+      "grad_norm": 0.1746823936700821,
+      "learning_rate": 1.9934282011393753e-05,
+      "loss": 0.1412947654724121,
+      "step": 3180
+    },
+    {
+      "epoch": 0.5795123726346434,
+      "grad_norm": 0.17025792598724365,
+      "learning_rate": 1.9862222950334857e-05,
+      "loss": 0.15289769172668458,
+      "step": 3185
+    },
+    {
+      "epoch": 0.5804221251819505,
+      "grad_norm": 0.16857658326625824,
+      "learning_rate": 1.9790208428828252e-05,
+      "loss": 0.14419941902160643,
+      "step": 3190
+    },
+    {
+      "epoch": 0.5813318777292577,
+      "grad_norm": 0.16099876165390015,
+      "learning_rate": 1.9718239071170118e-05,
+      "loss": 0.14476487636566163,
+      "step": 3195
+    },
+    {
+      "epoch": 0.5822416302765647,
+      "grad_norm": 0.16140873730182648,
+      "learning_rate": 1.964631550126508e-05,
+      "loss": 0.14588416814804078,
+      "step": 3200
+    },
+    {
+      "epoch": 0.5831513828238719,
+      "grad_norm": 0.15719448029994965,
+      "learning_rate": 1.957443834262087e-05,
+      "loss": 0.15144693851470947,
+      "step": 3205
+    },
+    {
+      "epoch": 0.584061135371179,
+      "grad_norm": 0.16512645781040192,
+      "learning_rate": 1.950260821834285e-05,
+      "loss": 0.14787566661834717,
+      "step": 3210
+    },
+    {
+      "epoch": 0.5849708879184862,
+      "grad_norm": 0.18584516644477844,
+      "learning_rate": 1.9430825751128643e-05,
+      "loss": 0.14514710903167724,
+      "step": 3215
+    },
+    {
+      "epoch": 0.5858806404657934,
+      "grad_norm": 0.17640981078147888,
+      "learning_rate": 1.9359091563262742e-05,
+      "loss": 0.1511004686355591,
+      "step": 3220
+    },
+    {
+      "epoch": 0.5867903930131004,
+      "grad_norm": 0.1697624921798706,
+      "learning_rate": 1.9287406276611095e-05,
+      "loss": 0.15392563343048096,
+      "step": 3225
+    },
+    {
+      "epoch": 0.5877001455604076,
+      "grad_norm": 0.1677260845899582,
+      "learning_rate": 1.9215770512615725e-05,
+      "loss": 0.15311745405197144,
+      "step": 3230
+    },
+    {
+      "epoch": 0.5886098981077147,
+      "grad_norm": 0.15357480943202972,
+      "learning_rate": 1.9144184892289337e-05,
+      "loss": 0.14370160102844237,
+      "step": 3235
+    },
+    {
+      "epoch": 0.5895196506550219,
+      "grad_norm": 0.18601207435131073,
+      "learning_rate": 1.9072650036209955e-05,
+      "loss": 0.14095077514648438,
+      "step": 3240
+    },
+    {
+      "epoch": 0.590429403202329,
+      "grad_norm": 0.17313526570796967,
+      "learning_rate": 1.9001166564515513e-05,
+      "loss": 0.148259174823761,
+      "step": 3245
+    },
+    {
+      "epoch": 0.5913391557496361,
+      "grad_norm": 0.1634378433227539,
+      "learning_rate": 1.8929735096898504e-05,
+      "loss": 0.15082294940948487,
+      "step": 3250
+    },
+    {
+      "epoch": 0.5922489082969432,
+      "grad_norm": 0.18542174994945526,
+      "learning_rate": 1.885835625260058e-05,
+      "loss": 0.14461435079574586,
+      "step": 3255
+    },
+    {
+      "epoch": 0.5931586608442504,
+      "grad_norm": 0.1740756630897522,
+      "learning_rate": 1.87870306504072e-05,
+      "loss": 0.14083608388900756,
+      "step": 3260
+    },
+    {
+      "epoch": 0.5940684133915575,
+      "grad_norm": 0.25606217980384827,
+      "learning_rate": 1.8715758908642288e-05,
+      "loss": 0.15125386714935302,
+      "step": 3265
+    },
+    {
+      "epoch": 0.5949781659388647,
+      "grad_norm": 0.20194627344608307,
+      "learning_rate": 1.8644541645162834e-05,
+      "loss": 0.14433003664016725,
+      "step": 3270
+    },
+    {
+      "epoch": 0.5958879184861717,
+      "grad_norm": 0.1902168095111847,
+      "learning_rate": 1.8573379477353542e-05,
+      "loss": 0.14718132019042968,
+      "step": 3275
+    },
+    {
+      "epoch": 0.5967976710334789,
+      "grad_norm": 0.15122972428798676,
+      "learning_rate": 1.850227302212151e-05,
+      "loss": 0.153376567363739,
+      "step": 3280
+    },
+    {
+      "epoch": 0.597707423580786,
+      "grad_norm": 0.14331959187984467,
+      "learning_rate": 1.843122289589085e-05,
+      "loss": 0.146630597114563,
+      "step": 3285
+    },
+    {
+      "epoch": 0.5986171761280932,
+      "grad_norm": 0.15083099901676178,
+      "learning_rate": 1.836022971459737e-05,
+      "loss": 0.1445971965789795,
+      "step": 3290
+    },
+    {
+      "epoch": 0.5995269286754003,
+      "grad_norm": 0.16585418581962585,
+      "learning_rate": 1.828929409368321e-05,
+      "loss": 0.15120241641998292,
+      "step": 3295
+    },
+    {
+      "epoch": 0.6004366812227074,
+      "grad_norm": 0.1653224229812622,
+      "learning_rate": 1.8218416648091524e-05,
+      "loss": 0.14349838495254516,
+      "step": 3300
+    },
+    {
+      "epoch": 0.6013464337700145,
+      "grad_norm": 0.1891375184059143,
+      "learning_rate": 1.8147597992261124e-05,
+      "loss": 0.15171384811401367,
+      "step": 3305
+    },
+    {
+      "epoch": 0.6022561863173217,
+      "grad_norm": 0.13392704725265503,
+      "learning_rate": 1.8076838740121187e-05,
+      "loss": 0.14607118368148803,
+      "step": 3310
+    },
+    {
+      "epoch": 0.6031659388646288,
+      "grad_norm": 0.15421944856643677,
+      "learning_rate": 1.8006139505085926e-05,
+      "loss": 0.1380957007408142,
+      "step": 3315
+    },
+    {
+      "epoch": 0.604075691411936,
+      "grad_norm": 0.16637761890888214,
+      "learning_rate": 1.7935500900049246e-05,
+      "loss": 0.14604611396789552,
+      "step": 3320
+    },
+    {
+      "epoch": 0.6049854439592431,
+      "grad_norm": 0.16638441383838654,
+      "learning_rate": 1.7864923537379445e-05,
+      "loss": 0.1513611912727356,
+      "step": 3325
+    },
+    {
+      "epoch": 0.6058951965065502,
+      "grad_norm": 0.1745707094669342,
+      "learning_rate": 1.779440802891394e-05,
+      "loss": 0.15391240119934083,
+      "step": 3330
+    },
+    {
+      "epoch": 0.6068049490538574,
+      "grad_norm": 0.1620505005121231,
+      "learning_rate": 1.77239549859539e-05,
+      "loss": 0.14986472129821776,
+      "step": 3335
+    },
+    {
+      "epoch": 0.6077147016011645,
+      "grad_norm": 0.1579132080078125,
+      "learning_rate": 1.7653565019259e-05,
+      "loss": 0.1466603994369507,
+      "step": 3340
+    },
+    {
+      "epoch": 0.6086244541484717,
+      "grad_norm": 0.19154994189739227,
+      "learning_rate": 1.7583238739042086e-05,
+      "loss": 0.15228934288024903,
+      "step": 3345
+    },
+    {
+      "epoch": 0.6095342066957787,
+      "grad_norm": 0.15771779417991638,
+      "learning_rate": 1.7512976754963913e-05,
+      "loss": 0.14965078830718995,
+      "step": 3350
+    },
+    {
+      "epoch": 0.6104439592430859,
+      "grad_norm": 0.18406136333942413,
+      "learning_rate": 1.744277967612785e-05,
+      "loss": 0.1473196864128113,
+      "step": 3355
+    },
+    {
+      "epoch": 0.611353711790393,
+      "grad_norm": 0.17603816092014313,
+      "learning_rate": 1.7372648111074607e-05,
+      "loss": 0.1430676221847534,
+      "step": 3360
+    },
+    {
+      "epoch": 0.6122634643377002,
+      "grad_norm": 0.156408429145813,
+      "learning_rate": 1.7302582667776933e-05,
+      "loss": 0.14018454551696777,
+      "step": 3365
+    },
+    {
+      "epoch": 0.6131732168850073,
+      "grad_norm": 0.14504843950271606,
+      "learning_rate": 1.7232583953634407e-05,
+      "loss": 0.14505640268325806,
+      "step": 3370
+    },
+    {
+      "epoch": 0.6140829694323144,
+      "grad_norm": 0.1864968240261078,
+      "learning_rate": 1.716265257546808e-05,
+      "loss": 0.14810394048690795,
+      "step": 3375
+    },
+    {
+      "epoch": 0.6149927219796215,
+      "grad_norm": 0.1621711403131485,
+      "learning_rate": 1.7092789139515295e-05,
+      "loss": 0.14203091859817504,
+      "step": 3380
+    },
+    {
+      "epoch": 0.6159024745269287,
+      "grad_norm": 0.17994914948940277,
+      "learning_rate": 1.70229942514244e-05,
+      "loss": 0.14565644264221192,
+      "step": 3385
+    },
+    {
+      "epoch": 0.6168122270742358,
+      "grad_norm": 0.1707388162612915,
+      "learning_rate": 1.6953268516249486e-05,
+      "loss": 0.14449434280395507,
+      "step": 3390
+    },
+    {
+      "epoch": 0.617721979621543,
+      "grad_norm": 0.16425329446792603,
+      "learning_rate": 1.6883612538445175e-05,
+      "loss": 0.15185940265655518,
+      "step": 3395
+    },
+    {
+      "epoch": 0.61863173216885,
+      "grad_norm": 0.15987788140773773,
+      "learning_rate": 1.6814026921861335e-05,
+      "loss": 0.14994431734085084,
+      "step": 3400
+    },
+    {
+      "epoch": 0.6195414847161572,
+      "grad_norm": 0.2987690269947052,
+      "learning_rate": 1.6744512269737894e-05,
+      "loss": 0.14652738571166993,
+      "step": 3405
+    },
+    {
+      "epoch": 0.6204512372634643,
+      "grad_norm": 0.1681315004825592,
+      "learning_rate": 1.6675069184699574e-05,
+      "loss": 0.14566165208816528,
+      "step": 3410
+    },
+    {
+      "epoch": 0.6213609898107715,
+      "grad_norm": 0.15847846865653992,
+      "learning_rate": 1.660569826875069e-05,
+      "loss": 0.1374401330947876,
+      "step": 3415
+    },
+    {
+      "epoch": 0.6222707423580786,
+      "grad_norm": 0.16370312869548798,
+      "learning_rate": 1.6536400123269907e-05,
+      "loss": 0.14905524253845215,
+      "step": 3420
+    },
+    {
+      "epoch": 0.6231804949053857,
+      "grad_norm": 0.16054444015026093,
+      "learning_rate": 1.6467175349005054e-05,
+      "loss": 0.1496324896812439,
+      "step": 3425
+    },
+    {
+      "epoch": 0.6240902474526928,
+      "grad_norm": 0.1663951277732849,
+      "learning_rate": 1.639802454606788e-05,
+      "loss": 0.1504170298576355,
+      "step": 3430
+    },
+    {
+      "epoch": 0.625,
+      "grad_norm": 0.1591310054063797,
+      "learning_rate": 1.6328948313928906e-05,
+      "loss": 0.1410186171531677,
+      "step": 3435
+    },
+    {
+      "epoch": 0.6259097525473072,
+      "grad_norm": 0.1637524962425232,
+      "learning_rate": 1.6259947251412178e-05,
+      "loss": 0.13963305950164795,
+      "step": 3440
+    },
+    {
+      "epoch": 0.6268195050946143,
+      "grad_norm": 0.1688017100095749,
+      "learning_rate": 1.6191021956690096e-05,
+      "loss": 0.14727941751480103,
+      "step": 3445
+    },
+    {
+      "epoch": 0.6277292576419214,
+      "grad_norm": 0.1691795438528061,
+      "learning_rate": 1.612217302727821e-05,
+      "loss": 0.14856183528900146,
+      "step": 3450
+    },
+    {
+      "epoch": 0.6286390101892285,
+      "grad_norm": 0.18501746654510498,
+      "learning_rate": 1.60534010600301e-05,
+      "loss": 0.1481746554374695,
+      "step": 3455
+    },
+    {
+      "epoch": 0.6295487627365357,
+      "grad_norm": 0.16234716773033142,
+      "learning_rate": 1.5984706651132125e-05,
+      "loss": 0.1427530527114868,
+      "step": 3460
+    },
+    {
+      "epoch": 0.6304585152838428,
+      "grad_norm": 0.16013780236244202,
+      "learning_rate": 1.5916090396098293e-05,
+      "loss": 0.14264426231384278,
+      "step": 3465
+    },
+    {
+      "epoch": 0.63136826783115,
+      "grad_norm": 0.17116396129131317,
+      "learning_rate": 1.5847552889765095e-05,
+      "loss": 0.14109257459640503,
+      "step": 3470
+    },
+    {
+      "epoch": 0.632278020378457,
+      "grad_norm": 0.16949769854545593,
+      "learning_rate": 1.5779094726286344e-05,
+      "loss": 0.1387040376663208,
+      "step": 3475
+    },
+    {
+      "epoch": 0.6331877729257642,
+      "grad_norm": 0.14983431994915009,
+      "learning_rate": 1.5710716499128044e-05,
+      "loss": 0.13645120859146118,
+      "step": 3480
+    },
+    {
+      "epoch": 0.6340975254730713,
+      "grad_norm": 0.1632554531097412,
+      "learning_rate": 1.564241880106321e-05,
+      "loss": 0.14883992671966553,
+      "step": 3485
+    },
+    {
+      "epoch": 0.6350072780203785,
+      "grad_norm": 0.15686506032943726,
+      "learning_rate": 1.5574202224166744e-05,
+      "loss": 0.14244272708892822,
+      "step": 3490
+    },
+    {
+      "epoch": 0.6359170305676856,
+      "grad_norm": 0.18843458592891693,
+      "learning_rate": 1.5506067359810333e-05,
+      "loss": 0.15149861574172974,
+      "step": 3495
+    },
+    {
+      "epoch": 0.6368267831149927,
+      "grad_norm": 0.15874551236629486,
+      "learning_rate": 1.5438014798657275e-05,
+      "loss": 0.15188233852386473,
+      "step": 3500
+    },
+    {
+      "epoch": 0.6377365356622998,
+      "grad_norm": 0.17014239728450775,
+      "learning_rate": 1.5370045130657366e-05,
+      "loss": 0.14694437980651856,
+      "step": 3505
+    },
+    {
+      "epoch": 0.638646288209607,
+      "grad_norm": 0.14744038879871368,
+      "learning_rate": 1.5302158945041838e-05,
+      "loss": 0.14434736967086792,
+      "step": 3510
+    },
+    {
+      "epoch": 0.6395560407569141,
+      "grad_norm": 0.2069770246744156,
+      "learning_rate": 1.523435683031818e-05,
+      "loss": 0.13982917070388795,
+      "step": 3515
+    },
+    {
+      "epoch": 0.6404657933042213,
+      "grad_norm": 0.17811502516269684,
+      "learning_rate": 1.5166639374265063e-05,
+      "loss": 0.1408839702606201,
+      "step": 3520
+    },
+    {
+      "epoch": 0.6413755458515283,
+      "grad_norm": 0.165786474943161,
+      "learning_rate": 1.509900716392728e-05,
+      "loss": 0.15312877893447877,
+      "step": 3525
+    },
+    {
+      "epoch": 0.6422852983988355,
+      "grad_norm": 0.1633884161710739,
+      "learning_rate": 1.5031460785610596e-05,
+      "loss": 0.1488795518875122,
+      "step": 3530
+    },
+    {
+      "epoch": 0.6431950509461426,
+      "grad_norm": 0.16498984396457672,
+      "learning_rate": 1.4964000824876723e-05,
+      "loss": 0.15031465291976928,
+      "step": 3535
+    },
+    {
+      "epoch": 0.6441048034934498,
+      "grad_norm": 0.18043678998947144,
+      "learning_rate": 1.4896627866538191e-05,
+      "loss": 0.147829806804657,
+      "step": 3540
+    },
+    {
+      "epoch": 0.6450145560407569,
+      "grad_norm": 0.16813597083091736,
+      "learning_rate": 1.4829342494653315e-05,
+      "loss": 0.1418998956680298,
+      "step": 3545
+    },
+    {
+      "epoch": 0.645924308588064,
+      "grad_norm": 0.1817242056131363,
+      "learning_rate": 1.4762145292521118e-05,
+      "loss": 0.14508869647979736,
+      "step": 3550
+    },
+    {
+      "epoch": 0.6468340611353712,
+      "grad_norm": 0.14666494727134705,
+      "learning_rate": 1.469503684267628e-05,
+      "loss": 0.14159854650497436,
+      "step": 3555
+    },
+    {
+      "epoch": 0.6477438136826783,
+      "grad_norm": 0.16485381126403809,
+      "learning_rate": 1.4628017726884086e-05,
+      "loss": 0.14419105052947997,
+      "step": 3560
+    },
+    {
+      "epoch": 0.6486535662299855,
+      "grad_norm": 0.16100342571735382,
+      "learning_rate": 1.4561088526135375e-05,
+      "loss": 0.14501721858978273,
+      "step": 3565
+    },
+    {
+      "epoch": 0.6495633187772926,
+      "grad_norm": 0.16996590793132782,
+      "learning_rate": 1.4494249820641493e-05,
+      "loss": 0.1377166509628296,
+      "step": 3570
+    },
+    {
+      "epoch": 0.6504730713245997,
+      "grad_norm": 0.16168837249279022,
+      "learning_rate": 1.4427502189829339e-05,
+      "loss": 0.1414325475692749,
+      "step": 3575
+    },
+    {
+      "epoch": 0.6513828238719068,
+      "grad_norm": 0.16318906843662262,
+      "learning_rate": 1.436084621233621e-05,
+      "loss": 0.14685193300247193,
+      "step": 3580
+    },
+    {
+      "epoch": 0.652292576419214,
+      "grad_norm": 0.1636219322681427,
+      "learning_rate": 1.4294282466004899e-05,
+      "loss": 0.1405899167060852,
+      "step": 3585
+    },
+    {
+      "epoch": 0.6532023289665211,
+      "grad_norm": 0.1838461309671402,
+      "learning_rate": 1.422781152787865e-05,
+      "loss": 0.14386332035064697,
+      "step": 3590
+    },
+    {
+      "epoch": 0.6541120815138283,
+      "grad_norm": 0.1796344667673111,
+      "learning_rate": 1.4161433974196115e-05,
+      "loss": 0.1513024687767029,
+      "step": 3595
+    },
+    {
+      "epoch": 0.6550218340611353,
+      "grad_norm": 0.16424529254436493,
+      "learning_rate": 1.4095150380386427e-05,
+      "loss": 0.14238927364349366,
+      "step": 3600
+    },
+    {
+      "epoch": 0.6559315866084425,
+      "grad_norm": 0.19264160096645355,
+      "learning_rate": 1.402896132106415e-05,
+      "loss": 0.14297477006912232,
+      "step": 3605
+    },
+    {
+      "epoch": 0.6568413391557496,
+      "grad_norm": 0.18319948017597198,
+      "learning_rate": 1.3962867370024347e-05,
+      "loss": 0.1448880434036255,
+      "step": 3610
+    },
+    {
+      "epoch": 0.6577510917030568,
+      "grad_norm": 0.16507290303707123,
+      "learning_rate": 1.389686910023758e-05,
+      "loss": 0.14724698066711425,
+      "step": 3615
+    },
+    {
+      "epoch": 0.6586608442503639,
+      "grad_norm": 0.17871244251728058,
+      "learning_rate": 1.3830967083844942e-05,
+      "loss": 0.14479386806488037,
+      "step": 3620
+    },
+    {
+      "epoch": 0.659570596797671,
+      "grad_norm": 0.1846228390932083,
+      "learning_rate": 1.3765161892153112e-05,
+      "loss": 0.1453616738319397,
+      "step": 3625
+    },
+    {
+      "epoch": 0.6604803493449781,
+      "grad_norm": 0.17185978591442108,
+      "learning_rate": 1.3699454095629372e-05,
+      "loss": 0.14906206130981445,
+      "step": 3630
+    },
+    {
+      "epoch": 0.6613901018922853,
+      "grad_norm": 0.14751191437244415,
+      "learning_rate": 1.3633844263896698e-05,
+      "loss": 0.13991892337799072,
+      "step": 3635
+    },
+    {
+      "epoch": 0.6622998544395924,
+      "grad_norm": 0.22059763967990875,
+      "learning_rate": 1.3568332965728817e-05,
+      "loss": 0.14680869579315187,
+      "step": 3640
+    },
+    {
+      "epoch": 0.6632096069868996,
+      "grad_norm": 0.15295909345149994,
+      "learning_rate": 1.3502920769045232e-05,
+      "loss": 0.1404443383216858,
+      "step": 3645
+    },
+    {
+      "epoch": 0.6641193595342066,
+      "grad_norm": 0.14600558578968048,
+      "learning_rate": 1.3437608240906364e-05,
+      "loss": 0.14663270711898804,
+      "step": 3650
+    },
+    {
+      "epoch": 0.6650291120815138,
+      "grad_norm": 0.15548352897167206,
+      "learning_rate": 1.3372395947508587e-05,
+      "loss": 0.1431443452835083,
+      "step": 3655
+    },
+    {
+      "epoch": 0.665938864628821,
+      "grad_norm": 0.1813388466835022,
+      "learning_rate": 1.3307284454179342e-05,
+      "loss": 0.1458706736564636,
+      "step": 3660
+    },
+    {
+      "epoch": 0.6668486171761281,
+      "grad_norm": 0.16326870024204254,
+      "learning_rate": 1.3242274325372247e-05,
+      "loss": 0.14700595140457154,
+      "step": 3665
+    },
+    {
+      "epoch": 0.6677583697234353,
+      "grad_norm": 0.18779197335243225,
+      "learning_rate": 1.3177366124662149e-05,
+      "loss": 0.1497237801551819,
+      "step": 3670
+    },
+    {
+      "epoch": 0.6686681222707423,
+      "grad_norm": 0.16291002929210663,
+      "learning_rate": 1.3112560414740315e-05,
+      "loss": 0.1387086868286133,
+      "step": 3675
+    },
+    {
+      "epoch": 0.6695778748180495,
+      "grad_norm": 0.1532297134399414,
+      "learning_rate": 1.3047857757409487e-05,
+      "loss": 0.14497545957565308,
+      "step": 3680
+    },
+    {
+      "epoch": 0.6704876273653566,
+      "grad_norm": 0.14697515964508057,
+      "learning_rate": 1.2983258713579066e-05,
+      "loss": 0.1494283437728882,
+      "step": 3685
+    },
+    {
+      "epoch": 0.6713973799126638,
+      "grad_norm": 0.15213452279567719,
+      "learning_rate": 1.2918763843260218e-05,
+      "loss": 0.1468907594680786,
+      "step": 3690
+    },
+    {
+      "epoch": 0.6723071324599709,
+      "grad_norm": 0.1745215803384781,
+      "learning_rate": 1.285437370556099e-05,
+      "loss": 0.14997754096984864,
+      "step": 3695
+    },
+    {
+      "epoch": 0.673216885007278,
+      "grad_norm": 0.19207637012004852,
+      "learning_rate": 1.2790088858681577e-05,
+      "loss": 0.14202862977981567,
+      "step": 3700
+    },
+    {
+      "epoch": 0.6741266375545851,
+      "grad_norm": 0.1521359086036682,
+      "learning_rate": 1.2725909859909313e-05,
+      "loss": 0.14547673463821412,
+      "step": 3705
+    },
+    {
+      "epoch": 0.6750363901018923,
+      "grad_norm": 0.16975535452365875,
+      "learning_rate": 1.2661837265613999e-05,
+      "loss": 0.14006874561309815,
+      "step": 3710
+    },
+    {
+      "epoch": 0.6759461426491994,
+      "grad_norm": 0.22234582901000977,
+      "learning_rate": 1.2597871631242992e-05,
+      "loss": 0.13691173791885375,
+      "step": 3715
+    },
+    {
+      "epoch": 0.6768558951965066,
+      "grad_norm": 0.16082969307899475,
+      "learning_rate": 1.2534013511316383e-05,
+      "loss": 0.14932308197021485,
+      "step": 3720
+    },
+    {
+      "epoch": 0.6777656477438136,
+      "grad_norm": 0.1751091182231903,
+      "learning_rate": 1.247026345942226e-05,
+      "loss": 0.14531974792480468,
+      "step": 3725
+    },
+    {
+      "epoch": 0.6786754002911208,
+      "grad_norm": 0.15838147699832916,
+      "learning_rate": 1.2406622028211844e-05,
+      "loss": 0.14759832620620728,
+      "step": 3730
+    },
+    {
+      "epoch": 0.6795851528384279,
+      "grad_norm": 0.1771744042634964,
+      "learning_rate": 1.2343089769394714e-05,
+      "loss": 0.1382831573486328,
+      "step": 3735
+    },
+    {
+      "epoch": 0.6804949053857351,
+      "grad_norm": 0.16301538050174713,
+      "learning_rate": 1.2279667233734037e-05,
+      "loss": 0.14444775581359864,
+      "step": 3740
+    },
+    {
+      "epoch": 0.6814046579330422,
+      "grad_norm": 0.1584121286869049,
+      "learning_rate": 1.2216354971041796e-05,
+      "loss": 0.14200170040130616,
+      "step": 3745
+    },
+    {
+      "epoch": 0.6823144104803494,
+      "grad_norm": 0.139187291264534,
+      "learning_rate": 1.2153153530174007e-05,
+      "loss": 0.14318310022354125,
+      "step": 3750
+    },
+    {
+      "epoch": 0.6832241630276564,
+      "grad_norm": 0.13665248453617096,
+      "learning_rate": 1.2090063459025955e-05,
+      "loss": 0.1411946654319763,
+      "step": 3755
+    },
+    {
+      "epoch": 0.6841339155749636,
+      "grad_norm": 0.16273781657218933,
+      "learning_rate": 1.2027085304527475e-05,
+      "loss": 0.14873508214950562,
+      "step": 3760
+    },
+    {
+      "epoch": 0.6850436681222707,
+      "grad_norm": 0.16317526996135712,
+      "learning_rate": 1.1964219612638194e-05,
+      "loss": 0.14644203186035157,
+      "step": 3765
+    },
+    {
+      "epoch": 0.6859534206695779,
+      "grad_norm": 0.17253617942333221,
+      "learning_rate": 1.1901466928342777e-05,
+      "loss": 0.14027841091156007,
+      "step": 3770
+    },
+    {
+      "epoch": 0.6868631732168851,
+      "grad_norm": 0.19692830741405487,
+      "learning_rate": 1.183882779564624e-05,
+      "loss": 0.14411110877990724,
+      "step": 3775
+    },
+    {
+      "epoch": 0.6877729257641921,
+      "grad_norm": 0.15444578230381012,
+      "learning_rate": 1.1776302757569214e-05,
+      "loss": 0.14355008602142333,
+      "step": 3780
+    },
+    {
+      "epoch": 0.6886826783114993,
+      "grad_norm": 0.1622200757265091,
+      "learning_rate": 1.1713892356143239e-05,
+      "loss": 0.14794334173202514,
+      "step": 3785
+    },
+    {
+      "epoch": 0.6895924308588064,
+      "grad_norm": 0.1898501068353653,
+      "learning_rate": 1.1651597132406073e-05,
+      "loss": 0.1418622612953186,
+      "step": 3790
+    },
+    {
+      "epoch": 0.6905021834061136,
+      "grad_norm": 0.17803208529949188,
+      "learning_rate": 1.1589417626396973e-05,
+      "loss": 0.14576040506362914,
+      "step": 3795
+    },
+    {
+      "epoch": 0.6914119359534207,
+      "grad_norm": 0.17138013243675232,
+      "learning_rate": 1.1527354377152053e-05,
+      "loss": 0.14494270086288452,
+      "step": 3800
+    },
+    {
+      "epoch": 0.6923216885007278,
+      "grad_norm": 0.15170913934707642,
+      "learning_rate": 1.1465407922699603e-05,
+      "loss": 0.144084370136261,
+      "step": 3805
+    },
+    {
+      "epoch": 0.6932314410480349,
+      "grad_norm": 0.158562570810318,
+      "learning_rate": 1.1403578800055387e-05,
+      "loss": 0.13636608123779298,
+      "step": 3810
+    },
+    {
+      "epoch": 0.6941411935953421,
+      "grad_norm": 0.17687302827835083,
+      "learning_rate": 1.1341867545218044e-05,
+      "loss": 0.14214688539505005,
+      "step": 3815
+    },
+    {
+      "epoch": 0.6950509461426492,
+      "grad_norm": 0.15394899249076843,
+      "learning_rate": 1.1280274693164378e-05,
+      "loss": 0.14914129972457885,
+      "step": 3820
+    },
+    {
+      "epoch": 0.6959606986899564,
+      "grad_norm": 0.15709355473518372,
+      "learning_rate": 1.12188007778448e-05,
+      "loss": 0.14798580408096312,
+      "step": 3825
+    },
+    {
+      "epoch": 0.6968704512372634,
+      "grad_norm": 0.16631539165973663,
+      "learning_rate": 1.115744633217864e-05,
+      "loss": 0.14756966829299928,
+      "step": 3830
+    },
+    {
+      "epoch": 0.6977802037845706,
+      "grad_norm": 0.15893076360225677,
+      "learning_rate": 1.109621188804951e-05,
+      "loss": 0.14061959981918334,
+      "step": 3835
+    },
+    {
+      "epoch": 0.6986899563318777,
+      "grad_norm": 0.183414489030838,
+      "learning_rate": 1.103509797630077e-05,
+      "loss": 0.1448473334312439,
+      "step": 3840
+    },
+    {
+      "epoch": 0.6995997088791849,
+      "grad_norm": 0.14087305963039398,
+      "learning_rate": 1.0974105126730841e-05,
+      "loss": 0.14369285106658936,
+      "step": 3845
+    },
+    {
+      "epoch": 0.700509461426492,
+      "grad_norm": 0.16919967532157898,
+      "learning_rate": 1.0913233868088685e-05,
+      "loss": 0.1478085398674011,
+      "step": 3850
+    },
+    {
+      "epoch": 0.7014192139737991,
+      "grad_norm": 0.1439533829689026,
+      "learning_rate": 1.0852484728069178e-05,
+      "loss": 0.14376721382141114,
+      "step": 3855
+    },
+    {
+      "epoch": 0.7023289665211062,
+      "grad_norm": 0.17719274759292603,
+      "learning_rate": 1.0791858233308521e-05,
+      "loss": 0.14089040756225585,
+      "step": 3860
+    },
+    {
+      "epoch": 0.7032387190684134,
+      "grad_norm": 0.19753769040107727,
+      "learning_rate": 1.0731354909379754e-05,
+      "loss": 0.15021742582321168,
+      "step": 3865
+    },
+    {
+      "epoch": 0.7041484716157205,
+      "grad_norm": 0.19186992943286896,
+      "learning_rate": 1.0670975280788086e-05,
+      "loss": 0.14113202095031738,
+      "step": 3870
+    },
+    {
+      "epoch": 0.7050582241630277,
+      "grad_norm": 0.1709229201078415,
+      "learning_rate": 1.0610719870966443e-05,
+      "loss": 0.1500566840171814,
+      "step": 3875
+    },
+    {
+      "epoch": 0.7059679767103348,
+      "grad_norm": 0.17846204340457916,
+      "learning_rate": 1.0550589202270892e-05,
+      "loss": 0.15014195442199707,
+      "step": 3880
+    },
+    {
+      "epoch": 0.7068777292576419,
+      "grad_norm": 0.1827082335948944,
+      "learning_rate": 1.0490583795976091e-05,
+      "loss": 0.1423472762107849,
+      "step": 3885
+    },
+    {
+      "epoch": 0.7077874818049491,
+      "grad_norm": 0.17418377101421356,
+      "learning_rate": 1.043070417227083e-05,
+      "loss": 0.14668900966644288,
+      "step": 3890
+    },
+    {
+      "epoch": 0.7086972343522562,
+      "grad_norm": 0.17385616898536682,
+      "learning_rate": 1.0370950850253449e-05,
+      "loss": 0.14627279043197633,
+      "step": 3895
+    },
+    {
+      "epoch": 0.7096069868995634,
+      "grad_norm": 0.16486723721027374,
+      "learning_rate": 1.0311324347927404e-05,
+      "loss": 0.14603652954101562,
+      "step": 3900
+    },
+    {
+      "epoch": 0.7105167394468704,
+      "grad_norm": 0.21806862950325012,
+      "learning_rate": 1.0251825182196732e-05,
+      "loss": 0.1488169550895691,
+      "step": 3905
+    },
+    {
+      "epoch": 0.7114264919941776,
+      "grad_norm": 0.19884569942951202,
+      "learning_rate": 1.019245386886159e-05,
+      "loss": 0.14387656450271608,
+      "step": 3910
+    },
+    {
+      "epoch": 0.7123362445414847,
+      "grad_norm": 0.16139011085033417,
+      "learning_rate": 1.0133210922613789e-05,
+      "loss": 0.1483074426651001,
+      "step": 3915
+    },
+    {
+      "epoch": 0.7132459970887919,
+      "grad_norm": 0.17000740766525269,
+      "learning_rate": 1.007409685703229e-05,
+      "loss": 0.14050065279006957,
+      "step": 3920
+    },
+    {
+      "epoch": 0.714155749636099,
+      "grad_norm": 0.17235304415225983,
+      "learning_rate": 1.0015112184578813e-05,
+      "loss": 0.1440442681312561,
+      "step": 3925
+    },
+    {
+      "epoch": 0.7150655021834061,
+      "grad_norm": 0.15737567842006683,
+      "learning_rate": 9.956257416593362e-06,
+      "loss": 0.14960765838623047,
+      "step": 3930
+    },
+    {
+      "epoch": 0.7159752547307132,
+      "grad_norm": 0.15499180555343628,
+      "learning_rate": 9.897533063289773e-06,
+      "loss": 0.14488829374313356,
+      "step": 3935
+    },
+    {
+      "epoch": 0.7168850072780204,
+      "grad_norm": 0.17744216322898865,
+      "learning_rate": 9.838939633751337e-06,
+      "loss": 0.1416949987411499,
+      "step": 3940
+    },
+    {
+      "epoch": 0.7177947598253275,
+      "grad_norm": 0.1597192883491516,
+      "learning_rate": 9.780477635926358e-06,
+      "loss": 0.14275280237197877,
+      "step": 3945
+    },
+    {
+      "epoch": 0.7187045123726347,
+      "grad_norm": 0.17800374329090118,
+      "learning_rate": 9.722147576623743e-06,
+      "loss": 0.14532098770141602,
+      "step": 3950
+    },
+    {
+      "epoch": 0.7196142649199417,
+      "grad_norm": 0.1828162521123886,
+      "learning_rate": 9.66394996150864e-06,
+      "loss": 0.14525585174560546,
+      "step": 3955
+    },
+    {
+      "epoch": 0.7205240174672489,
+      "grad_norm": 0.1800539344549179,
+      "learning_rate": 9.605885295098005e-06,
+      "loss": 0.14235819578170777,
+      "step": 3960
+    },
+    {
+      "epoch": 0.721433770014556,
+      "grad_norm": 0.16556483507156372,
+      "learning_rate": 9.54795408075628e-06,
+      "loss": 0.13965482711791993,
+      "step": 3965
+    },
+    {
+      "epoch": 0.7223435225618632,
+      "grad_norm": 0.1592024862766266,
+      "learning_rate": 9.49015682069101e-06,
+      "loss": 0.14051042795181273,
+      "step": 3970
+    },
+    {
+      "epoch": 0.7232532751091703,
+      "grad_norm": 0.18988847732543945,
+      "learning_rate": 9.43249401594846e-06,
+      "loss": 0.1436900496482849,
+      "step": 3975
+    },
+    {
+      "epoch": 0.7241630276564774,
+      "grad_norm": 0.24433808028697968,
+      "learning_rate": 9.374966166409329e-06,
+      "loss": 0.14883997440338134,
+      "step": 3980
+    },
+    {
+      "epoch": 0.7250727802037845,
+      "grad_norm": 0.15091639757156372,
+      "learning_rate": 9.317573770784352e-06,
+      "loss": 0.14726560115814208,
+      "step": 3985
+    },
+    {
+      "epoch": 0.7259825327510917,
+      "grad_norm": 0.17045573890209198,
+      "learning_rate": 9.260317326610051e-06,
+      "loss": 0.14120506048202514,
+      "step": 3990
+    },
+    {
+      "epoch": 0.7268922852983989,
+      "grad_norm": 0.18847957253456116,
+      "learning_rate": 9.203197330244343e-06,
+      "loss": 0.1377041220664978,
+      "step": 3995
+    },
+    {
+      "epoch": 0.727802037845706,
+      "grad_norm": 0.1516445279121399,
+      "learning_rate": 9.14621427686229e-06,
+      "loss": 0.14043946266174318,
+      "step": 4000
+    },
+    {
+      "epoch": 0.7287117903930131,
+      "grad_norm": 0.18264050781726837,
+      "learning_rate": 9.0893686604518e-06,
+      "loss": 0.14080368280410765,
+      "step": 4005
+    },
+    {
+      "epoch": 0.7296215429403202,
+      "grad_norm": 0.19129371643066406,
+      "learning_rate": 9.032660973809312e-06,
+      "loss": 0.1402561902999878,
+      "step": 4010
+    },
+    {
+      "epoch": 0.7305312954876274,
+      "grad_norm": 0.15762710571289062,
+      "learning_rate": 8.976091708535567e-06,
+      "loss": 0.14421157836914061,
+      "step": 4015
+    },
+    {
+      "epoch": 0.7314410480349345,
+      "grad_norm": 0.17785198986530304,
+      "learning_rate": 8.919661355031331e-06,
+      "loss": 0.14999009370803834,
+      "step": 4020
+    },
+    {
+      "epoch": 0.7323508005822417,
+      "grad_norm": 0.15306031703948975,
+      "learning_rate": 8.8633704024931e-06,
+      "loss": 0.14101698398590087,
+      "step": 4025
+    },
+    {
+      "epoch": 0.7332605531295487,
+      "grad_norm": 0.16481758654117584,
+      "learning_rate": 8.807219338908968e-06,
+      "loss": 0.14170764684677123,
+      "step": 4030
+    },
+    {
+      "epoch": 0.7341703056768559,
+      "grad_norm": 0.14892235398292542,
+      "learning_rate": 8.751208651054257e-06,
+      "loss": 0.15317896604537964,
+      "step": 4035
+    },
+    {
+      "epoch": 0.735080058224163,
+      "grad_norm": 0.1775592565536499,
+      "learning_rate": 8.695338824487409e-06,
+      "loss": 0.1520617723464966,
+      "step": 4040
+    },
+    {
+      "epoch": 0.7359898107714702,
+      "grad_norm": 0.1614258885383606,
+      "learning_rate": 8.639610343545728e-06,
+      "loss": 0.13747400045394897,
+      "step": 4045
+    },
+    {
+      "epoch": 0.7368995633187773,
+      "grad_norm": 0.21415506303310394,
+      "learning_rate": 8.58402369134117e-06,
+      "loss": 0.1432439088821411,
+      "step": 4050
+    },
+    {
+      "epoch": 0.7378093158660844,
+      "grad_norm": 0.1759418249130249,
+      "learning_rate": 8.528579349756205e-06,
+      "loss": 0.141641104221344,
+      "step": 4055
+    },
+    {
+      "epoch": 0.7387190684133915,
+      "grad_norm": 0.16738329827785492,
+      "learning_rate": 8.47327779943957e-06,
+      "loss": 0.14294810295104982,
+      "step": 4060
+    },
+    {
+      "epoch": 0.7396288209606987,
+      "grad_norm": 0.13916844129562378,
+      "learning_rate": 8.41811951980217e-06,
+      "loss": 0.13876968622207642,
+      "step": 4065
+    },
+    {
+      "epoch": 0.7405385735080058,
+      "grad_norm": 0.1828441321849823,
+      "learning_rate": 8.36310498901288e-06,
+      "loss": 0.148428475856781,
+      "step": 4070
+    },
+    {
+      "epoch": 0.741448326055313,
+      "grad_norm": 0.16534076631069183,
+      "learning_rate": 8.308234683994415e-06,
+      "loss": 0.14222711324691772,
+      "step": 4075
+    },
+    {
+      "epoch": 0.74235807860262,
+      "grad_norm": 0.17922644317150116,
+      "learning_rate": 8.253509080419198e-06,
+      "loss": 0.14365782737731933,
+      "step": 4080
+    },
+    {
+      "epoch": 0.7432678311499272,
+      "grad_norm": 0.15061035752296448,
+      "learning_rate": 8.198928652705204e-06,
+      "loss": 0.13571925163269044,
+      "step": 4085
+    },
+    {
+      "epoch": 0.7441775836972343,
+      "grad_norm": 0.18075402081012726,
+      "learning_rate": 8.144493874011908e-06,
+      "loss": 0.14385528564453126,
+      "step": 4090
+    },
+    {
+      "epoch": 0.7450873362445415,
+      "grad_norm": 0.16514739394187927,
+      "learning_rate": 8.090205216236135e-06,
+      "loss": 0.14920626878738402,
+      "step": 4095
+    },
+    {
+      "epoch": 0.7459970887918487,
+      "grad_norm": 0.16453702747821808,
+      "learning_rate": 8.03606315000797e-06,
+      "loss": 0.14704222679138185,
+      "step": 4100
+    },
+    {
+      "epoch": 0.7469068413391557,
+      "grad_norm": 0.16719917953014374,
+      "learning_rate": 7.982068144686707e-06,
+      "loss": 0.14722511768341065,
+      "step": 4105
+    },
+    {
+      "epoch": 0.7478165938864629,
+      "grad_norm": 0.18499110639095306,
+      "learning_rate": 7.92822066835677e-06,
+      "loss": 0.1401848554611206,
+      "step": 4110
+    },
+    {
+      "epoch": 0.74872634643377,
+      "grad_norm": 0.17249563336372375,
+      "learning_rate": 7.87452118782363e-06,
+      "loss": 0.15132423639297485,
+      "step": 4115
+    },
+    {
+      "epoch": 0.7496360989810772,
+      "grad_norm": 0.15049682557582855,
+      "learning_rate": 7.8209701686098e-06,
+      "loss": 0.1341150164604187,
+      "step": 4120
+    },
+    {
+      "epoch": 0.7505458515283843,
+      "grad_norm": 0.16892646253108978,
+      "learning_rate": 7.767568074950751e-06,
+      "loss": 0.1466840147972107,
+      "step": 4125
+    },
+    {
+      "epoch": 0.7514556040756915,
+      "grad_norm": 0.17288286983966827,
+      "learning_rate": 7.714315369790942e-06,
+      "loss": 0.13819680213928223,
+      "step": 4130
+    },
+    {
+      "epoch": 0.7523653566229985,
+      "grad_norm": 0.21893996000289917,
+      "learning_rate": 7.661212514779745e-06,
+      "loss": 0.14369510412216185,
+      "step": 4135
+    },
+    {
+      "epoch": 0.7532751091703057,
+      "grad_norm": 0.1674601435661316,
+      "learning_rate": 7.608259970267509e-06,
+      "loss": 0.14810250997543334,
+      "step": 4140
+    },
+    {
+      "epoch": 0.7541848617176128,
+      "grad_norm": 0.15875539183616638,
+      "learning_rate": 7.555458195301526e-06,
+      "loss": 0.14103198051452637,
+      "step": 4145
+    },
+    {
+      "epoch": 0.75509461426492,
+      "grad_norm": 0.19454079866409302,
+      "learning_rate": 7.502807647622037e-06,
+      "loss": 0.13848764896392823,
+      "step": 4150
+    },
+    {
+      "epoch": 0.756004366812227,
+      "grad_norm": 0.1795455813407898,
+      "learning_rate": 7.450308783658341e-06,
+      "loss": 0.14459335803985596,
+      "step": 4155
+    },
+    {
+      "epoch": 0.7569141193595342,
+      "grad_norm": 0.1643362045288086,
+      "learning_rate": 7.397962058524735e-06,
+      "loss": 0.14335378408432006,
+      "step": 4160
+    },
+    {
+      "epoch": 0.7578238719068413,
+      "grad_norm": 0.16362066566944122,
+      "learning_rate": 7.3457679260166475e-06,
+      "loss": 0.14222005605697632,
+      "step": 4165
+    },
+    {
+      "epoch": 0.7587336244541485,
+      "grad_norm": 0.17313003540039062,
+      "learning_rate": 7.293726838606674e-06,
+      "loss": 0.14272255897521974,
+      "step": 4170
+    },
+    {
+      "epoch": 0.7596433770014556,
+      "grad_norm": 0.1809929460287094,
+      "learning_rate": 7.2418392474406405e-06,
+      "loss": 0.14089123010635377,
+      "step": 4175
+    },
+    {
+      "epoch": 0.7605531295487628,
+      "grad_norm": 0.14306005835533142,
+      "learning_rate": 7.19010560233373e-06,
+      "loss": 0.13531534671783446,
+      "step": 4180
+    },
+    {
+      "epoch": 0.7614628820960698,
+      "grad_norm": 0.15525390207767487,
+      "learning_rate": 7.138526351766559e-06,
+      "loss": 0.14340845346450806,
+      "step": 4185
+    },
+    {
+      "epoch": 0.762372634643377,
+      "grad_norm": 0.24478943645954132,
+      "learning_rate": 7.087101942881263e-06,
+      "loss": 0.14744555950164795,
+      "step": 4190
+    },
+    {
+      "epoch": 0.7632823871906841,
+      "grad_norm": 0.31335577368736267,
+      "learning_rate": 7.035832821477711e-06,
+      "loss": 0.1484094500541687,
+      "step": 4195
+    },
+    {
+      "epoch": 0.7641921397379913,
+      "grad_norm": 0.15140366554260254,
+      "learning_rate": 6.984719432009515e-06,
+      "loss": 0.14991614818572999,
+      "step": 4200
+    },
+    {
+      "epoch": 0.7651018922852983,
+      "grad_norm": 0.16125506162643433,
+      "learning_rate": 6.933762217580289e-06,
+      "loss": 0.1408134937286377,
+      "step": 4205
+    },
+    {
+      "epoch": 0.7660116448326055,
+      "grad_norm": 0.2501450181007385,
+      "learning_rate": 6.882961619939726e-06,
+      "loss": 0.13875640630722047,
+      "step": 4210
+    },
+    {
+      "epoch": 0.7669213973799127,
+      "grad_norm": 0.16227811574935913,
+      "learning_rate": 6.8323180794798245e-06,
+      "loss": 0.14138660430908204,
+      "step": 4215
+    },
+    {
+      "epoch": 0.7678311499272198,
+      "grad_norm": 0.16676810383796692,
+      "learning_rate": 6.781832035231053e-06,
+      "loss": 0.14696706533432008,
+      "step": 4220
+    },
+    {
+      "epoch": 0.768740902474527,
+      "grad_norm": 0.14638574421405792,
+      "learning_rate": 6.731503924858518e-06,
+      "loss": 0.14263020753860473,
+      "step": 4225
+    },
+    {
+      "epoch": 0.769650655021834,
+      "grad_norm": 0.17093190550804138,
+      "learning_rate": 6.681334184658211e-06,
+      "loss": 0.14694111347198485,
+      "step": 4230
+    },
+    {
+      "epoch": 0.7705604075691412,
+      "grad_norm": 0.17174287140369415,
+      "learning_rate": 6.631323249553201e-06,
+      "loss": 0.13854929208755493,
+      "step": 4235
+    },
+    {
+      "epoch": 0.7714701601164483,
+      "grad_norm": 0.14599016308784485,
+      "learning_rate": 6.5814715530898745e-06,
+      "loss": 0.14058833122253417,
+      "step": 4240
+    },
+    {
+      "epoch": 0.7723799126637555,
+      "grad_norm": 0.16222265362739563,
+      "learning_rate": 6.531779527434176e-06,
+      "loss": 0.1428326725959778,
+      "step": 4245
+    },
+    {
+      "epoch": 0.7732896652110626,
+      "grad_norm": 0.1741994023323059,
+      "learning_rate": 6.482247603367839e-06,
+      "loss": 0.13985042572021483,
+      "step": 4250
+    },
+    {
+      "epoch": 0.7741994177583698,
+      "grad_norm": 0.17427101731300354,
+      "learning_rate": 6.432876210284688e-06,
+      "loss": 0.1442667603492737,
+      "step": 4255
+    },
+    {
+      "epoch": 0.7751091703056768,
+      "grad_norm": 0.1665259599685669,
+      "learning_rate": 6.383665776186912e-06,
+      "loss": 0.1421986222267151,
+      "step": 4260
+    },
+    {
+      "epoch": 0.776018922852984,
+      "grad_norm": 0.1728232353925705,
+      "learning_rate": 6.334616727681303e-06,
+      "loss": 0.1367053508758545,
+      "step": 4265
+    },
+    {
+      "epoch": 0.7769286754002911,
+      "grad_norm": 0.15882381796836853,
+      "learning_rate": 6.285729489975639e-06,
+      "loss": 0.14551182985305786,
+      "step": 4270
+    },
+    {
+      "epoch": 0.7778384279475983,
+      "grad_norm": 0.242042675614357,
+      "learning_rate": 6.2370044868749115e-06,
+      "loss": 0.1455132007598877,
+      "step": 4275
+    },
+    {
+      "epoch": 0.7787481804949054,
+      "grad_norm": 0.1599501073360443,
+      "learning_rate": 6.188442140777742e-06,
+      "loss": 0.1424942970275879,
+      "step": 4280
+    },
+    {
+      "epoch": 0.7796579330422125,
+      "grad_norm": 0.15182635188102722,
+      "learning_rate": 6.140042872672647e-06,
+      "loss": 0.14212887287139891,
+      "step": 4285
+    },
+    {
+      "epoch": 0.7805676855895196,
+      "grad_norm": 0.1720375418663025,
+      "learning_rate": 6.091807102134403e-06,
+      "loss": 0.14243412017822266,
+      "step": 4290
+    },
+    {
+      "epoch": 0.7814774381368268,
+      "grad_norm": 0.16436047852039337,
+      "learning_rate": 6.043735247320454e-06,
+      "loss": 0.15035657882690429,
+      "step": 4295
+    },
+    {
+      "epoch": 0.7823871906841339,
+      "grad_norm": 0.1498408019542694,
+      "learning_rate": 5.995827724967218e-06,
+      "loss": 0.14494839906692505,
+      "step": 4300
+    },
+    {
+      "epoch": 0.7832969432314411,
+      "grad_norm": 0.16924560070037842,
+      "learning_rate": 5.948084950386535e-06,
+      "loss": 0.13581212759017944,
+      "step": 4305
+    },
+    {
+      "epoch": 0.7842066957787481,
+      "grad_norm": 0.15889139473438263,
+      "learning_rate": 5.900507337462036e-06,
+      "loss": 0.15071530342102052,
+      "step": 4310
+    },
+    {
+      "epoch": 0.7851164483260553,
+      "grad_norm": 0.17201054096221924,
+      "learning_rate": 5.853095298645542e-06,
+      "loss": 0.1398628830909729,
+      "step": 4315
+    },
+    {
+      "epoch": 0.7860262008733624,
+      "grad_norm": 0.17965619266033173,
+      "learning_rate": 5.805849244953548e-06,
+      "loss": 0.14666696786880493,
+      "step": 4320
+    },
+    {
+      "epoch": 0.7869359534206696,
+      "grad_norm": 0.17514032125473022,
+      "learning_rate": 5.758769585963569e-06,
+      "loss": 0.1383386731147766,
+      "step": 4325
+    },
+    {
+      "epoch": 0.7878457059679768,
+      "grad_norm": 0.17497631907463074,
+      "learning_rate": 5.7118567298106744e-06,
+      "loss": 0.14362354278564454,
+      "step": 4330
+    },
+    {
+      "epoch": 0.7887554585152838,
+      "grad_norm": 0.16770458221435547,
+      "learning_rate": 5.665111083183905e-06,
+      "loss": 0.14136618375778198,
+      "step": 4335
+    },
+    {
+      "epoch": 0.789665211062591,
+      "grad_norm": 0.17134106159210205,
+      "learning_rate": 5.618533051322747e-06,
+      "loss": 0.1401529550552368,
+      "step": 4340
+    },
+    {
+      "epoch": 0.7905749636098981,
+      "grad_norm": 0.19458788633346558,
+      "learning_rate": 5.5721230380136435e-06,
+      "loss": 0.1393273115158081,
+      "step": 4345
+    },
+    {
+      "epoch": 0.7914847161572053,
+      "grad_norm": 0.19483692944049835,
+      "learning_rate": 5.525881445586467e-06,
+      "loss": 0.1369825482368469,
+      "step": 4350
+    },
+    {
+      "epoch": 0.7923944687045124,
+      "grad_norm": 0.3052191734313965,
+      "learning_rate": 5.4798086749110495e-06,
+      "loss": 0.14762181043624878,
+      "step": 4355
+    },
+    {
+      "epoch": 0.7933042212518195,
+      "grad_norm": 0.164458766579628,
+      "learning_rate": 5.4339051253937065e-06,
+      "loss": 0.14501686096191407,
+      "step": 4360
+    },
+    {
+      "epoch": 0.7942139737991266,
+      "grad_norm": 0.1719193458557129,
+      "learning_rate": 5.3881711949737625e-06,
+      "loss": 0.13321092128753662,
+      "step": 4365
+    },
+    {
+      "epoch": 0.7951237263464338,
+      "grad_norm": 0.17219696938991547,
+      "learning_rate": 5.342607280120121e-06,
+      "loss": 0.1413906455039978,
+      "step": 4370
+    },
+    {
+      "epoch": 0.7960334788937409,
+      "grad_norm": 0.15083056688308716,
+      "learning_rate": 5.297213775827789e-06,
+      "loss": 0.14772192239761353,
+      "step": 4375
+    },
+    {
+      "epoch": 0.7969432314410481,
+      "grad_norm": 0.1699071079492569,
+      "learning_rate": 5.251991075614507e-06,
+      "loss": 0.1392375946044922,
+      "step": 4380
+    },
+    {
+      "epoch": 0.7978529839883551,
+      "grad_norm": 0.1680395007133484,
+      "learning_rate": 5.206939571517302e-06,
+      "loss": 0.14185575246810914,
+      "step": 4385
+    },
+    {
+      "epoch": 0.7987627365356623,
+      "grad_norm": 0.16526710987091064,
+      "learning_rate": 5.162059654089083e-06,
+      "loss": 0.15001428127288818,
+      "step": 4390
+    },
+    {
+      "epoch": 0.7996724890829694,
+      "grad_norm": 0.16281752288341522,
+      "learning_rate": 5.1173517123952794e-06,
+      "loss": 0.13747023344039916,
+      "step": 4395
+    },
+    {
+      "epoch": 0.8005822416302766,
+      "grad_norm": 0.1454378366470337,
+      "learning_rate": 5.072816134010458e-06,
+      "loss": 0.14710829257965088,
+      "step": 4400
+    },
+    {
+      "epoch": 0.8014919941775837,
+      "grad_norm": 0.16565890610218048,
+      "learning_rate": 5.028453305014966e-06,
+      "loss": 0.14138611555099487,
+      "step": 4405
+    },
+    {
+      "epoch": 0.8024017467248908,
+      "grad_norm": 0.1962810605764389,
+      "learning_rate": 4.984263609991577e-06,
+      "loss": 0.13836177587509155,
+      "step": 4410
+    },
+    {
+      "epoch": 0.8033114992721979,
+      "grad_norm": 0.16091369092464447,
+      "learning_rate": 4.940247432022149e-06,
+      "loss": 0.14407440423965454,
+      "step": 4415
+    },
+    {
+      "epoch": 0.8042212518195051,
+      "grad_norm": 0.1930241584777832,
+      "learning_rate": 4.89640515268433e-06,
+      "loss": 0.14346336126327514,
+      "step": 4420
+    },
+    {
+      "epoch": 0.8051310043668122,
+      "grad_norm": 0.19301500916481018,
+      "learning_rate": 4.852737152048242e-06,
+      "loss": 0.14174317121505736,
+      "step": 4425
+    },
+    {
+      "epoch": 0.8060407569141194,
+      "grad_norm": 0.1541353315114975,
+      "learning_rate": 4.80924380867315e-06,
+      "loss": 0.14100592136383056,
+      "step": 4430
+    },
+    {
+      "epoch": 0.8069505094614265,
+      "grad_norm": 0.16285750269889832,
+      "learning_rate": 4.765925499604243e-06,
+      "loss": 0.1441288709640503,
+      "step": 4435
+    },
+    {
+      "epoch": 0.8078602620087336,
+      "grad_norm": 0.17382675409317017,
+      "learning_rate": 4.722782600369299e-06,
+      "loss": 0.13763951063156127,
+      "step": 4440
+    },
+    {
+      "epoch": 0.8087700145560408,
+      "grad_norm": 0.1697344034910202,
+      "learning_rate": 4.679815484975505e-06,
+      "loss": 0.1410105347633362,
+      "step": 4445
+    },
+    {
+      "epoch": 0.8096797671033479,
+      "grad_norm": 0.19964542984962463,
+      "learning_rate": 4.637024525906131e-06,
+      "loss": 0.1439276695251465,
+      "step": 4450
+    },
+    {
+      "epoch": 0.8105895196506551,
+      "grad_norm": 0.165307879447937,
+      "learning_rate": 4.59441009411736e-06,
+      "loss": 0.13897504806518554,
+      "step": 4455
+    },
+    {
+      "epoch": 0.8114992721979621,
+      "grad_norm": 0.16687989234924316,
+      "learning_rate": 4.551972559035067e-06,
+      "loss": 0.1422593355178833,
+      "step": 4460
+    },
+    {
+      "epoch": 0.8124090247452693,
+      "grad_norm": 0.15737789869308472,
+      "learning_rate": 4.509712288551571e-06,
+      "loss": 0.1452128052711487,
+      "step": 4465
+    },
+    {
+      "epoch": 0.8133187772925764,
+      "grad_norm": 0.17116659879684448,
+      "learning_rate": 4.467629649022509e-06,
+      "loss": 0.14385371208190917,
+      "step": 4470
+    },
+    {
+      "epoch": 0.8142285298398836,
+      "grad_norm": 0.17457640171051025,
+      "learning_rate": 4.425725005263623e-06,
+      "loss": 0.14808475971221924,
+      "step": 4475
+    },
+    {
+      "epoch": 0.8151382823871907,
+      "grad_norm": 0.1621970385313034,
+      "learning_rate": 4.383998720547583e-06,
+      "loss": 0.13927959203720092,
+      "step": 4480
+    },
+    {
+      "epoch": 0.8160480349344978,
+      "grad_norm": 0.176296666264534,
+      "learning_rate": 4.342451156600896e-06,
+      "loss": 0.15041060447692872,
+      "step": 4485
+    },
+    {
+      "epoch": 0.8169577874818049,
+      "grad_norm": 0.17157645523548126,
+      "learning_rate": 4.301082673600698e-06,
+      "loss": 0.13932652473449708,
+      "step": 4490
+    },
+    {
+      "epoch": 0.8178675400291121,
+      "grad_norm": 0.15378527343273163,
+      "learning_rate": 4.259893630171682e-06,
+      "loss": 0.1406856894493103,
+      "step": 4495
+    },
+    {
+      "epoch": 0.8187772925764192,
+      "grad_norm": 0.1750226765871048,
+      "learning_rate": 4.218884383382987e-06,
+      "loss": 0.1350164532661438,
+      "step": 4500
+    },
+    {
+      "epoch": 0.8196870451237264,
+      "grad_norm": 0.1393742561340332,
+      "learning_rate": 4.178055288745053e-06,
+      "loss": 0.13769235610961914,
+      "step": 4505
+    },
+    {
+      "epoch": 0.8205967976710334,
+      "grad_norm": 0.1668994128704071,
+      "learning_rate": 4.137406700206617e-06,
+      "loss": 0.14029752016067504,
+      "step": 4510
+    },
+    {
+      "epoch": 0.8215065502183406,
+      "grad_norm": 0.1833454668521881,
+      "learning_rate": 4.0969389701515675e-06,
+      "loss": 0.14276301860809326,
+      "step": 4515
+    },
+    {
+      "epoch": 0.8224163027656477,
+      "grad_norm": 0.16187874972820282,
+      "learning_rate": 4.056652449395945e-06,
+      "loss": 0.1444832682609558,
+      "step": 4520
+    },
+    {
+      "epoch": 0.8233260553129549,
+      "grad_norm": 0.1453280746936798,
+      "learning_rate": 4.01654748718488e-06,
+      "loss": 0.14512733221054078,
+      "step": 4525
+    },
+    {
+      "epoch": 0.824235807860262,
+      "grad_norm": 0.1782725751399994,
+      "learning_rate": 3.976624431189563e-06,
+      "loss": 0.14093561172485353,
+      "step": 4530
+    },
+    {
+      "epoch": 0.8251455604075691,
+      "grad_norm": 0.17374491691589355,
+      "learning_rate": 3.936883627504234e-06,
+      "loss": 0.14031401872634888,
+      "step": 4535
+    },
+    {
+      "epoch": 0.8260553129548762,
+      "grad_norm": 0.1609172821044922,
+      "learning_rate": 3.897325420643174e-06,
+      "loss": 0.1428336262702942,
+      "step": 4540
+    },
+    {
+      "epoch": 0.8269650655021834,
+      "grad_norm": 0.1520884931087494,
+      "learning_rate": 3.85795015353774e-06,
+      "loss": 0.1460547924041748,
+      "step": 4545
+    },
+    {
+      "epoch": 0.8278748180494906,
+      "grad_norm": 0.20986326038837433,
+      "learning_rate": 3.818758167533376e-06,
+      "loss": 0.14706350564956666,
+      "step": 4550
+    },
+    {
+      "epoch": 0.8287845705967977,
+      "grad_norm": 0.16825413703918457,
+      "learning_rate": 3.7797498023866396e-06,
+      "loss": 0.14507200717926025,
+      "step": 4555
+    },
+    {
+      "epoch": 0.8296943231441049,
+      "grad_norm": 0.16758380830287933,
+      "learning_rate": 3.740925396262296e-06,
+      "loss": 0.14898381233215333,
+      "step": 4560
+    },
+    {
+      "epoch": 0.8306040756914119,
+      "grad_norm": 0.15207453072071075,
+      "learning_rate": 3.7022852857303503e-06,
+      "loss": 0.14138854742050172,
+      "step": 4565
+    },
+    {
+      "epoch": 0.8315138282387191,
+      "grad_norm": 0.15150749683380127,
+      "learning_rate": 3.66382980576315e-06,
+      "loss": 0.13894975185394287,
+      "step": 4570
+    },
+    {
+      "epoch": 0.8324235807860262,
+      "grad_norm": 0.17071188986301422,
+      "learning_rate": 3.625559289732472e-06,
+      "loss": 0.14072470664978026,
+      "step": 4575
+    },
+    {
+      "epoch": 0.8333333333333334,
+      "grad_norm": 0.154335618019104,
+      "learning_rate": 3.5874740694066294e-06,
+      "loss": 0.13791344165802003,
+      "step": 4580
+    },
+    {
+      "epoch": 0.8342430858806404,
+      "grad_norm": 0.14017128944396973,
+      "learning_rate": 3.5495744749476116e-06,
+      "loss": 0.14427922964096068,
+      "step": 4585
+    },
+    {
+      "epoch": 0.8351528384279476,
+      "grad_norm": 0.17210033535957336,
+      "learning_rate": 3.5118608349081983e-06,
+      "loss": 0.15191166400909423,
+      "step": 4590
+    },
+    {
+      "epoch": 0.8360625909752547,
+      "grad_norm": 0.18715685606002808,
+      "learning_rate": 3.4743334762291358e-06,
+      "loss": 0.14451316595077515,
+      "step": 4595
+    },
+    {
+      "epoch": 0.8369723435225619,
+      "grad_norm": 0.18079884350299835,
+      "learning_rate": 3.436992724236293e-06,
+      "loss": 0.13530746698379517,
+      "step": 4600
+    },
+    {
+      "epoch": 0.837882096069869,
+      "grad_norm": 0.13519920408725739,
+      "learning_rate": 3.399838902637817e-06,
+      "loss": 0.1477964401245117,
+      "step": 4605
+    },
+    {
+      "epoch": 0.8387918486171762,
+      "grad_norm": 0.1778026670217514,
+      "learning_rate": 3.3628723335213885e-06,
+      "loss": 0.14419831037521363,
+      "step": 4610
+    },
+    {
+      "epoch": 0.8397016011644832,
+      "grad_norm": 0.15165366232395172,
+      "learning_rate": 3.326093337351355e-06,
+      "loss": 0.13888469934463502,
+      "step": 4615
+    },
+    {
+      "epoch": 0.8406113537117904,
+      "grad_norm": 0.17049473524093628,
+      "learning_rate": 3.2895022329660018e-06,
+      "loss": 0.14438477754592896,
+      "step": 4620
+    },
+    {
+      "epoch": 0.8415211062590975,
+      "grad_norm": 0.16536414623260498,
+      "learning_rate": 3.2530993375747833e-06,
+      "loss": 0.1444351315498352,
+      "step": 4625
+    },
+    {
+      "epoch": 0.8424308588064047,
+      "grad_norm": 0.17570015788078308,
+      "learning_rate": 3.2168849667555402e-06,
+      "loss": 0.13861945867538453,
+      "step": 4630
+    },
+    {
+      "epoch": 0.8433406113537117,
+      "grad_norm": 0.1699545532464981,
+      "learning_rate": 3.1808594344518132e-06,
+      "loss": 0.13902754783630372,
+      "step": 4635
+    },
+    {
+      "epoch": 0.8442503639010189,
+      "grad_norm": 0.12331254780292511,
+      "learning_rate": 3.1450230529700837e-06,
+      "loss": 0.14104254245758058,
+      "step": 4640
+    },
+    {
+      "epoch": 0.845160116448326,
+      "grad_norm": 0.1508190929889679,
+      "learning_rate": 3.1093761329770708e-06,
+      "loss": 0.13288766145706177,
+      "step": 4645
+    },
+    {
+      "epoch": 0.8460698689956332,
+      "grad_norm": 0.19049489498138428,
+      "learning_rate": 3.0739189834970735e-06,
+      "loss": 0.14914840459823608,
+      "step": 4650
+    },
+    {
+      "epoch": 0.8469796215429404,
+      "grad_norm": 0.1662369966506958,
+      "learning_rate": 3.0386519119092293e-06,
+      "loss": 0.14222898483276367,
+      "step": 4655
+    },
+    {
+      "epoch": 0.8478893740902474,
+      "grad_norm": 0.18985967338085175,
+      "learning_rate": 3.0035752239449126e-06,
+      "loss": 0.14431113004684448,
+      "step": 4660
+    },
+    {
+      "epoch": 0.8487991266375546,
+      "grad_norm": 0.17005261778831482,
+      "learning_rate": 2.9686892236850337e-06,
+      "loss": 0.14140807390213012,
+      "step": 4665
+    },
+    {
+      "epoch": 0.8497088791848617,
+      "grad_norm": 0.16786684095859528,
+      "learning_rate": 2.9339942135574394e-06,
+      "loss": 0.14161460399627684,
+      "step": 4670
+    },
+    {
+      "epoch": 0.8506186317321689,
+      "grad_norm": 0.16358181834220886,
+      "learning_rate": 2.899490494334281e-06,
+      "loss": 0.14674670696258546,
+      "step": 4675
+    },
+    {
+      "epoch": 0.851528384279476,
+      "grad_norm": 0.1651349812746048,
+      "learning_rate": 2.8651783651293867e-06,
+      "loss": 0.13794611692428588,
+      "step": 4680
+    },
+    {
+      "epoch": 0.8524381368267832,
+      "grad_norm": 0.16934923827648163,
+      "learning_rate": 2.831058123395694e-06,
+      "loss": 0.13199397325515747,
+      "step": 4685
+    },
+    {
+      "epoch": 0.8533478893740902,
+      "grad_norm": 0.1704150140285492,
+      "learning_rate": 2.797130064922665e-06,
+      "loss": 0.14044904708862305,
+      "step": 4690
+    },
+    {
+      "epoch": 0.8542576419213974,
+      "grad_norm": 0.1814192682504654,
+      "learning_rate": 2.7633944838337143e-06,
+      "loss": 0.1465100646018982,
+      "step": 4695
+    },
+    {
+      "epoch": 0.8551673944687045,
+      "grad_norm": 0.18942610919475555,
+      "learning_rate": 2.729851672583669e-06,
+      "loss": 0.14685982465744019,
+      "step": 4700
+    },
+    {
+      "epoch": 0.8560771470160117,
+      "grad_norm": 0.17895208299160004,
+      "learning_rate": 2.6965019219562155e-06,
+      "loss": 0.13971571922302245,
+      "step": 4705
+    },
+    {
+      "epoch": 0.8569868995633187,
+      "grad_norm": 0.22735828161239624,
+      "learning_rate": 2.6633455210614055e-06,
+      "loss": 0.13776102066040039,
+      "step": 4710
+    },
+    {
+      "epoch": 0.8578966521106259,
+      "grad_norm": 0.16779793798923492,
+      "learning_rate": 2.630382757333133e-06,
+      "loss": 0.14134042263031005,
+      "step": 4715
+    },
+    {
+      "epoch": 0.858806404657933,
+      "grad_norm": 0.2148888260126114,
+      "learning_rate": 2.597613916526637e-06,
+      "loss": 0.14680721759796142,
+      "step": 4720
+    },
+    {
+      "epoch": 0.8597161572052402,
+      "grad_norm": 0.16560257971286774,
+      "learning_rate": 2.565039282716045e-06,
+      "loss": 0.14137234687805175,
+      "step": 4725
+    },
+    {
+      "epoch": 0.8606259097525473,
+      "grad_norm": 0.16197068989276886,
+      "learning_rate": 2.532659138291879e-06,
+      "loss": 0.14969314336776735,
+      "step": 4730
+    },
+    {
+      "epoch": 0.8615356622998545,
+      "grad_norm": 0.14650246500968933,
+      "learning_rate": 2.5004737639586497e-06,
+      "loss": 0.13532910346984864,
+      "step": 4735
+    },
+    {
+      "epoch": 0.8624454148471615,
+      "grad_norm": 0.1565634310245514,
+      "learning_rate": 2.4684834387323943e-06,
+      "loss": 0.14146244525909424,
+      "step": 4740
+    },
+    {
+      "epoch": 0.8633551673944687,
+      "grad_norm": 0.18060864508152008,
+      "learning_rate": 2.4366884399382393e-06,
+      "loss": 0.14218534231185914,
+      "step": 4745
+    },
+    {
+      "epoch": 0.8642649199417758,
+      "grad_norm": 0.24613255262374878,
+      "learning_rate": 2.4050890432080557e-06,
+      "loss": 0.13907679319381713,
+      "step": 4750
+    },
+    {
+      "epoch": 0.865174672489083,
+      "grad_norm": 0.16036023199558258,
+      "learning_rate": 2.3736855224780057e-06,
+      "loss": 0.13718113899230958,
+      "step": 4755
+    },
+    {
+      "epoch": 0.86608442503639,
+      "grad_norm": 0.16678516566753387,
+      "learning_rate": 2.3424781499862075e-06,
+      "loss": 0.1327962040901184,
+      "step": 4760
+    },
+    {
+      "epoch": 0.8669941775836972,
+      "grad_norm": 0.1763770878314972,
+      "learning_rate": 2.3114671962703727e-06,
+      "loss": 0.14390318393707274,
+      "step": 4765
+    },
+    {
+      "epoch": 0.8679039301310044,
+      "grad_norm": 0.17735697329044342,
+      "learning_rate": 2.280652930165428e-06,
+      "loss": 0.15223288536071777,
+      "step": 4770
+    },
+    {
+      "epoch": 0.8688136826783115,
+      "grad_norm": 0.15827041864395142,
+      "learning_rate": 2.250035618801241e-06,
+      "loss": 0.14296332597732545,
+      "step": 4775
+    },
+    {
+      "epoch": 0.8697234352256187,
+      "grad_norm": 0.16876135766506195,
+      "learning_rate": 2.219615527600244e-06,
+      "loss": 0.1359076738357544,
+      "step": 4780
+    },
+    {
+      "epoch": 0.8706331877729258,
+      "grad_norm": 0.1800110638141632,
+      "learning_rate": 2.189392920275174e-06,
+      "loss": 0.1424281358718872,
+      "step": 4785
+    },
+    {
+      "epoch": 0.8715429403202329,
+      "grad_norm": 0.1409560889005661,
+      "learning_rate": 2.159368058826783e-06,
+      "loss": 0.14480490684509278,
+      "step": 4790
+    },
+    {
+      "epoch": 0.87245269286754,
+      "grad_norm": 0.1634288728237152,
+      "learning_rate": 2.129541203541535e-06,
+      "loss": 0.14513269662857056,
+      "step": 4795
+    },
+    {
+      "epoch": 0.8733624454148472,
+      "grad_norm": 0.17126062512397766,
+      "learning_rate": 2.099912612989391e-06,
+      "loss": 0.13546934127807617,
+      "step": 4800
+    },
+    {
+      "epoch": 0.8742721979621543,
+      "grad_norm": 0.16704080998897552,
+      "learning_rate": 2.0704825440215457e-06,
+      "loss": 0.13852492570877076,
+      "step": 4805
+    },
+    {
+      "epoch": 0.8751819505094615,
+      "grad_norm": 0.1725970208644867,
+      "learning_rate": 2.0412512517681946e-06,
+      "loss": 0.14504197835922242,
+      "step": 4810
+    },
+    {
+      "epoch": 0.8760917030567685,
+      "grad_norm": 0.1700201779603958,
+      "learning_rate": 2.0122189896363387e-06,
+      "loss": 0.14312338829040527,
+      "step": 4815
+    },
+    {
+      "epoch": 0.8770014556040757,
+      "grad_norm": 0.16491736471652985,
+      "learning_rate": 1.9833860093075834e-06,
+      "loss": 0.14062976837158203,
+      "step": 4820
+    },
+    {
+      "epoch": 0.8779112081513828,
+      "grad_norm": 0.13748787343502045,
+      "learning_rate": 1.9547525607359537e-06,
+      "loss": 0.1346171498298645,
+      "step": 4825
+    },
+    {
+      "epoch": 0.87882096069869,
+      "grad_norm": 0.16399399936199188,
+      "learning_rate": 1.926318892145712e-06,
+      "loss": 0.14178123474121093,
+      "step": 4830
+    },
+    {
+      "epoch": 0.879730713245997,
+      "grad_norm": 0.14491963386535645,
+      "learning_rate": 1.8980852500292412e-06,
+      "loss": 0.1408564567565918,
+      "step": 4835
+    },
+    {
+      "epoch": 0.8806404657933042,
+      "grad_norm": 0.17335423827171326,
+      "learning_rate": 1.8700518791448851e-06,
+      "loss": 0.14403265714645386,
+      "step": 4840
+    },
+    {
+      "epoch": 0.8815502183406113,
+      "grad_norm": 0.17399625480175018,
+      "learning_rate": 1.8422190225148155e-06,
+      "loss": 0.14289036989212037,
+      "step": 4845
+    },
+    {
+      "epoch": 0.8824599708879185,
+      "grad_norm": 0.17945612967014313,
+      "learning_rate": 1.814586921422956e-06,
+      "loss": 0.14494109153747559,
+      "step": 4850
+    },
+    {
+      "epoch": 0.8833697234352256,
+      "grad_norm": 0.1910620480775833,
+      "learning_rate": 1.7871558154128664e-06,
+      "loss": 0.13726245164871215,
+      "step": 4855
+    },
+    {
+      "epoch": 0.8842794759825328,
+      "grad_norm": 0.1771879345178604,
+      "learning_rate": 1.7599259422856756e-06,
+      "loss": 0.1464752197265625,
+      "step": 4860
+    },
+    {
+      "epoch": 0.8851892285298398,
+      "grad_norm": 0.19427461922168732,
+      "learning_rate": 1.7328975380980218e-06,
+      "loss": 0.13823356628417968,
+      "step": 4865
+    },
+    {
+      "epoch": 0.886098981077147,
+      "grad_norm": 0.1491149365901947,
+      "learning_rate": 1.7060708371599897e-06,
+      "loss": 0.1338604211807251,
+      "step": 4870
+    },
+    {
+      "epoch": 0.8870087336244541,
+      "grad_norm": 0.16087733209133148,
+      "learning_rate": 1.6794460720331057e-06,
+      "loss": 0.14184389114379883,
+      "step": 4875
+    },
+    {
+      "epoch": 0.8879184861717613,
+      "grad_norm": 0.14506325125694275,
+      "learning_rate": 1.653023473528309e-06,
+      "loss": 0.14267687797546386,
+      "step": 4880
+    },
+    {
+      "epoch": 0.8888282387190685,
+      "grad_norm": 0.16886365413665771,
+      "learning_rate": 1.626803270703936e-06,
+      "loss": 0.14266083240509034,
+      "step": 4885
+    },
+    {
+      "epoch": 0.8897379912663755,
+      "grad_norm": 0.1891999989748001,
+      "learning_rate": 1.6007856908637652e-06,
+      "loss": 0.1398016929626465,
+      "step": 4890
+    },
+    {
+      "epoch": 0.8906477438136827,
+      "grad_norm": 0.17645299434661865,
+      "learning_rate": 1.5749709595550083e-06,
+      "loss": 0.13869571685791016,
+      "step": 4895
+    },
+    {
+      "epoch": 0.8915574963609898,
+      "grad_norm": 0.17714262008666992,
+      "learning_rate": 1.549359300566408e-06,
+      "loss": 0.14957486391067504,
+      "step": 4900
+    },
+    {
+      "epoch": 0.892467248908297,
+      "grad_norm": 0.18025240302085876,
+      "learning_rate": 1.5239509359262355e-06,
+      "loss": 0.1358652949333191,
+      "step": 4905
+    },
+    {
+      "epoch": 0.8933770014556041,
+      "grad_norm": 0.17539937794208527,
+      "learning_rate": 1.4987460859004154e-06,
+      "loss": 0.13833394050598144,
+      "step": 4910
+    },
+    {
+      "epoch": 0.8942867540029112,
+      "grad_norm": 0.1772230565547943,
+      "learning_rate": 1.4737449689905953e-06,
+      "loss": 0.14202116727828978,
+      "step": 4915
+    },
+    {
+      "epoch": 0.8951965065502183,
+      "grad_norm": 0.1670161783695221,
+      "learning_rate": 1.4489478019322433e-06,
+      "loss": 0.1403665542602539,
+      "step": 4920
+    },
+    {
+      "epoch": 0.8961062590975255,
+      "grad_norm": 0.1697034239768982,
+      "learning_rate": 1.4243547996927926e-06,
+      "loss": 0.1401481032371521,
+      "step": 4925
+    },
+    {
+      "epoch": 0.8970160116448326,
+      "grad_norm": 0.16474860906600952,
+      "learning_rate": 1.3999661754697636e-06,
+      "loss": 0.13969850540161133,
+      "step": 4930
+    },
+    {
+      "epoch": 0.8979257641921398,
+      "grad_norm": 0.1664883941411972,
+      "learning_rate": 1.3757821406889027e-06,
+      "loss": 0.1399069309234619,
+      "step": 4935
+    },
+    {
+      "epoch": 0.8988355167394468,
+      "grad_norm": 0.16675794124603271,
+      "learning_rate": 1.351802905002386e-06,
+      "loss": 0.14129226207733153,
+      "step": 4940
+    },
+    {
+      "epoch": 0.899745269286754,
+      "grad_norm": 0.17529809474945068,
+      "learning_rate": 1.3280286762869632e-06,
+      "loss": 0.14663081169128417,
+      "step": 4945
+    },
+    {
+      "epoch": 0.9006550218340611,
+      "grad_norm": 0.17758169770240784,
+      "learning_rate": 1.3044596606421795e-06,
+      "loss": 0.13986254930496217,
+      "step": 4950
+    },
+    {
+      "epoch": 0.9015647743813683,
+      "grad_norm": 0.153225839138031,
+      "learning_rate": 1.2810960623885815e-06,
+      "loss": 0.14236698150634766,
+      "step": 4955
+    },
+    {
+      "epoch": 0.9024745269286754,
+      "grad_norm": 0.169761523604393,
+      "learning_rate": 1.2579380840659376e-06,
+      "loss": 0.1450445055961609,
+      "step": 4960
+    },
+    {
+      "epoch": 0.9033842794759825,
+      "grad_norm": 0.16659331321716309,
+      "learning_rate": 1.2349859264315034e-06,
+      "loss": 0.14043926000595092,
+      "step": 4965
+    },
+    {
+      "epoch": 0.9042940320232896,
+      "grad_norm": 0.16748706996440887,
+      "learning_rate": 1.2122397884582553e-06,
+      "loss": 0.14725675582885742,
+      "step": 4970
+    },
+    {
+      "epoch": 0.9052037845705968,
+      "grad_norm": 0.1600511223077774,
+      "learning_rate": 1.1896998673331883e-06,
+      "loss": 0.14551150798797607,
+      "step": 4975
+    },
+    {
+      "epoch": 0.9061135371179039,
+      "grad_norm": 0.24318362772464752,
+      "learning_rate": 1.1673663584555934e-06,
+      "loss": 0.14470888376235963,
+      "step": 4980
+    },
+    {
+      "epoch": 0.9070232896652111,
+      "grad_norm": 0.16443821787834167,
+      "learning_rate": 1.1452394554353706e-06,
+      "loss": 0.13639854192733764,
+      "step": 4985
+    },
+    {
+      "epoch": 0.9079330422125182,
+      "grad_norm": 0.14277774095535278,
+      "learning_rate": 1.1233193500913453e-06,
+      "loss": 0.13749881982803344,
+      "step": 4990
+    },
+    {
+      "epoch": 0.9088427947598253,
+      "grad_norm": 0.1610947549343109,
+      "learning_rate": 1.1016062324496008e-06,
+      "loss": 0.1385629653930664,
+      "step": 4995
+    },
+    {
+      "epoch": 0.9097525473071325,
+      "grad_norm": 0.17888498306274414,
+      "learning_rate": 1.080100290741845e-06,
+      "loss": 0.14225621223449708,
+      "step": 5000
+    },
+    {
+      "epoch": 0.9106622998544396,
+      "grad_norm": 0.17488449811935425,
+      "learning_rate": 1.0588017114037729e-06,
+      "loss": 0.14187805652618407,
+      "step": 5005
+    },
+    {
+      "epoch": 0.9115720524017468,
+      "grad_norm": 0.16410665214061737,
+      "learning_rate": 1.0377106790734392e-06,
+      "loss": 0.1407416582107544,
+      "step": 5010
+    },
+    {
+      "epoch": 0.9124818049490538,
+      "grad_norm": 0.18115971982479095,
+      "learning_rate": 1.016827376589674e-06,
+      "loss": 0.1427263855934143,
+      "step": 5015
+    },
+    {
+      "epoch": 0.913391557496361,
+      "grad_norm": 0.18507841229438782,
+      "learning_rate": 9.961519849904898e-07,
+      "loss": 0.1390499472618103,
+      "step": 5020
+    },
+    {
+      "epoch": 0.9143013100436681,
+      "grad_norm": 0.21296796202659607,
+      "learning_rate": 9.75684683511513e-07,
+      "loss": 0.1382216691970825,
+      "step": 5025
+    },
+    {
+      "epoch": 0.9152110625909753,
+      "grad_norm": 0.2308044582605362,
+      "learning_rate": 9.55425649584435e-07,
+      "loss": 0.14271280765533448,
+      "step": 5030
+    },
+    {
+      "epoch": 0.9161208151382824,
+      "grad_norm": 0.15796682238578796,
+      "learning_rate": 9.353750588354527e-07,
+      "loss": 0.13807624578475952,
+      "step": 5035
+    },
+    {
+      "epoch": 0.9170305676855895,
+      "grad_norm": 0.1695316582918167,
+      "learning_rate": 9.155330850837834e-07,
+      "loss": 0.14289476871490478,
+      "step": 5040
+    },
+    {
+      "epoch": 0.9179403202328966,
+      "grad_norm": 0.1738404780626297,
+      "learning_rate": 8.958999003401191e-07,
+      "loss": 0.14070619344711305,
+      "step": 5045
+    },
+    {
+      "epoch": 0.9188500727802038,
+      "grad_norm": 0.20618964731693268,
+      "learning_rate": 8.764756748051662e-07,
+      "loss": 0.14535053968429565,
+      "step": 5050
+    },
+    {
+      "epoch": 0.9197598253275109,
+      "grad_norm": 0.1506137251853943,
+      "learning_rate": 8.572605768681546e-07,
+      "loss": 0.13995139598846434,
+      "step": 5055
+    },
+    {
+      "epoch": 0.9206695778748181,
+      "grad_norm": 0.17772039771080017,
+      "learning_rate": 8.382547731053708e-07,
+      "loss": 0.14470311403274536,
+      "step": 5060
+    },
+    {
+      "epoch": 0.9215793304221251,
+      "grad_norm": 0.19897456467151642,
+      "learning_rate": 8.194584282787382e-07,
+      "loss": 0.144488525390625,
+      "step": 5065
+    },
+    {
+      "epoch": 0.9224890829694323,
+      "grad_norm": 0.15899236500263214,
+      "learning_rate": 8.008717053343606e-07,
+      "loss": 0.1352991580963135,
+      "step": 5070
+    },
+    {
+      "epoch": 0.9233988355167394,
+      "grad_norm": 0.14965768158435822,
+      "learning_rate": 7.824947654011345e-07,
+      "loss": 0.13827911615371705,
+      "step": 5075
+    },
+    {
+      "epoch": 0.9243085880640466,
+      "grad_norm": 0.43651485443115234,
+      "learning_rate": 7.643277677893329e-07,
+      "loss": 0.14149526357650757,
+      "step": 5080
+    },
+    {
+      "epoch": 0.9252183406113537,
+      "grad_norm": 0.19912713766098022,
+      "learning_rate": 7.463708699892325e-07,
+      "loss": 0.14357032775878906,
+      "step": 5085
+    },
+    {
+      "epoch": 0.9261280931586608,
+      "grad_norm": 0.1635904610157013,
+      "learning_rate": 7.286242276697524e-07,
+      "loss": 0.13550699949264527,
+      "step": 5090
+    },
+    {
+      "epoch": 0.9270378457059679,
+      "grad_norm": 0.19391080737113953,
+      "learning_rate": 7.11087994677101e-07,
+      "loss": 0.14674756526947022,
+      "step": 5095
+    },
+    {
+      "epoch": 0.9279475982532751,
+      "grad_norm": 0.17458125948905945,
+      "learning_rate": 6.937623230334284e-07,
+      "loss": 0.14155579805374147,
+      "step": 5100
+    },
+    {
+      "epoch": 0.9288573508005823,
+      "grad_norm": 0.1617971807718277,
+      "learning_rate": 6.766473629355452e-07,
+      "loss": 0.140555477142334,
+      "step": 5105
+    },
+    {
+      "epoch": 0.9297671033478894,
+      "grad_norm": 0.16945427656173706,
+      "learning_rate": 6.59743262753576e-07,
+      "loss": 0.13607511520385743,
+      "step": 5110
+    },
+    {
+      "epoch": 0.9306768558951966,
+      "grad_norm": 0.18347840011119843,
+      "learning_rate": 6.43050169029702e-07,
+      "loss": 0.14903461933135986,
+      "step": 5115
+    },
+    {
+      "epoch": 0.9315866084425036,
+      "grad_norm": 0.15434837341308594,
+      "learning_rate": 6.265682264768869e-07,
+      "loss": 0.14146015644073487,
+      "step": 5120
+    },
+    {
+      "epoch": 0.9324963609898108,
+      "grad_norm": 0.1397712528705597,
+      "learning_rate": 6.10297577977606e-07,
+      "loss": 0.14261592626571656,
+      "step": 5125
+    },
+    {
+      "epoch": 0.9334061135371179,
+      "grad_norm": 0.1765873283147812,
+      "learning_rate": 5.942383645826361e-07,
+      "loss": 0.13559447526931762,
+      "step": 5130
+    },
+    {
+      "epoch": 0.9343158660844251,
+      "grad_norm": 0.1656057983636856,
+      "learning_rate": 5.783907255098003e-07,
+      "loss": 0.13961490392684936,
+      "step": 5135
+    },
+    {
+      "epoch": 0.9352256186317321,
+      "grad_norm": 0.2169366180896759,
+      "learning_rate": 5.627547981427894e-07,
+      "loss": 0.1447835922241211,
+      "step": 5140
+    },
+    {
+      "epoch": 0.9361353711790393,
+      "grad_norm": 0.18623125553131104,
+      "learning_rate": 5.473307180299508e-07,
+      "loss": 0.14366730451583862,
+      "step": 5145
+    },
+    {
+      "epoch": 0.9370451237263464,
+      "grad_norm": 0.15423963963985443,
+      "learning_rate": 5.32118618883129e-07,
+      "loss": 0.14295632839202882,
+      "step": 5150
+    },
+    {
+      "epoch": 0.9379548762736536,
+      "grad_norm": 0.18423247337341309,
+      "learning_rate": 5.17118632576491e-07,
+      "loss": 0.14137414693832398,
+      "step": 5155
+    },
+    {
+      "epoch": 0.9388646288209607,
+      "grad_norm": 0.15338757634162903,
+      "learning_rate": 5.023308891453915e-07,
+      "loss": 0.13583066463470458,
+      "step": 5160
+    },
+    {
+      "epoch": 0.9397743813682679,
+      "grad_norm": 0.2293633222579956,
+      "learning_rate": 4.877555167852515e-07,
+      "loss": 0.14819620847702025,
+      "step": 5165
+    },
+    {
+      "epoch": 0.9406841339155749,
+      "grad_norm": 0.16889944672584534,
+      "learning_rate": 4.7339264185043974e-07,
+      "loss": 0.13617686033248902,
+      "step": 5170
+    },
+    {
+      "epoch": 0.9415938864628821,
+      "grad_norm": 0.1767464578151703,
+      "learning_rate": 4.5924238885316775e-07,
+      "loss": 0.13487552404403685,
+      "step": 5175
+    },
+    {
+      "epoch": 0.9425036390101892,
+      "grad_norm": 0.16697899997234344,
+      "learning_rate": 4.453048804624327e-07,
+      "loss": 0.1446886420249939,
+      "step": 5180
+    },
+    {
+      "epoch": 0.9434133915574964,
+      "grad_norm": 0.19576266407966614,
+      "learning_rate": 4.315802375029293e-07,
+      "loss": 0.14252450466156005,
+      "step": 5185
+    },
+    {
+      "epoch": 0.9443231441048034,
+      "grad_norm": 0.14838077127933502,
+      "learning_rate": 4.18068578954034e-07,
+      "loss": 0.13933032751083374,
+      "step": 5190
+    },
+    {
+      "epoch": 0.9452328966521106,
+      "grad_norm": 0.18481744825839996,
+      "learning_rate": 4.047700219487388e-07,
+      "loss": 0.1410665273666382,
+      "step": 5195
+    },
+    {
+      "epoch": 0.9461426491994177,
+      "grad_norm": 0.16954176127910614,
+      "learning_rate": 3.9168468177265547e-07,
+      "loss": 0.1421758770942688,
+      "step": 5200
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.857910203994113e+18,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-5200/training_args.bin b/checkpoint-5200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-5200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/checkpoint-5300/README.md b/checkpoint-5300/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-5300/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-5300/adapter_config.json b/checkpoint-5300/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-5300/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-5300/adapter_model.safetensors b/checkpoint-5300/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..fefa75a7bec8a4d2fb4a21708e4fcf1074bfedbe
--- /dev/null
+++ b/checkpoint-5300/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:03d134e5bd0379593af8640a9695296c13be790cd99b98f86152deb3956e3d52
+size 169741912
diff --git a/checkpoint-5300/chat_template.jinja b/checkpoint-5300/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-5300/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-5300/optimizer.pt b/checkpoint-5300/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..652eb0c69fcac1c906678deeee2e36cb5ccf5b1f
--- /dev/null
+++ b/checkpoint-5300/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3ab24f6e8be492fde0cbb674956956b52430e2856ceb6e24d1dccd0203a0255a
+size 72807355
diff --git a/checkpoint-5300/processor_config.json b/checkpoint-5300/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-5300/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-5300/rng_state.pth b/checkpoint-5300/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-5300/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-5300/scheduler.pt b/checkpoint-5300/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..cefb749dd36fecbf51d2bd3f5e120fbe964a5179
--- /dev/null
+++ b/checkpoint-5300/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0e15f40cb3aebdb433d431e37533a02b8207ffc5eeb3fa5f208e35510462495e
+size 1465
diff --git a/checkpoint-5300/tokenizer.json b/checkpoint-5300/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-5300/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-5300/tokenizer_config.json b/checkpoint-5300/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-5300/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-5300/trainer_state.json b/checkpoint-5300/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..01c06afce9b8addb755964322512fc600e853d03
--- /dev/null
+++ b/checkpoint-5300/trainer_state.json
@@ -0,0 +1,7462 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9643377001455604,
+  "eval_steps": 100,
+  "global_step": 5300,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    },
+    {
+      "epoch": 0.03729985443959243,
+      "grad_norm": 0.061678580939769745,
+      "learning_rate": 4.999340748636462e-05,
+      "loss": 0.24956226348876953,
+      "step": 205
+    },
+    {
+      "epoch": 0.03820960698689956,
+      "grad_norm": 0.07328873127698898,
+      "learning_rate": 4.999160884067051e-05,
+      "loss": 0.26169676780700685,
+      "step": 210
+    },
+    {
+      "epoch": 0.0391193595342067,
+      "grad_norm": 0.08287990838289261,
+      "learning_rate": 4.9989593541926246e-05,
+      "loss": 0.2574604034423828,
+      "step": 215
+    },
+    {
+      "epoch": 0.04002911208151383,
+      "grad_norm": 0.06787359714508057,
+      "learning_rate": 4.9987361607602525e-05,
+      "loss": 0.25351409912109374,
+      "step": 220
+    },
+    {
+      "epoch": 0.04093886462882096,
+      "grad_norm": 0.06695502996444702,
+      "learning_rate": 4.998491305704805e-05,
+      "loss": 0.24522039890289307,
+      "step": 225
+    },
+    {
+      "epoch": 0.041848617176128096,
+      "grad_norm": 0.08872214704751968,
+      "learning_rate": 4.9982247911489375e-05,
+      "loss": 0.2581867933273315,
+      "step": 230
+    },
+    {
+      "epoch": 0.042758369723435226,
+      "grad_norm": 0.07637131959199905,
+      "learning_rate": 4.9979366194030743e-05,
+      "loss": 0.25569658279418944,
+      "step": 235
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 0.08158119022846222,
+      "learning_rate": 4.997626792965385e-05,
+      "loss": 0.2529409646987915,
+      "step": 240
+    },
+    {
+      "epoch": 0.04457787481804949,
+      "grad_norm": 0.07529161125421524,
+      "learning_rate": 4.997295314521766e-05,
+      "loss": 0.24049024581909179,
+      "step": 245
+    },
+    {
+      "epoch": 0.04548762736535662,
+      "grad_norm": 0.08860139548778534,
+      "learning_rate": 4.996942186945813e-05,
+      "loss": 0.2490522861480713,
+      "step": 250
+    },
+    {
+      "epoch": 0.04639737991266375,
+      "grad_norm": 0.0850321501493454,
+      "learning_rate": 4.9965674132988005e-05,
+      "loss": 0.24180831909179687,
+      "step": 255
+    },
+    {
+      "epoch": 0.04730713245997089,
+      "grad_norm": 0.07556115090847015,
+      "learning_rate": 4.996170996829653e-05,
+      "loss": 0.2509631872177124,
+      "step": 260
+    },
+    {
+      "epoch": 0.04821688500727802,
+      "grad_norm": 0.07971206307411194,
+      "learning_rate": 4.995752940974918e-05,
+      "loss": 0.24398891925811766,
+      "step": 265
+    },
+    {
+      "epoch": 0.04912663755458515,
+      "grad_norm": 0.09149336814880371,
+      "learning_rate": 4.9953132493587344e-05,
+      "loss": 0.2300492286682129,
+      "step": 270
+    },
+    {
+      "epoch": 0.050036390101892286,
+      "grad_norm": 0.08265820890665054,
+      "learning_rate": 4.9948519257928034e-05,
+      "loss": 0.24246792793273925,
+      "step": 275
+    },
+    {
+      "epoch": 0.050946142649199416,
+      "grad_norm": 0.10328587144613266,
+      "learning_rate": 4.9943689742763534e-05,
+      "loss": 0.2367171049118042,
+      "step": 280
+    },
+    {
+      "epoch": 0.05185589519650655,
+      "grad_norm": 0.0836917981505394,
+      "learning_rate": 4.993864398996105e-05,
+      "loss": 0.23215813636779786,
+      "step": 285
+    },
+    {
+      "epoch": 0.05276564774381368,
+      "grad_norm": 0.09475161135196686,
+      "learning_rate": 4.99333820432624e-05,
+      "loss": 0.2350748062133789,
+      "step": 290
+    },
+    {
+      "epoch": 0.05367540029112081,
+      "grad_norm": 0.08040128648281097,
+      "learning_rate": 4.992790394828355e-05,
+      "loss": 0.23253886699676513,
+      "step": 295
+    },
+    {
+      "epoch": 0.05458515283842795,
+      "grad_norm": 0.08852150291204453,
+      "learning_rate": 4.992220975251428e-05,
+      "loss": 0.23856515884399415,
+      "step": 300
+    },
+    {
+      "epoch": 0.05549490538573508,
+      "grad_norm": 0.09565229713916779,
+      "learning_rate": 4.991629950531775e-05,
+      "loss": 0.23311660289764405,
+      "step": 305
+    },
+    {
+      "epoch": 0.05640465793304221,
+      "grad_norm": 0.08158160001039505,
+      "learning_rate": 4.991017325793009e-05,
+      "loss": 0.22467944622039795,
+      "step": 310
+    },
+    {
+      "epoch": 0.05731441048034935,
+      "grad_norm": 0.07746429741382599,
+      "learning_rate": 4.990383106345994e-05,
+      "loss": 0.229844069480896,
+      "step": 315
+    },
+    {
+      "epoch": 0.05822416302765648,
+      "grad_norm": 0.08564355969429016,
+      "learning_rate": 4.989727297688797e-05,
+      "loss": 0.22414517402648926,
+      "step": 320
+    },
+    {
+      "epoch": 0.05913391557496361,
+      "grad_norm": 0.07517435401678085,
+      "learning_rate": 4.9890499055066435e-05,
+      "loss": 0.2236532211303711,
+      "step": 325
+    },
+    {
+      "epoch": 0.060043668122270744,
+      "grad_norm": 0.111734539270401,
+      "learning_rate": 4.988350935671869e-05,
+      "loss": 0.21474847793579102,
+      "step": 330
+    },
+    {
+      "epoch": 0.060953420669577874,
+      "grad_norm": 0.09906989336013794,
+      "learning_rate": 4.987630394243866e-05,
+      "loss": 0.23321933746337892,
+      "step": 335
+    },
+    {
+      "epoch": 0.06186317321688501,
+      "grad_norm": 0.10131457448005676,
+      "learning_rate": 4.98688828746903e-05,
+      "loss": 0.2310662031173706,
+      "step": 340
+    },
+    {
+      "epoch": 0.06277292576419213,
+      "grad_norm": 0.09203507006168365,
+      "learning_rate": 4.986124621780708e-05,
+      "loss": 0.22021169662475587,
+      "step": 345
+    },
+    {
+      "epoch": 0.06368267831149928,
+      "grad_norm": 0.09505912661552429,
+      "learning_rate": 4.9853394037991416e-05,
+      "loss": 0.2197155237197876,
+      "step": 350
+    },
+    {
+      "epoch": 0.06459243085880641,
+      "grad_norm": 0.09038657695055008,
+      "learning_rate": 4.984532640331412e-05,
+      "loss": 0.22066287994384765,
+      "step": 355
+    },
+    {
+      "epoch": 0.06550218340611354,
+      "grad_norm": 0.09707064181566238,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 0.22455451488494874,
+      "step": 360
+    },
+    {
+      "epoch": 0.06641193595342067,
+      "grad_norm": 0.10367228090763092,
+      "learning_rate": 4.98285450509961e-05,
+      "loss": 0.21993820667266845,
+      "step": 365
+    },
+    {
+      "epoch": 0.0673216885007278,
+      "grad_norm": 0.12229471653699875,
+      "learning_rate": 4.9819831478833456e-05,
+      "loss": 0.2168867588043213,
+      "step": 370
+    },
+    {
+      "epoch": 0.06823144104803494,
+      "grad_norm": 0.0964592918753624,
+      "learning_rate": 4.981090274276406e-05,
+      "loss": 0.21579203605651856,
+      "step": 375
+    },
+    {
+      "epoch": 0.06914119359534207,
+      "grad_norm": 0.09400496631860733,
+      "learning_rate": 4.980175892019141e-05,
+      "loss": 0.20972180366516113,
+      "step": 380
+    },
+    {
+      "epoch": 0.0700509461426492,
+      "grad_norm": 0.08158645778894424,
+      "learning_rate": 4.9792400090383594e-05,
+      "loss": 0.22148358821868896,
+      "step": 385
+    },
+    {
+      "epoch": 0.07096069868995633,
+      "grad_norm": 0.10916394740343094,
+      "learning_rate": 4.978282633447261e-05,
+      "loss": 0.2214418649673462,
+      "step": 390
+    },
+    {
+      "epoch": 0.07187045123726346,
+      "grad_norm": 0.11138810962438583,
+      "learning_rate": 4.9773037735453636e-05,
+      "loss": 0.21814754009246826,
+      "step": 395
+    },
+    {
+      "epoch": 0.07278020378457059,
+      "grad_norm": 0.10914396494626999,
+      "learning_rate": 4.9763034378184365e-05,
+      "loss": 0.21310818195343018,
+      "step": 400
+    },
+    {
+      "epoch": 0.07368995633187773,
+      "grad_norm": 0.1043366864323616,
+      "learning_rate": 4.975281634938421e-05,
+      "loss": 0.21266789436340333,
+      "step": 405
+    },
+    {
+      "epoch": 0.07459970887918486,
+      "grad_norm": 0.1036868542432785,
+      "learning_rate": 4.9742383737633594e-05,
+      "loss": 0.21606721878051757,
+      "step": 410
+    },
+    {
+      "epoch": 0.075509461426492,
+      "grad_norm": 0.11640442907810211,
+      "learning_rate": 4.9731736633373144e-05,
+      "loss": 0.21532948017120362,
+      "step": 415
+    },
+    {
+      "epoch": 0.07641921397379912,
+      "grad_norm": 0.11219926178455353,
+      "learning_rate": 4.9720875128902956e-05,
+      "loss": 0.2191627025604248,
+      "step": 420
+    },
+    {
+      "epoch": 0.07732896652110625,
+      "grad_norm": 0.12103637307882309,
+      "learning_rate": 4.970979931838176e-05,
+      "loss": 0.20938868522644044,
+      "step": 425
+    },
+    {
+      "epoch": 0.0782387190684134,
+      "grad_norm": 0.13274189829826355,
+      "learning_rate": 4.96985092978261e-05,
+      "loss": 0.21792960166931152,
+      "step": 430
+    },
+    {
+      "epoch": 0.07914847161572053,
+      "grad_norm": 0.11164513230323792,
+      "learning_rate": 4.968700516510954e-05,
+      "loss": 0.2022618055343628,
+      "step": 435
+    },
+    {
+      "epoch": 0.08005822416302766,
+      "grad_norm": 0.09532847255468369,
+      "learning_rate": 4.967528701996174e-05,
+      "loss": 0.21255812644958497,
+      "step": 440
+    },
+    {
+      "epoch": 0.08096797671033479,
+      "grad_norm": 0.10279258340597153,
+      "learning_rate": 4.96633549639677e-05,
+      "loss": 0.20683050155639648,
+      "step": 445
+    },
+    {
+      "epoch": 0.08187772925764192,
+      "grad_norm": 0.1257462352514267,
+      "learning_rate": 4.965120910056677e-05,
+      "loss": 0.21419920921325683,
+      "step": 450
+    },
+    {
+      "epoch": 0.08278748180494905,
+      "grad_norm": 0.11663137376308441,
+      "learning_rate": 4.963884953505186e-05,
+      "loss": 0.2072287082672119,
+      "step": 455
+    },
+    {
+      "epoch": 0.08369723435225619,
+      "grad_norm": 0.10488224029541016,
+      "learning_rate": 4.96262763745684e-05,
+      "loss": 0.1982678532600403,
+      "step": 460
+    },
+    {
+      "epoch": 0.08460698689956332,
+      "grad_norm": 0.11801692098379135,
+      "learning_rate": 4.961348972811354e-05,
+      "loss": 0.20662031173706055,
+      "step": 465
+    },
+    {
+      "epoch": 0.08551673944687045,
+      "grad_norm": 0.11318827420473099,
+      "learning_rate": 4.96004897065351e-05,
+      "loss": 0.20947303771972656,
+      "step": 470
+    },
+    {
+      "epoch": 0.08642649199417758,
+      "grad_norm": 0.13409486413002014,
+      "learning_rate": 4.95872764225307e-05,
+      "loss": 0.19670876264572143,
+      "step": 475
+    },
+    {
+      "epoch": 0.08733624454148471,
+      "grad_norm": 0.14440792798995972,
+      "learning_rate": 4.957384999064672e-05,
+      "loss": 0.19842848777770997,
+      "step": 480
+    },
+    {
+      "epoch": 0.08824599708879186,
+      "grad_norm": 0.12246996909379959,
+      "learning_rate": 4.956021052727731e-05,
+      "loss": 0.20318071842193602,
+      "step": 485
+    },
+    {
+      "epoch": 0.08915574963609899,
+      "grad_norm": 0.13437233865261078,
+      "learning_rate": 4.954635815066342e-05,
+      "loss": 0.21675212383270265,
+      "step": 490
+    },
+    {
+      "epoch": 0.09006550218340612,
+      "grad_norm": 0.11109672486782074,
+      "learning_rate": 4.9532292980891744e-05,
+      "loss": 0.2100757837295532,
+      "step": 495
+    },
+    {
+      "epoch": 0.09097525473071325,
+      "grad_norm": 0.1388893872499466,
+      "learning_rate": 4.9518015139893675e-05,
+      "loss": 0.20303285121917725,
+      "step": 500
+    },
+    {
+      "epoch": 0.09188500727802038,
+      "grad_norm": 0.13239721953868866,
+      "learning_rate": 4.950352475144427e-05,
+      "loss": 0.2152268409729004,
+      "step": 505
+    },
+    {
+      "epoch": 0.0927947598253275,
+      "grad_norm": 0.12834979593753815,
+      "learning_rate": 4.948882194116119e-05,
+      "loss": 0.20799248218536376,
+      "step": 510
+    },
+    {
+      "epoch": 0.09370451237263465,
+      "grad_norm": 0.11886704713106155,
+      "learning_rate": 4.947390683650354e-05,
+      "loss": 0.20394976139068605,
+      "step": 515
+    },
+    {
+      "epoch": 0.09461426491994178,
+      "grad_norm": 0.11398876458406448,
+      "learning_rate": 4.945877956677083e-05,
+      "loss": 0.2091092586517334,
+      "step": 520
+    },
+    {
+      "epoch": 0.09552401746724891,
+      "grad_norm": 0.1422540694475174,
+      "learning_rate": 4.944344026310186e-05,
+      "loss": 0.19564238786697388,
+      "step": 525
+    },
+    {
+      "epoch": 0.09643377001455604,
+      "grad_norm": 0.11359584331512451,
+      "learning_rate": 4.9427889058473535e-05,
+      "loss": 0.20493624210357667,
+      "step": 530
+    },
+    {
+      "epoch": 0.09734352256186317,
+      "grad_norm": 0.11703553050756454,
+      "learning_rate": 4.941212608769974e-05,
+      "loss": 0.2098615884780884,
+      "step": 535
+    },
+    {
+      "epoch": 0.0982532751091703,
+      "grad_norm": 0.14552047848701477,
+      "learning_rate": 4.939615148743017e-05,
+      "loss": 0.20382182598114013,
+      "step": 540
+    },
+    {
+      "epoch": 0.09916302765647744,
+      "grad_norm": 0.13178016245365143,
+      "learning_rate": 4.937996539614914e-05,
+      "loss": 0.19901862144470214,
+      "step": 545
+    },
+    {
+      "epoch": 0.10007278020378457,
+      "grad_norm": 0.635392427444458,
+      "learning_rate": 4.936356795417439e-05,
+      "loss": 0.20694944858551026,
+      "step": 550
+    },
+    {
+      "epoch": 0.1009825327510917,
+      "grad_norm": 0.15019077062606812,
+      "learning_rate": 4.934695930365586e-05,
+      "loss": 0.19313746690750122,
+      "step": 555
+    },
+    {
+      "epoch": 0.10189228529839883,
+      "grad_norm": 0.12941956520080566,
+      "learning_rate": 4.9330139588574474e-05,
+      "loss": 0.19671722650527954,
+      "step": 560
+    },
+    {
+      "epoch": 0.10280203784570596,
+      "grad_norm": 0.13818831741809845,
+      "learning_rate": 4.931310895474088e-05,
+      "loss": 0.20026786327362062,
+      "step": 565
+    },
+    {
+      "epoch": 0.1037117903930131,
+      "grad_norm": 0.12011194974184036,
+      "learning_rate": 4.929586754979417e-05,
+      "loss": 0.1932437539100647,
+      "step": 570
+    },
+    {
+      "epoch": 0.10462154294032024,
+      "grad_norm": 0.1345364898443222,
+      "learning_rate": 4.9278415523200644e-05,
+      "loss": 0.20245940685272218,
+      "step": 575
+    },
+    {
+      "epoch": 0.10553129548762737,
+      "grad_norm": 0.13281017541885376,
+      "learning_rate": 4.926075302625247e-05,
+      "loss": 0.19864981174468993,
+      "step": 580
+    },
+    {
+      "epoch": 0.1064410480349345,
+      "grad_norm": 0.13465586304664612,
+      "learning_rate": 4.924288021206639e-05,
+      "loss": 0.19573183059692384,
+      "step": 585
+    },
+    {
+      "epoch": 0.10735080058224163,
+      "grad_norm": 0.15225961804389954,
+      "learning_rate": 4.9224797235582396e-05,
+      "loss": 0.19946801662445068,
+      "step": 590
+    },
+    {
+      "epoch": 0.10826055312954876,
+      "grad_norm": 0.12816746532917023,
+      "learning_rate": 4.92065042535624e-05,
+      "loss": 0.19851526021957397,
+      "step": 595
+    },
+    {
+      "epoch": 0.1091703056768559,
+      "grad_norm": 0.13802853226661682,
+      "learning_rate": 4.9188001424588824e-05,
+      "loss": 0.19321763515472412,
+      "step": 600
+    },
+    {
+      "epoch": 0.11008005822416303,
+      "grad_norm": 0.17504797875881195,
+      "learning_rate": 4.9169288909063295e-05,
+      "loss": 0.2032616138458252,
+      "step": 605
+    },
+    {
+      "epoch": 0.11098981077147016,
+      "grad_norm": 0.13544194400310516,
+      "learning_rate": 4.91503668692052e-05,
+      "loss": 0.2011256456375122,
+      "step": 610
+    },
+    {
+      "epoch": 0.11189956331877729,
+      "grad_norm": 1.3976134061813354,
+      "learning_rate": 4.91312354690503e-05,
+      "loss": 0.19916868209838867,
+      "step": 615
+    },
+    {
+      "epoch": 0.11280931586608442,
+      "grad_norm": 0.1465059071779251,
+      "learning_rate": 4.91118948744493e-05,
+      "loss": 0.19487457275390624,
+      "step": 620
+    },
+    {
+      "epoch": 0.11371906841339156,
+      "grad_norm": 0.12103168666362762,
+      "learning_rate": 4.909234525306645e-05,
+      "loss": 0.1907251238822937,
+      "step": 625
+    },
+    {
+      "epoch": 0.1146288209606987,
+      "grad_norm": 0.12660574913024902,
+      "learning_rate": 4.907258677437802e-05,
+      "loss": 0.19327253103256226,
+      "step": 630
+    },
+    {
+      "epoch": 0.11553857350800582,
+      "grad_norm": 0.1347813606262207,
+      "learning_rate": 4.90526196096709e-05,
+      "loss": 0.19637736082077026,
+      "step": 635
+    },
+    {
+      "epoch": 0.11644832605531295,
+      "grad_norm": 0.14953652024269104,
+      "learning_rate": 4.903244393204107e-05,
+      "loss": 0.20325069427490233,
+      "step": 640
+    },
+    {
+      "epoch": 0.11735807860262008,
+      "grad_norm": 0.13936272263526917,
+      "learning_rate": 4.901205991639213e-05,
+      "loss": 0.1930275321006775,
+      "step": 645
+    },
+    {
+      "epoch": 0.11826783114992721,
+      "grad_norm": 0.1448420137166977,
+      "learning_rate": 4.899146773943374e-05,
+      "loss": 0.20026936531066894,
+      "step": 650
+    },
+    {
+      "epoch": 0.11917758369723436,
+      "grad_norm": 0.1312534064054489,
+      "learning_rate": 4.897066757968014e-05,
+      "loss": 0.19062033891677857,
+      "step": 655
+    },
+    {
+      "epoch": 0.12008733624454149,
+      "grad_norm": 0.13644742965698242,
+      "learning_rate": 4.894965961744859e-05,
+      "loss": 0.18719595670700073,
+      "step": 660
+    },
+    {
+      "epoch": 0.12099708879184862,
+      "grad_norm": 0.14276087284088135,
+      "learning_rate": 4.892844403485777e-05,
+      "loss": 0.19784307479858398,
+      "step": 665
+    },
+    {
+      "epoch": 0.12190684133915575,
+      "grad_norm": 0.14735399186611176,
+      "learning_rate": 4.890702101582623e-05,
+      "loss": 0.19163782596588136,
+      "step": 670
+    },
+    {
+      "epoch": 0.12281659388646288,
+      "grad_norm": 0.15742065012454987,
+      "learning_rate": 4.888539074607082e-05,
+      "loss": 0.19312986135482788,
+      "step": 675
+    },
+    {
+      "epoch": 0.12372634643377002,
+      "grad_norm": 0.12917031347751617,
+      "learning_rate": 4.8863553413105025e-05,
+      "loss": 0.20066320896148682,
+      "step": 680
+    },
+    {
+      "epoch": 0.12463609898107715,
+      "grad_norm": 0.1484801322221756,
+      "learning_rate": 4.884150920623737e-05,
+      "loss": 0.20096096992492676,
+      "step": 685
+    },
+    {
+      "epoch": 0.12554585152838427,
+      "grad_norm": 0.1455296128988266,
+      "learning_rate": 4.88192583165698e-05,
+      "loss": 0.20518505573272705,
+      "step": 690
+    },
+    {
+      "epoch": 0.12645560407569142,
+      "grad_norm": 0.14517490565776825,
+      "learning_rate": 4.879680093699598e-05,
+      "loss": 0.18859238624572755,
+      "step": 695
+    },
+    {
+      "epoch": 0.12736535662299855,
+      "grad_norm": 0.18778090178966522,
+      "learning_rate": 4.877413726219964e-05,
+      "loss": 0.197074818611145,
+      "step": 700
+    },
+    {
+      "epoch": 0.12827510917030568,
+      "grad_norm": 0.13497677445411682,
+      "learning_rate": 4.87512674886529e-05,
+      "loss": 0.18713107109069824,
+      "step": 705
+    },
+    {
+      "epoch": 0.12918486171761281,
+      "grad_norm": 0.12657155096530914,
+      "learning_rate": 4.872819181461455e-05,
+      "loss": 0.1858484387397766,
+      "step": 710
+    },
+    {
+      "epoch": 0.13009461426491994,
+      "grad_norm": 0.11458148807287216,
+      "learning_rate": 4.870491044012834e-05,
+      "loss": 0.18732179403305055,
+      "step": 715
+    },
+    {
+      "epoch": 0.13100436681222707,
+      "grad_norm": 0.13000249862670898,
+      "learning_rate": 4.8681423567021244e-05,
+      "loss": 0.1872936010360718,
+      "step": 720
+    },
+    {
+      "epoch": 0.1319141193595342,
+      "grad_norm": 0.14580890536308289,
+      "learning_rate": 4.865773139890172e-05,
+      "loss": 0.19280019998550416,
+      "step": 725
+    },
+    {
+      "epoch": 0.13282387190684133,
+      "grad_norm": 0.1507277935743332,
+      "learning_rate": 4.8633834141157913e-05,
+      "loss": 0.1898929238319397,
+      "step": 730
+    },
+    {
+      "epoch": 0.13373362445414846,
+      "grad_norm": 0.1418737769126892,
+      "learning_rate": 4.860973200095592e-05,
+      "loss": 0.17926375865936278,
+      "step": 735
+    },
+    {
+      "epoch": 0.1346433770014556,
+      "grad_norm": 0.17151866853237152,
+      "learning_rate": 4.858542518723794e-05,
+      "loss": 0.18963592052459716,
+      "step": 740
+    },
+    {
+      "epoch": 0.13555312954876272,
+      "grad_norm": 0.11162743717432022,
+      "learning_rate": 4.8560913910720535e-05,
+      "loss": 0.19466646909713745,
+      "step": 745
+    },
+    {
+      "epoch": 0.13646288209606988,
+      "grad_norm": 0.15628376603126526,
+      "learning_rate": 4.8536198383892725e-05,
+      "loss": 0.19494034051895143,
+      "step": 750
+    },
+    {
+      "epoch": 0.137372634643377,
+      "grad_norm": 0.18209289014339447,
+      "learning_rate": 4.851127882101421e-05,
+      "loss": 0.18747550249099731,
+      "step": 755
+    },
+    {
+      "epoch": 0.13828238719068414,
+      "grad_norm": 0.14559614658355713,
+      "learning_rate": 4.8486155438113454e-05,
+      "loss": 0.1897158980369568,
+      "step": 760
+    },
+    {
+      "epoch": 0.13919213973799127,
+      "grad_norm": 0.3198587894439697,
+      "learning_rate": 4.846082845298586e-05,
+      "loss": 0.18571001291275024,
+      "step": 765
+    },
+    {
+      "epoch": 0.1401018922852984,
+      "grad_norm": 0.1486678421497345,
+      "learning_rate": 4.843529808519189e-05,
+      "loss": 0.19561930894851684,
+      "step": 770
+    },
+    {
+      "epoch": 0.14101164483260553,
+      "grad_norm": 0.15318170189857483,
+      "learning_rate": 4.840956455605509e-05,
+      "loss": 0.187040114402771,
+      "step": 775
+    },
+    {
+      "epoch": 0.14192139737991266,
+      "grad_norm": 0.13754244148731232,
+      "learning_rate": 4.838362808866025e-05,
+      "loss": 0.18345539569854735,
+      "step": 780
+    },
+    {
+      "epoch": 0.1428311499272198,
+      "grad_norm": 0.12943248450756073,
+      "learning_rate": 4.835748890785143e-05,
+      "loss": 0.1921079397201538,
+      "step": 785
+    },
+    {
+      "epoch": 0.14374090247452692,
+      "grad_norm": 0.110458143055439,
+      "learning_rate": 4.833114724023001e-05,
+      "loss": 0.17927205562591553,
+      "step": 790
+    },
+    {
+      "epoch": 0.14465065502183405,
+      "grad_norm": 0.2421770840883255,
+      "learning_rate": 4.830460331415275e-05,
+      "loss": 0.18317567110061644,
+      "step": 795
+    },
+    {
+      "epoch": 0.14556040756914118,
+      "grad_norm": 0.14752762019634247,
+      "learning_rate": 4.8277857359729787e-05,
+      "loss": 0.1843916058540344,
+      "step": 800
+    },
+    {
+      "epoch": 0.14647016011644834,
+      "grad_norm": 0.15043556690216064,
+      "learning_rate": 4.8250909608822644e-05,
+      "loss": 0.18354393243789674,
+      "step": 805
+    },
+    {
+      "epoch": 0.14737991266375547,
+      "grad_norm": 0.1381794661283493,
+      "learning_rate": 4.822376029504223e-05,
+      "loss": 0.1789781332015991,
+      "step": 810
+    },
+    {
+      "epoch": 0.1482896652110626,
+      "grad_norm": 0.18386174738407135,
+      "learning_rate": 4.819640965374681e-05,
+      "loss": 0.19494292736053467,
+      "step": 815
+    },
+    {
+      "epoch": 0.14919941775836973,
+      "grad_norm": 0.13829593360424042,
+      "learning_rate": 4.816885792203996e-05,
+      "loss": 0.18486063480377196,
+      "step": 820
+    },
+    {
+      "epoch": 0.15010917030567686,
+      "grad_norm": 0.15033291280269623,
+      "learning_rate": 4.814110533876852e-05,
+      "loss": 0.18061509132385253,
+      "step": 825
+    },
+    {
+      "epoch": 0.151018922852984,
+      "grad_norm": 0.17150473594665527,
+      "learning_rate": 4.811315214452051e-05,
+      "loss": 0.18464866876602173,
+      "step": 830
+    },
+    {
+      "epoch": 0.15192867540029112,
+      "grad_norm": 0.15317125618457794,
+      "learning_rate": 4.808499858162307e-05,
+      "loss": 0.1837708592414856,
+      "step": 835
+    },
+    {
+      "epoch": 0.15283842794759825,
+      "grad_norm": 0.2671392560005188,
+      "learning_rate": 4.805664489414031e-05,
+      "loss": 0.19338636398315429,
+      "step": 840
+    },
+    {
+      "epoch": 0.15374818049490538,
+      "grad_norm": 0.14047028124332428,
+      "learning_rate": 4.802809132787125e-05,
+      "loss": 0.17069108486175538,
+      "step": 845
+    },
+    {
+      "epoch": 0.1546579330422125,
+      "grad_norm": 0.1520431935787201,
+      "learning_rate": 4.799933813034768e-05,
+      "loss": 0.18607735633850098,
+      "step": 850
+    },
+    {
+      "epoch": 0.15556768558951964,
+      "grad_norm": 0.17239463329315186,
+      "learning_rate": 4.797038555083197e-05,
+      "loss": 0.18069062232971192,
+      "step": 855
+    },
+    {
+      "epoch": 0.1564774381368268,
+      "grad_norm": 0.1377955675125122,
+      "learning_rate": 4.794123384031495e-05,
+      "loss": 0.18870222568511963,
+      "step": 860
+    },
+    {
+      "epoch": 0.15738719068413393,
+      "grad_norm": 0.15901461243629456,
+      "learning_rate": 4.791188325151373e-05,
+      "loss": 0.18128334283828734,
+      "step": 865
+    },
+    {
+      "epoch": 0.15829694323144106,
+      "grad_norm": 0.14634132385253906,
+      "learning_rate": 4.7882334038869495e-05,
+      "loss": 0.1866163969039917,
+      "step": 870
+    },
+    {
+      "epoch": 0.1592066957787482,
+      "grad_norm": 0.15361061692237854,
+      "learning_rate": 4.785258645854529e-05,
+      "loss": 0.17850807905197144,
+      "step": 875
+    },
+    {
+      "epoch": 0.16011644832605532,
+      "grad_norm": 0.13751649856567383,
+      "learning_rate": 4.782264076842385e-05,
+      "loss": 0.17731113433837892,
+      "step": 880
+    },
+    {
+      "epoch": 0.16102620087336245,
+      "grad_norm": 0.17909638583660126,
+      "learning_rate": 4.7792497228105314e-05,
+      "loss": 0.18344542980194092,
+      "step": 885
+    },
+    {
+      "epoch": 0.16193595342066958,
+      "grad_norm": 0.16038304567337036,
+      "learning_rate": 4.776215609890498e-05,
+      "loss": 0.18868647813796996,
+      "step": 890
+    },
+    {
+      "epoch": 0.1628457059679767,
+      "grad_norm": 0.1653951108455658,
+      "learning_rate": 4.773161764385107e-05,
+      "loss": 0.18614152669906617,
+      "step": 895
+    },
+    {
+      "epoch": 0.16375545851528384,
+      "grad_norm": 0.16193026304244995,
+      "learning_rate": 4.770088212768241e-05,
+      "loss": 0.18564575910568237,
+      "step": 900
+    },
+    {
+      "epoch": 0.16466521106259097,
+      "grad_norm": 0.16048531234264374,
+      "learning_rate": 4.7669949816846173e-05,
+      "loss": 0.18330031633377075,
+      "step": 905
+    },
+    {
+      "epoch": 0.1655749636098981,
+      "grad_norm": 0.1440177708864212,
+      "learning_rate": 4.7638820979495534e-05,
+      "loss": 0.17712442874908446,
+      "step": 910
+    },
+    {
+      "epoch": 0.16648471615720525,
+      "grad_norm": 0.19635969400405884,
+      "learning_rate": 4.760749588548738e-05,
+      "loss": 0.18679027557373046,
+      "step": 915
+    },
+    {
+      "epoch": 0.16739446870451238,
+      "grad_norm": 0.15576541423797607,
+      "learning_rate": 4.757597480637995e-05,
+      "loss": 0.19283764362335204,
+      "step": 920
+    },
+    {
+      "epoch": 0.1683042212518195,
+      "grad_norm": 0.1550331562757492,
+      "learning_rate": 4.7544258015430463e-05,
+      "loss": 0.18269542455673218,
+      "step": 925
+    },
+    {
+      "epoch": 0.16921397379912664,
+      "grad_norm": 0.18369626998901367,
+      "learning_rate": 4.75123457875928e-05,
+      "loss": 0.1697891116142273,
+      "step": 930
+    },
+    {
+      "epoch": 0.17012372634643377,
+      "grad_norm": 0.15266314148902893,
+      "learning_rate": 4.7480238399515074e-05,
+      "loss": 0.18523451089859008,
+      "step": 935
+    },
+    {
+      "epoch": 0.1710334788937409,
+      "grad_norm": 0.16709664463996887,
+      "learning_rate": 4.744793612953724e-05,
+      "loss": 0.1803238034248352,
+      "step": 940
+    },
+    {
+      "epoch": 0.17194323144104803,
+      "grad_norm": 0.14929179847240448,
+      "learning_rate": 4.741543925768872e-05,
+      "loss": 0.1861217737197876,
+      "step": 945
+    },
+    {
+      "epoch": 0.17285298398835516,
+      "grad_norm": 0.1362280696630478,
+      "learning_rate": 4.7382748065685915e-05,
+      "loss": 0.17896100282669067,
+      "step": 950
+    },
+    {
+      "epoch": 0.1737627365356623,
+      "grad_norm": 0.15290239453315735,
+      "learning_rate": 4.734986283692982e-05,
+      "loss": 0.18432788848876952,
+      "step": 955
+    },
+    {
+      "epoch": 0.17467248908296942,
+      "grad_norm": 0.1287035197019577,
+      "learning_rate": 4.73167838565035e-05,
+      "loss": 0.18485682010650634,
+      "step": 960
+    },
+    {
+      "epoch": 0.17558224163027655,
+      "grad_norm": 0.17969627678394318,
+      "learning_rate": 4.728351141116971e-05,
+      "loss": 0.17361557483673096,
+      "step": 965
+    },
+    {
+      "epoch": 0.1764919941775837,
+      "grad_norm": 0.13751201331615448,
+      "learning_rate": 4.7250045789368326e-05,
+      "loss": 0.1731679320335388,
+      "step": 970
+    },
+    {
+      "epoch": 0.17740174672489084,
+      "grad_norm": 0.1603265255689621,
+      "learning_rate": 4.721638728121388e-05,
+      "loss": 0.17308170795440675,
+      "step": 975
+    },
+    {
+      "epoch": 0.17831149927219797,
+      "grad_norm": 0.1592789888381958,
+      "learning_rate": 4.718253617849306e-05,
+      "loss": 0.17534757852554322,
+      "step": 980
+    },
+    {
+      "epoch": 0.1792212518195051,
+      "grad_norm": 0.12727224826812744,
+      "learning_rate": 4.714849277466214e-05,
+      "loss": 0.17817609310150145,
+      "step": 985
+    },
+    {
+      "epoch": 0.18013100436681223,
+      "grad_norm": 0.15401554107666016,
+      "learning_rate": 4.711425736484447e-05,
+      "loss": 0.1733405351638794,
+      "step": 990
+    },
+    {
+      "epoch": 0.18104075691411936,
+      "grad_norm": 0.13253968954086304,
+      "learning_rate": 4.7079830245827906e-05,
+      "loss": 0.17846795320510864,
+      "step": 995
+    },
+    {
+      "epoch": 0.1819505094614265,
+      "grad_norm": 0.21846213936805725,
+      "learning_rate": 4.7045211716062245e-05,
+      "loss": 0.18021599054336548,
+      "step": 1000
+    },
+    {
+      "epoch": 0.18286026200873362,
+      "grad_norm": 0.16867990791797638,
+      "learning_rate": 4.7010402075656595e-05,
+      "loss": 0.18232386112213134,
+      "step": 1005
+    },
+    {
+      "epoch": 0.18377001455604075,
+      "grad_norm": 0.17180582880973816,
+      "learning_rate": 4.697540162637686e-05,
+      "loss": 0.1816317319869995,
+      "step": 1010
+    },
+    {
+      "epoch": 0.18467976710334788,
+      "grad_norm": 0.16480213403701782,
+      "learning_rate": 4.694021067164303e-05,
+      "loss": 0.17718446254730225,
+      "step": 1015
+    },
+    {
+      "epoch": 0.185589519650655,
+      "grad_norm": 0.15015918016433716,
+      "learning_rate": 4.6904829516526605e-05,
+      "loss": 0.17412011623382567,
+      "step": 1020
+    },
+    {
+      "epoch": 0.18649927219796217,
+      "grad_norm": 0.14445139467716217,
+      "learning_rate": 4.686925846774795e-05,
+      "loss": 0.1778018832206726,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1874090247452693,
+      "grad_norm": 0.1701960265636444,
+      "learning_rate": 4.683349783367362e-05,
+      "loss": 0.16901081800460815,
+      "step": 1030
+    },
+    {
+      "epoch": 0.18831877729257643,
+      "grad_norm": 0.15894867479801178,
+      "learning_rate": 4.679754792431368e-05,
+      "loss": 0.17055928707122803,
+      "step": 1035
+    },
+    {
+      "epoch": 0.18922852983988356,
+      "grad_norm": 0.1511942446231842,
+      "learning_rate": 4.676140905131903e-05,
+      "loss": 0.17339680194854737,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1901382823871907,
+      "grad_norm": 0.14735209941864014,
+      "learning_rate": 4.672508152797872e-05,
+      "loss": 0.17802717685699462,
+      "step": 1045
+    },
+    {
+      "epoch": 0.19104803493449782,
+      "grad_norm": 0.17367291450500488,
+      "learning_rate": 4.66885656692172e-05,
+      "loss": 0.1732744097709656,
+      "step": 1050
+    },
+    {
+      "epoch": 0.19195778748180495,
+      "grad_norm": 0.147227481007576,
+      "learning_rate": 4.665186179159159e-05,
+      "loss": 0.17040517330169677,
+      "step": 1055
+    },
+    {
+      "epoch": 0.19286754002911208,
+      "grad_norm": 0.1709655076265335,
+      "learning_rate": 4.6614970213289e-05,
+      "loss": 0.17794088125228882,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1937772925764192,
+      "grad_norm": 0.1588088721036911,
+      "learning_rate": 4.657789125412366e-05,
+      "loss": 0.17180380821228028,
+      "step": 1065
+    },
+    {
+      "epoch": 0.19468704512372634,
+      "grad_norm": 0.14827021956443787,
+      "learning_rate": 4.654062523553428e-05,
+      "loss": 0.182997989654541,
+      "step": 1070
+    },
+    {
+      "epoch": 0.19559679767103347,
+      "grad_norm": 0.16230466961860657,
+      "learning_rate": 4.6503172480581126e-05,
+      "loss": 0.17346880435943604,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1965065502183406,
+      "grad_norm": 0.1637624353170395,
+      "learning_rate": 4.646553331394333e-05,
+      "loss": 0.17263576984405518,
+      "step": 1080
+    },
+    {
+      "epoch": 0.19741630276564776,
+      "grad_norm": 0.15977843105793,
+      "learning_rate": 4.642770806191603e-05,
+      "loss": 0.17284308671951293,
+      "step": 1085
+    },
+    {
+      "epoch": 0.19832605531295489,
+      "grad_norm": 0.15394869446754456,
+      "learning_rate": 4.6389697052407534e-05,
+      "loss": 0.17797101736068727,
+      "step": 1090
+    },
+    {
+      "epoch": 0.19923580786026202,
+      "grad_norm": 0.15995225310325623,
+      "learning_rate": 4.6351500614936485e-05,
+      "loss": 0.18137198686599731,
+      "step": 1095
+    },
+    {
+      "epoch": 0.20014556040756915,
+      "grad_norm": 0.1779479682445526,
+      "learning_rate": 4.6313119080629006e-05,
+      "loss": 0.17998344898223878,
+      "step": 1100
+    },
+    {
+      "epoch": 0.20105531295487628,
+      "grad_norm": 0.14362832903862,
+      "learning_rate": 4.627455278221584e-05,
+      "loss": 0.18196423053741456,
+      "step": 1105
+    },
+    {
+      "epoch": 0.2019650655021834,
+      "grad_norm": 0.15951639413833618,
+      "learning_rate": 4.623580205402947e-05,
+      "loss": 0.17423888444900512,
+      "step": 1110
+    },
+    {
+      "epoch": 0.20287481804949054,
+      "grad_norm": 0.17273563146591187,
+      "learning_rate": 4.619686723200115e-05,
+      "loss": 0.17392473220825194,
+      "step": 1115
+    },
+    {
+      "epoch": 0.20378457059679767,
+      "grad_norm": 0.1655360758304596,
+      "learning_rate": 4.615774865365813e-05,
+      "loss": 0.17528389692306517,
+      "step": 1120
+    },
+    {
+      "epoch": 0.2046943231441048,
+      "grad_norm": 0.15920691192150116,
+      "learning_rate": 4.611844665812058e-05,
+      "loss": 0.1806849241256714,
+      "step": 1125
+    },
+    {
+      "epoch": 0.20560407569141192,
+      "grad_norm": 0.16114577651023865,
+      "learning_rate": 4.607896158609875e-05,
+      "loss": 0.17217352390289306,
+      "step": 1130
+    },
+    {
+      "epoch": 0.20651382823871905,
+      "grad_norm": 0.1499422937631607,
+      "learning_rate": 4.603929377988999e-05,
+      "loss": 0.17806737422943114,
+      "step": 1135
+    },
+    {
+      "epoch": 0.2074235807860262,
+      "grad_norm": 0.17605191469192505,
+      "learning_rate": 4.5999443583375765e-05,
+      "loss": 0.17842113971710205,
+      "step": 1140
+    },
+    {
+      "epoch": 0.20833333333333334,
+      "grad_norm": 0.16117210686206818,
+      "learning_rate": 4.595941134201871e-05,
+      "loss": 0.18379683494567872,
+      "step": 1145
+    },
+    {
+      "epoch": 0.20924308588064047,
+      "grad_norm": 0.21199050545692444,
+      "learning_rate": 4.591919740285957e-05,
+      "loss": 0.16286123991012574,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2101528384279476,
+      "grad_norm": 0.15100529789924622,
+      "learning_rate": 4.587880211451427e-05,
+      "loss": 0.17995200157165528,
+      "step": 1155
+    },
+    {
+      "epoch": 0.21106259097525473,
+      "grad_norm": 0.16618172824382782,
+      "learning_rate": 4.583822582717085e-05,
+      "loss": 0.16960303783416747,
+      "step": 1160
+    },
+    {
+      "epoch": 0.21197234352256186,
+      "grad_norm": 0.14743569493293762,
+      "learning_rate": 4.579746889258643e-05,
+      "loss": 0.17762668132781984,
+      "step": 1165
+    },
+    {
+      "epoch": 0.212882096069869,
+      "grad_norm": 0.1697179079055786,
+      "learning_rate": 4.575653166408417e-05,
+      "loss": 0.16656005382537842,
+      "step": 1170
+    },
+    {
+      "epoch": 0.21379184861717612,
+      "grad_norm": 0.14886513352394104,
+      "learning_rate": 4.57154144965502e-05,
+      "loss": 0.17091882228851318,
+      "step": 1175
+    },
+    {
+      "epoch": 0.21470160116448325,
+      "grad_norm": 0.18197473883628845,
+      "learning_rate": 4.5674117746430556e-05,
+      "loss": 0.1770920753479004,
+      "step": 1180
+    },
+    {
+      "epoch": 0.21561135371179038,
+      "grad_norm": 0.17323088645935059,
+      "learning_rate": 4.563264177172807e-05,
+      "loss": 0.1734643578529358,
+      "step": 1185
+    },
+    {
+      "epoch": 0.2165211062590975,
+      "grad_norm": 0.1521984338760376,
+      "learning_rate": 4.559098693199929e-05,
+      "loss": 0.17515116930007935,
+      "step": 1190
+    },
+    {
+      "epoch": 0.21743085880640467,
+      "grad_norm": 0.1842304915189743,
+      "learning_rate": 4.554915358835134e-05,
+      "loss": 0.16798022985458375,
+      "step": 1195
+    },
+    {
+      "epoch": 0.2183406113537118,
+      "grad_norm": 0.14753451943397522,
+      "learning_rate": 4.5507142103438794e-05,
+      "loss": 0.1755476713180542,
+      "step": 1200
+    },
+    {
+      "epoch": 0.21925036390101893,
+      "grad_norm": 0.17096194624900818,
+      "learning_rate": 4.546495284146057e-05,
+      "loss": 0.1792473554611206,
+      "step": 1205
+    },
+    {
+      "epoch": 0.22016011644832606,
+      "grad_norm": 0.1579233556985855,
+      "learning_rate": 4.542258616815669e-05,
+      "loss": 0.17230144739151002,
+      "step": 1210
+    },
+    {
+      "epoch": 0.2210698689956332,
+      "grad_norm": 0.177297905087471,
+      "learning_rate": 4.5380042450805216e-05,
+      "loss": 0.1807127833366394,
+      "step": 1215
+    },
+    {
+      "epoch": 0.22197962154294032,
+      "grad_norm": 0.14331696927547455,
+      "learning_rate": 4.533732205821897e-05,
+      "loss": 0.17201389074325563,
+      "step": 1220
+    },
+    {
+      "epoch": 0.22288937409024745,
+      "grad_norm": 0.14473360776901245,
+      "learning_rate": 4.529442536074239e-05,
+      "loss": 0.17036900520324708,
+      "step": 1225
+    },
+    {
+      "epoch": 0.22379912663755458,
+      "grad_norm": 0.1820901483297348,
+      "learning_rate": 4.5251352730248314e-05,
+      "loss": 0.17704882621765136,
+      "step": 1230
+    },
+    {
+      "epoch": 0.2247088791848617,
+      "grad_norm": 0.1948976367712021,
+      "learning_rate": 4.5208104540134746e-05,
+      "loss": 0.1706973433494568,
+      "step": 1235
+    },
+    {
+      "epoch": 0.22561863173216884,
+      "grad_norm": 0.16660070419311523,
+      "learning_rate": 4.51646811653216e-05,
+      "loss": 0.17636821269989014,
+      "step": 1240
+    },
+    {
+      "epoch": 0.22652838427947597,
+      "grad_norm": 0.1699984073638916,
+      "learning_rate": 4.512108298224751e-05,
+      "loss": 0.16986632347106934,
+      "step": 1245
+    },
+    {
+      "epoch": 0.22743813682678313,
+      "grad_norm": 0.17601042985916138,
+      "learning_rate": 4.50773103688665e-05,
+      "loss": 0.17507898807525635,
+      "step": 1250
+    },
+    {
+      "epoch": 0.22834788937409026,
+      "grad_norm": 0.17557238042354584,
+      "learning_rate": 4.503336370464476e-05,
+      "loss": 0.17702863216400147,
+      "step": 1255
+    },
+    {
+      "epoch": 0.2292576419213974,
+      "grad_norm": 0.1800651252269745,
+      "learning_rate": 4.498924337055729e-05,
+      "loss": 0.16419180631637573,
+      "step": 1260
+    },
+    {
+      "epoch": 0.23016739446870452,
+      "grad_norm": 0.2022479772567749,
+      "learning_rate": 4.494494974908468e-05,
+      "loss": 0.17482060194015503,
+      "step": 1265
+    },
+    {
+      "epoch": 0.23107714701601165,
+      "grad_norm": 0.14180205762386322,
+      "learning_rate": 4.490048322420973e-05,
+      "loss": 0.1723136067390442,
+      "step": 1270
+    },
+    {
+      "epoch": 0.23198689956331878,
+      "grad_norm": 0.18607310950756073,
+      "learning_rate": 4.485584418141419e-05,
+      "loss": 0.17096419334411622,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2328966521106259,
+      "grad_norm": 0.15958310663700104,
+      "learning_rate": 4.481103300767529e-05,
+      "loss": 0.1656244158744812,
+      "step": 1280
+    },
+    {
+      "epoch": 0.23380640465793304,
+      "grad_norm": 0.17552383244037628,
+      "learning_rate": 4.476605009146255e-05,
+      "loss": 0.17677626609802247,
+      "step": 1285
+    },
+    {
+      "epoch": 0.23471615720524017,
+      "grad_norm": 0.15299823880195618,
+      "learning_rate": 4.472089582273429e-05,
+      "loss": 0.1778991103172302,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2356259097525473,
+      "grad_norm": 0.14613987505435944,
+      "learning_rate": 4.46755705929343e-05,
+      "loss": 0.17071452140808105,
+      "step": 1295
+    },
+    {
+      "epoch": 0.23653566229985443,
+      "grad_norm": 0.17781122028827667,
+      "learning_rate": 4.463007479498843e-05,
+      "loss": 0.16955430507659913,
+      "step": 1300
+    },
+    {
+      "epoch": 0.23744541484716158,
+      "grad_norm": 0.16326487064361572,
+      "learning_rate": 4.458440882330119e-05,
+      "loss": 0.1777693510055542,
+      "step": 1305
+    },
+    {
+      "epoch": 0.23835516739446871,
+      "grad_norm": 0.17701926827430725,
+      "learning_rate": 4.4538573073752365e-05,
+      "loss": 0.16323351860046387,
+      "step": 1310
+    },
+    {
+      "epoch": 0.23926491994177584,
+      "grad_norm": 0.13104717433452606,
+      "learning_rate": 4.449256794369349e-05,
+      "loss": 0.17653456926345826,
+      "step": 1315
+    },
+    {
+      "epoch": 0.24017467248908297,
+      "grad_norm": 0.1796836256980896,
+      "learning_rate": 4.444639383194452e-05,
+      "loss": 0.17189600467681884,
+      "step": 1320
+    },
+    {
+      "epoch": 0.2410844250363901,
+      "grad_norm": 0.14919696748256683,
+      "learning_rate": 4.440005113879029e-05,
+      "loss": 0.17003334760665895,
+      "step": 1325
+    },
+    {
+      "epoch": 0.24199417758369723,
+      "grad_norm": 0.1728784441947937,
+      "learning_rate": 4.4353540265977064e-05,
+      "loss": 0.17397408485412597,
+      "step": 1330
+    },
+    {
+      "epoch": 0.24290393013100436,
+      "grad_norm": 0.14591015875339508,
+      "learning_rate": 4.43068616167091e-05,
+      "loss": 0.16498478651046752,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2438136826783115,
+      "grad_norm": 0.18417201936244965,
+      "learning_rate": 4.4260015595645055e-05,
+      "loss": 0.16841750144958495,
+      "step": 1340
+    },
+    {
+      "epoch": 0.24472343522561862,
+      "grad_norm": 0.16264279186725616,
+      "learning_rate": 4.4213002608894605e-05,
+      "loss": 0.16907373666763306,
+      "step": 1345
+    },
+    {
+      "epoch": 0.24563318777292575,
+      "grad_norm": 0.15248481929302216,
+      "learning_rate": 4.416582306401481e-05,
+      "loss": 0.15931472778320313,
+      "step": 1350
+    },
+    {
+      "epoch": 0.24654294032023288,
+      "grad_norm": 0.1488373875617981,
+      "learning_rate": 4.4118477370006636e-05,
+      "loss": 0.1701716423034668,
+      "step": 1355
+    },
+    {
+      "epoch": 0.24745269286754004,
+      "grad_norm": 0.14679782092571259,
+      "learning_rate": 4.407096593731142e-05,
+      "loss": 0.157412326335907,
+      "step": 1360
+    },
+    {
+      "epoch": 0.24836244541484717,
+      "grad_norm": 0.17139530181884766,
+      "learning_rate": 4.402328917780728e-05,
+      "loss": 0.17303754091262818,
+      "step": 1365
+    },
+    {
+      "epoch": 0.2492721979621543,
+      "grad_norm": 0.1534871757030487,
+      "learning_rate": 4.397544750480554e-05,
+      "loss": 0.1786255121231079,
+      "step": 1370
+    },
+    {
+      "epoch": 0.2501819505094614,
+      "grad_norm": 0.1876252293586731,
+      "learning_rate": 4.39274413330472e-05,
+      "loss": 0.16442898511886597,
+      "step": 1375
+    },
+    {
+      "epoch": 0.25109170305676853,
+      "grad_norm": 0.16165752708911896,
+      "learning_rate": 4.387927107869928e-05,
+      "loss": 0.1780426025390625,
+      "step": 1380
+    },
+    {
+      "epoch": 0.25200145560407566,
+      "grad_norm": 0.17242255806922913,
+      "learning_rate": 4.383093715935124e-05,
+      "loss": 0.15959256887435913,
+      "step": 1385
+    },
+    {
+      "epoch": 0.25291120815138285,
+      "grad_norm": 0.1627114862203598,
+      "learning_rate": 4.378243999401137e-05,
+      "loss": 0.17606115341186523,
+      "step": 1390
+    },
+    {
+      "epoch": 0.25382096069869,
+      "grad_norm": 0.15911224484443665,
+      "learning_rate": 4.373378000310312e-05,
+      "loss": 0.16798585653305054,
+      "step": 1395
+    },
+    {
+      "epoch": 0.2547307132459971,
+      "grad_norm": 0.15542249381542206,
+      "learning_rate": 4.3684957608461505e-05,
+      "loss": 0.1695417881011963,
+      "step": 1400
+    },
+    {
+      "epoch": 0.25564046579330424,
+      "grad_norm": 0.1475304812192917,
+      "learning_rate": 4.363597323332941e-05,
+      "loss": 0.16340878009796142,
+      "step": 1405
+    },
+    {
+      "epoch": 0.25655021834061137,
+      "grad_norm": 0.16943927109241486,
+      "learning_rate": 4.358682730235395e-05,
+      "loss": 0.17240238189697266,
+      "step": 1410
+    },
+    {
+      "epoch": 0.2574599708879185,
+      "grad_norm": 0.1816391944885254,
+      "learning_rate": 4.3537520241582744e-05,
+      "loss": 0.16558437347412108,
+      "step": 1415
+    },
+    {
+      "epoch": 0.25836972343522563,
+      "grad_norm": 0.23851341009140015,
+      "learning_rate": 4.348805247846027e-05,
+      "loss": 0.16796000003814698,
+      "step": 1420
+    },
+    {
+      "epoch": 0.25927947598253276,
+      "grad_norm": 0.15415243804454803,
+      "learning_rate": 4.343842444182414e-05,
+      "loss": 0.1746017098426819,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2601892285298399,
+      "grad_norm": 0.15651032328605652,
+      "learning_rate": 4.338863656190139e-05,
+      "loss": 0.1649057984352112,
+      "step": 1430
+    },
+    {
+      "epoch": 0.261098981077147,
+      "grad_norm": 0.16601966321468353,
+      "learning_rate": 4.333868927030471e-05,
+      "loss": 0.15888988971710205,
+      "step": 1435
+    },
+    {
+      "epoch": 0.26200873362445415,
+      "grad_norm": 0.1549467295408249,
+      "learning_rate": 4.328858300002876e-05,
+      "loss": 0.16357985734939576,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2629184861717613,
+      "grad_norm": 0.16332370042800903,
+      "learning_rate": 4.32383181854464e-05,
+      "loss": 0.16749982833862304,
+      "step": 1445
+    },
+    {
+      "epoch": 0.2638282387190684,
+      "grad_norm": 0.14827077090740204,
+      "learning_rate": 4.3187895262304894e-05,
+      "loss": 0.16886214017868043,
+      "step": 1450
+    },
+    {
+      "epoch": 0.26473799126637554,
+      "grad_norm": 0.1557198166847229,
+      "learning_rate": 4.313731466772216e-05,
+      "loss": 0.17512214183807373,
+      "step": 1455
+    },
+    {
+      "epoch": 0.26564774381368267,
+      "grad_norm": 0.17263570427894592,
+      "learning_rate": 4.308657684018299e-05,
+      "loss": 0.16248074769973755,
+      "step": 1460
+    },
+    {
+      "epoch": 0.2665574963609898,
+      "grad_norm": 0.17135761678218842,
+      "learning_rate": 4.303568221953521e-05,
+      "loss": 0.16605921983718872,
+      "step": 1465
+    },
+    {
+      "epoch": 0.26746724890829693,
+      "grad_norm": 0.14322632551193237,
+      "learning_rate": 4.2984631246985897e-05,
+      "loss": 0.1610772728919983,
+      "step": 1470
+    },
+    {
+      "epoch": 0.26837700145560406,
+      "grad_norm": 0.18852312862873077,
+      "learning_rate": 4.2933424365097564e-05,
+      "loss": 0.1686462163925171,
+      "step": 1475
+    },
+    {
+      "epoch": 0.2692867540029112,
+      "grad_norm": 0.1780245155096054,
+      "learning_rate": 4.2882062017784294e-05,
+      "loss": 0.16953932046890258,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2701965065502183,
+      "grad_norm": 0.180568665266037,
+      "learning_rate": 4.2830544650307895e-05,
+      "loss": 0.16442664861679077,
+      "step": 1485
+    },
+    {
+      "epoch": 0.27110625909752545,
+      "grad_norm": 0.16876435279846191,
+      "learning_rate": 4.277887270927407e-05,
+      "loss": 0.17128173112869263,
+      "step": 1490
+    },
+    {
+      "epoch": 0.2720160116448326,
+      "grad_norm": 0.164053276181221,
+      "learning_rate": 4.2727046642628513e-05,
+      "loss": 0.16331382989883422,
+      "step": 1495
+    },
+    {
+      "epoch": 0.27292576419213976,
+      "grad_norm": 0.14577528834342957,
+      "learning_rate": 4.267506689965305e-05,
+      "loss": 0.1638316035270691,
+      "step": 1500
+    },
+    {
+      "epoch": 0.2738355167394469,
+      "grad_norm": 0.1648740917444229,
+      "learning_rate": 4.262293393096171e-05,
+      "loss": 0.15332664251327516,
+      "step": 1505
+    },
+    {
+      "epoch": 0.274745269286754,
+      "grad_norm": 0.16445094347000122,
+      "learning_rate": 4.257064818849685e-05,
+      "loss": 0.1706634521484375,
+      "step": 1510
+    },
+    {
+      "epoch": 0.27565502183406115,
+      "grad_norm": 0.1584935486316681,
+      "learning_rate": 4.251821012552524e-05,
+      "loss": 0.1684114694595337,
+      "step": 1515
+    },
+    {
+      "epoch": 0.2765647743813683,
+      "grad_norm": 0.17215611040592194,
+      "learning_rate": 4.24656201966341e-05,
+      "loss": 0.15594131946563722,
+      "step": 1520
+    },
+    {
+      "epoch": 0.2774745269286754,
+      "grad_norm": 0.15945589542388916,
+      "learning_rate": 4.2412878857727214e-05,
+      "loss": 0.1686659574508667,
+      "step": 1525
+    },
+    {
+      "epoch": 0.27838427947598254,
+      "grad_norm": 0.16103951632976532,
+      "learning_rate": 4.2359986566020906e-05,
+      "loss": 0.17779340744018554,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2792940320232897,
+      "grad_norm": 0.1770307570695877,
+      "learning_rate": 4.230694378004014e-05,
+      "loss": 0.16786882877349854,
+      "step": 1535
+    },
+    {
+      "epoch": 0.2802037845705968,
+      "grad_norm": 0.16225053369998932,
+      "learning_rate": 4.2253750959614504e-05,
+      "loss": 0.16558897495269775,
+      "step": 1540
+    },
+    {
+      "epoch": 0.28111353711790393,
+      "grad_norm": 0.27213969826698303,
+      "learning_rate": 4.220040856587425e-05,
+      "loss": 0.1641119599342346,
+      "step": 1545
+    },
+    {
+      "epoch": 0.28202328966521106,
+      "grad_norm": 0.1773071587085724,
+      "learning_rate": 4.2146917061246284e-05,
+      "loss": 0.16919140815734862,
+      "step": 1550
+    },
+    {
+      "epoch": 0.2829330422125182,
+      "grad_norm": 0.15519705414772034,
+      "learning_rate": 4.209327690945014e-05,
+      "loss": 0.15501506328582765,
+      "step": 1555
+    },
+    {
+      "epoch": 0.2838427947598253,
+      "grad_norm": 0.19921597838401794,
+      "learning_rate": 4.203948857549402e-05,
+      "loss": 0.1690821886062622,
+      "step": 1560
+    },
+    {
+      "epoch": 0.28475254730713245,
+      "grad_norm": 0.15417630970478058,
+      "learning_rate": 4.1985552525670696e-05,
+      "loss": 0.1675640344619751,
+      "step": 1565
+    },
+    {
+      "epoch": 0.2856622998544396,
+      "grad_norm": 0.1739572137594223,
+      "learning_rate": 4.193146922755348e-05,
+      "loss": 0.16738017797470092,
+      "step": 1570
+    },
+    {
+      "epoch": 0.2865720524017467,
+      "grad_norm": 0.1384361982345581,
+      "learning_rate": 4.187723914999221e-05,
+      "loss": 0.16802358627319336,
+      "step": 1575
+    },
+    {
+      "epoch": 0.28748180494905384,
+      "grad_norm": 0.1491454839706421,
+      "learning_rate": 4.182286276310915e-05,
+      "loss": 0.1619583249092102,
+      "step": 1580
+    },
+    {
+      "epoch": 0.288391557496361,
+      "grad_norm": 0.15831919014453888,
+      "learning_rate": 4.176834053829492e-05,
+      "loss": 0.1625199794769287,
+      "step": 1585
+    },
+    {
+      "epoch": 0.2893013100436681,
+      "grad_norm": 0.16265396773815155,
+      "learning_rate": 4.1713672948204416e-05,
+      "loss": 0.16718552112579346,
+      "step": 1590
+    },
+    {
+      "epoch": 0.29021106259097523,
+      "grad_norm": 0.15153461694717407,
+      "learning_rate": 4.1658860466752714e-05,
+      "loss": 0.15979087352752686,
+      "step": 1595
+    },
+    {
+      "epoch": 0.29112081513828236,
+      "grad_norm": 0.1620412915945053,
+      "learning_rate": 4.160390356911096e-05,
+      "loss": 0.16103557348251343,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2920305676855895,
+      "grad_norm": 0.16673807799816132,
+      "learning_rate": 4.154880273170223e-05,
+      "loss": 0.16394708156585694,
+      "step": 1605
+    },
+    {
+      "epoch": 0.2929403202328967,
+      "grad_norm": 0.14834867417812347,
+      "learning_rate": 4.149355843219744e-05,
+      "loss": 0.15916435718536376,
+      "step": 1610
+    },
+    {
+      "epoch": 0.2938500727802038,
+      "grad_norm": 0.16977964341640472,
+      "learning_rate": 4.143817114951119e-05,
+      "loss": 0.16538127660751342,
+      "step": 1615
+    },
+    {
+      "epoch": 0.29475982532751094,
+      "grad_norm": 0.17986875772476196,
+      "learning_rate": 4.138264136379756e-05,
+      "loss": 0.15514618158340454,
+      "step": 1620
+    },
+    {
+      "epoch": 0.29566957787481807,
+      "grad_norm": 0.15794920921325684,
+      "learning_rate": 4.132696955644605e-05,
+      "loss": 0.15992183685302735,
+      "step": 1625
+    },
+    {
+      "epoch": 0.2965793304221252,
+      "grad_norm": 0.19955399632453918,
+      "learning_rate": 4.127115621007731e-05,
+      "loss": 0.16362056732177735,
+      "step": 1630
+    },
+    {
+      "epoch": 0.29748908296943233,
+      "grad_norm": 0.1352023035287857,
+      "learning_rate": 4.121520180853903e-05,
+      "loss": 0.15631601810455323,
+      "step": 1635
+    },
+    {
+      "epoch": 0.29839883551673946,
+      "grad_norm": 0.15340781211853027,
+      "learning_rate": 4.1159106836901674e-05,
+      "loss": 0.1571858048439026,
+      "step": 1640
+    },
+    {
+      "epoch": 0.2993085880640466,
+      "grad_norm": 0.15311770141124725,
+      "learning_rate": 4.110287178145433e-05,
+      "loss": 0.16082344055175782,
+      "step": 1645
+    },
+    {
+      "epoch": 0.3002183406113537,
+      "grad_norm": 0.17811627686023712,
+      "learning_rate": 4.10464971297005e-05,
+      "loss": 0.16117215156555176,
+      "step": 1650
+    },
+    {
+      "epoch": 0.30112809315866085,
+      "grad_norm": 0.21060039103031158,
+      "learning_rate": 4.0989983370353805e-05,
+      "loss": 0.15838587284088135,
+      "step": 1655
+    },
+    {
+      "epoch": 0.302037845705968,
+      "grad_norm": 0.155836820602417,
+      "learning_rate": 4.093333099333383e-05,
+      "loss": 0.16648870706558228,
+      "step": 1660
+    },
+    {
+      "epoch": 0.3029475982532751,
+      "grad_norm": 0.13711698353290558,
+      "learning_rate": 4.0876540489761826e-05,
+      "loss": 0.16899349689483642,
+      "step": 1665
+    },
+    {
+      "epoch": 0.30385735080058224,
+      "grad_norm": 0.15162716805934906,
+      "learning_rate": 4.0819612351956485e-05,
+      "loss": 0.16574090719223022,
+      "step": 1670
+    },
+    {
+      "epoch": 0.30476710334788937,
+      "grad_norm": 0.15016348659992218,
+      "learning_rate": 4.0762547073429615e-05,
+      "loss": 0.1689780354499817,
+      "step": 1675
+    },
+    {
+      "epoch": 0.3056768558951965,
+      "grad_norm": 0.15182986855506897,
+      "learning_rate": 4.070534514888194e-05,
+      "loss": 0.1593686819076538,
+      "step": 1680
+    },
+    {
+      "epoch": 0.3065866084425036,
+      "grad_norm": 0.15648750960826874,
+      "learning_rate": 4.0648007074198765e-05,
+      "loss": 0.16436235904693602,
+      "step": 1685
+    },
+    {
+      "epoch": 0.30749636098981076,
+      "grad_norm": 0.18339484930038452,
+      "learning_rate": 4.0590533346445665e-05,
+      "loss": 0.1678077220916748,
+      "step": 1690
+    },
+    {
+      "epoch": 0.3084061135371179,
+      "grad_norm": 0.16426527500152588,
+      "learning_rate": 4.053292446386422e-05,
+      "loss": 0.1689227342605591,
+      "step": 1695
+    },
+    {
+      "epoch": 0.309315866084425,
+      "grad_norm": 0.16129335761070251,
+      "learning_rate": 4.047518092586766e-05,
+      "loss": 0.16592445373535156,
+      "step": 1700
+    },
+    {
+      "epoch": 0.31022561863173215,
+      "grad_norm": 0.15512363612651825,
+      "learning_rate": 4.041730323303654e-05,
+      "loss": 0.16142364740371704,
+      "step": 1705
+    },
+    {
+      "epoch": 0.3111353711790393,
+      "grad_norm": 0.159842386841774,
+      "learning_rate": 4.0359291887114425e-05,
+      "loss": 0.1702875852584839,
+      "step": 1710
+    },
+    {
+      "epoch": 0.3120451237263464,
+      "grad_norm": 0.19558854401111603,
+      "learning_rate": 4.030114739100352e-05,
+      "loss": 0.15966148376464845,
+      "step": 1715
+    },
+    {
+      "epoch": 0.3129548762736536,
+      "grad_norm": 0.1577496975660324,
+      "learning_rate": 4.024287024876029e-05,
+      "loss": 0.1620358943939209,
+      "step": 1720
+    },
+    {
+      "epoch": 0.3138646288209607,
+      "grad_norm": 0.1629355251789093,
+      "learning_rate": 4.0184460965591144e-05,
+      "loss": 0.16511552333831786,
+      "step": 1725
+    },
+    {
+      "epoch": 0.31477438136826785,
+      "grad_norm": 0.17060767114162445,
+      "learning_rate": 4.0125920047848e-05,
+      "loss": 0.15672838687896729,
+      "step": 1730
+    },
+    {
+      "epoch": 0.315684133915575,
+      "grad_norm": 0.22447620332241058,
+      "learning_rate": 4.006724800302394e-05,
+      "loss": 0.15339784622192382,
+      "step": 1735
+    },
+    {
+      "epoch": 0.3165938864628821,
+      "grad_norm": 0.14572037756443024,
+      "learning_rate": 4.000844533974878e-05,
+      "loss": 0.16566959619522095,
+      "step": 1740
+    },
+    {
+      "epoch": 0.31750363901018924,
+      "grad_norm": 0.15915483236312866,
+      "learning_rate": 3.9949512567784684e-05,
+      "loss": 0.16153957843780517,
+      "step": 1745
+    },
+    {
+      "epoch": 0.3184133915574964,
+      "grad_norm": 0.1668540984392166,
+      "learning_rate": 3.9890450198021704e-05,
+      "loss": 0.1659809947013855,
+      "step": 1750
+    },
+    {
+      "epoch": 0.3193231441048035,
+      "grad_norm": 0.16612035036087036,
+      "learning_rate": 3.983125874247341e-05,
+      "loss": 0.16941241025924683,
+      "step": 1755
+    },
+    {
+      "epoch": 0.32023289665211063,
+      "grad_norm": 0.15163679420948029,
+      "learning_rate": 3.9771938714272407e-05,
+      "loss": 0.16053590774536133,
+      "step": 1760
+    },
+    {
+      "epoch": 0.32114264919941776,
+      "grad_norm": 0.1797824203968048,
+      "learning_rate": 3.97124906276659e-05,
+      "loss": 0.1667110800743103,
+      "step": 1765
+    },
+    {
+      "epoch": 0.3220524017467249,
+      "grad_norm": 0.15076608955860138,
+      "learning_rate": 3.9652914998011237e-05,
+      "loss": 0.1607860803604126,
+      "step": 1770
+    },
+    {
+      "epoch": 0.322962154294032,
+      "grad_norm": 0.16523587703704834,
+      "learning_rate": 3.959321234177144e-05,
+      "loss": 0.16515827178955078,
+      "step": 1775
+    },
+    {
+      "epoch": 0.32387190684133915,
+      "grad_norm": 0.22065149247646332,
+      "learning_rate": 3.9533383176510746e-05,
+      "loss": 0.1618957757949829,
+      "step": 1780
+    },
+    {
+      "epoch": 0.3247816593886463,
+      "grad_norm": 0.16426463425159454,
+      "learning_rate": 3.9473428020890066e-05,
+      "loss": 0.15763382911682128,
+      "step": 1785
+    },
+    {
+      "epoch": 0.3256914119359534,
+      "grad_norm": 0.16474904119968414,
+      "learning_rate": 3.941334739466257e-05,
+      "loss": 0.15135571956634522,
+      "step": 1790
+    },
+    {
+      "epoch": 0.32660116448326054,
+      "grad_norm": 0.16746412217617035,
+      "learning_rate": 3.935314181866909e-05,
+      "loss": 0.15925389528274536,
+      "step": 1795
+    },
+    {
+      "epoch": 0.32751091703056767,
+      "grad_norm": 0.17819371819496155,
+      "learning_rate": 3.929281181483369e-05,
+      "loss": 0.1598669171333313,
+      "step": 1800
+    },
+    {
+      "epoch": 0.3284206695778748,
+      "grad_norm": 0.1816040277481079,
+      "learning_rate": 3.923235790615907e-05,
+      "loss": 0.1652522087097168,
+      "step": 1805
+    },
+    {
+      "epoch": 0.32933042212518193,
+      "grad_norm": 0.14846695959568024,
+      "learning_rate": 3.917178061672211e-05,
+      "loss": 0.16665585041046144,
+      "step": 1810
+    },
+    {
+      "epoch": 0.33024017467248906,
+      "grad_norm": 0.1734926551580429,
+      "learning_rate": 3.911108047166924e-05,
+      "loss": 0.16069791316986085,
+      "step": 1815
+    },
+    {
+      "epoch": 0.3311499272197962,
+      "grad_norm": 0.16154922544956207,
+      "learning_rate": 3.905025799721194e-05,
+      "loss": 0.16114097833633423,
+      "step": 1820
+    },
+    {
+      "epoch": 0.3320596797671033,
+      "grad_norm": 0.1538771390914917,
+      "learning_rate": 3.898931372062217e-05,
+      "loss": 0.1602831244468689,
+      "step": 1825
+    },
+    {
+      "epoch": 0.3329694323144105,
+      "grad_norm": 0.14036566019058228,
+      "learning_rate": 3.892824817022781e-05,
+      "loss": 0.1502395749092102,
+      "step": 1830
+    },
+    {
+      "epoch": 0.33387918486171764,
+      "grad_norm": 0.19212059676647186,
+      "learning_rate": 3.886706187540804e-05,
+      "loss": 0.16265250444412233,
+      "step": 1835
+    },
+    {
+      "epoch": 0.33478893740902477,
+      "grad_norm": 0.17410333454608917,
+      "learning_rate": 3.880575536658881e-05,
+      "loss": 0.15689224004745483,
+      "step": 1840
+    },
+    {
+      "epoch": 0.3356986899563319,
+      "grad_norm": 0.15165294706821442,
+      "learning_rate": 3.874432917523817e-05,
+      "loss": 0.15033140182495117,
+      "step": 1845
+    },
+    {
+      "epoch": 0.336608442503639,
+      "grad_norm": 0.16166730225086212,
+      "learning_rate": 3.8682783833861736e-05,
+      "loss": 0.16896235942840576,
+      "step": 1850
+    },
+    {
+      "epoch": 0.33751819505094616,
+      "grad_norm": 0.16497021913528442,
+      "learning_rate": 3.8621119875998026e-05,
+      "loss": 0.1600774645805359,
+      "step": 1855
+    },
+    {
+      "epoch": 0.3384279475982533,
+      "grad_norm": 0.17264948785305023,
+      "learning_rate": 3.855933783621384e-05,
+      "loss": 0.16947593688964843,
+      "step": 1860
+    },
+    {
+      "epoch": 0.3393377001455604,
+      "grad_norm": 0.16870704293251038,
+      "learning_rate": 3.8497438250099636e-05,
+      "loss": 0.16062095165252685,
+      "step": 1865
+    },
+    {
+      "epoch": 0.34024745269286755,
+      "grad_norm": 0.16644036769866943,
+      "learning_rate": 3.843542165426492e-05,
+      "loss": 0.16015599966049193,
+      "step": 1870
+    },
+    {
+      "epoch": 0.3411572052401747,
+      "grad_norm": 0.1626352220773697,
+      "learning_rate": 3.837328858633349e-05,
+      "loss": 0.17444703578948975,
+      "step": 1875
+    },
+    {
+      "epoch": 0.3420669577874818,
+      "grad_norm": 0.1427375227212906,
+      "learning_rate": 3.83110395849389e-05,
+      "loss": 0.1589805006980896,
+      "step": 1880
+    },
+    {
+      "epoch": 0.34297671033478894,
+      "grad_norm": 0.17840255796909332,
+      "learning_rate": 3.824867518971973e-05,
+      "loss": 0.15953952074050903,
+      "step": 1885
+    },
+    {
+      "epoch": 0.34388646288209607,
+      "grad_norm": 0.16998249292373657,
+      "learning_rate": 3.818619594131489e-05,
+      "loss": 0.16027032136917113,
+      "step": 1890
+    },
+    {
+      "epoch": 0.3447962154294032,
+      "grad_norm": 0.14950257539749146,
+      "learning_rate": 3.812360238135897e-05,
+      "loss": 0.15335670709609986,
+      "step": 1895
+    },
+    {
+      "epoch": 0.3457059679767103,
+      "grad_norm": 0.1678011417388916,
+      "learning_rate": 3.806089505247752e-05,
+      "loss": 0.1560648798942566,
+      "step": 1900
+    },
+    {
+      "epoch": 0.34661572052401746,
+      "grad_norm": 0.17944541573524475,
+      "learning_rate": 3.799807449828238e-05,
+      "loss": 0.16072254180908202,
+      "step": 1905
+    },
+    {
+      "epoch": 0.3475254730713246,
+      "grad_norm": 0.166817307472229,
+      "learning_rate": 3.793514126336691e-05,
+      "loss": 0.1542820692062378,
+      "step": 1910
+    },
+    {
+      "epoch": 0.3484352256186317,
+      "grad_norm": 0.16047626733779907,
+      "learning_rate": 3.787209589330134e-05,
+      "loss": 0.16092092990875245,
+      "step": 1915
+    },
+    {
+      "epoch": 0.34934497816593885,
+      "grad_norm": 0.16478900611400604,
+      "learning_rate": 3.7808938934627965e-05,
+      "loss": 0.16765867471694945,
+      "step": 1920
+    },
+    {
+      "epoch": 0.350254730713246,
+      "grad_norm": 0.15349514782428741,
+      "learning_rate": 3.774567093485648e-05,
+      "loss": 0.15890377759933472,
+      "step": 1925
+    },
+    {
+      "epoch": 0.3511644832605531,
+      "grad_norm": 0.1515921950340271,
+      "learning_rate": 3.768229244245917e-05,
+      "loss": 0.16668319702148438,
+      "step": 1930
+    },
+    {
+      "epoch": 0.35207423580786024,
+      "grad_norm": 0.16310466825962067,
+      "learning_rate": 3.7618804006866195e-05,
+      "loss": 0.15182652473449706,
+      "step": 1935
+    },
+    {
+      "epoch": 0.3529839883551674,
+      "grad_norm": 0.17294517159461975,
+      "learning_rate": 3.755520617846084e-05,
+      "loss": 0.16287628412246705,
+      "step": 1940
+    },
+    {
+      "epoch": 0.35389374090247455,
+      "grad_norm": 0.1482895463705063,
+      "learning_rate": 3.749149950857467e-05,
+      "loss": 0.15321952104568481,
+      "step": 1945
+    },
+    {
+      "epoch": 0.3548034934497817,
+      "grad_norm": 0.2236029952764511,
+      "learning_rate": 3.7427684549482847e-05,
+      "loss": 0.15403482913970948,
+      "step": 1950
+    },
+    {
+      "epoch": 0.3557132459970888,
+      "grad_norm": 0.20185327529907227,
+      "learning_rate": 3.736376185439927e-05,
+      "loss": 0.1633884072303772,
+      "step": 1955
+    },
+    {
+      "epoch": 0.35662299854439594,
+      "grad_norm": 0.13906247913837433,
+      "learning_rate": 3.7299731977471816e-05,
+      "loss": 0.15925350189208984,
+      "step": 1960
+    },
+    {
+      "epoch": 0.35753275109170307,
+      "grad_norm": 0.18665002286434174,
+      "learning_rate": 3.723559547377751e-05,
+      "loss": 0.1612026572227478,
+      "step": 1965
+    },
+    {
+      "epoch": 0.3584425036390102,
+      "grad_norm": 0.16913433372974396,
+      "learning_rate": 3.717135289931774e-05,
+      "loss": 0.15479494333267213,
+      "step": 1970
+    },
+    {
+      "epoch": 0.35935225618631733,
+      "grad_norm": 0.1620066910982132,
+      "learning_rate": 3.7107004811013434e-05,
+      "loss": 0.1604058027267456,
+      "step": 1975
+    },
+    {
+      "epoch": 0.36026200873362446,
+      "grad_norm": 0.16838301718235016,
+      "learning_rate": 3.704255176670021e-05,
+      "loss": 0.15335073471069335,
+      "step": 1980
+    },
+    {
+      "epoch": 0.3611717612809316,
+      "grad_norm": 0.3054695427417755,
+      "learning_rate": 3.6977994325123535e-05,
+      "loss": 0.16558053493499755,
+      "step": 1985
+    },
+    {
+      "epoch": 0.3620815138282387,
+      "grad_norm": 0.1526716649532318,
+      "learning_rate": 3.6913333045933934e-05,
+      "loss": 0.16148923635482787,
+      "step": 1990
+    },
+    {
+      "epoch": 0.36299126637554585,
+      "grad_norm": 0.15328513085842133,
+      "learning_rate": 3.684856848968209e-05,
+      "loss": 0.1553613781929016,
+      "step": 1995
+    },
+    {
+      "epoch": 0.363901018922853,
+      "grad_norm": 0.16129714250564575,
+      "learning_rate": 3.6783701217813995e-05,
+      "loss": 0.16724612712860107,
+      "step": 2000
+    },
+    {
+      "epoch": 0.3648107714701601,
+      "grad_norm": 0.15715539455413818,
+      "learning_rate": 3.6718731792666086e-05,
+      "loss": 0.15867922306060792,
+      "step": 2005
+    },
+    {
+      "epoch": 0.36572052401746724,
+      "grad_norm": 0.15569166839122772,
+      "learning_rate": 3.6653660777460366e-05,
+      "loss": 0.1552058696746826,
+      "step": 2010
+    },
+    {
+      "epoch": 0.36663027656477437,
+      "grad_norm": 0.16223010420799255,
+      "learning_rate": 3.6588488736299535e-05,
+      "loss": 0.1583200454711914,
+      "step": 2015
+    },
+    {
+      "epoch": 0.3675400291120815,
+      "grad_norm": 0.18441995978355408,
+      "learning_rate": 3.652321623416209e-05,
+      "loss": 0.15050662755966188,
+      "step": 2020
+    },
+    {
+      "epoch": 0.36844978165938863,
+      "grad_norm": 0.13792674243450165,
+      "learning_rate": 3.645784383689742e-05,
+      "loss": 0.15458759069442748,
+      "step": 2025
+    },
+    {
+      "epoch": 0.36935953420669576,
+      "grad_norm": 0.14993111789226532,
+      "learning_rate": 3.639237211122091e-05,
+      "loss": 0.15926222801208495,
+      "step": 2030
+    },
+    {
+      "epoch": 0.3702692867540029,
+      "grad_norm": 0.16815930604934692,
+      "learning_rate": 3.632680162470904e-05,
+      "loss": 0.15524441003799438,
+      "step": 2035
+    },
+    {
+      "epoch": 0.37117903930131,
+      "grad_norm": 0.13312821090221405,
+      "learning_rate": 3.626113294579441e-05,
+      "loss": 0.15883516073226928,
+      "step": 2040
+    },
+    {
+      "epoch": 0.37208879184861715,
+      "grad_norm": 0.16838273406028748,
+      "learning_rate": 3.619536664376091e-05,
+      "loss": 0.15829603672027587,
+      "step": 2045
+    },
+    {
+      "epoch": 0.37299854439592434,
+      "grad_norm": 0.14706873893737793,
+      "learning_rate": 3.612950328873869e-05,
+      "loss": 0.15644397735595703,
+      "step": 2050
+    },
+    {
+      "epoch": 0.37390829694323147,
+      "grad_norm": 0.1644199639558792,
+      "learning_rate": 3.606354345169926e-05,
+      "loss": 0.15858219861984252,
+      "step": 2055
+    },
+    {
+      "epoch": 0.3748180494905386,
+      "grad_norm": 0.18077051639556885,
+      "learning_rate": 3.599748770445055e-05,
+      "loss": 0.1641286849975586,
+      "step": 2060
+    },
+    {
+      "epoch": 0.3757278020378457,
+      "grad_norm": 0.16329127550125122,
+      "learning_rate": 3.5931336619631914e-05,
+      "loss": 0.15027186870574952,
+      "step": 2065
+    },
+    {
+      "epoch": 0.37663755458515286,
+      "grad_norm": 0.16346783936023712,
+      "learning_rate": 3.586509077070922e-05,
+      "loss": 0.1558641314506531,
+      "step": 2070
+    },
+    {
+      "epoch": 0.37754730713246,
+      "grad_norm": 0.1727602630853653,
+      "learning_rate": 3.5798750731969834e-05,
+      "loss": 0.15390506982803345,
+      "step": 2075
+    },
+    {
+      "epoch": 0.3784570596797671,
+      "grad_norm": 0.7598192691802979,
+      "learning_rate": 3.5732317078517654e-05,
+      "loss": 0.1533232808113098,
+      "step": 2080
+    },
+    {
+      "epoch": 0.37936681222707425,
+      "grad_norm": 0.1433355212211609,
+      "learning_rate": 3.5665790386268124e-05,
+      "loss": 0.15560413599014283,
+      "step": 2085
+    },
+    {
+      "epoch": 0.3802765647743814,
+      "grad_norm": 0.18439625203609467,
+      "learning_rate": 3.559917123194325e-05,
+      "loss": 0.16695556640625,
+      "step": 2090
+    },
+    {
+      "epoch": 0.3811863173216885,
+      "grad_norm": 0.1693502813577652,
+      "learning_rate": 3.55324601930666e-05,
+      "loss": 0.15957870483398437,
+      "step": 2095
+    },
+    {
+      "epoch": 0.38209606986899564,
+      "grad_norm": 0.17776088416576385,
+      "learning_rate": 3.54656578479583e-05,
+      "loss": 0.1527492880821228,
+      "step": 2100
+    },
+    {
+      "epoch": 0.38300582241630277,
+      "grad_norm": 0.15993724763393402,
+      "learning_rate": 3.539876477572998e-05,
+      "loss": 0.1567505717277527,
+      "step": 2105
+    },
+    {
+      "epoch": 0.3839155749636099,
+      "grad_norm": 0.17067375779151917,
+      "learning_rate": 3.533178155627981e-05,
+      "loss": 0.14660797119140626,
+      "step": 2110
+    },
+    {
+      "epoch": 0.384825327510917,
+      "grad_norm": 0.20239882171154022,
+      "learning_rate": 3.526470877028745e-05,
+      "loss": 0.1596767544746399,
+      "step": 2115
+    },
+    {
+      "epoch": 0.38573508005822416,
+      "grad_norm": 0.1863643079996109,
+      "learning_rate": 3.5197546999209005e-05,
+      "loss": 0.15738571882247926,
+      "step": 2120
+    },
+    {
+      "epoch": 0.3866448326055313,
+      "grad_norm": 0.16994133591651917,
+      "learning_rate": 3.5130296825272014e-05,
+      "loss": 0.16255316734313965,
+      "step": 2125
+    },
+    {
+      "epoch": 0.3875545851528384,
+      "grad_norm": 0.18703415989875793,
+      "learning_rate": 3.5062958831470355e-05,
+      "loss": 0.15206334590911866,
+      "step": 2130
+    },
+    {
+      "epoch": 0.38846433770014555,
+      "grad_norm": 0.15433982014656067,
+      "learning_rate": 3.4995533601559226e-05,
+      "loss": 0.1590178370475769,
+      "step": 2135
+    },
+    {
+      "epoch": 0.3893740902474527,
+      "grad_norm": 0.16498146951198578,
+      "learning_rate": 3.4928021720050104e-05,
+      "loss": 0.14759145975112914,
+      "step": 2140
+    },
+    {
+      "epoch": 0.3902838427947598,
+      "grad_norm": 0.17880478501319885,
+      "learning_rate": 3.486042377220562e-05,
+      "loss": 0.1642458915710449,
+      "step": 2145
+    },
+    {
+      "epoch": 0.39119359534206694,
+      "grad_norm": 0.14700061082839966,
+      "learning_rate": 3.479274034403455e-05,
+      "loss": 0.16105138063430785,
+      "step": 2150
+    },
+    {
+      "epoch": 0.39210334788937407,
+      "grad_norm": 0.1620762050151825,
+      "learning_rate": 3.472497202228664e-05,
+      "loss": 0.15104985237121582,
+      "step": 2155
+    },
+    {
+      "epoch": 0.3930131004366812,
+      "grad_norm": 0.1625058799982071,
+      "learning_rate": 3.4657119394447654e-05,
+      "loss": 0.16145485639572144,
+      "step": 2160
+    },
+    {
+      "epoch": 0.3939228529839884,
+      "grad_norm": 0.1631549596786499,
+      "learning_rate": 3.458918304873417e-05,
+      "loss": 0.16712255477905275,
+      "step": 2165
+    },
+    {
+      "epoch": 0.3948326055312955,
+      "grad_norm": 0.16041551530361176,
+      "learning_rate": 3.452116357408853e-05,
+      "loss": 0.15118330717086792,
+      "step": 2170
+    },
+    {
+      "epoch": 0.39574235807860264,
+      "grad_norm": 0.16692611575126648,
+      "learning_rate": 3.44530615601737e-05,
+      "loss": 0.16982550621032716,
+      "step": 2175
+    },
+    {
+      "epoch": 0.39665211062590977,
+      "grad_norm": 0.16082268953323364,
+      "learning_rate": 3.438487759736821e-05,
+      "loss": 0.1513260006904602,
+      "step": 2180
+    },
+    {
+      "epoch": 0.3975618631732169,
+      "grad_norm": 0.1474589854478836,
+      "learning_rate": 3.4316612276761004e-05,
+      "loss": 0.14968743324279785,
+      "step": 2185
+    },
+    {
+      "epoch": 0.39847161572052403,
+      "grad_norm": 0.14531342685222626,
+      "learning_rate": 3.42482661901463e-05,
+      "loss": 0.1563260555267334,
+      "step": 2190
+    },
+    {
+      "epoch": 0.39938136826783116,
+      "grad_norm": 0.16775506734848022,
+      "learning_rate": 3.41798399300185e-05,
+      "loss": 0.14861010313034057,
+      "step": 2195
+    },
+    {
+      "epoch": 0.4002911208151383,
+      "grad_norm": 0.15065217018127441,
+      "learning_rate": 3.411133408956703e-05,
+      "loss": 0.15559519529342652,
+      "step": 2200
+    },
+    {
+      "epoch": 0.4012008733624454,
+      "grad_norm": 0.16655296087265015,
+      "learning_rate": 3.4042749262671184e-05,
+      "loss": 0.16025567054748535,
+      "step": 2205
+    },
+    {
+      "epoch": 0.40211062590975255,
+      "grad_norm": 0.14773905277252197,
+      "learning_rate": 3.397408604389501e-05,
+      "loss": 0.15074082612991332,
+      "step": 2210
+    },
+    {
+      "epoch": 0.4030203784570597,
+      "grad_norm": 0.16233304142951965,
+      "learning_rate": 3.3905345028482125e-05,
+      "loss": 0.15490520000457764,
+      "step": 2215
+    },
+    {
+      "epoch": 0.4039301310043668,
+      "grad_norm": 0.17520153522491455,
+      "learning_rate": 3.383652681235058e-05,
+      "loss": 0.1517520785331726,
+      "step": 2220
+    },
+    {
+      "epoch": 0.40483988355167394,
+      "grad_norm": 0.14749875664710999,
+      "learning_rate": 3.376763199208766e-05,
+      "loss": 0.15410997867584228,
+      "step": 2225
+    },
+    {
+      "epoch": 0.40574963609898107,
+      "grad_norm": 0.16855919361114502,
+      "learning_rate": 3.369866116494477e-05,
+      "loss": 0.1510261058807373,
+      "step": 2230
+    },
+    {
+      "epoch": 0.4066593886462882,
+      "grad_norm": 0.1594122350215912,
+      "learning_rate": 3.362961492883218e-05,
+      "loss": 0.1493813395500183,
+      "step": 2235
+    },
+    {
+      "epoch": 0.40756914119359533,
+      "grad_norm": 0.13645926117897034,
+      "learning_rate": 3.3560493882313915e-05,
+      "loss": 0.14876762628555298,
+      "step": 2240
+    },
+    {
+      "epoch": 0.40847889374090246,
+      "grad_norm": 0.14304400980472565,
+      "learning_rate": 3.349129862460251e-05,
+      "loss": 0.15567013025283813,
+      "step": 2245
+    },
+    {
+      "epoch": 0.4093886462882096,
+      "grad_norm": 0.17040041089057922,
+      "learning_rate": 3.342202975555386e-05,
+      "loss": 0.1563249945640564,
+      "step": 2250
+    },
+    {
+      "epoch": 0.4102983988355167,
+      "grad_norm": 0.15594671666622162,
+      "learning_rate": 3.3352687875661984e-05,
+      "loss": 0.1546410083770752,
+      "step": 2255
+    },
+    {
+      "epoch": 0.41120815138282385,
+      "grad_norm": 0.1677195280790329,
+      "learning_rate": 3.328327358605384e-05,
+      "loss": 0.15710171461105346,
+      "step": 2260
+    },
+    {
+      "epoch": 0.412117903930131,
+      "grad_norm": 0.1731705516576767,
+      "learning_rate": 3.321378748848412e-05,
+      "loss": 0.16444036960601807,
+      "step": 2265
+    },
+    {
+      "epoch": 0.4130276564774381,
+      "grad_norm": 0.18779033422470093,
+      "learning_rate": 3.3144230185329984e-05,
+      "loss": 0.15659687519073487,
+      "step": 2270
+    },
+    {
+      "epoch": 0.4139374090247453,
+      "grad_norm": 0.1543768346309662,
+      "learning_rate": 3.3074602279585913e-05,
+      "loss": 0.15100739002227784,
+      "step": 2275
+    },
+    {
+      "epoch": 0.4148471615720524,
+      "grad_norm": 0.16672168672084808,
+      "learning_rate": 3.300490437485843e-05,
+      "loss": 0.15535364151000977,
+      "step": 2280
+    },
+    {
+      "epoch": 0.41575691411935956,
+      "grad_norm": 0.16741308569908142,
+      "learning_rate": 3.293513707536089e-05,
+      "loss": 0.15523911714553834,
+      "step": 2285
+    },
+    {
+      "epoch": 0.4166666666666667,
+      "grad_norm": 0.1488303542137146,
+      "learning_rate": 3.286530098590822e-05,
+      "loss": 0.1542000651359558,
+      "step": 2290
+    },
+    {
+      "epoch": 0.4175764192139738,
+      "grad_norm": 0.1637732982635498,
+      "learning_rate": 3.2795396711911694e-05,
+      "loss": 0.15354831218719484,
+      "step": 2295
+    },
+    {
+      "epoch": 0.41848617176128095,
+      "grad_norm": 0.1472022533416748,
+      "learning_rate": 3.272542485937369e-05,
+      "loss": 0.16235145330429077,
+      "step": 2300
+    },
+    {
+      "epoch": 0.4193959243085881,
+      "grad_norm": 0.15908290445804596,
+      "learning_rate": 3.265538603488241e-05,
+      "loss": 0.15642645359039306,
+      "step": 2305
+    },
+    {
+      "epoch": 0.4203056768558952,
+      "grad_norm": 0.1584865301847458,
+      "learning_rate": 3.2585280845606645e-05,
+      "loss": 0.15490249395370484,
+      "step": 2310
+    },
+    {
+      "epoch": 0.42121542940320233,
+      "grad_norm": 0.15893949568271637,
+      "learning_rate": 3.251510989929052e-05,
+      "loss": 0.1598116159439087,
+      "step": 2315
+    },
+    {
+      "epoch": 0.42212518195050946,
+      "grad_norm": 0.18930596113204956,
+      "learning_rate": 3.244487380424817e-05,
+      "loss": 0.1482008934020996,
+      "step": 2320
+    },
+    {
+      "epoch": 0.4230349344978166,
+      "grad_norm": 0.132876455783844,
+      "learning_rate": 3.237457316935856e-05,
+      "loss": 0.15304710865020751,
+      "step": 2325
+    },
+    {
+      "epoch": 0.4239446870451237,
+      "grad_norm": 0.16447032988071442,
+      "learning_rate": 3.2304208604060106e-05,
+      "loss": 0.15298750400543212,
+      "step": 2330
+    },
+    {
+      "epoch": 0.42485443959243085,
+      "grad_norm": 0.17748120427131653,
+      "learning_rate": 3.223378071834546e-05,
+      "loss": 0.1556084156036377,
+      "step": 2335
+    },
+    {
+      "epoch": 0.425764192139738,
+      "grad_norm": 0.16366586089134216,
+      "learning_rate": 3.2163290122756206e-05,
+      "loss": 0.14387927055358887,
+      "step": 2340
+    },
+    {
+      "epoch": 0.4266739446870451,
+      "grad_norm": 0.15398970246315002,
+      "learning_rate": 3.209273742837755e-05,
+      "loss": 0.16091293096542358,
+      "step": 2345
+    },
+    {
+      "epoch": 0.42758369723435224,
+      "grad_norm": 0.164212167263031,
+      "learning_rate": 3.202212324683305e-05,
+      "loss": 0.15523531436920165,
+      "step": 2350
+    },
+    {
+      "epoch": 0.4284934497816594,
+      "grad_norm": 0.16749800741672516,
+      "learning_rate": 3.1951448190279255e-05,
+      "loss": 0.15354975461959838,
+      "step": 2355
+    },
+    {
+      "epoch": 0.4294032023289665,
+      "grad_norm": 0.14137034118175507,
+      "learning_rate": 3.18807128714005e-05,
+      "loss": 0.14981694221496583,
+      "step": 2360
+    },
+    {
+      "epoch": 0.43031295487627363,
+      "grad_norm": 0.14848439395427704,
+      "learning_rate": 3.1809917903403507e-05,
+      "loss": 0.15448769330978393,
+      "step": 2365
+    },
+    {
+      "epoch": 0.43122270742358076,
+      "grad_norm": 0.1747605800628662,
+      "learning_rate": 3.1739063900012095e-05,
+      "loss": 0.15882387161254882,
+      "step": 2370
+    },
+    {
+      "epoch": 0.4321324599708879,
+      "grad_norm": 0.16054467856884003,
+      "learning_rate": 3.166815147546186e-05,
+      "loss": 0.15170297622680665,
+      "step": 2375
+    },
+    {
+      "epoch": 0.433042212518195,
+      "grad_norm": 0.15428027510643005,
+      "learning_rate": 3.1597181244494886e-05,
+      "loss": 0.16202548742294312,
+      "step": 2380
+    },
+    {
+      "epoch": 0.4339519650655022,
+      "grad_norm": 0.16747219860553741,
+      "learning_rate": 3.1526153822354325e-05,
+      "loss": 0.15461477041244506,
+      "step": 2385
+    },
+    {
+      "epoch": 0.43486171761280934,
+      "grad_norm": 0.17415772378444672,
+      "learning_rate": 3.145506982477918e-05,
+      "loss": 0.16173542737960817,
+      "step": 2390
+    },
+    {
+      "epoch": 0.43577147016011647,
+      "grad_norm": 0.1293518990278244,
+      "learning_rate": 3.1383929867998865e-05,
+      "loss": 0.15572521686553956,
+      "step": 2395
+    },
+    {
+      "epoch": 0.4366812227074236,
+      "grad_norm": 0.16909323632717133,
+      "learning_rate": 3.1312734568727935e-05,
+      "loss": 0.15898628234863282,
+      "step": 2400
+    },
+    {
+      "epoch": 0.43759097525473073,
+      "grad_norm": 0.16770294308662415,
+      "learning_rate": 3.124148454416069e-05,
+      "loss": 0.1536281704902649,
+      "step": 2405
+    },
+    {
+      "epoch": 0.43850072780203786,
+      "grad_norm": 0.14078612625598907,
+      "learning_rate": 3.117018041196585e-05,
+      "loss": 0.15274266004562378,
+      "step": 2410
+    },
+    {
+      "epoch": 0.439410480349345,
+      "grad_norm": 0.15457536280155182,
+      "learning_rate": 3.1098822790281226e-05,
+      "loss": 0.15391263961791993,
+      "step": 2415
+    },
+    {
+      "epoch": 0.4403202328966521,
+      "grad_norm": 0.1640717089176178,
+      "learning_rate": 3.102741229770827e-05,
+      "loss": 0.15515168905258178,
+      "step": 2420
+    },
+    {
+      "epoch": 0.44122998544395925,
+      "grad_norm": 0.2601533830165863,
+      "learning_rate": 3.095594955330683e-05,
+      "loss": 0.1587247371673584,
+      "step": 2425
+    },
+    {
+      "epoch": 0.4421397379912664,
+      "grad_norm": 0.1352529525756836,
+      "learning_rate": 3.08844351765897e-05,
+      "loss": 0.1483217477798462,
+      "step": 2430
+    },
+    {
+      "epoch": 0.4430494905385735,
+      "grad_norm": 0.18479721248149872,
+      "learning_rate": 3.081286978751728e-05,
+      "loss": 0.15121787786483765,
+      "step": 2435
+    },
+    {
+      "epoch": 0.44395924308588064,
+      "grad_norm": 0.16954511404037476,
+      "learning_rate": 3.074125400649221e-05,
+      "loss": 0.16073100566864013,
+      "step": 2440
+    },
+    {
+      "epoch": 0.44486899563318777,
+      "grad_norm": 0.15154729783535004,
+      "learning_rate": 3.0669588454353944e-05,
+      "loss": 0.15738017559051515,
+      "step": 2445
+    },
+    {
+      "epoch": 0.4457787481804949,
+      "grad_norm": 0.1540488302707672,
+      "learning_rate": 3.059787375237344e-05,
+      "loss": 0.1515384554862976,
+      "step": 2450
+    },
+    {
+      "epoch": 0.44668850072780203,
+      "grad_norm": 0.1814432442188263,
+      "learning_rate": 3.052611052224774e-05,
+      "loss": 0.15731438398361205,
+      "step": 2455
+    },
+    {
+      "epoch": 0.44759825327510916,
+      "grad_norm": 0.16657036542892456,
+      "learning_rate": 3.0454299386094542e-05,
+      "loss": 0.15741543769836425,
+      "step": 2460
+    },
+    {
+      "epoch": 0.4485080058224163,
+      "grad_norm": 0.2177237570285797,
+      "learning_rate": 3.0382440966446875e-05,
+      "loss": 0.14972515106201173,
+      "step": 2465
+    },
+    {
+      "epoch": 0.4494177583697234,
+      "grad_norm": 0.1669909954071045,
+      "learning_rate": 3.031053588624766e-05,
+      "loss": 0.1506432294845581,
+      "step": 2470
+    },
+    {
+      "epoch": 0.45032751091703055,
+      "grad_norm": 0.1752234250307083,
+      "learning_rate": 3.0238584768844313e-05,
+      "loss": 0.14969609975814818,
+      "step": 2475
+    },
+    {
+      "epoch": 0.4512372634643377,
+      "grad_norm": 0.18267901241779327,
+      "learning_rate": 3.0166588237983363e-05,
+      "loss": 0.15112748146057128,
+      "step": 2480
+    },
+    {
+      "epoch": 0.4521470160116448,
+      "grad_norm": 0.16250105202198029,
+      "learning_rate": 3.0094546917805007e-05,
+      "loss": 0.15864100456237792,
+      "step": 2485
+    },
+    {
+      "epoch": 0.45305676855895194,
+      "grad_norm": 0.14825721085071564,
+      "learning_rate": 3.0022461432837752e-05,
+      "loss": 0.1513954520225525,
+      "step": 2490
+    },
+    {
+      "epoch": 0.4539665211062591,
+      "grad_norm": 0.1626640111207962,
+      "learning_rate": 2.9950332407992943e-05,
+      "loss": 0.1505578875541687,
+      "step": 2495
+    },
+    {
+      "epoch": 0.45487627365356625,
+      "grad_norm": 0.1535351574420929,
+      "learning_rate": 2.987816046855939e-05,
+      "loss": 0.15255829095840454,
+      "step": 2500
+    },
+    {
+      "epoch": 0.4557860262008734,
+      "grad_norm": 0.17552775144577026,
+      "learning_rate": 2.9805946240197928e-05,
+      "loss": 0.1516443133354187,
+      "step": 2505
+    },
+    {
+      "epoch": 0.4566957787481805,
+      "grad_norm": 0.16020981967449188,
+      "learning_rate": 2.9733690348935994e-05,
+      "loss": 0.14519743919372557,
+      "step": 2510
+    },
+    {
+      "epoch": 0.45760553129548764,
+      "grad_norm": 0.17800211906433105,
+      "learning_rate": 2.9661393421162204e-05,
+      "loss": 0.15679080486297609,
+      "step": 2515
+    },
+    {
+      "epoch": 0.4585152838427948,
+      "grad_norm": 0.16016991436481476,
+      "learning_rate": 2.9589056083620902e-05,
+      "loss": 0.14768127202987671,
+      "step": 2520
+    },
+    {
+      "epoch": 0.4594250363901019,
+      "grad_norm": 0.16272081434726715,
+      "learning_rate": 2.951667896340679e-05,
+      "loss": 0.1513301968574524,
+      "step": 2525
+    },
+    {
+      "epoch": 0.46033478893740903,
+      "grad_norm": 0.1726413071155548,
+      "learning_rate": 2.9444262687959402e-05,
+      "loss": 0.14819332361221313,
+      "step": 2530
+    },
+    {
+      "epoch": 0.46124454148471616,
+      "grad_norm": 0.1670403778553009,
+      "learning_rate": 2.9371807885057735e-05,
+      "loss": 0.15245940685272216,
+      "step": 2535
+    },
+    {
+      "epoch": 0.4621542940320233,
+      "grad_norm": 0.1650049239397049,
+      "learning_rate": 2.9299315182814772e-05,
+      "loss": 0.15187418460845947,
+      "step": 2540
+    },
+    {
+      "epoch": 0.4630640465793304,
+      "grad_norm": 0.16327734291553497,
+      "learning_rate": 2.9226785209672047e-05,
+      "loss": 0.15579828023910522,
+      "step": 2545
+    },
+    {
+      "epoch": 0.46397379912663755,
+      "grad_norm": 0.3367880582809448,
+      "learning_rate": 2.91542185943942e-05,
+      "loss": 0.15617697238922118,
+      "step": 2550
+    },
+    {
+      "epoch": 0.4648835516739447,
+      "grad_norm": 0.1731594055891037,
+      "learning_rate": 2.908161596606353e-05,
+      "loss": 0.1559603691101074,
+      "step": 2555
+    },
+    {
+      "epoch": 0.4657933042212518,
+      "grad_norm": 0.1477293074131012,
+      "learning_rate": 2.9008977954074517e-05,
+      "loss": 0.15567959547042848,
+      "step": 2560
+    },
+    {
+      "epoch": 0.46670305676855894,
+      "grad_norm": 0.16227173805236816,
+      "learning_rate": 2.8936305188128392e-05,
+      "loss": 0.1522113561630249,
+      "step": 2565
+    },
+    {
+      "epoch": 0.4676128093158661,
+      "grad_norm": 0.2031075656414032,
+      "learning_rate": 2.8863598298227674e-05,
+      "loss": 0.15054640769958497,
+      "step": 2570
+    },
+    {
+      "epoch": 0.4685225618631732,
+      "grad_norm": 0.18351472914218903,
+      "learning_rate": 2.8790857914670698e-05,
+      "loss": 0.15837019681930542,
+      "step": 2575
+    },
+    {
+      "epoch": 0.46943231441048033,
+      "grad_norm": 0.15914765000343323,
+      "learning_rate": 2.871808466804616e-05,
+      "loss": 0.1550259470939636,
+      "step": 2580
+    },
+    {
+      "epoch": 0.47034206695778746,
+      "grad_norm": 0.17366717755794525,
+      "learning_rate": 2.8645279189227636e-05,
+      "loss": 0.15702390670776367,
+      "step": 2585
+    },
+    {
+      "epoch": 0.4712518195050946,
+      "grad_norm": 0.13677838444709778,
+      "learning_rate": 2.8572442109368134e-05,
+      "loss": 0.15485031604766847,
+      "step": 2590
+    },
+    {
+      "epoch": 0.4721615720524017,
+      "grad_norm": 0.1477748304605484,
+      "learning_rate": 2.8499574059894617e-05,
+      "loss": 0.14577245712280273,
+      "step": 2595
+    },
+    {
+      "epoch": 0.47307132459970885,
+      "grad_norm": 0.1582217663526535,
+      "learning_rate": 2.842667567250252e-05,
+      "loss": 0.15586793422698975,
+      "step": 2600
+    },
+    {
+      "epoch": 0.47398107714701604,
+      "grad_norm": 0.19658738374710083,
+      "learning_rate": 2.8353747579150268e-05,
+      "loss": 0.15060495138168334,
+      "step": 2605
+    },
+    {
+      "epoch": 0.47489082969432317,
+      "grad_norm": 0.176767036318779,
+      "learning_rate": 2.828079041205382e-05,
+      "loss": 0.15116705894470214,
+      "step": 2610
+    },
+    {
+      "epoch": 0.4758005822416303,
+      "grad_norm": 0.16972507536411285,
+      "learning_rate": 2.820780480368117e-05,
+      "loss": 0.1541937470436096,
+      "step": 2615
+    },
+    {
+      "epoch": 0.47671033478893743,
+      "grad_norm": 0.1548585742712021,
+      "learning_rate": 2.8134791386746884e-05,
+      "loss": 0.14334756135940552,
+      "step": 2620
+    },
+    {
+      "epoch": 0.47762008733624456,
+      "grad_norm": 0.15411986410617828,
+      "learning_rate": 2.806175079420658e-05,
+      "loss": 0.14642289876937867,
+      "step": 2625
+    },
+    {
+      "epoch": 0.4785298398835517,
+      "grad_norm": 0.16609491407871246,
+      "learning_rate": 2.7988683659251474e-05,
+      "loss": 0.15083469152450563,
+      "step": 2630
+    },
+    {
+      "epoch": 0.4794395924308588,
+      "grad_norm": 0.16592684388160706,
+      "learning_rate": 2.791559061530289e-05,
+      "loss": 0.14218480587005616,
+      "step": 2635
+    },
+    {
+      "epoch": 0.48034934497816595,
+      "grad_norm": 0.1764935404062271,
+      "learning_rate": 2.7842472296006722e-05,
+      "loss": 0.15004343986511232,
+      "step": 2640
+    },
+    {
+      "epoch": 0.4812590975254731,
+      "grad_norm": 0.20094354450702667,
+      "learning_rate": 2.7769329335228022e-05,
+      "loss": 0.14975016117095946,
+      "step": 2645
+    },
+    {
+      "epoch": 0.4821688500727802,
+      "grad_norm": 0.1869269460439682,
+      "learning_rate": 2.769616236704542e-05,
+      "loss": 0.155981707572937,
+      "step": 2650
+    },
+    {
+      "epoch": 0.48307860262008734,
+      "grad_norm": 0.16671574115753174,
+      "learning_rate": 2.762297202574571e-05,
+      "loss": 0.14633859395980836,
+      "step": 2655
+    },
+    {
+      "epoch": 0.48398835516739447,
+      "grad_norm": 0.14999663829803467,
+      "learning_rate": 2.754975894581826e-05,
+      "loss": 0.15692603588104248,
+      "step": 2660
+    },
+    {
+      "epoch": 0.4848981077147016,
+      "grad_norm": 0.16893649101257324,
+      "learning_rate": 2.7476523761949592e-05,
+      "loss": 0.14530394077301026,
+      "step": 2665
+    },
+    {
+      "epoch": 0.48580786026200873,
+      "grad_norm": 0.16039884090423584,
+      "learning_rate": 2.740326710901784e-05,
+      "loss": 0.15013915300369263,
+      "step": 2670
+    },
+    {
+      "epoch": 0.48671761280931586,
+      "grad_norm": 0.16672006249427795,
+      "learning_rate": 2.732998962208725e-05,
+      "loss": 0.15667349100112915,
+      "step": 2675
+    },
+    {
+      "epoch": 0.487627365356623,
+      "grad_norm": 0.2160867303609848,
+      "learning_rate": 2.7256691936402684e-05,
+      "loss": 0.14335414171218872,
+      "step": 2680
+    },
+    {
+      "epoch": 0.4885371179039301,
+      "grad_norm": 0.349030077457428,
+      "learning_rate": 2.71833746873841e-05,
+      "loss": 0.1437530279159546,
+      "step": 2685
+    },
+    {
+      "epoch": 0.48944687045123725,
+      "grad_norm": 0.18380966782569885,
+      "learning_rate": 2.7110038510621073e-05,
+      "loss": 0.1476014256477356,
+      "step": 2690
+    },
+    {
+      "epoch": 0.4903566229985444,
+      "grad_norm": 0.1523742377758026,
+      "learning_rate": 2.703668404186722e-05,
+      "loss": 0.14578526020050048,
+      "step": 2695
+    },
+    {
+      "epoch": 0.4912663755458515,
+      "grad_norm": 0.16092729568481445,
+      "learning_rate": 2.696331191703479e-05,
+      "loss": 0.15335593223571778,
+      "step": 2700
+    },
+    {
+      "epoch": 0.49217612809315864,
+      "grad_norm": 0.17185333371162415,
+      "learning_rate": 2.688992277218904e-05,
+      "loss": 0.1540898084640503,
+      "step": 2705
+    },
+    {
+      "epoch": 0.49308588064046577,
+      "grad_norm": 0.1521969735622406,
+      "learning_rate": 2.6816517243542792e-05,
+      "loss": 0.15171396732330322,
+      "step": 2710
+    },
+    {
+      "epoch": 0.49399563318777295,
+      "grad_norm": 0.16064171493053436,
+      "learning_rate": 2.674309596745092e-05,
+      "loss": 0.1505839228630066,
+      "step": 2715
+    },
+    {
+      "epoch": 0.4949053857350801,
+      "grad_norm": 0.16430898010730743,
+      "learning_rate": 2.6669659580404795e-05,
+      "loss": 0.1551363468170166,
+      "step": 2720
+    },
+    {
+      "epoch": 0.4958151382823872,
+      "grad_norm": 0.16125477850437164,
+      "learning_rate": 2.659620871902677e-05,
+      "loss": 0.15069286823272704,
+      "step": 2725
+    },
+    {
+      "epoch": 0.49672489082969434,
+      "grad_norm": 0.1428450047969818,
+      "learning_rate": 2.652274402006471e-05,
+      "loss": 0.15511081218719483,
+      "step": 2730
+    },
+    {
+      "epoch": 0.4976346433770015,
+      "grad_norm": 0.15452754497528076,
+      "learning_rate": 2.6449266120386406e-05,
+      "loss": 0.14941939115524291,
+      "step": 2735
+    },
+    {
+      "epoch": 0.4985443959243086,
+      "grad_norm": 0.17243537306785583,
+      "learning_rate": 2.6375775656974123e-05,
+      "loss": 0.151741623878479,
+      "step": 2740
+    },
+    {
+      "epoch": 0.49945414847161573,
+      "grad_norm": 0.13736453652381897,
+      "learning_rate": 2.6302273266919008e-05,
+      "loss": 0.147042977809906,
+      "step": 2745
+    },
+    {
+      "epoch": 0.5003639010189228,
+      "grad_norm": 0.16241495311260223,
+      "learning_rate": 2.6228759587415614e-05,
+      "loss": 0.14664684534072875,
+      "step": 2750
+    },
+    {
+      "epoch": 0.50127365356623,
+      "grad_norm": 0.193496435880661,
+      "learning_rate": 2.6155235255756356e-05,
+      "loss": 0.15486966371536254,
+      "step": 2755
+    },
+    {
+      "epoch": 0.5021834061135371,
+      "grad_norm": 0.1542847901582718,
+      "learning_rate": 2.6081700909326e-05,
+      "loss": 0.15148009061813356,
+      "step": 2760
+    },
+    {
+      "epoch": 0.5030931586608443,
+      "grad_norm": 0.1696511209011078,
+      "learning_rate": 2.6008157185596142e-05,
+      "loss": 0.14190055131912233,
+      "step": 2765
+    },
+    {
+      "epoch": 0.5040029112081513,
+      "grad_norm": 0.14690077304840088,
+      "learning_rate": 2.5934604722119655e-05,
+      "loss": 0.1570739269256592,
+      "step": 2770
+    },
+    {
+      "epoch": 0.5049126637554585,
+      "grad_norm": 0.17149671912193298,
+      "learning_rate": 2.5861044156525162e-05,
+      "loss": 0.14940304756164552,
+      "step": 2775
+    },
+    {
+      "epoch": 0.5058224163027657,
+      "grad_norm": 0.16639231145381927,
+      "learning_rate": 2.578747612651155e-05,
+      "loss": 0.15691237449645995,
+      "step": 2780
+    },
+    {
+      "epoch": 0.5067321688500728,
+      "grad_norm": 0.2062763124704361,
+      "learning_rate": 2.5713901269842404e-05,
+      "loss": 0.1564734935760498,
+      "step": 2785
+    },
+    {
+      "epoch": 0.50764192139738,
+      "grad_norm": 0.12636308372020721,
+      "learning_rate": 2.5640320224340502e-05,
+      "loss": 0.14539417028427123,
+      "step": 2790
+    },
+    {
+      "epoch": 0.508551673944687,
+      "grad_norm": 0.16893689334392548,
+      "learning_rate": 2.556673362788225e-05,
+      "loss": 0.15440930128097535,
+      "step": 2795
+    },
+    {
+      "epoch": 0.5094614264919942,
+      "grad_norm": 0.16250015795230865,
+      "learning_rate": 2.54931421183922e-05,
+      "loss": 0.14485647678375244,
+      "step": 2800
+    },
+    {
+      "epoch": 0.5103711790393013,
+      "grad_norm": 0.1700994372367859,
+      "learning_rate": 2.5419546333837462e-05,
+      "loss": 0.15411126613616943,
+      "step": 2805
+    },
+    {
+      "epoch": 0.5112809315866085,
+      "grad_norm": 0.1547706127166748,
+      "learning_rate": 2.5345946912222256e-05,
+      "loss": 0.15516072511672974,
+      "step": 2810
+    },
+    {
+      "epoch": 0.5121906841339156,
+      "grad_norm": 0.17955681681632996,
+      "learning_rate": 2.527234449158228e-05,
+      "loss": 0.15546923875808716,
+      "step": 2815
+    },
+    {
+      "epoch": 0.5131004366812227,
+      "grad_norm": 0.163709819316864,
+      "learning_rate": 2.519873970997927e-05,
+      "loss": 0.15665037631988527,
+      "step": 2820
+    },
+    {
+      "epoch": 0.5140101892285298,
+      "grad_norm": 0.17859576642513275,
+      "learning_rate": 2.5125133205495405e-05,
+      "loss": 0.1539722204208374,
+      "step": 2825
+    },
+    {
+      "epoch": 0.514919941775837,
+      "grad_norm": 0.17443150281906128,
+      "learning_rate": 2.5051525616227806e-05,
+      "loss": 0.148411762714386,
+      "step": 2830
+    },
+    {
+      "epoch": 0.5158296943231441,
+      "grad_norm": 0.17397581040859222,
+      "learning_rate": 2.4977917580283007e-05,
+      "loss": 0.14880497455596925,
+      "step": 2835
+    },
+    {
+      "epoch": 0.5167394468704513,
+      "grad_norm": 0.14565663039684296,
+      "learning_rate": 2.4904309735771405e-05,
+      "loss": 0.14934680461883545,
+      "step": 2840
+    },
+    {
+      "epoch": 0.5176491994177583,
+      "grad_norm": 0.17895659804344177,
+      "learning_rate": 2.4830702720801746e-05,
+      "loss": 0.15287939310073853,
+      "step": 2845
+    },
+    {
+      "epoch": 0.5185589519650655,
+      "grad_norm": 0.15812788903713226,
+      "learning_rate": 2.4757097173475572e-05,
+      "loss": 0.14576947689056396,
+      "step": 2850
+    },
+    {
+      "epoch": 0.5194687045123726,
+      "grad_norm": 0.17123781144618988,
+      "learning_rate": 2.46834937318817e-05,
+      "loss": 0.15224847793579102,
+      "step": 2855
+    },
+    {
+      "epoch": 0.5203784570596798,
+      "grad_norm": 0.14845474064350128,
+      "learning_rate": 2.460989303409072e-05,
+      "loss": 0.14901585578918458,
+      "step": 2860
+    },
+    {
+      "epoch": 0.5212882096069869,
+      "grad_norm": 0.23493704199790955,
+      "learning_rate": 2.4536295718149407e-05,
+      "loss": 0.1517487049102783,
+      "step": 2865
+    },
+    {
+      "epoch": 0.522197962154294,
+      "grad_norm": 0.16209843754768372,
+      "learning_rate": 2.4462702422075217e-05,
+      "loss": 0.14327445030212402,
+      "step": 2870
+    },
+    {
+      "epoch": 0.5231077147016011,
+      "grad_norm": 0.17249803245067596,
+      "learning_rate": 2.4389113783850793e-05,
+      "loss": 0.1517549753189087,
+      "step": 2875
+    },
+    {
+      "epoch": 0.5240174672489083,
+      "grad_norm": 0.14561402797698975,
+      "learning_rate": 2.431553044141836e-05,
+      "loss": 0.14764087200164794,
+      "step": 2880
+    },
+    {
+      "epoch": 0.5249272197962155,
+      "grad_norm": 0.17033302783966064,
+      "learning_rate": 2.4241953032674256e-05,
+      "loss": 0.15181604623794556,
+      "step": 2885
+    },
+    {
+      "epoch": 0.5258369723435226,
+      "grad_norm": 0.1184430941939354,
+      "learning_rate": 2.4168382195463367e-05,
+      "loss": 0.14264242649078368,
+      "step": 2890
+    },
+    {
+      "epoch": 0.5267467248908297,
+      "grad_norm": 0.17521196603775024,
+      "learning_rate": 2.4094818567573618e-05,
+      "loss": 0.1509538173675537,
+      "step": 2895
+    },
+    {
+      "epoch": 0.5276564774381368,
+      "grad_norm": 0.1681576371192932,
+      "learning_rate": 2.4021262786730428e-05,
+      "loss": 0.15344605445861817,
+      "step": 2900
+    },
+    {
+      "epoch": 0.528566229985444,
+      "grad_norm": 0.17134182155132294,
+      "learning_rate": 2.3947715490591206e-05,
+      "loss": 0.15161689519882202,
+      "step": 2905
+    },
+    {
+      "epoch": 0.5294759825327511,
+      "grad_norm": 0.1796472817659378,
+      "learning_rate": 2.3874177316739778e-05,
+      "loss": 0.15086464881896972,
+      "step": 2910
+    },
+    {
+      "epoch": 0.5303857350800583,
+      "grad_norm": 0.23268625140190125,
+      "learning_rate": 2.380064890268093e-05,
+      "loss": 0.15354180335998535,
+      "step": 2915
+    },
+    {
+      "epoch": 0.5312954876273653,
+      "grad_norm": 0.16318941116333008,
+      "learning_rate": 2.372713088583481e-05,
+      "loss": 0.15131797790527343,
+      "step": 2920
+    },
+    {
+      "epoch": 0.5322052401746725,
+      "grad_norm": 0.18171803653240204,
+      "learning_rate": 2.365362390353143e-05,
+      "loss": 0.15784090757369995,
+      "step": 2925
+    },
+    {
+      "epoch": 0.5331149927219796,
+      "grad_norm": 0.17672640085220337,
+      "learning_rate": 2.3580128593005156e-05,
+      "loss": 0.15509436130523682,
+      "step": 2930
+    },
+    {
+      "epoch": 0.5340247452692868,
+      "grad_norm": 0.15985223650932312,
+      "learning_rate": 2.3506645591389174e-05,
+      "loss": 0.14851027727127075,
+      "step": 2935
+    },
+    {
+      "epoch": 0.5349344978165939,
+      "grad_norm": 0.16597607731819153,
+      "learning_rate": 2.343317553570995e-05,
+      "loss": 0.1504931092262268,
+      "step": 2940
+    },
+    {
+      "epoch": 0.535844250363901,
+      "grad_norm": 0.20180748403072357,
+      "learning_rate": 2.3359719062881725e-05,
+      "loss": 0.15023820400238036,
+      "step": 2945
+    },
+    {
+      "epoch": 0.5367540029112081,
+      "grad_norm": 0.1735963076353073,
+      "learning_rate": 2.3286276809701e-05,
+      "loss": 0.15374408960342406,
+      "step": 2950
+    },
+    {
+      "epoch": 0.5376637554585153,
+      "grad_norm": 0.17629501223564148,
+      "learning_rate": 2.3212849412840995e-05,
+      "loss": 0.15007833242416382,
+      "step": 2955
+    },
+    {
+      "epoch": 0.5385735080058224,
+      "grad_norm": 0.1493796557188034,
+      "learning_rate": 2.3139437508846155e-05,
+      "loss": 0.15206656455993653,
+      "step": 2960
+    },
+    {
+      "epoch": 0.5394832605531296,
+      "grad_norm": 0.17426837980747223,
+      "learning_rate": 2.306604173412659e-05,
+      "loss": 0.1441131591796875,
+      "step": 2965
+    },
+    {
+      "epoch": 0.5403930131004366,
+      "grad_norm": 0.16984431445598602,
+      "learning_rate": 2.2992662724952613e-05,
+      "loss": 0.14438753128051757,
+      "step": 2970
+    },
+    {
+      "epoch": 0.5413027656477438,
+      "grad_norm": 0.1814386397600174,
+      "learning_rate": 2.2919301117449167e-05,
+      "loss": 0.14887022972106934,
+      "step": 2975
+    },
+    {
+      "epoch": 0.5422125181950509,
+      "grad_norm": 0.158392995595932,
+      "learning_rate": 2.2845957547590368e-05,
+      "loss": 0.14404361248016356,
+      "step": 2980
+    },
+    {
+      "epoch": 0.5431222707423581,
+      "grad_norm": 0.17496263980865479,
+      "learning_rate": 2.2772632651193953e-05,
+      "loss": 0.1454906702041626,
+      "step": 2985
+    },
+    {
+      "epoch": 0.5440320232896652,
+      "grad_norm": 0.157533198595047,
+      "learning_rate": 2.2699327063915766e-05,
+      "loss": 0.1458217740058899,
+      "step": 2990
+    },
+    {
+      "epoch": 0.5449417758369723,
+      "grad_norm": 0.1767890453338623,
+      "learning_rate": 2.262604142124427e-05,
+      "loss": 0.14384825229644777,
+      "step": 2995
+    },
+    {
+      "epoch": 0.5458515283842795,
+      "grad_norm": 0.1851050704717636,
+      "learning_rate": 2.2552776358495033e-05,
+      "loss": 0.14832457304000854,
+      "step": 3000
+    },
+    {
+      "epoch": 0.5467612809315866,
+      "grad_norm": 0.164175882935524,
+      "learning_rate": 2.247953251080521e-05,
+      "loss": 0.14999878406524658,
+      "step": 3005
+    },
+    {
+      "epoch": 0.5476710334788938,
+      "grad_norm": 0.3403675854206085,
+      "learning_rate": 2.240631051312804e-05,
+      "loss": 0.1443937063217163,
+      "step": 3010
+    },
+    {
+      "epoch": 0.5485807860262009,
+      "grad_norm": 0.16751109063625336,
+      "learning_rate": 2.2333111000227342e-05,
+      "loss": 0.1462402105331421,
+      "step": 3015
+    },
+    {
+      "epoch": 0.549490538573508,
+      "grad_norm": 0.14741151034832,
+      "learning_rate": 2.225993460667201e-05,
+      "loss": 0.149855899810791,
+      "step": 3020
+    },
+    {
+      "epoch": 0.5504002911208151,
+      "grad_norm": 0.20605266094207764,
+      "learning_rate": 2.218678196683054e-05,
+      "loss": 0.15413178205490113,
+      "step": 3025
+    },
+    {
+      "epoch": 0.5513100436681223,
+      "grad_norm": 0.14884796738624573,
+      "learning_rate": 2.2113653714865473e-05,
+      "loss": 0.14592334032058715,
+      "step": 3030
+    },
+    {
+      "epoch": 0.5522197962154294,
+      "grad_norm": 0.17114350199699402,
+      "learning_rate": 2.2040550484727943e-05,
+      "loss": 0.1498338460922241,
+      "step": 3035
+    },
+    {
+      "epoch": 0.5531295487627366,
+      "grad_norm": 0.16496853530406952,
+      "learning_rate": 2.196747291015219e-05,
+      "loss": 0.14650315046310425,
+      "step": 3040
+    },
+    {
+      "epoch": 0.5540393013100436,
+      "grad_norm": 0.15172401070594788,
+      "learning_rate": 2.189442162465001e-05,
+      "loss": 0.14984124898910522,
+      "step": 3045
+    },
+    {
+      "epoch": 0.5549490538573508,
+      "grad_norm": 0.19258467853069305,
+      "learning_rate": 2.182139726150532e-05,
+      "loss": 0.1486764669418335,
+      "step": 3050
+    },
+    {
+      "epoch": 0.5558588064046579,
+      "grad_norm": 0.1749001443386078,
+      "learning_rate": 2.1748400453768652e-05,
+      "loss": 0.14983701705932617,
+      "step": 3055
+    },
+    {
+      "epoch": 0.5567685589519651,
+      "grad_norm": 0.37510567903518677,
+      "learning_rate": 2.1675431834251637e-05,
+      "loss": 0.14483561515808105,
+      "step": 3060
+    },
+    {
+      "epoch": 0.5576783114992722,
+      "grad_norm": 0.16932405531406403,
+      "learning_rate": 2.1602492035521553e-05,
+      "loss": 0.14487643241882325,
+      "step": 3065
+    },
+    {
+      "epoch": 0.5585880640465793,
+      "grad_norm": 0.174176424741745,
+      "learning_rate": 2.152958168989584e-05,
+      "loss": 0.14737497568130492,
+      "step": 3070
+    },
+    {
+      "epoch": 0.5594978165938864,
+      "grad_norm": 0.1601252257823944,
+      "learning_rate": 2.1456701429436577e-05,
+      "loss": 0.15183379650115966,
+      "step": 3075
+    },
+    {
+      "epoch": 0.5604075691411936,
+      "grad_norm": 0.14960910379886627,
+      "learning_rate": 2.1383851885945085e-05,
+      "loss": 0.143074893951416,
+      "step": 3080
+    },
+    {
+      "epoch": 0.5613173216885007,
+      "grad_norm": 0.1678633838891983,
+      "learning_rate": 2.1311033690956346e-05,
+      "loss": 0.14961432218551635,
+      "step": 3085
+    },
+    {
+      "epoch": 0.5622270742358079,
+      "grad_norm": 0.15814319252967834,
+      "learning_rate": 2.1238247475733613e-05,
+      "loss": 0.14308581352233887,
+      "step": 3090
+    },
+    {
+      "epoch": 0.5631368267831149,
+      "grad_norm": 0.21240772306919098,
+      "learning_rate": 2.1165493871262887e-05,
+      "loss": 0.14737485647201537,
+      "step": 3095
+    },
+    {
+      "epoch": 0.5640465793304221,
+      "grad_norm": 0.15161271393299103,
+      "learning_rate": 2.109277350824749e-05,
+      "loss": 0.14534420967102052,
+      "step": 3100
+    },
+    {
+      "epoch": 0.5649563318777293,
+      "grad_norm": 0.16572362184524536,
+      "learning_rate": 2.1020087017102537e-05,
+      "loss": 0.14299670457839966,
+      "step": 3105
+    },
+    {
+      "epoch": 0.5658660844250364,
+      "grad_norm": 0.1548164039850235,
+      "learning_rate": 2.094743502794954e-05,
+      "loss": 0.14371142387390137,
+      "step": 3110
+    },
+    {
+      "epoch": 0.5667758369723436,
+      "grad_norm": 0.2574169933795929,
+      "learning_rate": 2.0874818170610885e-05,
+      "loss": 0.14350423812866211,
+      "step": 3115
+    },
+    {
+      "epoch": 0.5676855895196506,
+      "grad_norm": 0.16359548270702362,
+      "learning_rate": 2.080223707460443e-05,
+      "loss": 0.1520243763923645,
+      "step": 3120
+    },
+    {
+      "epoch": 0.5685953420669578,
+      "grad_norm": 0.1798320859670639,
+      "learning_rate": 2.072969236913799e-05,
+      "loss": 0.14832595586776734,
+      "step": 3125
+    },
+    {
+      "epoch": 0.5695050946142649,
+      "grad_norm": 0.17045916616916656,
+      "learning_rate": 2.0657184683103926e-05,
+      "loss": 0.15308042764663696,
+      "step": 3130
+    },
+    {
+      "epoch": 0.5704148471615721,
+      "grad_norm": 0.16345897316932678,
+      "learning_rate": 2.058471464507366e-05,
+      "loss": 0.14564799070358275,
+      "step": 3135
+    },
+    {
+      "epoch": 0.5713245997088792,
+      "grad_norm": 0.15170110762119293,
+      "learning_rate": 2.0512282883292257e-05,
+      "loss": 0.14161767959594726,
+      "step": 3140
+    },
+    {
+      "epoch": 0.5722343522561864,
+      "grad_norm": 0.8107472658157349,
+      "learning_rate": 2.0439890025672955e-05,
+      "loss": 0.14481087923049926,
+      "step": 3145
+    },
+    {
+      "epoch": 0.5731441048034934,
+      "grad_norm": 0.15346679091453552,
+      "learning_rate": 2.036753669979174e-05,
+      "loss": 0.14860262870788574,
+      "step": 3150
+    },
+    {
+      "epoch": 0.5740538573508006,
+      "grad_norm": 0.1632593423128128,
+      "learning_rate": 2.0295223532881886e-05,
+      "loss": 0.1481687307357788,
+      "step": 3155
+    },
+    {
+      "epoch": 0.5749636098981077,
+      "grad_norm": 0.23399172723293304,
+      "learning_rate": 2.022295115182852e-05,
+      "loss": 0.149153733253479,
+      "step": 3160
+    },
+    {
+      "epoch": 0.5758733624454149,
+      "grad_norm": 0.14977394044399261,
+      "learning_rate": 2.015072018316323e-05,
+      "loss": 0.14921388626098633,
+      "step": 3165
+    },
+    {
+      "epoch": 0.576783114992722,
+      "grad_norm": 0.1550658792257309,
+      "learning_rate": 2.007853125305856e-05,
+      "loss": 0.1482759475708008,
+      "step": 3170
+    },
+    {
+      "epoch": 0.5776928675400291,
+      "grad_norm": 0.16661737859249115,
+      "learning_rate": 2.0006384987322645e-05,
+      "loss": 0.14903552532196046,
+      "step": 3175
+    },
+    {
+      "epoch": 0.5786026200873362,
+      "grad_norm": 0.1746823936700821,
+      "learning_rate": 1.9934282011393753e-05,
+      "loss": 0.1412947654724121,
+      "step": 3180
+    },
+    {
+      "epoch": 0.5795123726346434,
+      "grad_norm": 0.17025792598724365,
+      "learning_rate": 1.9862222950334857e-05,
+      "loss": 0.15289769172668458,
+      "step": 3185
+    },
+    {
+      "epoch": 0.5804221251819505,
+      "grad_norm": 0.16857658326625824,
+      "learning_rate": 1.9790208428828252e-05,
+      "loss": 0.14419941902160643,
+      "step": 3190
+    },
+    {
+      "epoch": 0.5813318777292577,
+      "grad_norm": 0.16099876165390015,
+      "learning_rate": 1.9718239071170118e-05,
+      "loss": 0.14476487636566163,
+      "step": 3195
+    },
+    {
+      "epoch": 0.5822416302765647,
+      "grad_norm": 0.16140873730182648,
+      "learning_rate": 1.964631550126508e-05,
+      "loss": 0.14588416814804078,
+      "step": 3200
+    },
+    {
+      "epoch": 0.5831513828238719,
+      "grad_norm": 0.15719448029994965,
+      "learning_rate": 1.957443834262087e-05,
+      "loss": 0.15144693851470947,
+      "step": 3205
+    },
+    {
+      "epoch": 0.584061135371179,
+      "grad_norm": 0.16512645781040192,
+      "learning_rate": 1.950260821834285e-05,
+      "loss": 0.14787566661834717,
+      "step": 3210
+    },
+    {
+      "epoch": 0.5849708879184862,
+      "grad_norm": 0.18584516644477844,
+      "learning_rate": 1.9430825751128643e-05,
+      "loss": 0.14514710903167724,
+      "step": 3215
+    },
+    {
+      "epoch": 0.5858806404657934,
+      "grad_norm": 0.17640981078147888,
+      "learning_rate": 1.9359091563262742e-05,
+      "loss": 0.1511004686355591,
+      "step": 3220
+    },
+    {
+      "epoch": 0.5867903930131004,
+      "grad_norm": 0.1697624921798706,
+      "learning_rate": 1.9287406276611095e-05,
+      "loss": 0.15392563343048096,
+      "step": 3225
+    },
+    {
+      "epoch": 0.5877001455604076,
+      "grad_norm": 0.1677260845899582,
+      "learning_rate": 1.9215770512615725e-05,
+      "loss": 0.15311745405197144,
+      "step": 3230
+    },
+    {
+      "epoch": 0.5886098981077147,
+      "grad_norm": 0.15357480943202972,
+      "learning_rate": 1.9144184892289337e-05,
+      "loss": 0.14370160102844237,
+      "step": 3235
+    },
+    {
+      "epoch": 0.5895196506550219,
+      "grad_norm": 0.18601207435131073,
+      "learning_rate": 1.9072650036209955e-05,
+      "loss": 0.14095077514648438,
+      "step": 3240
+    },
+    {
+      "epoch": 0.590429403202329,
+      "grad_norm": 0.17313526570796967,
+      "learning_rate": 1.9001166564515513e-05,
+      "loss": 0.148259174823761,
+      "step": 3245
+    },
+    {
+      "epoch": 0.5913391557496361,
+      "grad_norm": 0.1634378433227539,
+      "learning_rate": 1.8929735096898504e-05,
+      "loss": 0.15082294940948487,
+      "step": 3250
+    },
+    {
+      "epoch": 0.5922489082969432,
+      "grad_norm": 0.18542174994945526,
+      "learning_rate": 1.885835625260058e-05,
+      "loss": 0.14461435079574586,
+      "step": 3255
+    },
+    {
+      "epoch": 0.5931586608442504,
+      "grad_norm": 0.1740756630897522,
+      "learning_rate": 1.87870306504072e-05,
+      "loss": 0.14083608388900756,
+      "step": 3260
+    },
+    {
+      "epoch": 0.5940684133915575,
+      "grad_norm": 0.25606217980384827,
+      "learning_rate": 1.8715758908642288e-05,
+      "loss": 0.15125386714935302,
+      "step": 3265
+    },
+    {
+      "epoch": 0.5949781659388647,
+      "grad_norm": 0.20194627344608307,
+      "learning_rate": 1.8644541645162834e-05,
+      "loss": 0.14433003664016725,
+      "step": 3270
+    },
+    {
+      "epoch": 0.5958879184861717,
+      "grad_norm": 0.1902168095111847,
+      "learning_rate": 1.8573379477353542e-05,
+      "loss": 0.14718132019042968,
+      "step": 3275
+    },
+    {
+      "epoch": 0.5967976710334789,
+      "grad_norm": 0.15122972428798676,
+      "learning_rate": 1.850227302212151e-05,
+      "loss": 0.153376567363739,
+      "step": 3280
+    },
+    {
+      "epoch": 0.597707423580786,
+      "grad_norm": 0.14331959187984467,
+      "learning_rate": 1.843122289589085e-05,
+      "loss": 0.146630597114563,
+      "step": 3285
+    },
+    {
+      "epoch": 0.5986171761280932,
+      "grad_norm": 0.15083099901676178,
+      "learning_rate": 1.836022971459737e-05,
+      "loss": 0.1445971965789795,
+      "step": 3290
+    },
+    {
+      "epoch": 0.5995269286754003,
+      "grad_norm": 0.16585418581962585,
+      "learning_rate": 1.828929409368321e-05,
+      "loss": 0.15120241641998292,
+      "step": 3295
+    },
+    {
+      "epoch": 0.6004366812227074,
+      "grad_norm": 0.1653224229812622,
+      "learning_rate": 1.8218416648091524e-05,
+      "loss": 0.14349838495254516,
+      "step": 3300
+    },
+    {
+      "epoch": 0.6013464337700145,
+      "grad_norm": 0.1891375184059143,
+      "learning_rate": 1.8147597992261124e-05,
+      "loss": 0.15171384811401367,
+      "step": 3305
+    },
+    {
+      "epoch": 0.6022561863173217,
+      "grad_norm": 0.13392704725265503,
+      "learning_rate": 1.8076838740121187e-05,
+      "loss": 0.14607118368148803,
+      "step": 3310
+    },
+    {
+      "epoch": 0.6031659388646288,
+      "grad_norm": 0.15421944856643677,
+      "learning_rate": 1.8006139505085926e-05,
+      "loss": 0.1380957007408142,
+      "step": 3315
+    },
+    {
+      "epoch": 0.604075691411936,
+      "grad_norm": 0.16637761890888214,
+      "learning_rate": 1.7935500900049246e-05,
+      "loss": 0.14604611396789552,
+      "step": 3320
+    },
+    {
+      "epoch": 0.6049854439592431,
+      "grad_norm": 0.16638441383838654,
+      "learning_rate": 1.7864923537379445e-05,
+      "loss": 0.1513611912727356,
+      "step": 3325
+    },
+    {
+      "epoch": 0.6058951965065502,
+      "grad_norm": 0.1745707094669342,
+      "learning_rate": 1.779440802891394e-05,
+      "loss": 0.15391240119934083,
+      "step": 3330
+    },
+    {
+      "epoch": 0.6068049490538574,
+      "grad_norm": 0.1620505005121231,
+      "learning_rate": 1.77239549859539e-05,
+      "loss": 0.14986472129821776,
+      "step": 3335
+    },
+    {
+      "epoch": 0.6077147016011645,
+      "grad_norm": 0.1579132080078125,
+      "learning_rate": 1.7653565019259e-05,
+      "loss": 0.1466603994369507,
+      "step": 3340
+    },
+    {
+      "epoch": 0.6086244541484717,
+      "grad_norm": 0.19154994189739227,
+      "learning_rate": 1.7583238739042086e-05,
+      "loss": 0.15228934288024903,
+      "step": 3345
+    },
+    {
+      "epoch": 0.6095342066957787,
+      "grad_norm": 0.15771779417991638,
+      "learning_rate": 1.7512976754963913e-05,
+      "loss": 0.14965078830718995,
+      "step": 3350
+    },
+    {
+      "epoch": 0.6104439592430859,
+      "grad_norm": 0.18406136333942413,
+      "learning_rate": 1.744277967612785e-05,
+      "loss": 0.1473196864128113,
+      "step": 3355
+    },
+    {
+      "epoch": 0.611353711790393,
+      "grad_norm": 0.17603816092014313,
+      "learning_rate": 1.7372648111074607e-05,
+      "loss": 0.1430676221847534,
+      "step": 3360
+    },
+    {
+      "epoch": 0.6122634643377002,
+      "grad_norm": 0.156408429145813,
+      "learning_rate": 1.7302582667776933e-05,
+      "loss": 0.14018454551696777,
+      "step": 3365
+    },
+    {
+      "epoch": 0.6131732168850073,
+      "grad_norm": 0.14504843950271606,
+      "learning_rate": 1.7232583953634407e-05,
+      "loss": 0.14505640268325806,
+      "step": 3370
+    },
+    {
+      "epoch": 0.6140829694323144,
+      "grad_norm": 0.1864968240261078,
+      "learning_rate": 1.716265257546808e-05,
+      "loss": 0.14810394048690795,
+      "step": 3375
+    },
+    {
+      "epoch": 0.6149927219796215,
+      "grad_norm": 0.1621711403131485,
+      "learning_rate": 1.7092789139515295e-05,
+      "loss": 0.14203091859817504,
+      "step": 3380
+    },
+    {
+      "epoch": 0.6159024745269287,
+      "grad_norm": 0.17994914948940277,
+      "learning_rate": 1.70229942514244e-05,
+      "loss": 0.14565644264221192,
+      "step": 3385
+    },
+    {
+      "epoch": 0.6168122270742358,
+      "grad_norm": 0.1707388162612915,
+      "learning_rate": 1.6953268516249486e-05,
+      "loss": 0.14449434280395507,
+      "step": 3390
+    },
+    {
+      "epoch": 0.617721979621543,
+      "grad_norm": 0.16425329446792603,
+      "learning_rate": 1.6883612538445175e-05,
+      "loss": 0.15185940265655518,
+      "step": 3395
+    },
+    {
+      "epoch": 0.61863173216885,
+      "grad_norm": 0.15987788140773773,
+      "learning_rate": 1.6814026921861335e-05,
+      "loss": 0.14994431734085084,
+      "step": 3400
+    },
+    {
+      "epoch": 0.6195414847161572,
+      "grad_norm": 0.2987690269947052,
+      "learning_rate": 1.6744512269737894e-05,
+      "loss": 0.14652738571166993,
+      "step": 3405
+    },
+    {
+      "epoch": 0.6204512372634643,
+      "grad_norm": 0.1681315004825592,
+      "learning_rate": 1.6675069184699574e-05,
+      "loss": 0.14566165208816528,
+      "step": 3410
+    },
+    {
+      "epoch": 0.6213609898107715,
+      "grad_norm": 0.15847846865653992,
+      "learning_rate": 1.660569826875069e-05,
+      "loss": 0.1374401330947876,
+      "step": 3415
+    },
+    {
+      "epoch": 0.6222707423580786,
+      "grad_norm": 0.16370312869548798,
+      "learning_rate": 1.6536400123269907e-05,
+      "loss": 0.14905524253845215,
+      "step": 3420
+    },
+    {
+      "epoch": 0.6231804949053857,
+      "grad_norm": 0.16054444015026093,
+      "learning_rate": 1.6467175349005054e-05,
+      "loss": 0.1496324896812439,
+      "step": 3425
+    },
+    {
+      "epoch": 0.6240902474526928,
+      "grad_norm": 0.1663951277732849,
+      "learning_rate": 1.639802454606788e-05,
+      "loss": 0.1504170298576355,
+      "step": 3430
+    },
+    {
+      "epoch": 0.625,
+      "grad_norm": 0.1591310054063797,
+      "learning_rate": 1.6328948313928906e-05,
+      "loss": 0.1410186171531677,
+      "step": 3435
+    },
+    {
+      "epoch": 0.6259097525473072,
+      "grad_norm": 0.1637524962425232,
+      "learning_rate": 1.6259947251412178e-05,
+      "loss": 0.13963305950164795,
+      "step": 3440
+    },
+    {
+      "epoch": 0.6268195050946143,
+      "grad_norm": 0.1688017100095749,
+      "learning_rate": 1.6191021956690096e-05,
+      "loss": 0.14727941751480103,
+      "step": 3445
+    },
+    {
+      "epoch": 0.6277292576419214,
+      "grad_norm": 0.1691795438528061,
+      "learning_rate": 1.612217302727821e-05,
+      "loss": 0.14856183528900146,
+      "step": 3450
+    },
+    {
+      "epoch": 0.6286390101892285,
+      "grad_norm": 0.18501746654510498,
+      "learning_rate": 1.60534010600301e-05,
+      "loss": 0.1481746554374695,
+      "step": 3455
+    },
+    {
+      "epoch": 0.6295487627365357,
+      "grad_norm": 0.16234716773033142,
+      "learning_rate": 1.5984706651132125e-05,
+      "loss": 0.1427530527114868,
+      "step": 3460
+    },
+    {
+      "epoch": 0.6304585152838428,
+      "grad_norm": 0.16013780236244202,
+      "learning_rate": 1.5916090396098293e-05,
+      "loss": 0.14264426231384278,
+      "step": 3465
+    },
+    {
+      "epoch": 0.63136826783115,
+      "grad_norm": 0.17116396129131317,
+      "learning_rate": 1.5847552889765095e-05,
+      "loss": 0.14109257459640503,
+      "step": 3470
+    },
+    {
+      "epoch": 0.632278020378457,
+      "grad_norm": 0.16949769854545593,
+      "learning_rate": 1.5779094726286344e-05,
+      "loss": 0.1387040376663208,
+      "step": 3475
+    },
+    {
+      "epoch": 0.6331877729257642,
+      "grad_norm": 0.14983431994915009,
+      "learning_rate": 1.5710716499128044e-05,
+      "loss": 0.13645120859146118,
+      "step": 3480
+    },
+    {
+      "epoch": 0.6340975254730713,
+      "grad_norm": 0.1632554531097412,
+      "learning_rate": 1.564241880106321e-05,
+      "loss": 0.14883992671966553,
+      "step": 3485
+    },
+    {
+      "epoch": 0.6350072780203785,
+      "grad_norm": 0.15686506032943726,
+      "learning_rate": 1.5574202224166744e-05,
+      "loss": 0.14244272708892822,
+      "step": 3490
+    },
+    {
+      "epoch": 0.6359170305676856,
+      "grad_norm": 0.18843458592891693,
+      "learning_rate": 1.5506067359810333e-05,
+      "loss": 0.15149861574172974,
+      "step": 3495
+    },
+    {
+      "epoch": 0.6368267831149927,
+      "grad_norm": 0.15874551236629486,
+      "learning_rate": 1.5438014798657275e-05,
+      "loss": 0.15188233852386473,
+      "step": 3500
+    },
+    {
+      "epoch": 0.6377365356622998,
+      "grad_norm": 0.17014239728450775,
+      "learning_rate": 1.5370045130657366e-05,
+      "loss": 0.14694437980651856,
+      "step": 3505
+    },
+    {
+      "epoch": 0.638646288209607,
+      "grad_norm": 0.14744038879871368,
+      "learning_rate": 1.5302158945041838e-05,
+      "loss": 0.14434736967086792,
+      "step": 3510
+    },
+    {
+      "epoch": 0.6395560407569141,
+      "grad_norm": 0.2069770246744156,
+      "learning_rate": 1.523435683031818e-05,
+      "loss": 0.13982917070388795,
+      "step": 3515
+    },
+    {
+      "epoch": 0.6404657933042213,
+      "grad_norm": 0.17811502516269684,
+      "learning_rate": 1.5166639374265063e-05,
+      "loss": 0.1408839702606201,
+      "step": 3520
+    },
+    {
+      "epoch": 0.6413755458515283,
+      "grad_norm": 0.165786474943161,
+      "learning_rate": 1.509900716392728e-05,
+      "loss": 0.15312877893447877,
+      "step": 3525
+    },
+    {
+      "epoch": 0.6422852983988355,
+      "grad_norm": 0.1633884161710739,
+      "learning_rate": 1.5031460785610596e-05,
+      "loss": 0.1488795518875122,
+      "step": 3530
+    },
+    {
+      "epoch": 0.6431950509461426,
+      "grad_norm": 0.16498984396457672,
+      "learning_rate": 1.4964000824876723e-05,
+      "loss": 0.15031465291976928,
+      "step": 3535
+    },
+    {
+      "epoch": 0.6441048034934498,
+      "grad_norm": 0.18043678998947144,
+      "learning_rate": 1.4896627866538191e-05,
+      "loss": 0.147829806804657,
+      "step": 3540
+    },
+    {
+      "epoch": 0.6450145560407569,
+      "grad_norm": 0.16813597083091736,
+      "learning_rate": 1.4829342494653315e-05,
+      "loss": 0.1418998956680298,
+      "step": 3545
+    },
+    {
+      "epoch": 0.645924308588064,
+      "grad_norm": 0.1817242056131363,
+      "learning_rate": 1.4762145292521118e-05,
+      "loss": 0.14508869647979736,
+      "step": 3550
+    },
+    {
+      "epoch": 0.6468340611353712,
+      "grad_norm": 0.14666494727134705,
+      "learning_rate": 1.469503684267628e-05,
+      "loss": 0.14159854650497436,
+      "step": 3555
+    },
+    {
+      "epoch": 0.6477438136826783,
+      "grad_norm": 0.16485381126403809,
+      "learning_rate": 1.4628017726884086e-05,
+      "loss": 0.14419105052947997,
+      "step": 3560
+    },
+    {
+      "epoch": 0.6486535662299855,
+      "grad_norm": 0.16100342571735382,
+      "learning_rate": 1.4561088526135375e-05,
+      "loss": 0.14501721858978273,
+      "step": 3565
+    },
+    {
+      "epoch": 0.6495633187772926,
+      "grad_norm": 0.16996590793132782,
+      "learning_rate": 1.4494249820641493e-05,
+      "loss": 0.1377166509628296,
+      "step": 3570
+    },
+    {
+      "epoch": 0.6504730713245997,
+      "grad_norm": 0.16168837249279022,
+      "learning_rate": 1.4427502189829339e-05,
+      "loss": 0.1414325475692749,
+      "step": 3575
+    },
+    {
+      "epoch": 0.6513828238719068,
+      "grad_norm": 0.16318906843662262,
+      "learning_rate": 1.436084621233621e-05,
+      "loss": 0.14685193300247193,
+      "step": 3580
+    },
+    {
+      "epoch": 0.652292576419214,
+      "grad_norm": 0.1636219322681427,
+      "learning_rate": 1.4294282466004899e-05,
+      "loss": 0.1405899167060852,
+      "step": 3585
+    },
+    {
+      "epoch": 0.6532023289665211,
+      "grad_norm": 0.1838461309671402,
+      "learning_rate": 1.422781152787865e-05,
+      "loss": 0.14386332035064697,
+      "step": 3590
+    },
+    {
+      "epoch": 0.6541120815138283,
+      "grad_norm": 0.1796344667673111,
+      "learning_rate": 1.4161433974196115e-05,
+      "loss": 0.1513024687767029,
+      "step": 3595
+    },
+    {
+      "epoch": 0.6550218340611353,
+      "grad_norm": 0.16424529254436493,
+      "learning_rate": 1.4095150380386427e-05,
+      "loss": 0.14238927364349366,
+      "step": 3600
+    },
+    {
+      "epoch": 0.6559315866084425,
+      "grad_norm": 0.19264160096645355,
+      "learning_rate": 1.402896132106415e-05,
+      "loss": 0.14297477006912232,
+      "step": 3605
+    },
+    {
+      "epoch": 0.6568413391557496,
+      "grad_norm": 0.18319948017597198,
+      "learning_rate": 1.3962867370024347e-05,
+      "loss": 0.1448880434036255,
+      "step": 3610
+    },
+    {
+      "epoch": 0.6577510917030568,
+      "grad_norm": 0.16507290303707123,
+      "learning_rate": 1.389686910023758e-05,
+      "loss": 0.14724698066711425,
+      "step": 3615
+    },
+    {
+      "epoch": 0.6586608442503639,
+      "grad_norm": 0.17871244251728058,
+      "learning_rate": 1.3830967083844942e-05,
+      "loss": 0.14479386806488037,
+      "step": 3620
+    },
+    {
+      "epoch": 0.659570596797671,
+      "grad_norm": 0.1846228390932083,
+      "learning_rate": 1.3765161892153112e-05,
+      "loss": 0.1453616738319397,
+      "step": 3625
+    },
+    {
+      "epoch": 0.6604803493449781,
+      "grad_norm": 0.17185978591442108,
+      "learning_rate": 1.3699454095629372e-05,
+      "loss": 0.14906206130981445,
+      "step": 3630
+    },
+    {
+      "epoch": 0.6613901018922853,
+      "grad_norm": 0.14751191437244415,
+      "learning_rate": 1.3633844263896698e-05,
+      "loss": 0.13991892337799072,
+      "step": 3635
+    },
+    {
+      "epoch": 0.6622998544395924,
+      "grad_norm": 0.22059763967990875,
+      "learning_rate": 1.3568332965728817e-05,
+      "loss": 0.14680869579315187,
+      "step": 3640
+    },
+    {
+      "epoch": 0.6632096069868996,
+      "grad_norm": 0.15295909345149994,
+      "learning_rate": 1.3502920769045232e-05,
+      "loss": 0.1404443383216858,
+      "step": 3645
+    },
+    {
+      "epoch": 0.6641193595342066,
+      "grad_norm": 0.14600558578968048,
+      "learning_rate": 1.3437608240906364e-05,
+      "loss": 0.14663270711898804,
+      "step": 3650
+    },
+    {
+      "epoch": 0.6650291120815138,
+      "grad_norm": 0.15548352897167206,
+      "learning_rate": 1.3372395947508587e-05,
+      "loss": 0.1431443452835083,
+      "step": 3655
+    },
+    {
+      "epoch": 0.665938864628821,
+      "grad_norm": 0.1813388466835022,
+      "learning_rate": 1.3307284454179342e-05,
+      "loss": 0.1458706736564636,
+      "step": 3660
+    },
+    {
+      "epoch": 0.6668486171761281,
+      "grad_norm": 0.16326870024204254,
+      "learning_rate": 1.3242274325372247e-05,
+      "loss": 0.14700595140457154,
+      "step": 3665
+    },
+    {
+      "epoch": 0.6677583697234353,
+      "grad_norm": 0.18779197335243225,
+      "learning_rate": 1.3177366124662149e-05,
+      "loss": 0.1497237801551819,
+      "step": 3670
+    },
+    {
+      "epoch": 0.6686681222707423,
+      "grad_norm": 0.16291002929210663,
+      "learning_rate": 1.3112560414740315e-05,
+      "loss": 0.1387086868286133,
+      "step": 3675
+    },
+    {
+      "epoch": 0.6695778748180495,
+      "grad_norm": 0.1532297134399414,
+      "learning_rate": 1.3047857757409487e-05,
+      "loss": 0.14497545957565308,
+      "step": 3680
+    },
+    {
+      "epoch": 0.6704876273653566,
+      "grad_norm": 0.14697515964508057,
+      "learning_rate": 1.2983258713579066e-05,
+      "loss": 0.1494283437728882,
+      "step": 3685
+    },
+    {
+      "epoch": 0.6713973799126638,
+      "grad_norm": 0.15213452279567719,
+      "learning_rate": 1.2918763843260218e-05,
+      "loss": 0.1468907594680786,
+      "step": 3690
+    },
+    {
+      "epoch": 0.6723071324599709,
+      "grad_norm": 0.1745215803384781,
+      "learning_rate": 1.285437370556099e-05,
+      "loss": 0.14997754096984864,
+      "step": 3695
+    },
+    {
+      "epoch": 0.673216885007278,
+      "grad_norm": 0.19207637012004852,
+      "learning_rate": 1.2790088858681577e-05,
+      "loss": 0.14202862977981567,
+      "step": 3700
+    },
+    {
+      "epoch": 0.6741266375545851,
+      "grad_norm": 0.1521359086036682,
+      "learning_rate": 1.2725909859909313e-05,
+      "loss": 0.14547673463821412,
+      "step": 3705
+    },
+    {
+      "epoch": 0.6750363901018923,
+      "grad_norm": 0.16975535452365875,
+      "learning_rate": 1.2661837265613999e-05,
+      "loss": 0.14006874561309815,
+      "step": 3710
+    },
+    {
+      "epoch": 0.6759461426491994,
+      "grad_norm": 0.22234582901000977,
+      "learning_rate": 1.2597871631242992e-05,
+      "loss": 0.13691173791885375,
+      "step": 3715
+    },
+    {
+      "epoch": 0.6768558951965066,
+      "grad_norm": 0.16082969307899475,
+      "learning_rate": 1.2534013511316383e-05,
+      "loss": 0.14932308197021485,
+      "step": 3720
+    },
+    {
+      "epoch": 0.6777656477438136,
+      "grad_norm": 0.1751091182231903,
+      "learning_rate": 1.247026345942226e-05,
+      "loss": 0.14531974792480468,
+      "step": 3725
+    },
+    {
+      "epoch": 0.6786754002911208,
+      "grad_norm": 0.15838147699832916,
+      "learning_rate": 1.2406622028211844e-05,
+      "loss": 0.14759832620620728,
+      "step": 3730
+    },
+    {
+      "epoch": 0.6795851528384279,
+      "grad_norm": 0.1771744042634964,
+      "learning_rate": 1.2343089769394714e-05,
+      "loss": 0.1382831573486328,
+      "step": 3735
+    },
+    {
+      "epoch": 0.6804949053857351,
+      "grad_norm": 0.16301538050174713,
+      "learning_rate": 1.2279667233734037e-05,
+      "loss": 0.14444775581359864,
+      "step": 3740
+    },
+    {
+      "epoch": 0.6814046579330422,
+      "grad_norm": 0.1584121286869049,
+      "learning_rate": 1.2216354971041796e-05,
+      "loss": 0.14200170040130616,
+      "step": 3745
+    },
+    {
+      "epoch": 0.6823144104803494,
+      "grad_norm": 0.139187291264534,
+      "learning_rate": 1.2153153530174007e-05,
+      "loss": 0.14318310022354125,
+      "step": 3750
+    },
+    {
+      "epoch": 0.6832241630276564,
+      "grad_norm": 0.13665248453617096,
+      "learning_rate": 1.2090063459025955e-05,
+      "loss": 0.1411946654319763,
+      "step": 3755
+    },
+    {
+      "epoch": 0.6841339155749636,
+      "grad_norm": 0.16273781657218933,
+      "learning_rate": 1.2027085304527475e-05,
+      "loss": 0.14873508214950562,
+      "step": 3760
+    },
+    {
+      "epoch": 0.6850436681222707,
+      "grad_norm": 0.16317526996135712,
+      "learning_rate": 1.1964219612638194e-05,
+      "loss": 0.14644203186035157,
+      "step": 3765
+    },
+    {
+      "epoch": 0.6859534206695779,
+      "grad_norm": 0.17253617942333221,
+      "learning_rate": 1.1901466928342777e-05,
+      "loss": 0.14027841091156007,
+      "step": 3770
+    },
+    {
+      "epoch": 0.6868631732168851,
+      "grad_norm": 0.19692830741405487,
+      "learning_rate": 1.183882779564624e-05,
+      "loss": 0.14411110877990724,
+      "step": 3775
+    },
+    {
+      "epoch": 0.6877729257641921,
+      "grad_norm": 0.15444578230381012,
+      "learning_rate": 1.1776302757569214e-05,
+      "loss": 0.14355008602142333,
+      "step": 3780
+    },
+    {
+      "epoch": 0.6886826783114993,
+      "grad_norm": 0.1622200757265091,
+      "learning_rate": 1.1713892356143239e-05,
+      "loss": 0.14794334173202514,
+      "step": 3785
+    },
+    {
+      "epoch": 0.6895924308588064,
+      "grad_norm": 0.1898501068353653,
+      "learning_rate": 1.1651597132406073e-05,
+      "loss": 0.1418622612953186,
+      "step": 3790
+    },
+    {
+      "epoch": 0.6905021834061136,
+      "grad_norm": 0.17803208529949188,
+      "learning_rate": 1.1589417626396973e-05,
+      "loss": 0.14576040506362914,
+      "step": 3795
+    },
+    {
+      "epoch": 0.6914119359534207,
+      "grad_norm": 0.17138013243675232,
+      "learning_rate": 1.1527354377152053e-05,
+      "loss": 0.14494270086288452,
+      "step": 3800
+    },
+    {
+      "epoch": 0.6923216885007278,
+      "grad_norm": 0.15170913934707642,
+      "learning_rate": 1.1465407922699603e-05,
+      "loss": 0.144084370136261,
+      "step": 3805
+    },
+    {
+      "epoch": 0.6932314410480349,
+      "grad_norm": 0.158562570810318,
+      "learning_rate": 1.1403578800055387e-05,
+      "loss": 0.13636608123779298,
+      "step": 3810
+    },
+    {
+      "epoch": 0.6941411935953421,
+      "grad_norm": 0.17687302827835083,
+      "learning_rate": 1.1341867545218044e-05,
+      "loss": 0.14214688539505005,
+      "step": 3815
+    },
+    {
+      "epoch": 0.6950509461426492,
+      "grad_norm": 0.15394899249076843,
+      "learning_rate": 1.1280274693164378e-05,
+      "loss": 0.14914129972457885,
+      "step": 3820
+    },
+    {
+      "epoch": 0.6959606986899564,
+      "grad_norm": 0.15709355473518372,
+      "learning_rate": 1.12188007778448e-05,
+      "loss": 0.14798580408096312,
+      "step": 3825
+    },
+    {
+      "epoch": 0.6968704512372634,
+      "grad_norm": 0.16631539165973663,
+      "learning_rate": 1.115744633217864e-05,
+      "loss": 0.14756966829299928,
+      "step": 3830
+    },
+    {
+      "epoch": 0.6977802037845706,
+      "grad_norm": 0.15893076360225677,
+      "learning_rate": 1.109621188804951e-05,
+      "loss": 0.14061959981918334,
+      "step": 3835
+    },
+    {
+      "epoch": 0.6986899563318777,
+      "grad_norm": 0.183414489030838,
+      "learning_rate": 1.103509797630077e-05,
+      "loss": 0.1448473334312439,
+      "step": 3840
+    },
+    {
+      "epoch": 0.6995997088791849,
+      "grad_norm": 0.14087305963039398,
+      "learning_rate": 1.0974105126730841e-05,
+      "loss": 0.14369285106658936,
+      "step": 3845
+    },
+    {
+      "epoch": 0.700509461426492,
+      "grad_norm": 0.16919967532157898,
+      "learning_rate": 1.0913233868088685e-05,
+      "loss": 0.1478085398674011,
+      "step": 3850
+    },
+    {
+      "epoch": 0.7014192139737991,
+      "grad_norm": 0.1439533829689026,
+      "learning_rate": 1.0852484728069178e-05,
+      "loss": 0.14376721382141114,
+      "step": 3855
+    },
+    {
+      "epoch": 0.7023289665211062,
+      "grad_norm": 0.17719274759292603,
+      "learning_rate": 1.0791858233308521e-05,
+      "loss": 0.14089040756225585,
+      "step": 3860
+    },
+    {
+      "epoch": 0.7032387190684134,
+      "grad_norm": 0.19753769040107727,
+      "learning_rate": 1.0731354909379754e-05,
+      "loss": 0.15021742582321168,
+      "step": 3865
+    },
+    {
+      "epoch": 0.7041484716157205,
+      "grad_norm": 0.19186992943286896,
+      "learning_rate": 1.0670975280788086e-05,
+      "loss": 0.14113202095031738,
+      "step": 3870
+    },
+    {
+      "epoch": 0.7050582241630277,
+      "grad_norm": 0.1709229201078415,
+      "learning_rate": 1.0610719870966443e-05,
+      "loss": 0.1500566840171814,
+      "step": 3875
+    },
+    {
+      "epoch": 0.7059679767103348,
+      "grad_norm": 0.17846204340457916,
+      "learning_rate": 1.0550589202270892e-05,
+      "loss": 0.15014195442199707,
+      "step": 3880
+    },
+    {
+      "epoch": 0.7068777292576419,
+      "grad_norm": 0.1827082335948944,
+      "learning_rate": 1.0490583795976091e-05,
+      "loss": 0.1423472762107849,
+      "step": 3885
+    },
+    {
+      "epoch": 0.7077874818049491,
+      "grad_norm": 0.17418377101421356,
+      "learning_rate": 1.043070417227083e-05,
+      "loss": 0.14668900966644288,
+      "step": 3890
+    },
+    {
+      "epoch": 0.7086972343522562,
+      "grad_norm": 0.17385616898536682,
+      "learning_rate": 1.0370950850253449e-05,
+      "loss": 0.14627279043197633,
+      "step": 3895
+    },
+    {
+      "epoch": 0.7096069868995634,
+      "grad_norm": 0.16486723721027374,
+      "learning_rate": 1.0311324347927404e-05,
+      "loss": 0.14603652954101562,
+      "step": 3900
+    },
+    {
+      "epoch": 0.7105167394468704,
+      "grad_norm": 0.21806862950325012,
+      "learning_rate": 1.0251825182196732e-05,
+      "loss": 0.1488169550895691,
+      "step": 3905
+    },
+    {
+      "epoch": 0.7114264919941776,
+      "grad_norm": 0.19884569942951202,
+      "learning_rate": 1.019245386886159e-05,
+      "loss": 0.14387656450271608,
+      "step": 3910
+    },
+    {
+      "epoch": 0.7123362445414847,
+      "grad_norm": 0.16139011085033417,
+      "learning_rate": 1.0133210922613789e-05,
+      "loss": 0.1483074426651001,
+      "step": 3915
+    },
+    {
+      "epoch": 0.7132459970887919,
+      "grad_norm": 0.17000740766525269,
+      "learning_rate": 1.007409685703229e-05,
+      "loss": 0.14050065279006957,
+      "step": 3920
+    },
+    {
+      "epoch": 0.714155749636099,
+      "grad_norm": 0.17235304415225983,
+      "learning_rate": 1.0015112184578813e-05,
+      "loss": 0.1440442681312561,
+      "step": 3925
+    },
+    {
+      "epoch": 0.7150655021834061,
+      "grad_norm": 0.15737567842006683,
+      "learning_rate": 9.956257416593362e-06,
+      "loss": 0.14960765838623047,
+      "step": 3930
+    },
+    {
+      "epoch": 0.7159752547307132,
+      "grad_norm": 0.15499180555343628,
+      "learning_rate": 9.897533063289773e-06,
+      "loss": 0.14488829374313356,
+      "step": 3935
+    },
+    {
+      "epoch": 0.7168850072780204,
+      "grad_norm": 0.17744216322898865,
+      "learning_rate": 9.838939633751337e-06,
+      "loss": 0.1416949987411499,
+      "step": 3940
+    },
+    {
+      "epoch": 0.7177947598253275,
+      "grad_norm": 0.1597192883491516,
+      "learning_rate": 9.780477635926358e-06,
+      "loss": 0.14275280237197877,
+      "step": 3945
+    },
+    {
+      "epoch": 0.7187045123726347,
+      "grad_norm": 0.17800374329090118,
+      "learning_rate": 9.722147576623743e-06,
+      "loss": 0.14532098770141602,
+      "step": 3950
+    },
+    {
+      "epoch": 0.7196142649199417,
+      "grad_norm": 0.1828162521123886,
+      "learning_rate": 9.66394996150864e-06,
+      "loss": 0.14525585174560546,
+      "step": 3955
+    },
+    {
+      "epoch": 0.7205240174672489,
+      "grad_norm": 0.1800539344549179,
+      "learning_rate": 9.605885295098005e-06,
+      "loss": 0.14235819578170777,
+      "step": 3960
+    },
+    {
+      "epoch": 0.721433770014556,
+      "grad_norm": 0.16556483507156372,
+      "learning_rate": 9.54795408075628e-06,
+      "loss": 0.13965482711791993,
+      "step": 3965
+    },
+    {
+      "epoch": 0.7223435225618632,
+      "grad_norm": 0.1592024862766266,
+      "learning_rate": 9.49015682069101e-06,
+      "loss": 0.14051042795181273,
+      "step": 3970
+    },
+    {
+      "epoch": 0.7232532751091703,
+      "grad_norm": 0.18988847732543945,
+      "learning_rate": 9.43249401594846e-06,
+      "loss": 0.1436900496482849,
+      "step": 3975
+    },
+    {
+      "epoch": 0.7241630276564774,
+      "grad_norm": 0.24433808028697968,
+      "learning_rate": 9.374966166409329e-06,
+      "loss": 0.14883997440338134,
+      "step": 3980
+    },
+    {
+      "epoch": 0.7250727802037845,
+      "grad_norm": 0.15091639757156372,
+      "learning_rate": 9.317573770784352e-06,
+      "loss": 0.14726560115814208,
+      "step": 3985
+    },
+    {
+      "epoch": 0.7259825327510917,
+      "grad_norm": 0.17045573890209198,
+      "learning_rate": 9.260317326610051e-06,
+      "loss": 0.14120506048202514,
+      "step": 3990
+    },
+    {
+      "epoch": 0.7268922852983989,
+      "grad_norm": 0.18847957253456116,
+      "learning_rate": 9.203197330244343e-06,
+      "loss": 0.1377041220664978,
+      "step": 3995
+    },
+    {
+      "epoch": 0.727802037845706,
+      "grad_norm": 0.1516445279121399,
+      "learning_rate": 9.14621427686229e-06,
+      "loss": 0.14043946266174318,
+      "step": 4000
+    },
+    {
+      "epoch": 0.7287117903930131,
+      "grad_norm": 0.18264050781726837,
+      "learning_rate": 9.0893686604518e-06,
+      "loss": 0.14080368280410765,
+      "step": 4005
+    },
+    {
+      "epoch": 0.7296215429403202,
+      "grad_norm": 0.19129371643066406,
+      "learning_rate": 9.032660973809312e-06,
+      "loss": 0.1402561902999878,
+      "step": 4010
+    },
+    {
+      "epoch": 0.7305312954876274,
+      "grad_norm": 0.15762710571289062,
+      "learning_rate": 8.976091708535567e-06,
+      "loss": 0.14421157836914061,
+      "step": 4015
+    },
+    {
+      "epoch": 0.7314410480349345,
+      "grad_norm": 0.17785198986530304,
+      "learning_rate": 8.919661355031331e-06,
+      "loss": 0.14999009370803834,
+      "step": 4020
+    },
+    {
+      "epoch": 0.7323508005822417,
+      "grad_norm": 0.15306031703948975,
+      "learning_rate": 8.8633704024931e-06,
+      "loss": 0.14101698398590087,
+      "step": 4025
+    },
+    {
+      "epoch": 0.7332605531295487,
+      "grad_norm": 0.16481758654117584,
+      "learning_rate": 8.807219338908968e-06,
+      "loss": 0.14170764684677123,
+      "step": 4030
+    },
+    {
+      "epoch": 0.7341703056768559,
+      "grad_norm": 0.14892235398292542,
+      "learning_rate": 8.751208651054257e-06,
+      "loss": 0.15317896604537964,
+      "step": 4035
+    },
+    {
+      "epoch": 0.735080058224163,
+      "grad_norm": 0.1775592565536499,
+      "learning_rate": 8.695338824487409e-06,
+      "loss": 0.1520617723464966,
+      "step": 4040
+    },
+    {
+      "epoch": 0.7359898107714702,
+      "grad_norm": 0.1614258885383606,
+      "learning_rate": 8.639610343545728e-06,
+      "loss": 0.13747400045394897,
+      "step": 4045
+    },
+    {
+      "epoch": 0.7368995633187773,
+      "grad_norm": 0.21415506303310394,
+      "learning_rate": 8.58402369134117e-06,
+      "loss": 0.1432439088821411,
+      "step": 4050
+    },
+    {
+      "epoch": 0.7378093158660844,
+      "grad_norm": 0.1759418249130249,
+      "learning_rate": 8.528579349756205e-06,
+      "loss": 0.141641104221344,
+      "step": 4055
+    },
+    {
+      "epoch": 0.7387190684133915,
+      "grad_norm": 0.16738329827785492,
+      "learning_rate": 8.47327779943957e-06,
+      "loss": 0.14294810295104982,
+      "step": 4060
+    },
+    {
+      "epoch": 0.7396288209606987,
+      "grad_norm": 0.13916844129562378,
+      "learning_rate": 8.41811951980217e-06,
+      "loss": 0.13876968622207642,
+      "step": 4065
+    },
+    {
+      "epoch": 0.7405385735080058,
+      "grad_norm": 0.1828441321849823,
+      "learning_rate": 8.36310498901288e-06,
+      "loss": 0.148428475856781,
+      "step": 4070
+    },
+    {
+      "epoch": 0.741448326055313,
+      "grad_norm": 0.16534076631069183,
+      "learning_rate": 8.308234683994415e-06,
+      "loss": 0.14222711324691772,
+      "step": 4075
+    },
+    {
+      "epoch": 0.74235807860262,
+      "grad_norm": 0.17922644317150116,
+      "learning_rate": 8.253509080419198e-06,
+      "loss": 0.14365782737731933,
+      "step": 4080
+    },
+    {
+      "epoch": 0.7432678311499272,
+      "grad_norm": 0.15061035752296448,
+      "learning_rate": 8.198928652705204e-06,
+      "loss": 0.13571925163269044,
+      "step": 4085
+    },
+    {
+      "epoch": 0.7441775836972343,
+      "grad_norm": 0.18075402081012726,
+      "learning_rate": 8.144493874011908e-06,
+      "loss": 0.14385528564453126,
+      "step": 4090
+    },
+    {
+      "epoch": 0.7450873362445415,
+      "grad_norm": 0.16514739394187927,
+      "learning_rate": 8.090205216236135e-06,
+      "loss": 0.14920626878738402,
+      "step": 4095
+    },
+    {
+      "epoch": 0.7459970887918487,
+      "grad_norm": 0.16453702747821808,
+      "learning_rate": 8.03606315000797e-06,
+      "loss": 0.14704222679138185,
+      "step": 4100
+    },
+    {
+      "epoch": 0.7469068413391557,
+      "grad_norm": 0.16719917953014374,
+      "learning_rate": 7.982068144686707e-06,
+      "loss": 0.14722511768341065,
+      "step": 4105
+    },
+    {
+      "epoch": 0.7478165938864629,
+      "grad_norm": 0.18499110639095306,
+      "learning_rate": 7.92822066835677e-06,
+      "loss": 0.1401848554611206,
+      "step": 4110
+    },
+    {
+      "epoch": 0.74872634643377,
+      "grad_norm": 0.17249563336372375,
+      "learning_rate": 7.87452118782363e-06,
+      "loss": 0.15132423639297485,
+      "step": 4115
+    },
+    {
+      "epoch": 0.7496360989810772,
+      "grad_norm": 0.15049682557582855,
+      "learning_rate": 7.8209701686098e-06,
+      "loss": 0.1341150164604187,
+      "step": 4120
+    },
+    {
+      "epoch": 0.7505458515283843,
+      "grad_norm": 0.16892646253108978,
+      "learning_rate": 7.767568074950751e-06,
+      "loss": 0.1466840147972107,
+      "step": 4125
+    },
+    {
+      "epoch": 0.7514556040756915,
+      "grad_norm": 0.17288286983966827,
+      "learning_rate": 7.714315369790942e-06,
+      "loss": 0.13819680213928223,
+      "step": 4130
+    },
+    {
+      "epoch": 0.7523653566229985,
+      "grad_norm": 0.21893996000289917,
+      "learning_rate": 7.661212514779745e-06,
+      "loss": 0.14369510412216185,
+      "step": 4135
+    },
+    {
+      "epoch": 0.7532751091703057,
+      "grad_norm": 0.1674601435661316,
+      "learning_rate": 7.608259970267509e-06,
+      "loss": 0.14810250997543334,
+      "step": 4140
+    },
+    {
+      "epoch": 0.7541848617176128,
+      "grad_norm": 0.15875539183616638,
+      "learning_rate": 7.555458195301526e-06,
+      "loss": 0.14103198051452637,
+      "step": 4145
+    },
+    {
+      "epoch": 0.75509461426492,
+      "grad_norm": 0.19454079866409302,
+      "learning_rate": 7.502807647622037e-06,
+      "loss": 0.13848764896392823,
+      "step": 4150
+    },
+    {
+      "epoch": 0.756004366812227,
+      "grad_norm": 0.1795455813407898,
+      "learning_rate": 7.450308783658341e-06,
+      "loss": 0.14459335803985596,
+      "step": 4155
+    },
+    {
+      "epoch": 0.7569141193595342,
+      "grad_norm": 0.1643362045288086,
+      "learning_rate": 7.397962058524735e-06,
+      "loss": 0.14335378408432006,
+      "step": 4160
+    },
+    {
+      "epoch": 0.7578238719068413,
+      "grad_norm": 0.16362066566944122,
+      "learning_rate": 7.3457679260166475e-06,
+      "loss": 0.14222005605697632,
+      "step": 4165
+    },
+    {
+      "epoch": 0.7587336244541485,
+      "grad_norm": 0.17313003540039062,
+      "learning_rate": 7.293726838606674e-06,
+      "loss": 0.14272255897521974,
+      "step": 4170
+    },
+    {
+      "epoch": 0.7596433770014556,
+      "grad_norm": 0.1809929460287094,
+      "learning_rate": 7.2418392474406405e-06,
+      "loss": 0.14089123010635377,
+      "step": 4175
+    },
+    {
+      "epoch": 0.7605531295487628,
+      "grad_norm": 0.14306005835533142,
+      "learning_rate": 7.19010560233373e-06,
+      "loss": 0.13531534671783446,
+      "step": 4180
+    },
+    {
+      "epoch": 0.7614628820960698,
+      "grad_norm": 0.15525390207767487,
+      "learning_rate": 7.138526351766559e-06,
+      "loss": 0.14340845346450806,
+      "step": 4185
+    },
+    {
+      "epoch": 0.762372634643377,
+      "grad_norm": 0.24478943645954132,
+      "learning_rate": 7.087101942881263e-06,
+      "loss": 0.14744555950164795,
+      "step": 4190
+    },
+    {
+      "epoch": 0.7632823871906841,
+      "grad_norm": 0.31335577368736267,
+      "learning_rate": 7.035832821477711e-06,
+      "loss": 0.1484094500541687,
+      "step": 4195
+    },
+    {
+      "epoch": 0.7641921397379913,
+      "grad_norm": 0.15140366554260254,
+      "learning_rate": 6.984719432009515e-06,
+      "loss": 0.14991614818572999,
+      "step": 4200
+    },
+    {
+      "epoch": 0.7651018922852983,
+      "grad_norm": 0.16125506162643433,
+      "learning_rate": 6.933762217580289e-06,
+      "loss": 0.1408134937286377,
+      "step": 4205
+    },
+    {
+      "epoch": 0.7660116448326055,
+      "grad_norm": 0.2501450181007385,
+      "learning_rate": 6.882961619939726e-06,
+      "loss": 0.13875640630722047,
+      "step": 4210
+    },
+    {
+      "epoch": 0.7669213973799127,
+      "grad_norm": 0.16227811574935913,
+      "learning_rate": 6.8323180794798245e-06,
+      "loss": 0.14138660430908204,
+      "step": 4215
+    },
+    {
+      "epoch": 0.7678311499272198,
+      "grad_norm": 0.16676810383796692,
+      "learning_rate": 6.781832035231053e-06,
+      "loss": 0.14696706533432008,
+      "step": 4220
+    },
+    {
+      "epoch": 0.768740902474527,
+      "grad_norm": 0.14638574421405792,
+      "learning_rate": 6.731503924858518e-06,
+      "loss": 0.14263020753860473,
+      "step": 4225
+    },
+    {
+      "epoch": 0.769650655021834,
+      "grad_norm": 0.17093190550804138,
+      "learning_rate": 6.681334184658211e-06,
+      "loss": 0.14694111347198485,
+      "step": 4230
+    },
+    {
+      "epoch": 0.7705604075691412,
+      "grad_norm": 0.17174287140369415,
+      "learning_rate": 6.631323249553201e-06,
+      "loss": 0.13854929208755493,
+      "step": 4235
+    },
+    {
+      "epoch": 0.7714701601164483,
+      "grad_norm": 0.14599016308784485,
+      "learning_rate": 6.5814715530898745e-06,
+      "loss": 0.14058833122253417,
+      "step": 4240
+    },
+    {
+      "epoch": 0.7723799126637555,
+      "grad_norm": 0.16222265362739563,
+      "learning_rate": 6.531779527434176e-06,
+      "loss": 0.1428326725959778,
+      "step": 4245
+    },
+    {
+      "epoch": 0.7732896652110626,
+      "grad_norm": 0.1741994023323059,
+      "learning_rate": 6.482247603367839e-06,
+      "loss": 0.13985042572021483,
+      "step": 4250
+    },
+    {
+      "epoch": 0.7741994177583698,
+      "grad_norm": 0.17427101731300354,
+      "learning_rate": 6.432876210284688e-06,
+      "loss": 0.1442667603492737,
+      "step": 4255
+    },
+    {
+      "epoch": 0.7751091703056768,
+      "grad_norm": 0.1665259599685669,
+      "learning_rate": 6.383665776186912e-06,
+      "loss": 0.1421986222267151,
+      "step": 4260
+    },
+    {
+      "epoch": 0.776018922852984,
+      "grad_norm": 0.1728232353925705,
+      "learning_rate": 6.334616727681303e-06,
+      "loss": 0.1367053508758545,
+      "step": 4265
+    },
+    {
+      "epoch": 0.7769286754002911,
+      "grad_norm": 0.15882381796836853,
+      "learning_rate": 6.285729489975639e-06,
+      "loss": 0.14551182985305786,
+      "step": 4270
+    },
+    {
+      "epoch": 0.7778384279475983,
+      "grad_norm": 0.242042675614357,
+      "learning_rate": 6.2370044868749115e-06,
+      "loss": 0.1455132007598877,
+      "step": 4275
+    },
+    {
+      "epoch": 0.7787481804949054,
+      "grad_norm": 0.1599501073360443,
+      "learning_rate": 6.188442140777742e-06,
+      "loss": 0.1424942970275879,
+      "step": 4280
+    },
+    {
+      "epoch": 0.7796579330422125,
+      "grad_norm": 0.15182635188102722,
+      "learning_rate": 6.140042872672647e-06,
+      "loss": 0.14212887287139891,
+      "step": 4285
+    },
+    {
+      "epoch": 0.7805676855895196,
+      "grad_norm": 0.1720375418663025,
+      "learning_rate": 6.091807102134403e-06,
+      "loss": 0.14243412017822266,
+      "step": 4290
+    },
+    {
+      "epoch": 0.7814774381368268,
+      "grad_norm": 0.16436047852039337,
+      "learning_rate": 6.043735247320454e-06,
+      "loss": 0.15035657882690429,
+      "step": 4295
+    },
+    {
+      "epoch": 0.7823871906841339,
+      "grad_norm": 0.1498408019542694,
+      "learning_rate": 5.995827724967218e-06,
+      "loss": 0.14494839906692505,
+      "step": 4300
+    },
+    {
+      "epoch": 0.7832969432314411,
+      "grad_norm": 0.16924560070037842,
+      "learning_rate": 5.948084950386535e-06,
+      "loss": 0.13581212759017944,
+      "step": 4305
+    },
+    {
+      "epoch": 0.7842066957787481,
+      "grad_norm": 0.15889139473438263,
+      "learning_rate": 5.900507337462036e-06,
+      "loss": 0.15071530342102052,
+      "step": 4310
+    },
+    {
+      "epoch": 0.7851164483260553,
+      "grad_norm": 0.17201054096221924,
+      "learning_rate": 5.853095298645542e-06,
+      "loss": 0.1398628830909729,
+      "step": 4315
+    },
+    {
+      "epoch": 0.7860262008733624,
+      "grad_norm": 0.17965619266033173,
+      "learning_rate": 5.805849244953548e-06,
+      "loss": 0.14666696786880493,
+      "step": 4320
+    },
+    {
+      "epoch": 0.7869359534206696,
+      "grad_norm": 0.17514032125473022,
+      "learning_rate": 5.758769585963569e-06,
+      "loss": 0.1383386731147766,
+      "step": 4325
+    },
+    {
+      "epoch": 0.7878457059679768,
+      "grad_norm": 0.17497631907463074,
+      "learning_rate": 5.7118567298106744e-06,
+      "loss": 0.14362354278564454,
+      "step": 4330
+    },
+    {
+      "epoch": 0.7887554585152838,
+      "grad_norm": 0.16770458221435547,
+      "learning_rate": 5.665111083183905e-06,
+      "loss": 0.14136618375778198,
+      "step": 4335
+    },
+    {
+      "epoch": 0.789665211062591,
+      "grad_norm": 0.17134106159210205,
+      "learning_rate": 5.618533051322747e-06,
+      "loss": 0.1401529550552368,
+      "step": 4340
+    },
+    {
+      "epoch": 0.7905749636098981,
+      "grad_norm": 0.19458788633346558,
+      "learning_rate": 5.5721230380136435e-06,
+      "loss": 0.1393273115158081,
+      "step": 4345
+    },
+    {
+      "epoch": 0.7914847161572053,
+      "grad_norm": 0.19483692944049835,
+      "learning_rate": 5.525881445586467e-06,
+      "loss": 0.1369825482368469,
+      "step": 4350
+    },
+    {
+      "epoch": 0.7923944687045124,
+      "grad_norm": 0.3052191734313965,
+      "learning_rate": 5.4798086749110495e-06,
+      "loss": 0.14762181043624878,
+      "step": 4355
+    },
+    {
+      "epoch": 0.7933042212518195,
+      "grad_norm": 0.164458766579628,
+      "learning_rate": 5.4339051253937065e-06,
+      "loss": 0.14501686096191407,
+      "step": 4360
+    },
+    {
+      "epoch": 0.7942139737991266,
+      "grad_norm": 0.1719193458557129,
+      "learning_rate": 5.3881711949737625e-06,
+      "loss": 0.13321092128753662,
+      "step": 4365
+    },
+    {
+      "epoch": 0.7951237263464338,
+      "grad_norm": 0.17219696938991547,
+      "learning_rate": 5.342607280120121e-06,
+      "loss": 0.1413906455039978,
+      "step": 4370
+    },
+    {
+      "epoch": 0.7960334788937409,
+      "grad_norm": 0.15083056688308716,
+      "learning_rate": 5.297213775827789e-06,
+      "loss": 0.14772192239761353,
+      "step": 4375
+    },
+    {
+      "epoch": 0.7969432314410481,
+      "grad_norm": 0.1699071079492569,
+      "learning_rate": 5.251991075614507e-06,
+      "loss": 0.1392375946044922,
+      "step": 4380
+    },
+    {
+      "epoch": 0.7978529839883551,
+      "grad_norm": 0.1680395007133484,
+      "learning_rate": 5.206939571517302e-06,
+      "loss": 0.14185575246810914,
+      "step": 4385
+    },
+    {
+      "epoch": 0.7987627365356623,
+      "grad_norm": 0.16526710987091064,
+      "learning_rate": 5.162059654089083e-06,
+      "loss": 0.15001428127288818,
+      "step": 4390
+    },
+    {
+      "epoch": 0.7996724890829694,
+      "grad_norm": 0.16281752288341522,
+      "learning_rate": 5.1173517123952794e-06,
+      "loss": 0.13747023344039916,
+      "step": 4395
+    },
+    {
+      "epoch": 0.8005822416302766,
+      "grad_norm": 0.1454378366470337,
+      "learning_rate": 5.072816134010458e-06,
+      "loss": 0.14710829257965088,
+      "step": 4400
+    },
+    {
+      "epoch": 0.8014919941775837,
+      "grad_norm": 0.16565890610218048,
+      "learning_rate": 5.028453305014966e-06,
+      "loss": 0.14138611555099487,
+      "step": 4405
+    },
+    {
+      "epoch": 0.8024017467248908,
+      "grad_norm": 0.1962810605764389,
+      "learning_rate": 4.984263609991577e-06,
+      "loss": 0.13836177587509155,
+      "step": 4410
+    },
+    {
+      "epoch": 0.8033114992721979,
+      "grad_norm": 0.16091369092464447,
+      "learning_rate": 4.940247432022149e-06,
+      "loss": 0.14407440423965454,
+      "step": 4415
+    },
+    {
+      "epoch": 0.8042212518195051,
+      "grad_norm": 0.1930241584777832,
+      "learning_rate": 4.89640515268433e-06,
+      "loss": 0.14346336126327514,
+      "step": 4420
+    },
+    {
+      "epoch": 0.8051310043668122,
+      "grad_norm": 0.19301500916481018,
+      "learning_rate": 4.852737152048242e-06,
+      "loss": 0.14174317121505736,
+      "step": 4425
+    },
+    {
+      "epoch": 0.8060407569141194,
+      "grad_norm": 0.1541353315114975,
+      "learning_rate": 4.80924380867315e-06,
+      "loss": 0.14100592136383056,
+      "step": 4430
+    },
+    {
+      "epoch": 0.8069505094614265,
+      "grad_norm": 0.16285750269889832,
+      "learning_rate": 4.765925499604243e-06,
+      "loss": 0.1441288709640503,
+      "step": 4435
+    },
+    {
+      "epoch": 0.8078602620087336,
+      "grad_norm": 0.17382675409317017,
+      "learning_rate": 4.722782600369299e-06,
+      "loss": 0.13763951063156127,
+      "step": 4440
+    },
+    {
+      "epoch": 0.8087700145560408,
+      "grad_norm": 0.1697344034910202,
+      "learning_rate": 4.679815484975505e-06,
+      "loss": 0.1410105347633362,
+      "step": 4445
+    },
+    {
+      "epoch": 0.8096797671033479,
+      "grad_norm": 0.19964542984962463,
+      "learning_rate": 4.637024525906131e-06,
+      "loss": 0.1439276695251465,
+      "step": 4450
+    },
+    {
+      "epoch": 0.8105895196506551,
+      "grad_norm": 0.165307879447937,
+      "learning_rate": 4.59441009411736e-06,
+      "loss": 0.13897504806518554,
+      "step": 4455
+    },
+    {
+      "epoch": 0.8114992721979621,
+      "grad_norm": 0.16687989234924316,
+      "learning_rate": 4.551972559035067e-06,
+      "loss": 0.1422593355178833,
+      "step": 4460
+    },
+    {
+      "epoch": 0.8124090247452693,
+      "grad_norm": 0.15737789869308472,
+      "learning_rate": 4.509712288551571e-06,
+      "loss": 0.1452128052711487,
+      "step": 4465
+    },
+    {
+      "epoch": 0.8133187772925764,
+      "grad_norm": 0.17116659879684448,
+      "learning_rate": 4.467629649022509e-06,
+      "loss": 0.14385371208190917,
+      "step": 4470
+    },
+    {
+      "epoch": 0.8142285298398836,
+      "grad_norm": 0.17457640171051025,
+      "learning_rate": 4.425725005263623e-06,
+      "loss": 0.14808475971221924,
+      "step": 4475
+    },
+    {
+      "epoch": 0.8151382823871907,
+      "grad_norm": 0.1621970385313034,
+      "learning_rate": 4.383998720547583e-06,
+      "loss": 0.13927959203720092,
+      "step": 4480
+    },
+    {
+      "epoch": 0.8160480349344978,
+      "grad_norm": 0.176296666264534,
+      "learning_rate": 4.342451156600896e-06,
+      "loss": 0.15041060447692872,
+      "step": 4485
+    },
+    {
+      "epoch": 0.8169577874818049,
+      "grad_norm": 0.17157645523548126,
+      "learning_rate": 4.301082673600698e-06,
+      "loss": 0.13932652473449708,
+      "step": 4490
+    },
+    {
+      "epoch": 0.8178675400291121,
+      "grad_norm": 0.15378527343273163,
+      "learning_rate": 4.259893630171682e-06,
+      "loss": 0.1406856894493103,
+      "step": 4495
+    },
+    {
+      "epoch": 0.8187772925764192,
+      "grad_norm": 0.1750226765871048,
+      "learning_rate": 4.218884383382987e-06,
+      "loss": 0.1350164532661438,
+      "step": 4500
+    },
+    {
+      "epoch": 0.8196870451237264,
+      "grad_norm": 0.1393742561340332,
+      "learning_rate": 4.178055288745053e-06,
+      "loss": 0.13769235610961914,
+      "step": 4505
+    },
+    {
+      "epoch": 0.8205967976710334,
+      "grad_norm": 0.1668994128704071,
+      "learning_rate": 4.137406700206617e-06,
+      "loss": 0.14029752016067504,
+      "step": 4510
+    },
+    {
+      "epoch": 0.8215065502183406,
+      "grad_norm": 0.1833454668521881,
+      "learning_rate": 4.0969389701515675e-06,
+      "loss": 0.14276301860809326,
+      "step": 4515
+    },
+    {
+      "epoch": 0.8224163027656477,
+      "grad_norm": 0.16187874972820282,
+      "learning_rate": 4.056652449395945e-06,
+      "loss": 0.1444832682609558,
+      "step": 4520
+    },
+    {
+      "epoch": 0.8233260553129549,
+      "grad_norm": 0.1453280746936798,
+      "learning_rate": 4.01654748718488e-06,
+      "loss": 0.14512733221054078,
+      "step": 4525
+    },
+    {
+      "epoch": 0.824235807860262,
+      "grad_norm": 0.1782725751399994,
+      "learning_rate": 3.976624431189563e-06,
+      "loss": 0.14093561172485353,
+      "step": 4530
+    },
+    {
+      "epoch": 0.8251455604075691,
+      "grad_norm": 0.17374491691589355,
+      "learning_rate": 3.936883627504234e-06,
+      "loss": 0.14031401872634888,
+      "step": 4535
+    },
+    {
+      "epoch": 0.8260553129548762,
+      "grad_norm": 0.1609172821044922,
+      "learning_rate": 3.897325420643174e-06,
+      "loss": 0.1428336262702942,
+      "step": 4540
+    },
+    {
+      "epoch": 0.8269650655021834,
+      "grad_norm": 0.1520884931087494,
+      "learning_rate": 3.85795015353774e-06,
+      "loss": 0.1460547924041748,
+      "step": 4545
+    },
+    {
+      "epoch": 0.8278748180494906,
+      "grad_norm": 0.20986326038837433,
+      "learning_rate": 3.818758167533376e-06,
+      "loss": 0.14706350564956666,
+      "step": 4550
+    },
+    {
+      "epoch": 0.8287845705967977,
+      "grad_norm": 0.16825413703918457,
+      "learning_rate": 3.7797498023866396e-06,
+      "loss": 0.14507200717926025,
+      "step": 4555
+    },
+    {
+      "epoch": 0.8296943231441049,
+      "grad_norm": 0.16758380830287933,
+      "learning_rate": 3.740925396262296e-06,
+      "loss": 0.14898381233215333,
+      "step": 4560
+    },
+    {
+      "epoch": 0.8306040756914119,
+      "grad_norm": 0.15207453072071075,
+      "learning_rate": 3.7022852857303503e-06,
+      "loss": 0.14138854742050172,
+      "step": 4565
+    },
+    {
+      "epoch": 0.8315138282387191,
+      "grad_norm": 0.15150749683380127,
+      "learning_rate": 3.66382980576315e-06,
+      "loss": 0.13894975185394287,
+      "step": 4570
+    },
+    {
+      "epoch": 0.8324235807860262,
+      "grad_norm": 0.17071188986301422,
+      "learning_rate": 3.625559289732472e-06,
+      "loss": 0.14072470664978026,
+      "step": 4575
+    },
+    {
+      "epoch": 0.8333333333333334,
+      "grad_norm": 0.154335618019104,
+      "learning_rate": 3.5874740694066294e-06,
+      "loss": 0.13791344165802003,
+      "step": 4580
+    },
+    {
+      "epoch": 0.8342430858806404,
+      "grad_norm": 0.14017128944396973,
+      "learning_rate": 3.5495744749476116e-06,
+      "loss": 0.14427922964096068,
+      "step": 4585
+    },
+    {
+      "epoch": 0.8351528384279476,
+      "grad_norm": 0.17210033535957336,
+      "learning_rate": 3.5118608349081983e-06,
+      "loss": 0.15191166400909423,
+      "step": 4590
+    },
+    {
+      "epoch": 0.8360625909752547,
+      "grad_norm": 0.18715685606002808,
+      "learning_rate": 3.4743334762291358e-06,
+      "loss": 0.14451316595077515,
+      "step": 4595
+    },
+    {
+      "epoch": 0.8369723435225619,
+      "grad_norm": 0.18079884350299835,
+      "learning_rate": 3.436992724236293e-06,
+      "loss": 0.13530746698379517,
+      "step": 4600
+    },
+    {
+      "epoch": 0.837882096069869,
+      "grad_norm": 0.13519920408725739,
+      "learning_rate": 3.399838902637817e-06,
+      "loss": 0.1477964401245117,
+      "step": 4605
+    },
+    {
+      "epoch": 0.8387918486171762,
+      "grad_norm": 0.1778026670217514,
+      "learning_rate": 3.3628723335213885e-06,
+      "loss": 0.14419831037521363,
+      "step": 4610
+    },
+    {
+      "epoch": 0.8397016011644832,
+      "grad_norm": 0.15165366232395172,
+      "learning_rate": 3.326093337351355e-06,
+      "loss": 0.13888469934463502,
+      "step": 4615
+    },
+    {
+      "epoch": 0.8406113537117904,
+      "grad_norm": 0.17049473524093628,
+      "learning_rate": 3.2895022329660018e-06,
+      "loss": 0.14438477754592896,
+      "step": 4620
+    },
+    {
+      "epoch": 0.8415211062590975,
+      "grad_norm": 0.16536414623260498,
+      "learning_rate": 3.2530993375747833e-06,
+      "loss": 0.1444351315498352,
+      "step": 4625
+    },
+    {
+      "epoch": 0.8424308588064047,
+      "grad_norm": 0.17570015788078308,
+      "learning_rate": 3.2168849667555402e-06,
+      "loss": 0.13861945867538453,
+      "step": 4630
+    },
+    {
+      "epoch": 0.8433406113537117,
+      "grad_norm": 0.1699545532464981,
+      "learning_rate": 3.1808594344518132e-06,
+      "loss": 0.13902754783630372,
+      "step": 4635
+    },
+    {
+      "epoch": 0.8442503639010189,
+      "grad_norm": 0.12331254780292511,
+      "learning_rate": 3.1450230529700837e-06,
+      "loss": 0.14104254245758058,
+      "step": 4640
+    },
+    {
+      "epoch": 0.845160116448326,
+      "grad_norm": 0.1508190929889679,
+      "learning_rate": 3.1093761329770708e-06,
+      "loss": 0.13288766145706177,
+      "step": 4645
+    },
+    {
+      "epoch": 0.8460698689956332,
+      "grad_norm": 0.19049489498138428,
+      "learning_rate": 3.0739189834970735e-06,
+      "loss": 0.14914840459823608,
+      "step": 4650
+    },
+    {
+      "epoch": 0.8469796215429404,
+      "grad_norm": 0.1662369966506958,
+      "learning_rate": 3.0386519119092293e-06,
+      "loss": 0.14222898483276367,
+      "step": 4655
+    },
+    {
+      "epoch": 0.8478893740902474,
+      "grad_norm": 0.18985967338085175,
+      "learning_rate": 3.0035752239449126e-06,
+      "loss": 0.14431113004684448,
+      "step": 4660
+    },
+    {
+      "epoch": 0.8487991266375546,
+      "grad_norm": 0.17005261778831482,
+      "learning_rate": 2.9686892236850337e-06,
+      "loss": 0.14140807390213012,
+      "step": 4665
+    },
+    {
+      "epoch": 0.8497088791848617,
+      "grad_norm": 0.16786684095859528,
+      "learning_rate": 2.9339942135574394e-06,
+      "loss": 0.14161460399627684,
+      "step": 4670
+    },
+    {
+      "epoch": 0.8506186317321689,
+      "grad_norm": 0.16358181834220886,
+      "learning_rate": 2.899490494334281e-06,
+      "loss": 0.14674670696258546,
+      "step": 4675
+    },
+    {
+      "epoch": 0.851528384279476,
+      "grad_norm": 0.1651349812746048,
+      "learning_rate": 2.8651783651293867e-06,
+      "loss": 0.13794611692428588,
+      "step": 4680
+    },
+    {
+      "epoch": 0.8524381368267832,
+      "grad_norm": 0.16934923827648163,
+      "learning_rate": 2.831058123395694e-06,
+      "loss": 0.13199397325515747,
+      "step": 4685
+    },
+    {
+      "epoch": 0.8533478893740902,
+      "grad_norm": 0.1704150140285492,
+      "learning_rate": 2.797130064922665e-06,
+      "loss": 0.14044904708862305,
+      "step": 4690
+    },
+    {
+      "epoch": 0.8542576419213974,
+      "grad_norm": 0.1814192682504654,
+      "learning_rate": 2.7633944838337143e-06,
+      "loss": 0.1465100646018982,
+      "step": 4695
+    },
+    {
+      "epoch": 0.8551673944687045,
+      "grad_norm": 0.18942610919475555,
+      "learning_rate": 2.729851672583669e-06,
+      "loss": 0.14685982465744019,
+      "step": 4700
+    },
+    {
+      "epoch": 0.8560771470160117,
+      "grad_norm": 0.17895208299160004,
+      "learning_rate": 2.6965019219562155e-06,
+      "loss": 0.13971571922302245,
+      "step": 4705
+    },
+    {
+      "epoch": 0.8569868995633187,
+      "grad_norm": 0.22735828161239624,
+      "learning_rate": 2.6633455210614055e-06,
+      "loss": 0.13776102066040039,
+      "step": 4710
+    },
+    {
+      "epoch": 0.8578966521106259,
+      "grad_norm": 0.16779793798923492,
+      "learning_rate": 2.630382757333133e-06,
+      "loss": 0.14134042263031005,
+      "step": 4715
+    },
+    {
+      "epoch": 0.858806404657933,
+      "grad_norm": 0.2148888260126114,
+      "learning_rate": 2.597613916526637e-06,
+      "loss": 0.14680721759796142,
+      "step": 4720
+    },
+    {
+      "epoch": 0.8597161572052402,
+      "grad_norm": 0.16560257971286774,
+      "learning_rate": 2.565039282716045e-06,
+      "loss": 0.14137234687805175,
+      "step": 4725
+    },
+    {
+      "epoch": 0.8606259097525473,
+      "grad_norm": 0.16197068989276886,
+      "learning_rate": 2.532659138291879e-06,
+      "loss": 0.14969314336776735,
+      "step": 4730
+    },
+    {
+      "epoch": 0.8615356622998545,
+      "grad_norm": 0.14650246500968933,
+      "learning_rate": 2.5004737639586497e-06,
+      "loss": 0.13532910346984864,
+      "step": 4735
+    },
+    {
+      "epoch": 0.8624454148471615,
+      "grad_norm": 0.1565634310245514,
+      "learning_rate": 2.4684834387323943e-06,
+      "loss": 0.14146244525909424,
+      "step": 4740
+    },
+    {
+      "epoch": 0.8633551673944687,
+      "grad_norm": 0.18060864508152008,
+      "learning_rate": 2.4366884399382393e-06,
+      "loss": 0.14218534231185914,
+      "step": 4745
+    },
+    {
+      "epoch": 0.8642649199417758,
+      "grad_norm": 0.24613255262374878,
+      "learning_rate": 2.4050890432080557e-06,
+      "loss": 0.13907679319381713,
+      "step": 4750
+    },
+    {
+      "epoch": 0.865174672489083,
+      "grad_norm": 0.16036023199558258,
+      "learning_rate": 2.3736855224780057e-06,
+      "loss": 0.13718113899230958,
+      "step": 4755
+    },
+    {
+      "epoch": 0.86608442503639,
+      "grad_norm": 0.16678516566753387,
+      "learning_rate": 2.3424781499862075e-06,
+      "loss": 0.1327962040901184,
+      "step": 4760
+    },
+    {
+      "epoch": 0.8669941775836972,
+      "grad_norm": 0.1763770878314972,
+      "learning_rate": 2.3114671962703727e-06,
+      "loss": 0.14390318393707274,
+      "step": 4765
+    },
+    {
+      "epoch": 0.8679039301310044,
+      "grad_norm": 0.17735697329044342,
+      "learning_rate": 2.280652930165428e-06,
+      "loss": 0.15223288536071777,
+      "step": 4770
+    },
+    {
+      "epoch": 0.8688136826783115,
+      "grad_norm": 0.15827041864395142,
+      "learning_rate": 2.250035618801241e-06,
+      "loss": 0.14296332597732545,
+      "step": 4775
+    },
+    {
+      "epoch": 0.8697234352256187,
+      "grad_norm": 0.16876135766506195,
+      "learning_rate": 2.219615527600244e-06,
+      "loss": 0.1359076738357544,
+      "step": 4780
+    },
+    {
+      "epoch": 0.8706331877729258,
+      "grad_norm": 0.1800110638141632,
+      "learning_rate": 2.189392920275174e-06,
+      "loss": 0.1424281358718872,
+      "step": 4785
+    },
+    {
+      "epoch": 0.8715429403202329,
+      "grad_norm": 0.1409560889005661,
+      "learning_rate": 2.159368058826783e-06,
+      "loss": 0.14480490684509278,
+      "step": 4790
+    },
+    {
+      "epoch": 0.87245269286754,
+      "grad_norm": 0.1634288728237152,
+      "learning_rate": 2.129541203541535e-06,
+      "loss": 0.14513269662857056,
+      "step": 4795
+    },
+    {
+      "epoch": 0.8733624454148472,
+      "grad_norm": 0.17126062512397766,
+      "learning_rate": 2.099912612989391e-06,
+      "loss": 0.13546934127807617,
+      "step": 4800
+    },
+    {
+      "epoch": 0.8742721979621543,
+      "grad_norm": 0.16704080998897552,
+      "learning_rate": 2.0704825440215457e-06,
+      "loss": 0.13852492570877076,
+      "step": 4805
+    },
+    {
+      "epoch": 0.8751819505094615,
+      "grad_norm": 0.1725970208644867,
+      "learning_rate": 2.0412512517681946e-06,
+      "loss": 0.14504197835922242,
+      "step": 4810
+    },
+    {
+      "epoch": 0.8760917030567685,
+      "grad_norm": 0.1700201779603958,
+      "learning_rate": 2.0122189896363387e-06,
+      "loss": 0.14312338829040527,
+      "step": 4815
+    },
+    {
+      "epoch": 0.8770014556040757,
+      "grad_norm": 0.16491736471652985,
+      "learning_rate": 1.9833860093075834e-06,
+      "loss": 0.14062976837158203,
+      "step": 4820
+    },
+    {
+      "epoch": 0.8779112081513828,
+      "grad_norm": 0.13748787343502045,
+      "learning_rate": 1.9547525607359537e-06,
+      "loss": 0.1346171498298645,
+      "step": 4825
+    },
+    {
+      "epoch": 0.87882096069869,
+      "grad_norm": 0.16399399936199188,
+      "learning_rate": 1.926318892145712e-06,
+      "loss": 0.14178123474121093,
+      "step": 4830
+    },
+    {
+      "epoch": 0.879730713245997,
+      "grad_norm": 0.14491963386535645,
+      "learning_rate": 1.8980852500292412e-06,
+      "loss": 0.1408564567565918,
+      "step": 4835
+    },
+    {
+      "epoch": 0.8806404657933042,
+      "grad_norm": 0.17335423827171326,
+      "learning_rate": 1.8700518791448851e-06,
+      "loss": 0.14403265714645386,
+      "step": 4840
+    },
+    {
+      "epoch": 0.8815502183406113,
+      "grad_norm": 0.17399625480175018,
+      "learning_rate": 1.8422190225148155e-06,
+      "loss": 0.14289036989212037,
+      "step": 4845
+    },
+    {
+      "epoch": 0.8824599708879185,
+      "grad_norm": 0.17945612967014313,
+      "learning_rate": 1.814586921422956e-06,
+      "loss": 0.14494109153747559,
+      "step": 4850
+    },
+    {
+      "epoch": 0.8833697234352256,
+      "grad_norm": 0.1910620480775833,
+      "learning_rate": 1.7871558154128664e-06,
+      "loss": 0.13726245164871215,
+      "step": 4855
+    },
+    {
+      "epoch": 0.8842794759825328,
+      "grad_norm": 0.1771879345178604,
+      "learning_rate": 1.7599259422856756e-06,
+      "loss": 0.1464752197265625,
+      "step": 4860
+    },
+    {
+      "epoch": 0.8851892285298398,
+      "grad_norm": 0.19427461922168732,
+      "learning_rate": 1.7328975380980218e-06,
+      "loss": 0.13823356628417968,
+      "step": 4865
+    },
+    {
+      "epoch": 0.886098981077147,
+      "grad_norm": 0.1491149365901947,
+      "learning_rate": 1.7060708371599897e-06,
+      "loss": 0.1338604211807251,
+      "step": 4870
+    },
+    {
+      "epoch": 0.8870087336244541,
+      "grad_norm": 0.16087733209133148,
+      "learning_rate": 1.6794460720331057e-06,
+      "loss": 0.14184389114379883,
+      "step": 4875
+    },
+    {
+      "epoch": 0.8879184861717613,
+      "grad_norm": 0.14506325125694275,
+      "learning_rate": 1.653023473528309e-06,
+      "loss": 0.14267687797546386,
+      "step": 4880
+    },
+    {
+      "epoch": 0.8888282387190685,
+      "grad_norm": 0.16886365413665771,
+      "learning_rate": 1.626803270703936e-06,
+      "loss": 0.14266083240509034,
+      "step": 4885
+    },
+    {
+      "epoch": 0.8897379912663755,
+      "grad_norm": 0.1891999989748001,
+      "learning_rate": 1.6007856908637652e-06,
+      "loss": 0.1398016929626465,
+      "step": 4890
+    },
+    {
+      "epoch": 0.8906477438136827,
+      "grad_norm": 0.17645299434661865,
+      "learning_rate": 1.5749709595550083e-06,
+      "loss": 0.13869571685791016,
+      "step": 4895
+    },
+    {
+      "epoch": 0.8915574963609898,
+      "grad_norm": 0.17714262008666992,
+      "learning_rate": 1.549359300566408e-06,
+      "loss": 0.14957486391067504,
+      "step": 4900
+    },
+    {
+      "epoch": 0.892467248908297,
+      "grad_norm": 0.18025240302085876,
+      "learning_rate": 1.5239509359262355e-06,
+      "loss": 0.1358652949333191,
+      "step": 4905
+    },
+    {
+      "epoch": 0.8933770014556041,
+      "grad_norm": 0.17539937794208527,
+      "learning_rate": 1.4987460859004154e-06,
+      "loss": 0.13833394050598144,
+      "step": 4910
+    },
+    {
+      "epoch": 0.8942867540029112,
+      "grad_norm": 0.1772230565547943,
+      "learning_rate": 1.4737449689905953e-06,
+      "loss": 0.14202116727828978,
+      "step": 4915
+    },
+    {
+      "epoch": 0.8951965065502183,
+      "grad_norm": 0.1670161783695221,
+      "learning_rate": 1.4489478019322433e-06,
+      "loss": 0.1403665542602539,
+      "step": 4920
+    },
+    {
+      "epoch": 0.8961062590975255,
+      "grad_norm": 0.1697034239768982,
+      "learning_rate": 1.4243547996927926e-06,
+      "loss": 0.1401481032371521,
+      "step": 4925
+    },
+    {
+      "epoch": 0.8970160116448326,
+      "grad_norm": 0.16474860906600952,
+      "learning_rate": 1.3999661754697636e-06,
+      "loss": 0.13969850540161133,
+      "step": 4930
+    },
+    {
+      "epoch": 0.8979257641921398,
+      "grad_norm": 0.1664883941411972,
+      "learning_rate": 1.3757821406889027e-06,
+      "loss": 0.1399069309234619,
+      "step": 4935
+    },
+    {
+      "epoch": 0.8988355167394468,
+      "grad_norm": 0.16675794124603271,
+      "learning_rate": 1.351802905002386e-06,
+      "loss": 0.14129226207733153,
+      "step": 4940
+    },
+    {
+      "epoch": 0.899745269286754,
+      "grad_norm": 0.17529809474945068,
+      "learning_rate": 1.3280286762869632e-06,
+      "loss": 0.14663081169128417,
+      "step": 4945
+    },
+    {
+      "epoch": 0.9006550218340611,
+      "grad_norm": 0.17758169770240784,
+      "learning_rate": 1.3044596606421795e-06,
+      "loss": 0.13986254930496217,
+      "step": 4950
+    },
+    {
+      "epoch": 0.9015647743813683,
+      "grad_norm": 0.153225839138031,
+      "learning_rate": 1.2810960623885815e-06,
+      "loss": 0.14236698150634766,
+      "step": 4955
+    },
+    {
+      "epoch": 0.9024745269286754,
+      "grad_norm": 0.169761523604393,
+      "learning_rate": 1.2579380840659376e-06,
+      "loss": 0.1450445055961609,
+      "step": 4960
+    },
+    {
+      "epoch": 0.9033842794759825,
+      "grad_norm": 0.16659331321716309,
+      "learning_rate": 1.2349859264315034e-06,
+      "loss": 0.14043926000595092,
+      "step": 4965
+    },
+    {
+      "epoch": 0.9042940320232896,
+      "grad_norm": 0.16748706996440887,
+      "learning_rate": 1.2122397884582553e-06,
+      "loss": 0.14725675582885742,
+      "step": 4970
+    },
+    {
+      "epoch": 0.9052037845705968,
+      "grad_norm": 0.1600511223077774,
+      "learning_rate": 1.1896998673331883e-06,
+      "loss": 0.14551150798797607,
+      "step": 4975
+    },
+    {
+      "epoch": 0.9061135371179039,
+      "grad_norm": 0.24318362772464752,
+      "learning_rate": 1.1673663584555934e-06,
+      "loss": 0.14470888376235963,
+      "step": 4980
+    },
+    {
+      "epoch": 0.9070232896652111,
+      "grad_norm": 0.16443821787834167,
+      "learning_rate": 1.1452394554353706e-06,
+      "loss": 0.13639854192733764,
+      "step": 4985
+    },
+    {
+      "epoch": 0.9079330422125182,
+      "grad_norm": 0.14277774095535278,
+      "learning_rate": 1.1233193500913453e-06,
+      "loss": 0.13749881982803344,
+      "step": 4990
+    },
+    {
+      "epoch": 0.9088427947598253,
+      "grad_norm": 0.1610947549343109,
+      "learning_rate": 1.1016062324496008e-06,
+      "loss": 0.1385629653930664,
+      "step": 4995
+    },
+    {
+      "epoch": 0.9097525473071325,
+      "grad_norm": 0.17888498306274414,
+      "learning_rate": 1.080100290741845e-06,
+      "loss": 0.14225621223449708,
+      "step": 5000
+    },
+    {
+      "epoch": 0.9106622998544396,
+      "grad_norm": 0.17488449811935425,
+      "learning_rate": 1.0588017114037729e-06,
+      "loss": 0.14187805652618407,
+      "step": 5005
+    },
+    {
+      "epoch": 0.9115720524017468,
+      "grad_norm": 0.16410665214061737,
+      "learning_rate": 1.0377106790734392e-06,
+      "loss": 0.1407416582107544,
+      "step": 5010
+    },
+    {
+      "epoch": 0.9124818049490538,
+      "grad_norm": 0.18115971982479095,
+      "learning_rate": 1.016827376589674e-06,
+      "loss": 0.1427263855934143,
+      "step": 5015
+    },
+    {
+      "epoch": 0.913391557496361,
+      "grad_norm": 0.18507841229438782,
+      "learning_rate": 9.961519849904898e-07,
+      "loss": 0.1390499472618103,
+      "step": 5020
+    },
+    {
+      "epoch": 0.9143013100436681,
+      "grad_norm": 0.21296796202659607,
+      "learning_rate": 9.75684683511513e-07,
+      "loss": 0.1382216691970825,
+      "step": 5025
+    },
+    {
+      "epoch": 0.9152110625909753,
+      "grad_norm": 0.2308044582605362,
+      "learning_rate": 9.55425649584435e-07,
+      "loss": 0.14271280765533448,
+      "step": 5030
+    },
+    {
+      "epoch": 0.9161208151382824,
+      "grad_norm": 0.15796682238578796,
+      "learning_rate": 9.353750588354527e-07,
+      "loss": 0.13807624578475952,
+      "step": 5035
+    },
+    {
+      "epoch": 0.9170305676855895,
+      "grad_norm": 0.1695316582918167,
+      "learning_rate": 9.155330850837834e-07,
+      "loss": 0.14289476871490478,
+      "step": 5040
+    },
+    {
+      "epoch": 0.9179403202328966,
+      "grad_norm": 0.1738404780626297,
+      "learning_rate": 8.958999003401191e-07,
+      "loss": 0.14070619344711305,
+      "step": 5045
+    },
+    {
+      "epoch": 0.9188500727802038,
+      "grad_norm": 0.20618964731693268,
+      "learning_rate": 8.764756748051662e-07,
+      "loss": 0.14535053968429565,
+      "step": 5050
+    },
+    {
+      "epoch": 0.9197598253275109,
+      "grad_norm": 0.1506137251853943,
+      "learning_rate": 8.572605768681546e-07,
+      "loss": 0.13995139598846434,
+      "step": 5055
+    },
+    {
+      "epoch": 0.9206695778748181,
+      "grad_norm": 0.17772039771080017,
+      "learning_rate": 8.382547731053708e-07,
+      "loss": 0.14470311403274536,
+      "step": 5060
+    },
+    {
+      "epoch": 0.9215793304221251,
+      "grad_norm": 0.19897456467151642,
+      "learning_rate": 8.194584282787382e-07,
+      "loss": 0.144488525390625,
+      "step": 5065
+    },
+    {
+      "epoch": 0.9224890829694323,
+      "grad_norm": 0.15899236500263214,
+      "learning_rate": 8.008717053343606e-07,
+      "loss": 0.1352991580963135,
+      "step": 5070
+    },
+    {
+      "epoch": 0.9233988355167394,
+      "grad_norm": 0.14965768158435822,
+      "learning_rate": 7.824947654011345e-07,
+      "loss": 0.13827911615371705,
+      "step": 5075
+    },
+    {
+      "epoch": 0.9243085880640466,
+      "grad_norm": 0.43651485443115234,
+      "learning_rate": 7.643277677893329e-07,
+      "loss": 0.14149526357650757,
+      "step": 5080
+    },
+    {
+      "epoch": 0.9252183406113537,
+      "grad_norm": 0.19912713766098022,
+      "learning_rate": 7.463708699892325e-07,
+      "loss": 0.14357032775878906,
+      "step": 5085
+    },
+    {
+      "epoch": 0.9261280931586608,
+      "grad_norm": 0.1635904610157013,
+      "learning_rate": 7.286242276697524e-07,
+      "loss": 0.13550699949264527,
+      "step": 5090
+    },
+    {
+      "epoch": 0.9270378457059679,
+      "grad_norm": 0.19391080737113953,
+      "learning_rate": 7.11087994677101e-07,
+      "loss": 0.14674756526947022,
+      "step": 5095
+    },
+    {
+      "epoch": 0.9279475982532751,
+      "grad_norm": 0.17458125948905945,
+      "learning_rate": 6.937623230334284e-07,
+      "loss": 0.14155579805374147,
+      "step": 5100
+    },
+    {
+      "epoch": 0.9288573508005823,
+      "grad_norm": 0.1617971807718277,
+      "learning_rate": 6.766473629355452e-07,
+      "loss": 0.140555477142334,
+      "step": 5105
+    },
+    {
+      "epoch": 0.9297671033478894,
+      "grad_norm": 0.16945427656173706,
+      "learning_rate": 6.59743262753576e-07,
+      "loss": 0.13607511520385743,
+      "step": 5110
+    },
+    {
+      "epoch": 0.9306768558951966,
+      "grad_norm": 0.18347840011119843,
+      "learning_rate": 6.43050169029702e-07,
+      "loss": 0.14903461933135986,
+      "step": 5115
+    },
+    {
+      "epoch": 0.9315866084425036,
+      "grad_norm": 0.15434837341308594,
+      "learning_rate": 6.265682264768869e-07,
+      "loss": 0.14146015644073487,
+      "step": 5120
+    },
+    {
+      "epoch": 0.9324963609898108,
+      "grad_norm": 0.1397712528705597,
+      "learning_rate": 6.10297577977606e-07,
+      "loss": 0.14261592626571656,
+      "step": 5125
+    },
+    {
+      "epoch": 0.9334061135371179,
+      "grad_norm": 0.1765873283147812,
+      "learning_rate": 5.942383645826361e-07,
+      "loss": 0.13559447526931762,
+      "step": 5130
+    },
+    {
+      "epoch": 0.9343158660844251,
+      "grad_norm": 0.1656057983636856,
+      "learning_rate": 5.783907255098003e-07,
+      "loss": 0.13961490392684936,
+      "step": 5135
+    },
+    {
+      "epoch": 0.9352256186317321,
+      "grad_norm": 0.2169366180896759,
+      "learning_rate": 5.627547981427894e-07,
+      "loss": 0.1447835922241211,
+      "step": 5140
+    },
+    {
+      "epoch": 0.9361353711790393,
+      "grad_norm": 0.18623125553131104,
+      "learning_rate": 5.473307180299508e-07,
+      "loss": 0.14366730451583862,
+      "step": 5145
+    },
+    {
+      "epoch": 0.9370451237263464,
+      "grad_norm": 0.15423963963985443,
+      "learning_rate": 5.32118618883129e-07,
+      "loss": 0.14295632839202882,
+      "step": 5150
+    },
+    {
+      "epoch": 0.9379548762736536,
+      "grad_norm": 0.18423247337341309,
+      "learning_rate": 5.17118632576491e-07,
+      "loss": 0.14137414693832398,
+      "step": 5155
+    },
+    {
+      "epoch": 0.9388646288209607,
+      "grad_norm": 0.15338757634162903,
+      "learning_rate": 5.023308891453915e-07,
+      "loss": 0.13583066463470458,
+      "step": 5160
+    },
+    {
+      "epoch": 0.9397743813682679,
+      "grad_norm": 0.2293633222579956,
+      "learning_rate": 4.877555167852515e-07,
+      "loss": 0.14819620847702025,
+      "step": 5165
+    },
+    {
+      "epoch": 0.9406841339155749,
+      "grad_norm": 0.16889944672584534,
+      "learning_rate": 4.7339264185043974e-07,
+      "loss": 0.13617686033248902,
+      "step": 5170
+    },
+    {
+      "epoch": 0.9415938864628821,
+      "grad_norm": 0.1767464578151703,
+      "learning_rate": 4.5924238885316775e-07,
+      "loss": 0.13487552404403685,
+      "step": 5175
+    },
+    {
+      "epoch": 0.9425036390101892,
+      "grad_norm": 0.16697899997234344,
+      "learning_rate": 4.453048804624327e-07,
+      "loss": 0.1446886420249939,
+      "step": 5180
+    },
+    {
+      "epoch": 0.9434133915574964,
+      "grad_norm": 0.19576266407966614,
+      "learning_rate": 4.315802375029293e-07,
+      "loss": 0.14252450466156005,
+      "step": 5185
+    },
+    {
+      "epoch": 0.9443231441048034,
+      "grad_norm": 0.14838077127933502,
+      "learning_rate": 4.18068578954034e-07,
+      "loss": 0.13933032751083374,
+      "step": 5190
+    },
+    {
+      "epoch": 0.9452328966521106,
+      "grad_norm": 0.18481744825839996,
+      "learning_rate": 4.047700219487388e-07,
+      "loss": 0.1410665273666382,
+      "step": 5195
+    },
+    {
+      "epoch": 0.9461426491994177,
+      "grad_norm": 0.16954176127910614,
+      "learning_rate": 3.9168468177265547e-07,
+      "loss": 0.1421758770942688,
+      "step": 5200
+    },
+    {
+      "epoch": 0.9470524017467249,
+      "grad_norm": 0.17614421248435974,
+      "learning_rate": 3.7881267186301306e-07,
+      "loss": 0.14059911966323851,
+      "step": 5205
+    },
+    {
+      "epoch": 0.9479621542940321,
+      "grad_norm": 0.1637226939201355,
+      "learning_rate": 3.6615410380767544e-07,
+      "loss": 0.1360395908355713,
+      "step": 5210
+    },
+    {
+      "epoch": 0.9488719068413392,
+      "grad_norm": 0.18330250680446625,
+      "learning_rate": 3.5370908734417006e-07,
+      "loss": 0.14543824195861815,
+      "step": 5215
+    },
+    {
+      "epoch": 0.9497816593886463,
+      "grad_norm": 0.1895420402288437,
+      "learning_rate": 3.414777303587413e-07,
+      "loss": 0.15304578542709352,
+      "step": 5220
+    },
+    {
+      "epoch": 0.9506914119359534,
+      "grad_norm": 0.15384933352470398,
+      "learning_rate": 3.294601388854041e-07,
+      "loss": 0.14675912857055665,
+      "step": 5225
+    },
+    {
+      "epoch": 0.9516011644832606,
+      "grad_norm": 0.20188499987125397,
+      "learning_rate": 3.1765641710505e-07,
+      "loss": 0.14068362712860108,
+      "step": 5230
+    },
+    {
+      "epoch": 0.9525109170305677,
+      "grad_norm": 0.16467279195785522,
+      "learning_rate": 3.060666673445123e-07,
+      "loss": 0.14733167886734008,
+      "step": 5235
+    },
+    {
+      "epoch": 0.9534206695778749,
+      "grad_norm": 0.16632016003131866,
+      "learning_rate": 2.9469099007569943e-07,
+      "loss": 0.13753929138183593,
+      "step": 5240
+    },
+    {
+      "epoch": 0.9543304221251819,
+      "grad_norm": 0.1477566957473755,
+      "learning_rate": 2.83529483914724e-07,
+      "loss": 0.14354891777038575,
+      "step": 5245
+    },
+    {
+      "epoch": 0.9552401746724891,
+      "grad_norm": 0.1693645417690277,
+      "learning_rate": 2.7258224562102805e-07,
+      "loss": 0.14622807502746582,
+      "step": 5250
+    },
+    {
+      "epoch": 0.9561499272197962,
+      "grad_norm": 0.17574062943458557,
+      "learning_rate": 2.6184937009657295e-07,
+      "loss": 0.1344899296760559,
+      "step": 5255
+    },
+    {
+      "epoch": 0.9570596797671034,
+      "grad_norm": 0.17448563873767853,
+      "learning_rate": 2.513309503850009e-07,
+      "loss": 0.1355789542198181,
+      "step": 5260
+    },
+    {
+      "epoch": 0.9579694323144105,
+      "grad_norm": 0.16993778944015503,
+      "learning_rate": 2.41027077670819e-07,
+      "loss": 0.151595401763916,
+      "step": 5265
+    },
+    {
+      "epoch": 0.9588791848617176,
+      "grad_norm": 0.16944102942943573,
+      "learning_rate": 2.3093784127863062e-07,
+      "loss": 0.1466623306274414,
+      "step": 5270
+    },
+    {
+      "epoch": 0.9597889374090247,
+      "grad_norm": 0.18085163831710815,
+      "learning_rate": 2.2106332867234402e-07,
+      "loss": 0.14645814895629883,
+      "step": 5275
+    },
+    {
+      "epoch": 0.9606986899563319,
+      "grad_norm": 0.14682307839393616,
+      "learning_rate": 2.1140362545442605e-07,
+      "loss": 0.13901774883270263,
+      "step": 5280
+    },
+    {
+      "epoch": 0.961608442503639,
+      "grad_norm": 0.17189526557922363,
+      "learning_rate": 2.0195881536514694e-07,
+      "loss": 0.14153491258621215,
+      "step": 5285
+    },
+    {
+      "epoch": 0.9625181950509462,
+      "grad_norm": 0.1977207362651825,
+      "learning_rate": 1.9272898028186714e-07,
+      "loss": 0.1437437653541565,
+      "step": 5290
+    },
+    {
+      "epoch": 0.9634279475982532,
+      "grad_norm": 0.16637668013572693,
+      "learning_rate": 1.837142002183184e-07,
+      "loss": 0.13910138607025146,
+      "step": 5295
+    },
+    {
+      "epoch": 0.9643377001455604,
+      "grad_norm": 0.18155774474143982,
+      "learning_rate": 1.7491455332391548e-07,
+      "loss": 0.14177814722061158,
+      "step": 5300
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.91270363724079e+18,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-5300/training_args.bin b/checkpoint-5300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-5300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/checkpoint-5400/README.md b/checkpoint-5400/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-5400/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-5400/adapter_config.json b/checkpoint-5400/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-5400/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-5400/adapter_model.safetensors b/checkpoint-5400/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..ed48240c1e1bedc059d842cc673ed7231ebea701
--- /dev/null
+++ b/checkpoint-5400/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:39a155a62a6b82ab171b303deed63e164255b839c4d35d3f4dc73a391c560321
+size 169741912
diff --git a/checkpoint-5400/chat_template.jinja b/checkpoint-5400/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-5400/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-5400/optimizer.pt b/checkpoint-5400/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d4d02c065e044192e5fc5d300659ce1b63ec0a01
--- /dev/null
+++ b/checkpoint-5400/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d3ce57aac76e6b3a00f67ce909a57612ce91cf2745dbc74a149dbdb1b267e843
+size 72807355
diff --git a/checkpoint-5400/processor_config.json b/checkpoint-5400/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-5400/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-5400/rng_state.pth b/checkpoint-5400/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-5400/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-5400/scheduler.pt b/checkpoint-5400/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..66d1eba99819f04ced6a1cd66ed865261031dfc2
--- /dev/null
+++ b/checkpoint-5400/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c2383841240df643f2cc3bec80f20fb5b80bd822a02870b923d76fb12aba5b41
+size 1465
diff --git a/checkpoint-5400/tokenizer.json b/checkpoint-5400/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-5400/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-5400/tokenizer_config.json b/checkpoint-5400/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-5400/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-5400/trainer_state.json b/checkpoint-5400/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..7c03c77701b49c9fe8ee1dbcd6359f15bab762bd
--- /dev/null
+++ b/checkpoint-5400/trainer_state.json
@@ -0,0 +1,7602 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.982532751091703,
+  "eval_steps": 100,
+  "global_step": 5400,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    },
+    {
+      "epoch": 0.03729985443959243,
+      "grad_norm": 0.061678580939769745,
+      "learning_rate": 4.999340748636462e-05,
+      "loss": 0.24956226348876953,
+      "step": 205
+    },
+    {
+      "epoch": 0.03820960698689956,
+      "grad_norm": 0.07328873127698898,
+      "learning_rate": 4.999160884067051e-05,
+      "loss": 0.26169676780700685,
+      "step": 210
+    },
+    {
+      "epoch": 0.0391193595342067,
+      "grad_norm": 0.08287990838289261,
+      "learning_rate": 4.9989593541926246e-05,
+      "loss": 0.2574604034423828,
+      "step": 215
+    },
+    {
+      "epoch": 0.04002911208151383,
+      "grad_norm": 0.06787359714508057,
+      "learning_rate": 4.9987361607602525e-05,
+      "loss": 0.25351409912109374,
+      "step": 220
+    },
+    {
+      "epoch": 0.04093886462882096,
+      "grad_norm": 0.06695502996444702,
+      "learning_rate": 4.998491305704805e-05,
+      "loss": 0.24522039890289307,
+      "step": 225
+    },
+    {
+      "epoch": 0.041848617176128096,
+      "grad_norm": 0.08872214704751968,
+      "learning_rate": 4.9982247911489375e-05,
+      "loss": 0.2581867933273315,
+      "step": 230
+    },
+    {
+      "epoch": 0.042758369723435226,
+      "grad_norm": 0.07637131959199905,
+      "learning_rate": 4.9979366194030743e-05,
+      "loss": 0.25569658279418944,
+      "step": 235
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 0.08158119022846222,
+      "learning_rate": 4.997626792965385e-05,
+      "loss": 0.2529409646987915,
+      "step": 240
+    },
+    {
+      "epoch": 0.04457787481804949,
+      "grad_norm": 0.07529161125421524,
+      "learning_rate": 4.997295314521766e-05,
+      "loss": 0.24049024581909179,
+      "step": 245
+    },
+    {
+      "epoch": 0.04548762736535662,
+      "grad_norm": 0.08860139548778534,
+      "learning_rate": 4.996942186945813e-05,
+      "loss": 0.2490522861480713,
+      "step": 250
+    },
+    {
+      "epoch": 0.04639737991266375,
+      "grad_norm": 0.0850321501493454,
+      "learning_rate": 4.9965674132988005e-05,
+      "loss": 0.24180831909179687,
+      "step": 255
+    },
+    {
+      "epoch": 0.04730713245997089,
+      "grad_norm": 0.07556115090847015,
+      "learning_rate": 4.996170996829653e-05,
+      "loss": 0.2509631872177124,
+      "step": 260
+    },
+    {
+      "epoch": 0.04821688500727802,
+      "grad_norm": 0.07971206307411194,
+      "learning_rate": 4.995752940974918e-05,
+      "loss": 0.24398891925811766,
+      "step": 265
+    },
+    {
+      "epoch": 0.04912663755458515,
+      "grad_norm": 0.09149336814880371,
+      "learning_rate": 4.9953132493587344e-05,
+      "loss": 0.2300492286682129,
+      "step": 270
+    },
+    {
+      "epoch": 0.050036390101892286,
+      "grad_norm": 0.08265820890665054,
+      "learning_rate": 4.9948519257928034e-05,
+      "loss": 0.24246792793273925,
+      "step": 275
+    },
+    {
+      "epoch": 0.050946142649199416,
+      "grad_norm": 0.10328587144613266,
+      "learning_rate": 4.9943689742763534e-05,
+      "loss": 0.2367171049118042,
+      "step": 280
+    },
+    {
+      "epoch": 0.05185589519650655,
+      "grad_norm": 0.0836917981505394,
+      "learning_rate": 4.993864398996105e-05,
+      "loss": 0.23215813636779786,
+      "step": 285
+    },
+    {
+      "epoch": 0.05276564774381368,
+      "grad_norm": 0.09475161135196686,
+      "learning_rate": 4.99333820432624e-05,
+      "loss": 0.2350748062133789,
+      "step": 290
+    },
+    {
+      "epoch": 0.05367540029112081,
+      "grad_norm": 0.08040128648281097,
+      "learning_rate": 4.992790394828355e-05,
+      "loss": 0.23253886699676513,
+      "step": 295
+    },
+    {
+      "epoch": 0.05458515283842795,
+      "grad_norm": 0.08852150291204453,
+      "learning_rate": 4.992220975251428e-05,
+      "loss": 0.23856515884399415,
+      "step": 300
+    },
+    {
+      "epoch": 0.05549490538573508,
+      "grad_norm": 0.09565229713916779,
+      "learning_rate": 4.991629950531775e-05,
+      "loss": 0.23311660289764405,
+      "step": 305
+    },
+    {
+      "epoch": 0.05640465793304221,
+      "grad_norm": 0.08158160001039505,
+      "learning_rate": 4.991017325793009e-05,
+      "loss": 0.22467944622039795,
+      "step": 310
+    },
+    {
+      "epoch": 0.05731441048034935,
+      "grad_norm": 0.07746429741382599,
+      "learning_rate": 4.990383106345994e-05,
+      "loss": 0.229844069480896,
+      "step": 315
+    },
+    {
+      "epoch": 0.05822416302765648,
+      "grad_norm": 0.08564355969429016,
+      "learning_rate": 4.989727297688797e-05,
+      "loss": 0.22414517402648926,
+      "step": 320
+    },
+    {
+      "epoch": 0.05913391557496361,
+      "grad_norm": 0.07517435401678085,
+      "learning_rate": 4.9890499055066435e-05,
+      "loss": 0.2236532211303711,
+      "step": 325
+    },
+    {
+      "epoch": 0.060043668122270744,
+      "grad_norm": 0.111734539270401,
+      "learning_rate": 4.988350935671869e-05,
+      "loss": 0.21474847793579102,
+      "step": 330
+    },
+    {
+      "epoch": 0.060953420669577874,
+      "grad_norm": 0.09906989336013794,
+      "learning_rate": 4.987630394243866e-05,
+      "loss": 0.23321933746337892,
+      "step": 335
+    },
+    {
+      "epoch": 0.06186317321688501,
+      "grad_norm": 0.10131457448005676,
+      "learning_rate": 4.98688828746903e-05,
+      "loss": 0.2310662031173706,
+      "step": 340
+    },
+    {
+      "epoch": 0.06277292576419213,
+      "grad_norm": 0.09203507006168365,
+      "learning_rate": 4.986124621780708e-05,
+      "loss": 0.22021169662475587,
+      "step": 345
+    },
+    {
+      "epoch": 0.06368267831149928,
+      "grad_norm": 0.09505912661552429,
+      "learning_rate": 4.9853394037991416e-05,
+      "loss": 0.2197155237197876,
+      "step": 350
+    },
+    {
+      "epoch": 0.06459243085880641,
+      "grad_norm": 0.09038657695055008,
+      "learning_rate": 4.984532640331412e-05,
+      "loss": 0.22066287994384765,
+      "step": 355
+    },
+    {
+      "epoch": 0.06550218340611354,
+      "grad_norm": 0.09707064181566238,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 0.22455451488494874,
+      "step": 360
+    },
+    {
+      "epoch": 0.06641193595342067,
+      "grad_norm": 0.10367228090763092,
+      "learning_rate": 4.98285450509961e-05,
+      "loss": 0.21993820667266845,
+      "step": 365
+    },
+    {
+      "epoch": 0.0673216885007278,
+      "grad_norm": 0.12229471653699875,
+      "learning_rate": 4.9819831478833456e-05,
+      "loss": 0.2168867588043213,
+      "step": 370
+    },
+    {
+      "epoch": 0.06823144104803494,
+      "grad_norm": 0.0964592918753624,
+      "learning_rate": 4.981090274276406e-05,
+      "loss": 0.21579203605651856,
+      "step": 375
+    },
+    {
+      "epoch": 0.06914119359534207,
+      "grad_norm": 0.09400496631860733,
+      "learning_rate": 4.980175892019141e-05,
+      "loss": 0.20972180366516113,
+      "step": 380
+    },
+    {
+      "epoch": 0.0700509461426492,
+      "grad_norm": 0.08158645778894424,
+      "learning_rate": 4.9792400090383594e-05,
+      "loss": 0.22148358821868896,
+      "step": 385
+    },
+    {
+      "epoch": 0.07096069868995633,
+      "grad_norm": 0.10916394740343094,
+      "learning_rate": 4.978282633447261e-05,
+      "loss": 0.2214418649673462,
+      "step": 390
+    },
+    {
+      "epoch": 0.07187045123726346,
+      "grad_norm": 0.11138810962438583,
+      "learning_rate": 4.9773037735453636e-05,
+      "loss": 0.21814754009246826,
+      "step": 395
+    },
+    {
+      "epoch": 0.07278020378457059,
+      "grad_norm": 0.10914396494626999,
+      "learning_rate": 4.9763034378184365e-05,
+      "loss": 0.21310818195343018,
+      "step": 400
+    },
+    {
+      "epoch": 0.07368995633187773,
+      "grad_norm": 0.1043366864323616,
+      "learning_rate": 4.975281634938421e-05,
+      "loss": 0.21266789436340333,
+      "step": 405
+    },
+    {
+      "epoch": 0.07459970887918486,
+      "grad_norm": 0.1036868542432785,
+      "learning_rate": 4.9742383737633594e-05,
+      "loss": 0.21606721878051757,
+      "step": 410
+    },
+    {
+      "epoch": 0.075509461426492,
+      "grad_norm": 0.11640442907810211,
+      "learning_rate": 4.9731736633373144e-05,
+      "loss": 0.21532948017120362,
+      "step": 415
+    },
+    {
+      "epoch": 0.07641921397379912,
+      "grad_norm": 0.11219926178455353,
+      "learning_rate": 4.9720875128902956e-05,
+      "loss": 0.2191627025604248,
+      "step": 420
+    },
+    {
+      "epoch": 0.07732896652110625,
+      "grad_norm": 0.12103637307882309,
+      "learning_rate": 4.970979931838176e-05,
+      "loss": 0.20938868522644044,
+      "step": 425
+    },
+    {
+      "epoch": 0.0782387190684134,
+      "grad_norm": 0.13274189829826355,
+      "learning_rate": 4.96985092978261e-05,
+      "loss": 0.21792960166931152,
+      "step": 430
+    },
+    {
+      "epoch": 0.07914847161572053,
+      "grad_norm": 0.11164513230323792,
+      "learning_rate": 4.968700516510954e-05,
+      "loss": 0.2022618055343628,
+      "step": 435
+    },
+    {
+      "epoch": 0.08005822416302766,
+      "grad_norm": 0.09532847255468369,
+      "learning_rate": 4.967528701996174e-05,
+      "loss": 0.21255812644958497,
+      "step": 440
+    },
+    {
+      "epoch": 0.08096797671033479,
+      "grad_norm": 0.10279258340597153,
+      "learning_rate": 4.96633549639677e-05,
+      "loss": 0.20683050155639648,
+      "step": 445
+    },
+    {
+      "epoch": 0.08187772925764192,
+      "grad_norm": 0.1257462352514267,
+      "learning_rate": 4.965120910056677e-05,
+      "loss": 0.21419920921325683,
+      "step": 450
+    },
+    {
+      "epoch": 0.08278748180494905,
+      "grad_norm": 0.11663137376308441,
+      "learning_rate": 4.963884953505186e-05,
+      "loss": 0.2072287082672119,
+      "step": 455
+    },
+    {
+      "epoch": 0.08369723435225619,
+      "grad_norm": 0.10488224029541016,
+      "learning_rate": 4.96262763745684e-05,
+      "loss": 0.1982678532600403,
+      "step": 460
+    },
+    {
+      "epoch": 0.08460698689956332,
+      "grad_norm": 0.11801692098379135,
+      "learning_rate": 4.961348972811354e-05,
+      "loss": 0.20662031173706055,
+      "step": 465
+    },
+    {
+      "epoch": 0.08551673944687045,
+      "grad_norm": 0.11318827420473099,
+      "learning_rate": 4.96004897065351e-05,
+      "loss": 0.20947303771972656,
+      "step": 470
+    },
+    {
+      "epoch": 0.08642649199417758,
+      "grad_norm": 0.13409486413002014,
+      "learning_rate": 4.95872764225307e-05,
+      "loss": 0.19670876264572143,
+      "step": 475
+    },
+    {
+      "epoch": 0.08733624454148471,
+      "grad_norm": 0.14440792798995972,
+      "learning_rate": 4.957384999064672e-05,
+      "loss": 0.19842848777770997,
+      "step": 480
+    },
+    {
+      "epoch": 0.08824599708879186,
+      "grad_norm": 0.12246996909379959,
+      "learning_rate": 4.956021052727731e-05,
+      "loss": 0.20318071842193602,
+      "step": 485
+    },
+    {
+      "epoch": 0.08915574963609899,
+      "grad_norm": 0.13437233865261078,
+      "learning_rate": 4.954635815066342e-05,
+      "loss": 0.21675212383270265,
+      "step": 490
+    },
+    {
+      "epoch": 0.09006550218340612,
+      "grad_norm": 0.11109672486782074,
+      "learning_rate": 4.9532292980891744e-05,
+      "loss": 0.2100757837295532,
+      "step": 495
+    },
+    {
+      "epoch": 0.09097525473071325,
+      "grad_norm": 0.1388893872499466,
+      "learning_rate": 4.9518015139893675e-05,
+      "loss": 0.20303285121917725,
+      "step": 500
+    },
+    {
+      "epoch": 0.09188500727802038,
+      "grad_norm": 0.13239721953868866,
+      "learning_rate": 4.950352475144427e-05,
+      "loss": 0.2152268409729004,
+      "step": 505
+    },
+    {
+      "epoch": 0.0927947598253275,
+      "grad_norm": 0.12834979593753815,
+      "learning_rate": 4.948882194116119e-05,
+      "loss": 0.20799248218536376,
+      "step": 510
+    },
+    {
+      "epoch": 0.09370451237263465,
+      "grad_norm": 0.11886704713106155,
+      "learning_rate": 4.947390683650354e-05,
+      "loss": 0.20394976139068605,
+      "step": 515
+    },
+    {
+      "epoch": 0.09461426491994178,
+      "grad_norm": 0.11398876458406448,
+      "learning_rate": 4.945877956677083e-05,
+      "loss": 0.2091092586517334,
+      "step": 520
+    },
+    {
+      "epoch": 0.09552401746724891,
+      "grad_norm": 0.1422540694475174,
+      "learning_rate": 4.944344026310186e-05,
+      "loss": 0.19564238786697388,
+      "step": 525
+    },
+    {
+      "epoch": 0.09643377001455604,
+      "grad_norm": 0.11359584331512451,
+      "learning_rate": 4.9427889058473535e-05,
+      "loss": 0.20493624210357667,
+      "step": 530
+    },
+    {
+      "epoch": 0.09734352256186317,
+      "grad_norm": 0.11703553050756454,
+      "learning_rate": 4.941212608769974e-05,
+      "loss": 0.2098615884780884,
+      "step": 535
+    },
+    {
+      "epoch": 0.0982532751091703,
+      "grad_norm": 0.14552047848701477,
+      "learning_rate": 4.939615148743017e-05,
+      "loss": 0.20382182598114013,
+      "step": 540
+    },
+    {
+      "epoch": 0.09916302765647744,
+      "grad_norm": 0.13178016245365143,
+      "learning_rate": 4.937996539614914e-05,
+      "loss": 0.19901862144470214,
+      "step": 545
+    },
+    {
+      "epoch": 0.10007278020378457,
+      "grad_norm": 0.635392427444458,
+      "learning_rate": 4.936356795417439e-05,
+      "loss": 0.20694944858551026,
+      "step": 550
+    },
+    {
+      "epoch": 0.1009825327510917,
+      "grad_norm": 0.15019077062606812,
+      "learning_rate": 4.934695930365586e-05,
+      "loss": 0.19313746690750122,
+      "step": 555
+    },
+    {
+      "epoch": 0.10189228529839883,
+      "grad_norm": 0.12941956520080566,
+      "learning_rate": 4.9330139588574474e-05,
+      "loss": 0.19671722650527954,
+      "step": 560
+    },
+    {
+      "epoch": 0.10280203784570596,
+      "grad_norm": 0.13818831741809845,
+      "learning_rate": 4.931310895474088e-05,
+      "loss": 0.20026786327362062,
+      "step": 565
+    },
+    {
+      "epoch": 0.1037117903930131,
+      "grad_norm": 0.12011194974184036,
+      "learning_rate": 4.929586754979417e-05,
+      "loss": 0.1932437539100647,
+      "step": 570
+    },
+    {
+      "epoch": 0.10462154294032024,
+      "grad_norm": 0.1345364898443222,
+      "learning_rate": 4.9278415523200644e-05,
+      "loss": 0.20245940685272218,
+      "step": 575
+    },
+    {
+      "epoch": 0.10553129548762737,
+      "grad_norm": 0.13281017541885376,
+      "learning_rate": 4.926075302625247e-05,
+      "loss": 0.19864981174468993,
+      "step": 580
+    },
+    {
+      "epoch": 0.1064410480349345,
+      "grad_norm": 0.13465586304664612,
+      "learning_rate": 4.924288021206639e-05,
+      "loss": 0.19573183059692384,
+      "step": 585
+    },
+    {
+      "epoch": 0.10735080058224163,
+      "grad_norm": 0.15225961804389954,
+      "learning_rate": 4.9224797235582396e-05,
+      "loss": 0.19946801662445068,
+      "step": 590
+    },
+    {
+      "epoch": 0.10826055312954876,
+      "grad_norm": 0.12816746532917023,
+      "learning_rate": 4.92065042535624e-05,
+      "loss": 0.19851526021957397,
+      "step": 595
+    },
+    {
+      "epoch": 0.1091703056768559,
+      "grad_norm": 0.13802853226661682,
+      "learning_rate": 4.9188001424588824e-05,
+      "loss": 0.19321763515472412,
+      "step": 600
+    },
+    {
+      "epoch": 0.11008005822416303,
+      "grad_norm": 0.17504797875881195,
+      "learning_rate": 4.9169288909063295e-05,
+      "loss": 0.2032616138458252,
+      "step": 605
+    },
+    {
+      "epoch": 0.11098981077147016,
+      "grad_norm": 0.13544194400310516,
+      "learning_rate": 4.91503668692052e-05,
+      "loss": 0.2011256456375122,
+      "step": 610
+    },
+    {
+      "epoch": 0.11189956331877729,
+      "grad_norm": 1.3976134061813354,
+      "learning_rate": 4.91312354690503e-05,
+      "loss": 0.19916868209838867,
+      "step": 615
+    },
+    {
+      "epoch": 0.11280931586608442,
+      "grad_norm": 0.1465059071779251,
+      "learning_rate": 4.91118948744493e-05,
+      "loss": 0.19487457275390624,
+      "step": 620
+    },
+    {
+      "epoch": 0.11371906841339156,
+      "grad_norm": 0.12103168666362762,
+      "learning_rate": 4.909234525306645e-05,
+      "loss": 0.1907251238822937,
+      "step": 625
+    },
+    {
+      "epoch": 0.1146288209606987,
+      "grad_norm": 0.12660574913024902,
+      "learning_rate": 4.907258677437802e-05,
+      "loss": 0.19327253103256226,
+      "step": 630
+    },
+    {
+      "epoch": 0.11553857350800582,
+      "grad_norm": 0.1347813606262207,
+      "learning_rate": 4.90526196096709e-05,
+      "loss": 0.19637736082077026,
+      "step": 635
+    },
+    {
+      "epoch": 0.11644832605531295,
+      "grad_norm": 0.14953652024269104,
+      "learning_rate": 4.903244393204107e-05,
+      "loss": 0.20325069427490233,
+      "step": 640
+    },
+    {
+      "epoch": 0.11735807860262008,
+      "grad_norm": 0.13936272263526917,
+      "learning_rate": 4.901205991639213e-05,
+      "loss": 0.1930275321006775,
+      "step": 645
+    },
+    {
+      "epoch": 0.11826783114992721,
+      "grad_norm": 0.1448420137166977,
+      "learning_rate": 4.899146773943374e-05,
+      "loss": 0.20026936531066894,
+      "step": 650
+    },
+    {
+      "epoch": 0.11917758369723436,
+      "grad_norm": 0.1312534064054489,
+      "learning_rate": 4.897066757968014e-05,
+      "loss": 0.19062033891677857,
+      "step": 655
+    },
+    {
+      "epoch": 0.12008733624454149,
+      "grad_norm": 0.13644742965698242,
+      "learning_rate": 4.894965961744859e-05,
+      "loss": 0.18719595670700073,
+      "step": 660
+    },
+    {
+      "epoch": 0.12099708879184862,
+      "grad_norm": 0.14276087284088135,
+      "learning_rate": 4.892844403485777e-05,
+      "loss": 0.19784307479858398,
+      "step": 665
+    },
+    {
+      "epoch": 0.12190684133915575,
+      "grad_norm": 0.14735399186611176,
+      "learning_rate": 4.890702101582623e-05,
+      "loss": 0.19163782596588136,
+      "step": 670
+    },
+    {
+      "epoch": 0.12281659388646288,
+      "grad_norm": 0.15742065012454987,
+      "learning_rate": 4.888539074607082e-05,
+      "loss": 0.19312986135482788,
+      "step": 675
+    },
+    {
+      "epoch": 0.12372634643377002,
+      "grad_norm": 0.12917031347751617,
+      "learning_rate": 4.8863553413105025e-05,
+      "loss": 0.20066320896148682,
+      "step": 680
+    },
+    {
+      "epoch": 0.12463609898107715,
+      "grad_norm": 0.1484801322221756,
+      "learning_rate": 4.884150920623737e-05,
+      "loss": 0.20096096992492676,
+      "step": 685
+    },
+    {
+      "epoch": 0.12554585152838427,
+      "grad_norm": 0.1455296128988266,
+      "learning_rate": 4.88192583165698e-05,
+      "loss": 0.20518505573272705,
+      "step": 690
+    },
+    {
+      "epoch": 0.12645560407569142,
+      "grad_norm": 0.14517490565776825,
+      "learning_rate": 4.879680093699598e-05,
+      "loss": 0.18859238624572755,
+      "step": 695
+    },
+    {
+      "epoch": 0.12736535662299855,
+      "grad_norm": 0.18778090178966522,
+      "learning_rate": 4.877413726219964e-05,
+      "loss": 0.197074818611145,
+      "step": 700
+    },
+    {
+      "epoch": 0.12827510917030568,
+      "grad_norm": 0.13497677445411682,
+      "learning_rate": 4.87512674886529e-05,
+      "loss": 0.18713107109069824,
+      "step": 705
+    },
+    {
+      "epoch": 0.12918486171761281,
+      "grad_norm": 0.12657155096530914,
+      "learning_rate": 4.872819181461455e-05,
+      "loss": 0.1858484387397766,
+      "step": 710
+    },
+    {
+      "epoch": 0.13009461426491994,
+      "grad_norm": 0.11458148807287216,
+      "learning_rate": 4.870491044012834e-05,
+      "loss": 0.18732179403305055,
+      "step": 715
+    },
+    {
+      "epoch": 0.13100436681222707,
+      "grad_norm": 0.13000249862670898,
+      "learning_rate": 4.8681423567021244e-05,
+      "loss": 0.1872936010360718,
+      "step": 720
+    },
+    {
+      "epoch": 0.1319141193595342,
+      "grad_norm": 0.14580890536308289,
+      "learning_rate": 4.865773139890172e-05,
+      "loss": 0.19280019998550416,
+      "step": 725
+    },
+    {
+      "epoch": 0.13282387190684133,
+      "grad_norm": 0.1507277935743332,
+      "learning_rate": 4.8633834141157913e-05,
+      "loss": 0.1898929238319397,
+      "step": 730
+    },
+    {
+      "epoch": 0.13373362445414846,
+      "grad_norm": 0.1418737769126892,
+      "learning_rate": 4.860973200095592e-05,
+      "loss": 0.17926375865936278,
+      "step": 735
+    },
+    {
+      "epoch": 0.1346433770014556,
+      "grad_norm": 0.17151866853237152,
+      "learning_rate": 4.858542518723794e-05,
+      "loss": 0.18963592052459716,
+      "step": 740
+    },
+    {
+      "epoch": 0.13555312954876272,
+      "grad_norm": 0.11162743717432022,
+      "learning_rate": 4.8560913910720535e-05,
+      "loss": 0.19466646909713745,
+      "step": 745
+    },
+    {
+      "epoch": 0.13646288209606988,
+      "grad_norm": 0.15628376603126526,
+      "learning_rate": 4.8536198383892725e-05,
+      "loss": 0.19494034051895143,
+      "step": 750
+    },
+    {
+      "epoch": 0.137372634643377,
+      "grad_norm": 0.18209289014339447,
+      "learning_rate": 4.851127882101421e-05,
+      "loss": 0.18747550249099731,
+      "step": 755
+    },
+    {
+      "epoch": 0.13828238719068414,
+      "grad_norm": 0.14559614658355713,
+      "learning_rate": 4.8486155438113454e-05,
+      "loss": 0.1897158980369568,
+      "step": 760
+    },
+    {
+      "epoch": 0.13919213973799127,
+      "grad_norm": 0.3198587894439697,
+      "learning_rate": 4.846082845298586e-05,
+      "loss": 0.18571001291275024,
+      "step": 765
+    },
+    {
+      "epoch": 0.1401018922852984,
+      "grad_norm": 0.1486678421497345,
+      "learning_rate": 4.843529808519189e-05,
+      "loss": 0.19561930894851684,
+      "step": 770
+    },
+    {
+      "epoch": 0.14101164483260553,
+      "grad_norm": 0.15318170189857483,
+      "learning_rate": 4.840956455605509e-05,
+      "loss": 0.187040114402771,
+      "step": 775
+    },
+    {
+      "epoch": 0.14192139737991266,
+      "grad_norm": 0.13754244148731232,
+      "learning_rate": 4.838362808866025e-05,
+      "loss": 0.18345539569854735,
+      "step": 780
+    },
+    {
+      "epoch": 0.1428311499272198,
+      "grad_norm": 0.12943248450756073,
+      "learning_rate": 4.835748890785143e-05,
+      "loss": 0.1921079397201538,
+      "step": 785
+    },
+    {
+      "epoch": 0.14374090247452692,
+      "grad_norm": 0.110458143055439,
+      "learning_rate": 4.833114724023001e-05,
+      "loss": 0.17927205562591553,
+      "step": 790
+    },
+    {
+      "epoch": 0.14465065502183405,
+      "grad_norm": 0.2421770840883255,
+      "learning_rate": 4.830460331415275e-05,
+      "loss": 0.18317567110061644,
+      "step": 795
+    },
+    {
+      "epoch": 0.14556040756914118,
+      "grad_norm": 0.14752762019634247,
+      "learning_rate": 4.8277857359729787e-05,
+      "loss": 0.1843916058540344,
+      "step": 800
+    },
+    {
+      "epoch": 0.14647016011644834,
+      "grad_norm": 0.15043556690216064,
+      "learning_rate": 4.8250909608822644e-05,
+      "loss": 0.18354393243789674,
+      "step": 805
+    },
+    {
+      "epoch": 0.14737991266375547,
+      "grad_norm": 0.1381794661283493,
+      "learning_rate": 4.822376029504223e-05,
+      "loss": 0.1789781332015991,
+      "step": 810
+    },
+    {
+      "epoch": 0.1482896652110626,
+      "grad_norm": 0.18386174738407135,
+      "learning_rate": 4.819640965374681e-05,
+      "loss": 0.19494292736053467,
+      "step": 815
+    },
+    {
+      "epoch": 0.14919941775836973,
+      "grad_norm": 0.13829593360424042,
+      "learning_rate": 4.816885792203996e-05,
+      "loss": 0.18486063480377196,
+      "step": 820
+    },
+    {
+      "epoch": 0.15010917030567686,
+      "grad_norm": 0.15033291280269623,
+      "learning_rate": 4.814110533876852e-05,
+      "loss": 0.18061509132385253,
+      "step": 825
+    },
+    {
+      "epoch": 0.151018922852984,
+      "grad_norm": 0.17150473594665527,
+      "learning_rate": 4.811315214452051e-05,
+      "loss": 0.18464866876602173,
+      "step": 830
+    },
+    {
+      "epoch": 0.15192867540029112,
+      "grad_norm": 0.15317125618457794,
+      "learning_rate": 4.808499858162307e-05,
+      "loss": 0.1837708592414856,
+      "step": 835
+    },
+    {
+      "epoch": 0.15283842794759825,
+      "grad_norm": 0.2671392560005188,
+      "learning_rate": 4.805664489414031e-05,
+      "loss": 0.19338636398315429,
+      "step": 840
+    },
+    {
+      "epoch": 0.15374818049490538,
+      "grad_norm": 0.14047028124332428,
+      "learning_rate": 4.802809132787125e-05,
+      "loss": 0.17069108486175538,
+      "step": 845
+    },
+    {
+      "epoch": 0.1546579330422125,
+      "grad_norm": 0.1520431935787201,
+      "learning_rate": 4.799933813034768e-05,
+      "loss": 0.18607735633850098,
+      "step": 850
+    },
+    {
+      "epoch": 0.15556768558951964,
+      "grad_norm": 0.17239463329315186,
+      "learning_rate": 4.797038555083197e-05,
+      "loss": 0.18069062232971192,
+      "step": 855
+    },
+    {
+      "epoch": 0.1564774381368268,
+      "grad_norm": 0.1377955675125122,
+      "learning_rate": 4.794123384031495e-05,
+      "loss": 0.18870222568511963,
+      "step": 860
+    },
+    {
+      "epoch": 0.15738719068413393,
+      "grad_norm": 0.15901461243629456,
+      "learning_rate": 4.791188325151373e-05,
+      "loss": 0.18128334283828734,
+      "step": 865
+    },
+    {
+      "epoch": 0.15829694323144106,
+      "grad_norm": 0.14634132385253906,
+      "learning_rate": 4.7882334038869495e-05,
+      "loss": 0.1866163969039917,
+      "step": 870
+    },
+    {
+      "epoch": 0.1592066957787482,
+      "grad_norm": 0.15361061692237854,
+      "learning_rate": 4.785258645854529e-05,
+      "loss": 0.17850807905197144,
+      "step": 875
+    },
+    {
+      "epoch": 0.16011644832605532,
+      "grad_norm": 0.13751649856567383,
+      "learning_rate": 4.782264076842385e-05,
+      "loss": 0.17731113433837892,
+      "step": 880
+    },
+    {
+      "epoch": 0.16102620087336245,
+      "grad_norm": 0.17909638583660126,
+      "learning_rate": 4.7792497228105314e-05,
+      "loss": 0.18344542980194092,
+      "step": 885
+    },
+    {
+      "epoch": 0.16193595342066958,
+      "grad_norm": 0.16038304567337036,
+      "learning_rate": 4.776215609890498e-05,
+      "loss": 0.18868647813796996,
+      "step": 890
+    },
+    {
+      "epoch": 0.1628457059679767,
+      "grad_norm": 0.1653951108455658,
+      "learning_rate": 4.773161764385107e-05,
+      "loss": 0.18614152669906617,
+      "step": 895
+    },
+    {
+      "epoch": 0.16375545851528384,
+      "grad_norm": 0.16193026304244995,
+      "learning_rate": 4.770088212768241e-05,
+      "loss": 0.18564575910568237,
+      "step": 900
+    },
+    {
+      "epoch": 0.16466521106259097,
+      "grad_norm": 0.16048531234264374,
+      "learning_rate": 4.7669949816846173e-05,
+      "loss": 0.18330031633377075,
+      "step": 905
+    },
+    {
+      "epoch": 0.1655749636098981,
+      "grad_norm": 0.1440177708864212,
+      "learning_rate": 4.7638820979495534e-05,
+      "loss": 0.17712442874908446,
+      "step": 910
+    },
+    {
+      "epoch": 0.16648471615720525,
+      "grad_norm": 0.19635969400405884,
+      "learning_rate": 4.760749588548738e-05,
+      "loss": 0.18679027557373046,
+      "step": 915
+    },
+    {
+      "epoch": 0.16739446870451238,
+      "grad_norm": 0.15576541423797607,
+      "learning_rate": 4.757597480637995e-05,
+      "loss": 0.19283764362335204,
+      "step": 920
+    },
+    {
+      "epoch": 0.1683042212518195,
+      "grad_norm": 0.1550331562757492,
+      "learning_rate": 4.7544258015430463e-05,
+      "loss": 0.18269542455673218,
+      "step": 925
+    },
+    {
+      "epoch": 0.16921397379912664,
+      "grad_norm": 0.18369626998901367,
+      "learning_rate": 4.75123457875928e-05,
+      "loss": 0.1697891116142273,
+      "step": 930
+    },
+    {
+      "epoch": 0.17012372634643377,
+      "grad_norm": 0.15266314148902893,
+      "learning_rate": 4.7480238399515074e-05,
+      "loss": 0.18523451089859008,
+      "step": 935
+    },
+    {
+      "epoch": 0.1710334788937409,
+      "grad_norm": 0.16709664463996887,
+      "learning_rate": 4.744793612953724e-05,
+      "loss": 0.1803238034248352,
+      "step": 940
+    },
+    {
+      "epoch": 0.17194323144104803,
+      "grad_norm": 0.14929179847240448,
+      "learning_rate": 4.741543925768872e-05,
+      "loss": 0.1861217737197876,
+      "step": 945
+    },
+    {
+      "epoch": 0.17285298398835516,
+      "grad_norm": 0.1362280696630478,
+      "learning_rate": 4.7382748065685915e-05,
+      "loss": 0.17896100282669067,
+      "step": 950
+    },
+    {
+      "epoch": 0.1737627365356623,
+      "grad_norm": 0.15290239453315735,
+      "learning_rate": 4.734986283692982e-05,
+      "loss": 0.18432788848876952,
+      "step": 955
+    },
+    {
+      "epoch": 0.17467248908296942,
+      "grad_norm": 0.1287035197019577,
+      "learning_rate": 4.73167838565035e-05,
+      "loss": 0.18485682010650634,
+      "step": 960
+    },
+    {
+      "epoch": 0.17558224163027655,
+      "grad_norm": 0.17969627678394318,
+      "learning_rate": 4.728351141116971e-05,
+      "loss": 0.17361557483673096,
+      "step": 965
+    },
+    {
+      "epoch": 0.1764919941775837,
+      "grad_norm": 0.13751201331615448,
+      "learning_rate": 4.7250045789368326e-05,
+      "loss": 0.1731679320335388,
+      "step": 970
+    },
+    {
+      "epoch": 0.17740174672489084,
+      "grad_norm": 0.1603265255689621,
+      "learning_rate": 4.721638728121388e-05,
+      "loss": 0.17308170795440675,
+      "step": 975
+    },
+    {
+      "epoch": 0.17831149927219797,
+      "grad_norm": 0.1592789888381958,
+      "learning_rate": 4.718253617849306e-05,
+      "loss": 0.17534757852554322,
+      "step": 980
+    },
+    {
+      "epoch": 0.1792212518195051,
+      "grad_norm": 0.12727224826812744,
+      "learning_rate": 4.714849277466214e-05,
+      "loss": 0.17817609310150145,
+      "step": 985
+    },
+    {
+      "epoch": 0.18013100436681223,
+      "grad_norm": 0.15401554107666016,
+      "learning_rate": 4.711425736484447e-05,
+      "loss": 0.1733405351638794,
+      "step": 990
+    },
+    {
+      "epoch": 0.18104075691411936,
+      "grad_norm": 0.13253968954086304,
+      "learning_rate": 4.7079830245827906e-05,
+      "loss": 0.17846795320510864,
+      "step": 995
+    },
+    {
+      "epoch": 0.1819505094614265,
+      "grad_norm": 0.21846213936805725,
+      "learning_rate": 4.7045211716062245e-05,
+      "loss": 0.18021599054336548,
+      "step": 1000
+    },
+    {
+      "epoch": 0.18286026200873362,
+      "grad_norm": 0.16867990791797638,
+      "learning_rate": 4.7010402075656595e-05,
+      "loss": 0.18232386112213134,
+      "step": 1005
+    },
+    {
+      "epoch": 0.18377001455604075,
+      "grad_norm": 0.17180582880973816,
+      "learning_rate": 4.697540162637686e-05,
+      "loss": 0.1816317319869995,
+      "step": 1010
+    },
+    {
+      "epoch": 0.18467976710334788,
+      "grad_norm": 0.16480213403701782,
+      "learning_rate": 4.694021067164303e-05,
+      "loss": 0.17718446254730225,
+      "step": 1015
+    },
+    {
+      "epoch": 0.185589519650655,
+      "grad_norm": 0.15015918016433716,
+      "learning_rate": 4.6904829516526605e-05,
+      "loss": 0.17412011623382567,
+      "step": 1020
+    },
+    {
+      "epoch": 0.18649927219796217,
+      "grad_norm": 0.14445139467716217,
+      "learning_rate": 4.686925846774795e-05,
+      "loss": 0.1778018832206726,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1874090247452693,
+      "grad_norm": 0.1701960265636444,
+      "learning_rate": 4.683349783367362e-05,
+      "loss": 0.16901081800460815,
+      "step": 1030
+    },
+    {
+      "epoch": 0.18831877729257643,
+      "grad_norm": 0.15894867479801178,
+      "learning_rate": 4.679754792431368e-05,
+      "loss": 0.17055928707122803,
+      "step": 1035
+    },
+    {
+      "epoch": 0.18922852983988356,
+      "grad_norm": 0.1511942446231842,
+      "learning_rate": 4.676140905131903e-05,
+      "loss": 0.17339680194854737,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1901382823871907,
+      "grad_norm": 0.14735209941864014,
+      "learning_rate": 4.672508152797872e-05,
+      "loss": 0.17802717685699462,
+      "step": 1045
+    },
+    {
+      "epoch": 0.19104803493449782,
+      "grad_norm": 0.17367291450500488,
+      "learning_rate": 4.66885656692172e-05,
+      "loss": 0.1732744097709656,
+      "step": 1050
+    },
+    {
+      "epoch": 0.19195778748180495,
+      "grad_norm": 0.147227481007576,
+      "learning_rate": 4.665186179159159e-05,
+      "loss": 0.17040517330169677,
+      "step": 1055
+    },
+    {
+      "epoch": 0.19286754002911208,
+      "grad_norm": 0.1709655076265335,
+      "learning_rate": 4.6614970213289e-05,
+      "loss": 0.17794088125228882,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1937772925764192,
+      "grad_norm": 0.1588088721036911,
+      "learning_rate": 4.657789125412366e-05,
+      "loss": 0.17180380821228028,
+      "step": 1065
+    },
+    {
+      "epoch": 0.19468704512372634,
+      "grad_norm": 0.14827021956443787,
+      "learning_rate": 4.654062523553428e-05,
+      "loss": 0.182997989654541,
+      "step": 1070
+    },
+    {
+      "epoch": 0.19559679767103347,
+      "grad_norm": 0.16230466961860657,
+      "learning_rate": 4.6503172480581126e-05,
+      "loss": 0.17346880435943604,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1965065502183406,
+      "grad_norm": 0.1637624353170395,
+      "learning_rate": 4.646553331394333e-05,
+      "loss": 0.17263576984405518,
+      "step": 1080
+    },
+    {
+      "epoch": 0.19741630276564776,
+      "grad_norm": 0.15977843105793,
+      "learning_rate": 4.642770806191603e-05,
+      "loss": 0.17284308671951293,
+      "step": 1085
+    },
+    {
+      "epoch": 0.19832605531295489,
+      "grad_norm": 0.15394869446754456,
+      "learning_rate": 4.6389697052407534e-05,
+      "loss": 0.17797101736068727,
+      "step": 1090
+    },
+    {
+      "epoch": 0.19923580786026202,
+      "grad_norm": 0.15995225310325623,
+      "learning_rate": 4.6351500614936485e-05,
+      "loss": 0.18137198686599731,
+      "step": 1095
+    },
+    {
+      "epoch": 0.20014556040756915,
+      "grad_norm": 0.1779479682445526,
+      "learning_rate": 4.6313119080629006e-05,
+      "loss": 0.17998344898223878,
+      "step": 1100
+    },
+    {
+      "epoch": 0.20105531295487628,
+      "grad_norm": 0.14362832903862,
+      "learning_rate": 4.627455278221584e-05,
+      "loss": 0.18196423053741456,
+      "step": 1105
+    },
+    {
+      "epoch": 0.2019650655021834,
+      "grad_norm": 0.15951639413833618,
+      "learning_rate": 4.623580205402947e-05,
+      "loss": 0.17423888444900512,
+      "step": 1110
+    },
+    {
+      "epoch": 0.20287481804949054,
+      "grad_norm": 0.17273563146591187,
+      "learning_rate": 4.619686723200115e-05,
+      "loss": 0.17392473220825194,
+      "step": 1115
+    },
+    {
+      "epoch": 0.20378457059679767,
+      "grad_norm": 0.1655360758304596,
+      "learning_rate": 4.615774865365813e-05,
+      "loss": 0.17528389692306517,
+      "step": 1120
+    },
+    {
+      "epoch": 0.2046943231441048,
+      "grad_norm": 0.15920691192150116,
+      "learning_rate": 4.611844665812058e-05,
+      "loss": 0.1806849241256714,
+      "step": 1125
+    },
+    {
+      "epoch": 0.20560407569141192,
+      "grad_norm": 0.16114577651023865,
+      "learning_rate": 4.607896158609875e-05,
+      "loss": 0.17217352390289306,
+      "step": 1130
+    },
+    {
+      "epoch": 0.20651382823871905,
+      "grad_norm": 0.1499422937631607,
+      "learning_rate": 4.603929377988999e-05,
+      "loss": 0.17806737422943114,
+      "step": 1135
+    },
+    {
+      "epoch": 0.2074235807860262,
+      "grad_norm": 0.17605191469192505,
+      "learning_rate": 4.5999443583375765e-05,
+      "loss": 0.17842113971710205,
+      "step": 1140
+    },
+    {
+      "epoch": 0.20833333333333334,
+      "grad_norm": 0.16117210686206818,
+      "learning_rate": 4.595941134201871e-05,
+      "loss": 0.18379683494567872,
+      "step": 1145
+    },
+    {
+      "epoch": 0.20924308588064047,
+      "grad_norm": 0.21199050545692444,
+      "learning_rate": 4.591919740285957e-05,
+      "loss": 0.16286123991012574,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2101528384279476,
+      "grad_norm": 0.15100529789924622,
+      "learning_rate": 4.587880211451427e-05,
+      "loss": 0.17995200157165528,
+      "step": 1155
+    },
+    {
+      "epoch": 0.21106259097525473,
+      "grad_norm": 0.16618172824382782,
+      "learning_rate": 4.583822582717085e-05,
+      "loss": 0.16960303783416747,
+      "step": 1160
+    },
+    {
+      "epoch": 0.21197234352256186,
+      "grad_norm": 0.14743569493293762,
+      "learning_rate": 4.579746889258643e-05,
+      "loss": 0.17762668132781984,
+      "step": 1165
+    },
+    {
+      "epoch": 0.212882096069869,
+      "grad_norm": 0.1697179079055786,
+      "learning_rate": 4.575653166408417e-05,
+      "loss": 0.16656005382537842,
+      "step": 1170
+    },
+    {
+      "epoch": 0.21379184861717612,
+      "grad_norm": 0.14886513352394104,
+      "learning_rate": 4.57154144965502e-05,
+      "loss": 0.17091882228851318,
+      "step": 1175
+    },
+    {
+      "epoch": 0.21470160116448325,
+      "grad_norm": 0.18197473883628845,
+      "learning_rate": 4.5674117746430556e-05,
+      "loss": 0.1770920753479004,
+      "step": 1180
+    },
+    {
+      "epoch": 0.21561135371179038,
+      "grad_norm": 0.17323088645935059,
+      "learning_rate": 4.563264177172807e-05,
+      "loss": 0.1734643578529358,
+      "step": 1185
+    },
+    {
+      "epoch": 0.2165211062590975,
+      "grad_norm": 0.1521984338760376,
+      "learning_rate": 4.559098693199929e-05,
+      "loss": 0.17515116930007935,
+      "step": 1190
+    },
+    {
+      "epoch": 0.21743085880640467,
+      "grad_norm": 0.1842304915189743,
+      "learning_rate": 4.554915358835134e-05,
+      "loss": 0.16798022985458375,
+      "step": 1195
+    },
+    {
+      "epoch": 0.2183406113537118,
+      "grad_norm": 0.14753451943397522,
+      "learning_rate": 4.5507142103438794e-05,
+      "loss": 0.1755476713180542,
+      "step": 1200
+    },
+    {
+      "epoch": 0.21925036390101893,
+      "grad_norm": 0.17096194624900818,
+      "learning_rate": 4.546495284146057e-05,
+      "loss": 0.1792473554611206,
+      "step": 1205
+    },
+    {
+      "epoch": 0.22016011644832606,
+      "grad_norm": 0.1579233556985855,
+      "learning_rate": 4.542258616815669e-05,
+      "loss": 0.17230144739151002,
+      "step": 1210
+    },
+    {
+      "epoch": 0.2210698689956332,
+      "grad_norm": 0.177297905087471,
+      "learning_rate": 4.5380042450805216e-05,
+      "loss": 0.1807127833366394,
+      "step": 1215
+    },
+    {
+      "epoch": 0.22197962154294032,
+      "grad_norm": 0.14331696927547455,
+      "learning_rate": 4.533732205821897e-05,
+      "loss": 0.17201389074325563,
+      "step": 1220
+    },
+    {
+      "epoch": 0.22288937409024745,
+      "grad_norm": 0.14473360776901245,
+      "learning_rate": 4.529442536074239e-05,
+      "loss": 0.17036900520324708,
+      "step": 1225
+    },
+    {
+      "epoch": 0.22379912663755458,
+      "grad_norm": 0.1820901483297348,
+      "learning_rate": 4.5251352730248314e-05,
+      "loss": 0.17704882621765136,
+      "step": 1230
+    },
+    {
+      "epoch": 0.2247088791848617,
+      "grad_norm": 0.1948976367712021,
+      "learning_rate": 4.5208104540134746e-05,
+      "loss": 0.1706973433494568,
+      "step": 1235
+    },
+    {
+      "epoch": 0.22561863173216884,
+      "grad_norm": 0.16660070419311523,
+      "learning_rate": 4.51646811653216e-05,
+      "loss": 0.17636821269989014,
+      "step": 1240
+    },
+    {
+      "epoch": 0.22652838427947597,
+      "grad_norm": 0.1699984073638916,
+      "learning_rate": 4.512108298224751e-05,
+      "loss": 0.16986632347106934,
+      "step": 1245
+    },
+    {
+      "epoch": 0.22743813682678313,
+      "grad_norm": 0.17601042985916138,
+      "learning_rate": 4.50773103688665e-05,
+      "loss": 0.17507898807525635,
+      "step": 1250
+    },
+    {
+      "epoch": 0.22834788937409026,
+      "grad_norm": 0.17557238042354584,
+      "learning_rate": 4.503336370464476e-05,
+      "loss": 0.17702863216400147,
+      "step": 1255
+    },
+    {
+      "epoch": 0.2292576419213974,
+      "grad_norm": 0.1800651252269745,
+      "learning_rate": 4.498924337055729e-05,
+      "loss": 0.16419180631637573,
+      "step": 1260
+    },
+    {
+      "epoch": 0.23016739446870452,
+      "grad_norm": 0.2022479772567749,
+      "learning_rate": 4.494494974908468e-05,
+      "loss": 0.17482060194015503,
+      "step": 1265
+    },
+    {
+      "epoch": 0.23107714701601165,
+      "grad_norm": 0.14180205762386322,
+      "learning_rate": 4.490048322420973e-05,
+      "loss": 0.1723136067390442,
+      "step": 1270
+    },
+    {
+      "epoch": 0.23198689956331878,
+      "grad_norm": 0.18607310950756073,
+      "learning_rate": 4.485584418141419e-05,
+      "loss": 0.17096419334411622,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2328966521106259,
+      "grad_norm": 0.15958310663700104,
+      "learning_rate": 4.481103300767529e-05,
+      "loss": 0.1656244158744812,
+      "step": 1280
+    },
+    {
+      "epoch": 0.23380640465793304,
+      "grad_norm": 0.17552383244037628,
+      "learning_rate": 4.476605009146255e-05,
+      "loss": 0.17677626609802247,
+      "step": 1285
+    },
+    {
+      "epoch": 0.23471615720524017,
+      "grad_norm": 0.15299823880195618,
+      "learning_rate": 4.472089582273429e-05,
+      "loss": 0.1778991103172302,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2356259097525473,
+      "grad_norm": 0.14613987505435944,
+      "learning_rate": 4.46755705929343e-05,
+      "loss": 0.17071452140808105,
+      "step": 1295
+    },
+    {
+      "epoch": 0.23653566229985443,
+      "grad_norm": 0.17781122028827667,
+      "learning_rate": 4.463007479498843e-05,
+      "loss": 0.16955430507659913,
+      "step": 1300
+    },
+    {
+      "epoch": 0.23744541484716158,
+      "grad_norm": 0.16326487064361572,
+      "learning_rate": 4.458440882330119e-05,
+      "loss": 0.1777693510055542,
+      "step": 1305
+    },
+    {
+      "epoch": 0.23835516739446871,
+      "grad_norm": 0.17701926827430725,
+      "learning_rate": 4.4538573073752365e-05,
+      "loss": 0.16323351860046387,
+      "step": 1310
+    },
+    {
+      "epoch": 0.23926491994177584,
+      "grad_norm": 0.13104717433452606,
+      "learning_rate": 4.449256794369349e-05,
+      "loss": 0.17653456926345826,
+      "step": 1315
+    },
+    {
+      "epoch": 0.24017467248908297,
+      "grad_norm": 0.1796836256980896,
+      "learning_rate": 4.444639383194452e-05,
+      "loss": 0.17189600467681884,
+      "step": 1320
+    },
+    {
+      "epoch": 0.2410844250363901,
+      "grad_norm": 0.14919696748256683,
+      "learning_rate": 4.440005113879029e-05,
+      "loss": 0.17003334760665895,
+      "step": 1325
+    },
+    {
+      "epoch": 0.24199417758369723,
+      "grad_norm": 0.1728784441947937,
+      "learning_rate": 4.4353540265977064e-05,
+      "loss": 0.17397408485412597,
+      "step": 1330
+    },
+    {
+      "epoch": 0.24290393013100436,
+      "grad_norm": 0.14591015875339508,
+      "learning_rate": 4.43068616167091e-05,
+      "loss": 0.16498478651046752,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2438136826783115,
+      "grad_norm": 0.18417201936244965,
+      "learning_rate": 4.4260015595645055e-05,
+      "loss": 0.16841750144958495,
+      "step": 1340
+    },
+    {
+      "epoch": 0.24472343522561862,
+      "grad_norm": 0.16264279186725616,
+      "learning_rate": 4.4213002608894605e-05,
+      "loss": 0.16907373666763306,
+      "step": 1345
+    },
+    {
+      "epoch": 0.24563318777292575,
+      "grad_norm": 0.15248481929302216,
+      "learning_rate": 4.416582306401481e-05,
+      "loss": 0.15931472778320313,
+      "step": 1350
+    },
+    {
+      "epoch": 0.24654294032023288,
+      "grad_norm": 0.1488373875617981,
+      "learning_rate": 4.4118477370006636e-05,
+      "loss": 0.1701716423034668,
+      "step": 1355
+    },
+    {
+      "epoch": 0.24745269286754004,
+      "grad_norm": 0.14679782092571259,
+      "learning_rate": 4.407096593731142e-05,
+      "loss": 0.157412326335907,
+      "step": 1360
+    },
+    {
+      "epoch": 0.24836244541484717,
+      "grad_norm": 0.17139530181884766,
+      "learning_rate": 4.402328917780728e-05,
+      "loss": 0.17303754091262818,
+      "step": 1365
+    },
+    {
+      "epoch": 0.2492721979621543,
+      "grad_norm": 0.1534871757030487,
+      "learning_rate": 4.397544750480554e-05,
+      "loss": 0.1786255121231079,
+      "step": 1370
+    },
+    {
+      "epoch": 0.2501819505094614,
+      "grad_norm": 0.1876252293586731,
+      "learning_rate": 4.39274413330472e-05,
+      "loss": 0.16442898511886597,
+      "step": 1375
+    },
+    {
+      "epoch": 0.25109170305676853,
+      "grad_norm": 0.16165752708911896,
+      "learning_rate": 4.387927107869928e-05,
+      "loss": 0.1780426025390625,
+      "step": 1380
+    },
+    {
+      "epoch": 0.25200145560407566,
+      "grad_norm": 0.17242255806922913,
+      "learning_rate": 4.383093715935124e-05,
+      "loss": 0.15959256887435913,
+      "step": 1385
+    },
+    {
+      "epoch": 0.25291120815138285,
+      "grad_norm": 0.1627114862203598,
+      "learning_rate": 4.378243999401137e-05,
+      "loss": 0.17606115341186523,
+      "step": 1390
+    },
+    {
+      "epoch": 0.25382096069869,
+      "grad_norm": 0.15911224484443665,
+      "learning_rate": 4.373378000310312e-05,
+      "loss": 0.16798585653305054,
+      "step": 1395
+    },
+    {
+      "epoch": 0.2547307132459971,
+      "grad_norm": 0.15542249381542206,
+      "learning_rate": 4.3684957608461505e-05,
+      "loss": 0.1695417881011963,
+      "step": 1400
+    },
+    {
+      "epoch": 0.25564046579330424,
+      "grad_norm": 0.1475304812192917,
+      "learning_rate": 4.363597323332941e-05,
+      "loss": 0.16340878009796142,
+      "step": 1405
+    },
+    {
+      "epoch": 0.25655021834061137,
+      "grad_norm": 0.16943927109241486,
+      "learning_rate": 4.358682730235395e-05,
+      "loss": 0.17240238189697266,
+      "step": 1410
+    },
+    {
+      "epoch": 0.2574599708879185,
+      "grad_norm": 0.1816391944885254,
+      "learning_rate": 4.3537520241582744e-05,
+      "loss": 0.16558437347412108,
+      "step": 1415
+    },
+    {
+      "epoch": 0.25836972343522563,
+      "grad_norm": 0.23851341009140015,
+      "learning_rate": 4.348805247846027e-05,
+      "loss": 0.16796000003814698,
+      "step": 1420
+    },
+    {
+      "epoch": 0.25927947598253276,
+      "grad_norm": 0.15415243804454803,
+      "learning_rate": 4.343842444182414e-05,
+      "loss": 0.1746017098426819,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2601892285298399,
+      "grad_norm": 0.15651032328605652,
+      "learning_rate": 4.338863656190139e-05,
+      "loss": 0.1649057984352112,
+      "step": 1430
+    },
+    {
+      "epoch": 0.261098981077147,
+      "grad_norm": 0.16601966321468353,
+      "learning_rate": 4.333868927030471e-05,
+      "loss": 0.15888988971710205,
+      "step": 1435
+    },
+    {
+      "epoch": 0.26200873362445415,
+      "grad_norm": 0.1549467295408249,
+      "learning_rate": 4.328858300002876e-05,
+      "loss": 0.16357985734939576,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2629184861717613,
+      "grad_norm": 0.16332370042800903,
+      "learning_rate": 4.32383181854464e-05,
+      "loss": 0.16749982833862304,
+      "step": 1445
+    },
+    {
+      "epoch": 0.2638282387190684,
+      "grad_norm": 0.14827077090740204,
+      "learning_rate": 4.3187895262304894e-05,
+      "loss": 0.16886214017868043,
+      "step": 1450
+    },
+    {
+      "epoch": 0.26473799126637554,
+      "grad_norm": 0.1557198166847229,
+      "learning_rate": 4.313731466772216e-05,
+      "loss": 0.17512214183807373,
+      "step": 1455
+    },
+    {
+      "epoch": 0.26564774381368267,
+      "grad_norm": 0.17263570427894592,
+      "learning_rate": 4.308657684018299e-05,
+      "loss": 0.16248074769973755,
+      "step": 1460
+    },
+    {
+      "epoch": 0.2665574963609898,
+      "grad_norm": 0.17135761678218842,
+      "learning_rate": 4.303568221953521e-05,
+      "loss": 0.16605921983718872,
+      "step": 1465
+    },
+    {
+      "epoch": 0.26746724890829693,
+      "grad_norm": 0.14322632551193237,
+      "learning_rate": 4.2984631246985897e-05,
+      "loss": 0.1610772728919983,
+      "step": 1470
+    },
+    {
+      "epoch": 0.26837700145560406,
+      "grad_norm": 0.18852312862873077,
+      "learning_rate": 4.2933424365097564e-05,
+      "loss": 0.1686462163925171,
+      "step": 1475
+    },
+    {
+      "epoch": 0.2692867540029112,
+      "grad_norm": 0.1780245155096054,
+      "learning_rate": 4.2882062017784294e-05,
+      "loss": 0.16953932046890258,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2701965065502183,
+      "grad_norm": 0.180568665266037,
+      "learning_rate": 4.2830544650307895e-05,
+      "loss": 0.16442664861679077,
+      "step": 1485
+    },
+    {
+      "epoch": 0.27110625909752545,
+      "grad_norm": 0.16876435279846191,
+      "learning_rate": 4.277887270927407e-05,
+      "loss": 0.17128173112869263,
+      "step": 1490
+    },
+    {
+      "epoch": 0.2720160116448326,
+      "grad_norm": 0.164053276181221,
+      "learning_rate": 4.2727046642628513e-05,
+      "loss": 0.16331382989883422,
+      "step": 1495
+    },
+    {
+      "epoch": 0.27292576419213976,
+      "grad_norm": 0.14577528834342957,
+      "learning_rate": 4.267506689965305e-05,
+      "loss": 0.1638316035270691,
+      "step": 1500
+    },
+    {
+      "epoch": 0.2738355167394469,
+      "grad_norm": 0.1648740917444229,
+      "learning_rate": 4.262293393096171e-05,
+      "loss": 0.15332664251327516,
+      "step": 1505
+    },
+    {
+      "epoch": 0.274745269286754,
+      "grad_norm": 0.16445094347000122,
+      "learning_rate": 4.257064818849685e-05,
+      "loss": 0.1706634521484375,
+      "step": 1510
+    },
+    {
+      "epoch": 0.27565502183406115,
+      "grad_norm": 0.1584935486316681,
+      "learning_rate": 4.251821012552524e-05,
+      "loss": 0.1684114694595337,
+      "step": 1515
+    },
+    {
+      "epoch": 0.2765647743813683,
+      "grad_norm": 0.17215611040592194,
+      "learning_rate": 4.24656201966341e-05,
+      "loss": 0.15594131946563722,
+      "step": 1520
+    },
+    {
+      "epoch": 0.2774745269286754,
+      "grad_norm": 0.15945589542388916,
+      "learning_rate": 4.2412878857727214e-05,
+      "loss": 0.1686659574508667,
+      "step": 1525
+    },
+    {
+      "epoch": 0.27838427947598254,
+      "grad_norm": 0.16103951632976532,
+      "learning_rate": 4.2359986566020906e-05,
+      "loss": 0.17779340744018554,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2792940320232897,
+      "grad_norm": 0.1770307570695877,
+      "learning_rate": 4.230694378004014e-05,
+      "loss": 0.16786882877349854,
+      "step": 1535
+    },
+    {
+      "epoch": 0.2802037845705968,
+      "grad_norm": 0.16225053369998932,
+      "learning_rate": 4.2253750959614504e-05,
+      "loss": 0.16558897495269775,
+      "step": 1540
+    },
+    {
+      "epoch": 0.28111353711790393,
+      "grad_norm": 0.27213969826698303,
+      "learning_rate": 4.220040856587425e-05,
+      "loss": 0.1641119599342346,
+      "step": 1545
+    },
+    {
+      "epoch": 0.28202328966521106,
+      "grad_norm": 0.1773071587085724,
+      "learning_rate": 4.2146917061246284e-05,
+      "loss": 0.16919140815734862,
+      "step": 1550
+    },
+    {
+      "epoch": 0.2829330422125182,
+      "grad_norm": 0.15519705414772034,
+      "learning_rate": 4.209327690945014e-05,
+      "loss": 0.15501506328582765,
+      "step": 1555
+    },
+    {
+      "epoch": 0.2838427947598253,
+      "grad_norm": 0.19921597838401794,
+      "learning_rate": 4.203948857549402e-05,
+      "loss": 0.1690821886062622,
+      "step": 1560
+    },
+    {
+      "epoch": 0.28475254730713245,
+      "grad_norm": 0.15417630970478058,
+      "learning_rate": 4.1985552525670696e-05,
+      "loss": 0.1675640344619751,
+      "step": 1565
+    },
+    {
+      "epoch": 0.2856622998544396,
+      "grad_norm": 0.1739572137594223,
+      "learning_rate": 4.193146922755348e-05,
+      "loss": 0.16738017797470092,
+      "step": 1570
+    },
+    {
+      "epoch": 0.2865720524017467,
+      "grad_norm": 0.1384361982345581,
+      "learning_rate": 4.187723914999221e-05,
+      "loss": 0.16802358627319336,
+      "step": 1575
+    },
+    {
+      "epoch": 0.28748180494905384,
+      "grad_norm": 0.1491454839706421,
+      "learning_rate": 4.182286276310915e-05,
+      "loss": 0.1619583249092102,
+      "step": 1580
+    },
+    {
+      "epoch": 0.288391557496361,
+      "grad_norm": 0.15831919014453888,
+      "learning_rate": 4.176834053829492e-05,
+      "loss": 0.1625199794769287,
+      "step": 1585
+    },
+    {
+      "epoch": 0.2893013100436681,
+      "grad_norm": 0.16265396773815155,
+      "learning_rate": 4.1713672948204416e-05,
+      "loss": 0.16718552112579346,
+      "step": 1590
+    },
+    {
+      "epoch": 0.29021106259097523,
+      "grad_norm": 0.15153461694717407,
+      "learning_rate": 4.1658860466752714e-05,
+      "loss": 0.15979087352752686,
+      "step": 1595
+    },
+    {
+      "epoch": 0.29112081513828236,
+      "grad_norm": 0.1620412915945053,
+      "learning_rate": 4.160390356911096e-05,
+      "loss": 0.16103557348251343,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2920305676855895,
+      "grad_norm": 0.16673807799816132,
+      "learning_rate": 4.154880273170223e-05,
+      "loss": 0.16394708156585694,
+      "step": 1605
+    },
+    {
+      "epoch": 0.2929403202328967,
+      "grad_norm": 0.14834867417812347,
+      "learning_rate": 4.149355843219744e-05,
+      "loss": 0.15916435718536376,
+      "step": 1610
+    },
+    {
+      "epoch": 0.2938500727802038,
+      "grad_norm": 0.16977964341640472,
+      "learning_rate": 4.143817114951119e-05,
+      "loss": 0.16538127660751342,
+      "step": 1615
+    },
+    {
+      "epoch": 0.29475982532751094,
+      "grad_norm": 0.17986875772476196,
+      "learning_rate": 4.138264136379756e-05,
+      "loss": 0.15514618158340454,
+      "step": 1620
+    },
+    {
+      "epoch": 0.29566957787481807,
+      "grad_norm": 0.15794920921325684,
+      "learning_rate": 4.132696955644605e-05,
+      "loss": 0.15992183685302735,
+      "step": 1625
+    },
+    {
+      "epoch": 0.2965793304221252,
+      "grad_norm": 0.19955399632453918,
+      "learning_rate": 4.127115621007731e-05,
+      "loss": 0.16362056732177735,
+      "step": 1630
+    },
+    {
+      "epoch": 0.29748908296943233,
+      "grad_norm": 0.1352023035287857,
+      "learning_rate": 4.121520180853903e-05,
+      "loss": 0.15631601810455323,
+      "step": 1635
+    },
+    {
+      "epoch": 0.29839883551673946,
+      "grad_norm": 0.15340781211853027,
+      "learning_rate": 4.1159106836901674e-05,
+      "loss": 0.1571858048439026,
+      "step": 1640
+    },
+    {
+      "epoch": 0.2993085880640466,
+      "grad_norm": 0.15311770141124725,
+      "learning_rate": 4.110287178145433e-05,
+      "loss": 0.16082344055175782,
+      "step": 1645
+    },
+    {
+      "epoch": 0.3002183406113537,
+      "grad_norm": 0.17811627686023712,
+      "learning_rate": 4.10464971297005e-05,
+      "loss": 0.16117215156555176,
+      "step": 1650
+    },
+    {
+      "epoch": 0.30112809315866085,
+      "grad_norm": 0.21060039103031158,
+      "learning_rate": 4.0989983370353805e-05,
+      "loss": 0.15838587284088135,
+      "step": 1655
+    },
+    {
+      "epoch": 0.302037845705968,
+      "grad_norm": 0.155836820602417,
+      "learning_rate": 4.093333099333383e-05,
+      "loss": 0.16648870706558228,
+      "step": 1660
+    },
+    {
+      "epoch": 0.3029475982532751,
+      "grad_norm": 0.13711698353290558,
+      "learning_rate": 4.0876540489761826e-05,
+      "loss": 0.16899349689483642,
+      "step": 1665
+    },
+    {
+      "epoch": 0.30385735080058224,
+      "grad_norm": 0.15162716805934906,
+      "learning_rate": 4.0819612351956485e-05,
+      "loss": 0.16574090719223022,
+      "step": 1670
+    },
+    {
+      "epoch": 0.30476710334788937,
+      "grad_norm": 0.15016348659992218,
+      "learning_rate": 4.0762547073429615e-05,
+      "loss": 0.1689780354499817,
+      "step": 1675
+    },
+    {
+      "epoch": 0.3056768558951965,
+      "grad_norm": 0.15182986855506897,
+      "learning_rate": 4.070534514888194e-05,
+      "loss": 0.1593686819076538,
+      "step": 1680
+    },
+    {
+      "epoch": 0.3065866084425036,
+      "grad_norm": 0.15648750960826874,
+      "learning_rate": 4.0648007074198765e-05,
+      "loss": 0.16436235904693602,
+      "step": 1685
+    },
+    {
+      "epoch": 0.30749636098981076,
+      "grad_norm": 0.18339484930038452,
+      "learning_rate": 4.0590533346445665e-05,
+      "loss": 0.1678077220916748,
+      "step": 1690
+    },
+    {
+      "epoch": 0.3084061135371179,
+      "grad_norm": 0.16426527500152588,
+      "learning_rate": 4.053292446386422e-05,
+      "loss": 0.1689227342605591,
+      "step": 1695
+    },
+    {
+      "epoch": 0.309315866084425,
+      "grad_norm": 0.16129335761070251,
+      "learning_rate": 4.047518092586766e-05,
+      "loss": 0.16592445373535156,
+      "step": 1700
+    },
+    {
+      "epoch": 0.31022561863173215,
+      "grad_norm": 0.15512363612651825,
+      "learning_rate": 4.041730323303654e-05,
+      "loss": 0.16142364740371704,
+      "step": 1705
+    },
+    {
+      "epoch": 0.3111353711790393,
+      "grad_norm": 0.159842386841774,
+      "learning_rate": 4.0359291887114425e-05,
+      "loss": 0.1702875852584839,
+      "step": 1710
+    },
+    {
+      "epoch": 0.3120451237263464,
+      "grad_norm": 0.19558854401111603,
+      "learning_rate": 4.030114739100352e-05,
+      "loss": 0.15966148376464845,
+      "step": 1715
+    },
+    {
+      "epoch": 0.3129548762736536,
+      "grad_norm": 0.1577496975660324,
+      "learning_rate": 4.024287024876029e-05,
+      "loss": 0.1620358943939209,
+      "step": 1720
+    },
+    {
+      "epoch": 0.3138646288209607,
+      "grad_norm": 0.1629355251789093,
+      "learning_rate": 4.0184460965591144e-05,
+      "loss": 0.16511552333831786,
+      "step": 1725
+    },
+    {
+      "epoch": 0.31477438136826785,
+      "grad_norm": 0.17060767114162445,
+      "learning_rate": 4.0125920047848e-05,
+      "loss": 0.15672838687896729,
+      "step": 1730
+    },
+    {
+      "epoch": 0.315684133915575,
+      "grad_norm": 0.22447620332241058,
+      "learning_rate": 4.006724800302394e-05,
+      "loss": 0.15339784622192382,
+      "step": 1735
+    },
+    {
+      "epoch": 0.3165938864628821,
+      "grad_norm": 0.14572037756443024,
+      "learning_rate": 4.000844533974878e-05,
+      "loss": 0.16566959619522095,
+      "step": 1740
+    },
+    {
+      "epoch": 0.31750363901018924,
+      "grad_norm": 0.15915483236312866,
+      "learning_rate": 3.9949512567784684e-05,
+      "loss": 0.16153957843780517,
+      "step": 1745
+    },
+    {
+      "epoch": 0.3184133915574964,
+      "grad_norm": 0.1668540984392166,
+      "learning_rate": 3.9890450198021704e-05,
+      "loss": 0.1659809947013855,
+      "step": 1750
+    },
+    {
+      "epoch": 0.3193231441048035,
+      "grad_norm": 0.16612035036087036,
+      "learning_rate": 3.983125874247341e-05,
+      "loss": 0.16941241025924683,
+      "step": 1755
+    },
+    {
+      "epoch": 0.32023289665211063,
+      "grad_norm": 0.15163679420948029,
+      "learning_rate": 3.9771938714272407e-05,
+      "loss": 0.16053590774536133,
+      "step": 1760
+    },
+    {
+      "epoch": 0.32114264919941776,
+      "grad_norm": 0.1797824203968048,
+      "learning_rate": 3.97124906276659e-05,
+      "loss": 0.1667110800743103,
+      "step": 1765
+    },
+    {
+      "epoch": 0.3220524017467249,
+      "grad_norm": 0.15076608955860138,
+      "learning_rate": 3.9652914998011237e-05,
+      "loss": 0.1607860803604126,
+      "step": 1770
+    },
+    {
+      "epoch": 0.322962154294032,
+      "grad_norm": 0.16523587703704834,
+      "learning_rate": 3.959321234177144e-05,
+      "loss": 0.16515827178955078,
+      "step": 1775
+    },
+    {
+      "epoch": 0.32387190684133915,
+      "grad_norm": 0.22065149247646332,
+      "learning_rate": 3.9533383176510746e-05,
+      "loss": 0.1618957757949829,
+      "step": 1780
+    },
+    {
+      "epoch": 0.3247816593886463,
+      "grad_norm": 0.16426463425159454,
+      "learning_rate": 3.9473428020890066e-05,
+      "loss": 0.15763382911682128,
+      "step": 1785
+    },
+    {
+      "epoch": 0.3256914119359534,
+      "grad_norm": 0.16474904119968414,
+      "learning_rate": 3.941334739466257e-05,
+      "loss": 0.15135571956634522,
+      "step": 1790
+    },
+    {
+      "epoch": 0.32660116448326054,
+      "grad_norm": 0.16746412217617035,
+      "learning_rate": 3.935314181866909e-05,
+      "loss": 0.15925389528274536,
+      "step": 1795
+    },
+    {
+      "epoch": 0.32751091703056767,
+      "grad_norm": 0.17819371819496155,
+      "learning_rate": 3.929281181483369e-05,
+      "loss": 0.1598669171333313,
+      "step": 1800
+    },
+    {
+      "epoch": 0.3284206695778748,
+      "grad_norm": 0.1816040277481079,
+      "learning_rate": 3.923235790615907e-05,
+      "loss": 0.1652522087097168,
+      "step": 1805
+    },
+    {
+      "epoch": 0.32933042212518193,
+      "grad_norm": 0.14846695959568024,
+      "learning_rate": 3.917178061672211e-05,
+      "loss": 0.16665585041046144,
+      "step": 1810
+    },
+    {
+      "epoch": 0.33024017467248906,
+      "grad_norm": 0.1734926551580429,
+      "learning_rate": 3.911108047166924e-05,
+      "loss": 0.16069791316986085,
+      "step": 1815
+    },
+    {
+      "epoch": 0.3311499272197962,
+      "grad_norm": 0.16154922544956207,
+      "learning_rate": 3.905025799721194e-05,
+      "loss": 0.16114097833633423,
+      "step": 1820
+    },
+    {
+      "epoch": 0.3320596797671033,
+      "grad_norm": 0.1538771390914917,
+      "learning_rate": 3.898931372062217e-05,
+      "loss": 0.1602831244468689,
+      "step": 1825
+    },
+    {
+      "epoch": 0.3329694323144105,
+      "grad_norm": 0.14036566019058228,
+      "learning_rate": 3.892824817022781e-05,
+      "loss": 0.1502395749092102,
+      "step": 1830
+    },
+    {
+      "epoch": 0.33387918486171764,
+      "grad_norm": 0.19212059676647186,
+      "learning_rate": 3.886706187540804e-05,
+      "loss": 0.16265250444412233,
+      "step": 1835
+    },
+    {
+      "epoch": 0.33478893740902477,
+      "grad_norm": 0.17410333454608917,
+      "learning_rate": 3.880575536658881e-05,
+      "loss": 0.15689224004745483,
+      "step": 1840
+    },
+    {
+      "epoch": 0.3356986899563319,
+      "grad_norm": 0.15165294706821442,
+      "learning_rate": 3.874432917523817e-05,
+      "loss": 0.15033140182495117,
+      "step": 1845
+    },
+    {
+      "epoch": 0.336608442503639,
+      "grad_norm": 0.16166730225086212,
+      "learning_rate": 3.8682783833861736e-05,
+      "loss": 0.16896235942840576,
+      "step": 1850
+    },
+    {
+      "epoch": 0.33751819505094616,
+      "grad_norm": 0.16497021913528442,
+      "learning_rate": 3.8621119875998026e-05,
+      "loss": 0.1600774645805359,
+      "step": 1855
+    },
+    {
+      "epoch": 0.3384279475982533,
+      "grad_norm": 0.17264948785305023,
+      "learning_rate": 3.855933783621384e-05,
+      "loss": 0.16947593688964843,
+      "step": 1860
+    },
+    {
+      "epoch": 0.3393377001455604,
+      "grad_norm": 0.16870704293251038,
+      "learning_rate": 3.8497438250099636e-05,
+      "loss": 0.16062095165252685,
+      "step": 1865
+    },
+    {
+      "epoch": 0.34024745269286755,
+      "grad_norm": 0.16644036769866943,
+      "learning_rate": 3.843542165426492e-05,
+      "loss": 0.16015599966049193,
+      "step": 1870
+    },
+    {
+      "epoch": 0.3411572052401747,
+      "grad_norm": 0.1626352220773697,
+      "learning_rate": 3.837328858633349e-05,
+      "loss": 0.17444703578948975,
+      "step": 1875
+    },
+    {
+      "epoch": 0.3420669577874818,
+      "grad_norm": 0.1427375227212906,
+      "learning_rate": 3.83110395849389e-05,
+      "loss": 0.1589805006980896,
+      "step": 1880
+    },
+    {
+      "epoch": 0.34297671033478894,
+      "grad_norm": 0.17840255796909332,
+      "learning_rate": 3.824867518971973e-05,
+      "loss": 0.15953952074050903,
+      "step": 1885
+    },
+    {
+      "epoch": 0.34388646288209607,
+      "grad_norm": 0.16998249292373657,
+      "learning_rate": 3.818619594131489e-05,
+      "loss": 0.16027032136917113,
+      "step": 1890
+    },
+    {
+      "epoch": 0.3447962154294032,
+      "grad_norm": 0.14950257539749146,
+      "learning_rate": 3.812360238135897e-05,
+      "loss": 0.15335670709609986,
+      "step": 1895
+    },
+    {
+      "epoch": 0.3457059679767103,
+      "grad_norm": 0.1678011417388916,
+      "learning_rate": 3.806089505247752e-05,
+      "loss": 0.1560648798942566,
+      "step": 1900
+    },
+    {
+      "epoch": 0.34661572052401746,
+      "grad_norm": 0.17944541573524475,
+      "learning_rate": 3.799807449828238e-05,
+      "loss": 0.16072254180908202,
+      "step": 1905
+    },
+    {
+      "epoch": 0.3475254730713246,
+      "grad_norm": 0.166817307472229,
+      "learning_rate": 3.793514126336691e-05,
+      "loss": 0.1542820692062378,
+      "step": 1910
+    },
+    {
+      "epoch": 0.3484352256186317,
+      "grad_norm": 0.16047626733779907,
+      "learning_rate": 3.787209589330134e-05,
+      "loss": 0.16092092990875245,
+      "step": 1915
+    },
+    {
+      "epoch": 0.34934497816593885,
+      "grad_norm": 0.16478900611400604,
+      "learning_rate": 3.7808938934627965e-05,
+      "loss": 0.16765867471694945,
+      "step": 1920
+    },
+    {
+      "epoch": 0.350254730713246,
+      "grad_norm": 0.15349514782428741,
+      "learning_rate": 3.774567093485648e-05,
+      "loss": 0.15890377759933472,
+      "step": 1925
+    },
+    {
+      "epoch": 0.3511644832605531,
+      "grad_norm": 0.1515921950340271,
+      "learning_rate": 3.768229244245917e-05,
+      "loss": 0.16668319702148438,
+      "step": 1930
+    },
+    {
+      "epoch": 0.35207423580786024,
+      "grad_norm": 0.16310466825962067,
+      "learning_rate": 3.7618804006866195e-05,
+      "loss": 0.15182652473449706,
+      "step": 1935
+    },
+    {
+      "epoch": 0.3529839883551674,
+      "grad_norm": 0.17294517159461975,
+      "learning_rate": 3.755520617846084e-05,
+      "loss": 0.16287628412246705,
+      "step": 1940
+    },
+    {
+      "epoch": 0.35389374090247455,
+      "grad_norm": 0.1482895463705063,
+      "learning_rate": 3.749149950857467e-05,
+      "loss": 0.15321952104568481,
+      "step": 1945
+    },
+    {
+      "epoch": 0.3548034934497817,
+      "grad_norm": 0.2236029952764511,
+      "learning_rate": 3.7427684549482847e-05,
+      "loss": 0.15403482913970948,
+      "step": 1950
+    },
+    {
+      "epoch": 0.3557132459970888,
+      "grad_norm": 0.20185327529907227,
+      "learning_rate": 3.736376185439927e-05,
+      "loss": 0.1633884072303772,
+      "step": 1955
+    },
+    {
+      "epoch": 0.35662299854439594,
+      "grad_norm": 0.13906247913837433,
+      "learning_rate": 3.7299731977471816e-05,
+      "loss": 0.15925350189208984,
+      "step": 1960
+    },
+    {
+      "epoch": 0.35753275109170307,
+      "grad_norm": 0.18665002286434174,
+      "learning_rate": 3.723559547377751e-05,
+      "loss": 0.1612026572227478,
+      "step": 1965
+    },
+    {
+      "epoch": 0.3584425036390102,
+      "grad_norm": 0.16913433372974396,
+      "learning_rate": 3.717135289931774e-05,
+      "loss": 0.15479494333267213,
+      "step": 1970
+    },
+    {
+      "epoch": 0.35935225618631733,
+      "grad_norm": 0.1620066910982132,
+      "learning_rate": 3.7107004811013434e-05,
+      "loss": 0.1604058027267456,
+      "step": 1975
+    },
+    {
+      "epoch": 0.36026200873362446,
+      "grad_norm": 0.16838301718235016,
+      "learning_rate": 3.704255176670021e-05,
+      "loss": 0.15335073471069335,
+      "step": 1980
+    },
+    {
+      "epoch": 0.3611717612809316,
+      "grad_norm": 0.3054695427417755,
+      "learning_rate": 3.6977994325123535e-05,
+      "loss": 0.16558053493499755,
+      "step": 1985
+    },
+    {
+      "epoch": 0.3620815138282387,
+      "grad_norm": 0.1526716649532318,
+      "learning_rate": 3.6913333045933934e-05,
+      "loss": 0.16148923635482787,
+      "step": 1990
+    },
+    {
+      "epoch": 0.36299126637554585,
+      "grad_norm": 0.15328513085842133,
+      "learning_rate": 3.684856848968209e-05,
+      "loss": 0.1553613781929016,
+      "step": 1995
+    },
+    {
+      "epoch": 0.363901018922853,
+      "grad_norm": 0.16129714250564575,
+      "learning_rate": 3.6783701217813995e-05,
+      "loss": 0.16724612712860107,
+      "step": 2000
+    },
+    {
+      "epoch": 0.3648107714701601,
+      "grad_norm": 0.15715539455413818,
+      "learning_rate": 3.6718731792666086e-05,
+      "loss": 0.15867922306060792,
+      "step": 2005
+    },
+    {
+      "epoch": 0.36572052401746724,
+      "grad_norm": 0.15569166839122772,
+      "learning_rate": 3.6653660777460366e-05,
+      "loss": 0.1552058696746826,
+      "step": 2010
+    },
+    {
+      "epoch": 0.36663027656477437,
+      "grad_norm": 0.16223010420799255,
+      "learning_rate": 3.6588488736299535e-05,
+      "loss": 0.1583200454711914,
+      "step": 2015
+    },
+    {
+      "epoch": 0.3675400291120815,
+      "grad_norm": 0.18441995978355408,
+      "learning_rate": 3.652321623416209e-05,
+      "loss": 0.15050662755966188,
+      "step": 2020
+    },
+    {
+      "epoch": 0.36844978165938863,
+      "grad_norm": 0.13792674243450165,
+      "learning_rate": 3.645784383689742e-05,
+      "loss": 0.15458759069442748,
+      "step": 2025
+    },
+    {
+      "epoch": 0.36935953420669576,
+      "grad_norm": 0.14993111789226532,
+      "learning_rate": 3.639237211122091e-05,
+      "loss": 0.15926222801208495,
+      "step": 2030
+    },
+    {
+      "epoch": 0.3702692867540029,
+      "grad_norm": 0.16815930604934692,
+      "learning_rate": 3.632680162470904e-05,
+      "loss": 0.15524441003799438,
+      "step": 2035
+    },
+    {
+      "epoch": 0.37117903930131,
+      "grad_norm": 0.13312821090221405,
+      "learning_rate": 3.626113294579441e-05,
+      "loss": 0.15883516073226928,
+      "step": 2040
+    },
+    {
+      "epoch": 0.37208879184861715,
+      "grad_norm": 0.16838273406028748,
+      "learning_rate": 3.619536664376091e-05,
+      "loss": 0.15829603672027587,
+      "step": 2045
+    },
+    {
+      "epoch": 0.37299854439592434,
+      "grad_norm": 0.14706873893737793,
+      "learning_rate": 3.612950328873869e-05,
+      "loss": 0.15644397735595703,
+      "step": 2050
+    },
+    {
+      "epoch": 0.37390829694323147,
+      "grad_norm": 0.1644199639558792,
+      "learning_rate": 3.606354345169926e-05,
+      "loss": 0.15858219861984252,
+      "step": 2055
+    },
+    {
+      "epoch": 0.3748180494905386,
+      "grad_norm": 0.18077051639556885,
+      "learning_rate": 3.599748770445055e-05,
+      "loss": 0.1641286849975586,
+      "step": 2060
+    },
+    {
+      "epoch": 0.3757278020378457,
+      "grad_norm": 0.16329127550125122,
+      "learning_rate": 3.5931336619631914e-05,
+      "loss": 0.15027186870574952,
+      "step": 2065
+    },
+    {
+      "epoch": 0.37663755458515286,
+      "grad_norm": 0.16346783936023712,
+      "learning_rate": 3.586509077070922e-05,
+      "loss": 0.1558641314506531,
+      "step": 2070
+    },
+    {
+      "epoch": 0.37754730713246,
+      "grad_norm": 0.1727602630853653,
+      "learning_rate": 3.5798750731969834e-05,
+      "loss": 0.15390506982803345,
+      "step": 2075
+    },
+    {
+      "epoch": 0.3784570596797671,
+      "grad_norm": 0.7598192691802979,
+      "learning_rate": 3.5732317078517654e-05,
+      "loss": 0.1533232808113098,
+      "step": 2080
+    },
+    {
+      "epoch": 0.37936681222707425,
+      "grad_norm": 0.1433355212211609,
+      "learning_rate": 3.5665790386268124e-05,
+      "loss": 0.15560413599014283,
+      "step": 2085
+    },
+    {
+      "epoch": 0.3802765647743814,
+      "grad_norm": 0.18439625203609467,
+      "learning_rate": 3.559917123194325e-05,
+      "loss": 0.16695556640625,
+      "step": 2090
+    },
+    {
+      "epoch": 0.3811863173216885,
+      "grad_norm": 0.1693502813577652,
+      "learning_rate": 3.55324601930666e-05,
+      "loss": 0.15957870483398437,
+      "step": 2095
+    },
+    {
+      "epoch": 0.38209606986899564,
+      "grad_norm": 0.17776088416576385,
+      "learning_rate": 3.54656578479583e-05,
+      "loss": 0.1527492880821228,
+      "step": 2100
+    },
+    {
+      "epoch": 0.38300582241630277,
+      "grad_norm": 0.15993724763393402,
+      "learning_rate": 3.539876477572998e-05,
+      "loss": 0.1567505717277527,
+      "step": 2105
+    },
+    {
+      "epoch": 0.3839155749636099,
+      "grad_norm": 0.17067375779151917,
+      "learning_rate": 3.533178155627981e-05,
+      "loss": 0.14660797119140626,
+      "step": 2110
+    },
+    {
+      "epoch": 0.384825327510917,
+      "grad_norm": 0.20239882171154022,
+      "learning_rate": 3.526470877028745e-05,
+      "loss": 0.1596767544746399,
+      "step": 2115
+    },
+    {
+      "epoch": 0.38573508005822416,
+      "grad_norm": 0.1863643079996109,
+      "learning_rate": 3.5197546999209005e-05,
+      "loss": 0.15738571882247926,
+      "step": 2120
+    },
+    {
+      "epoch": 0.3866448326055313,
+      "grad_norm": 0.16994133591651917,
+      "learning_rate": 3.5130296825272014e-05,
+      "loss": 0.16255316734313965,
+      "step": 2125
+    },
+    {
+      "epoch": 0.3875545851528384,
+      "grad_norm": 0.18703415989875793,
+      "learning_rate": 3.5062958831470355e-05,
+      "loss": 0.15206334590911866,
+      "step": 2130
+    },
+    {
+      "epoch": 0.38846433770014555,
+      "grad_norm": 0.15433982014656067,
+      "learning_rate": 3.4995533601559226e-05,
+      "loss": 0.1590178370475769,
+      "step": 2135
+    },
+    {
+      "epoch": 0.3893740902474527,
+      "grad_norm": 0.16498146951198578,
+      "learning_rate": 3.4928021720050104e-05,
+      "loss": 0.14759145975112914,
+      "step": 2140
+    },
+    {
+      "epoch": 0.3902838427947598,
+      "grad_norm": 0.17880478501319885,
+      "learning_rate": 3.486042377220562e-05,
+      "loss": 0.1642458915710449,
+      "step": 2145
+    },
+    {
+      "epoch": 0.39119359534206694,
+      "grad_norm": 0.14700061082839966,
+      "learning_rate": 3.479274034403455e-05,
+      "loss": 0.16105138063430785,
+      "step": 2150
+    },
+    {
+      "epoch": 0.39210334788937407,
+      "grad_norm": 0.1620762050151825,
+      "learning_rate": 3.472497202228664e-05,
+      "loss": 0.15104985237121582,
+      "step": 2155
+    },
+    {
+      "epoch": 0.3930131004366812,
+      "grad_norm": 0.1625058799982071,
+      "learning_rate": 3.4657119394447654e-05,
+      "loss": 0.16145485639572144,
+      "step": 2160
+    },
+    {
+      "epoch": 0.3939228529839884,
+      "grad_norm": 0.1631549596786499,
+      "learning_rate": 3.458918304873417e-05,
+      "loss": 0.16712255477905275,
+      "step": 2165
+    },
+    {
+      "epoch": 0.3948326055312955,
+      "grad_norm": 0.16041551530361176,
+      "learning_rate": 3.452116357408853e-05,
+      "loss": 0.15118330717086792,
+      "step": 2170
+    },
+    {
+      "epoch": 0.39574235807860264,
+      "grad_norm": 0.16692611575126648,
+      "learning_rate": 3.44530615601737e-05,
+      "loss": 0.16982550621032716,
+      "step": 2175
+    },
+    {
+      "epoch": 0.39665211062590977,
+      "grad_norm": 0.16082268953323364,
+      "learning_rate": 3.438487759736821e-05,
+      "loss": 0.1513260006904602,
+      "step": 2180
+    },
+    {
+      "epoch": 0.3975618631732169,
+      "grad_norm": 0.1474589854478836,
+      "learning_rate": 3.4316612276761004e-05,
+      "loss": 0.14968743324279785,
+      "step": 2185
+    },
+    {
+      "epoch": 0.39847161572052403,
+      "grad_norm": 0.14531342685222626,
+      "learning_rate": 3.42482661901463e-05,
+      "loss": 0.1563260555267334,
+      "step": 2190
+    },
+    {
+      "epoch": 0.39938136826783116,
+      "grad_norm": 0.16775506734848022,
+      "learning_rate": 3.41798399300185e-05,
+      "loss": 0.14861010313034057,
+      "step": 2195
+    },
+    {
+      "epoch": 0.4002911208151383,
+      "grad_norm": 0.15065217018127441,
+      "learning_rate": 3.411133408956703e-05,
+      "loss": 0.15559519529342652,
+      "step": 2200
+    },
+    {
+      "epoch": 0.4012008733624454,
+      "grad_norm": 0.16655296087265015,
+      "learning_rate": 3.4042749262671184e-05,
+      "loss": 0.16025567054748535,
+      "step": 2205
+    },
+    {
+      "epoch": 0.40211062590975255,
+      "grad_norm": 0.14773905277252197,
+      "learning_rate": 3.397408604389501e-05,
+      "loss": 0.15074082612991332,
+      "step": 2210
+    },
+    {
+      "epoch": 0.4030203784570597,
+      "grad_norm": 0.16233304142951965,
+      "learning_rate": 3.3905345028482125e-05,
+      "loss": 0.15490520000457764,
+      "step": 2215
+    },
+    {
+      "epoch": 0.4039301310043668,
+      "grad_norm": 0.17520153522491455,
+      "learning_rate": 3.383652681235058e-05,
+      "loss": 0.1517520785331726,
+      "step": 2220
+    },
+    {
+      "epoch": 0.40483988355167394,
+      "grad_norm": 0.14749875664710999,
+      "learning_rate": 3.376763199208766e-05,
+      "loss": 0.15410997867584228,
+      "step": 2225
+    },
+    {
+      "epoch": 0.40574963609898107,
+      "grad_norm": 0.16855919361114502,
+      "learning_rate": 3.369866116494477e-05,
+      "loss": 0.1510261058807373,
+      "step": 2230
+    },
+    {
+      "epoch": 0.4066593886462882,
+      "grad_norm": 0.1594122350215912,
+      "learning_rate": 3.362961492883218e-05,
+      "loss": 0.1493813395500183,
+      "step": 2235
+    },
+    {
+      "epoch": 0.40756914119359533,
+      "grad_norm": 0.13645926117897034,
+      "learning_rate": 3.3560493882313915e-05,
+      "loss": 0.14876762628555298,
+      "step": 2240
+    },
+    {
+      "epoch": 0.40847889374090246,
+      "grad_norm": 0.14304400980472565,
+      "learning_rate": 3.349129862460251e-05,
+      "loss": 0.15567013025283813,
+      "step": 2245
+    },
+    {
+      "epoch": 0.4093886462882096,
+      "grad_norm": 0.17040041089057922,
+      "learning_rate": 3.342202975555386e-05,
+      "loss": 0.1563249945640564,
+      "step": 2250
+    },
+    {
+      "epoch": 0.4102983988355167,
+      "grad_norm": 0.15594671666622162,
+      "learning_rate": 3.3352687875661984e-05,
+      "loss": 0.1546410083770752,
+      "step": 2255
+    },
+    {
+      "epoch": 0.41120815138282385,
+      "grad_norm": 0.1677195280790329,
+      "learning_rate": 3.328327358605384e-05,
+      "loss": 0.15710171461105346,
+      "step": 2260
+    },
+    {
+      "epoch": 0.412117903930131,
+      "grad_norm": 0.1731705516576767,
+      "learning_rate": 3.321378748848412e-05,
+      "loss": 0.16444036960601807,
+      "step": 2265
+    },
+    {
+      "epoch": 0.4130276564774381,
+      "grad_norm": 0.18779033422470093,
+      "learning_rate": 3.3144230185329984e-05,
+      "loss": 0.15659687519073487,
+      "step": 2270
+    },
+    {
+      "epoch": 0.4139374090247453,
+      "grad_norm": 0.1543768346309662,
+      "learning_rate": 3.3074602279585913e-05,
+      "loss": 0.15100739002227784,
+      "step": 2275
+    },
+    {
+      "epoch": 0.4148471615720524,
+      "grad_norm": 0.16672168672084808,
+      "learning_rate": 3.300490437485843e-05,
+      "loss": 0.15535364151000977,
+      "step": 2280
+    },
+    {
+      "epoch": 0.41575691411935956,
+      "grad_norm": 0.16741308569908142,
+      "learning_rate": 3.293513707536089e-05,
+      "loss": 0.15523911714553834,
+      "step": 2285
+    },
+    {
+      "epoch": 0.4166666666666667,
+      "grad_norm": 0.1488303542137146,
+      "learning_rate": 3.286530098590822e-05,
+      "loss": 0.1542000651359558,
+      "step": 2290
+    },
+    {
+      "epoch": 0.4175764192139738,
+      "grad_norm": 0.1637732982635498,
+      "learning_rate": 3.2795396711911694e-05,
+      "loss": 0.15354831218719484,
+      "step": 2295
+    },
+    {
+      "epoch": 0.41848617176128095,
+      "grad_norm": 0.1472022533416748,
+      "learning_rate": 3.272542485937369e-05,
+      "loss": 0.16235145330429077,
+      "step": 2300
+    },
+    {
+      "epoch": 0.4193959243085881,
+      "grad_norm": 0.15908290445804596,
+      "learning_rate": 3.265538603488241e-05,
+      "loss": 0.15642645359039306,
+      "step": 2305
+    },
+    {
+      "epoch": 0.4203056768558952,
+      "grad_norm": 0.1584865301847458,
+      "learning_rate": 3.2585280845606645e-05,
+      "loss": 0.15490249395370484,
+      "step": 2310
+    },
+    {
+      "epoch": 0.42121542940320233,
+      "grad_norm": 0.15893949568271637,
+      "learning_rate": 3.251510989929052e-05,
+      "loss": 0.1598116159439087,
+      "step": 2315
+    },
+    {
+      "epoch": 0.42212518195050946,
+      "grad_norm": 0.18930596113204956,
+      "learning_rate": 3.244487380424817e-05,
+      "loss": 0.1482008934020996,
+      "step": 2320
+    },
+    {
+      "epoch": 0.4230349344978166,
+      "grad_norm": 0.132876455783844,
+      "learning_rate": 3.237457316935856e-05,
+      "loss": 0.15304710865020751,
+      "step": 2325
+    },
+    {
+      "epoch": 0.4239446870451237,
+      "grad_norm": 0.16447032988071442,
+      "learning_rate": 3.2304208604060106e-05,
+      "loss": 0.15298750400543212,
+      "step": 2330
+    },
+    {
+      "epoch": 0.42485443959243085,
+      "grad_norm": 0.17748120427131653,
+      "learning_rate": 3.223378071834546e-05,
+      "loss": 0.1556084156036377,
+      "step": 2335
+    },
+    {
+      "epoch": 0.425764192139738,
+      "grad_norm": 0.16366586089134216,
+      "learning_rate": 3.2163290122756206e-05,
+      "loss": 0.14387927055358887,
+      "step": 2340
+    },
+    {
+      "epoch": 0.4266739446870451,
+      "grad_norm": 0.15398970246315002,
+      "learning_rate": 3.209273742837755e-05,
+      "loss": 0.16091293096542358,
+      "step": 2345
+    },
+    {
+      "epoch": 0.42758369723435224,
+      "grad_norm": 0.164212167263031,
+      "learning_rate": 3.202212324683305e-05,
+      "loss": 0.15523531436920165,
+      "step": 2350
+    },
+    {
+      "epoch": 0.4284934497816594,
+      "grad_norm": 0.16749800741672516,
+      "learning_rate": 3.1951448190279255e-05,
+      "loss": 0.15354975461959838,
+      "step": 2355
+    },
+    {
+      "epoch": 0.4294032023289665,
+      "grad_norm": 0.14137034118175507,
+      "learning_rate": 3.18807128714005e-05,
+      "loss": 0.14981694221496583,
+      "step": 2360
+    },
+    {
+      "epoch": 0.43031295487627363,
+      "grad_norm": 0.14848439395427704,
+      "learning_rate": 3.1809917903403507e-05,
+      "loss": 0.15448769330978393,
+      "step": 2365
+    },
+    {
+      "epoch": 0.43122270742358076,
+      "grad_norm": 0.1747605800628662,
+      "learning_rate": 3.1739063900012095e-05,
+      "loss": 0.15882387161254882,
+      "step": 2370
+    },
+    {
+      "epoch": 0.4321324599708879,
+      "grad_norm": 0.16054467856884003,
+      "learning_rate": 3.166815147546186e-05,
+      "loss": 0.15170297622680665,
+      "step": 2375
+    },
+    {
+      "epoch": 0.433042212518195,
+      "grad_norm": 0.15428027510643005,
+      "learning_rate": 3.1597181244494886e-05,
+      "loss": 0.16202548742294312,
+      "step": 2380
+    },
+    {
+      "epoch": 0.4339519650655022,
+      "grad_norm": 0.16747219860553741,
+      "learning_rate": 3.1526153822354325e-05,
+      "loss": 0.15461477041244506,
+      "step": 2385
+    },
+    {
+      "epoch": 0.43486171761280934,
+      "grad_norm": 0.17415772378444672,
+      "learning_rate": 3.145506982477918e-05,
+      "loss": 0.16173542737960817,
+      "step": 2390
+    },
+    {
+      "epoch": 0.43577147016011647,
+      "grad_norm": 0.1293518990278244,
+      "learning_rate": 3.1383929867998865e-05,
+      "loss": 0.15572521686553956,
+      "step": 2395
+    },
+    {
+      "epoch": 0.4366812227074236,
+      "grad_norm": 0.16909323632717133,
+      "learning_rate": 3.1312734568727935e-05,
+      "loss": 0.15898628234863282,
+      "step": 2400
+    },
+    {
+      "epoch": 0.43759097525473073,
+      "grad_norm": 0.16770294308662415,
+      "learning_rate": 3.124148454416069e-05,
+      "loss": 0.1536281704902649,
+      "step": 2405
+    },
+    {
+      "epoch": 0.43850072780203786,
+      "grad_norm": 0.14078612625598907,
+      "learning_rate": 3.117018041196585e-05,
+      "loss": 0.15274266004562378,
+      "step": 2410
+    },
+    {
+      "epoch": 0.439410480349345,
+      "grad_norm": 0.15457536280155182,
+      "learning_rate": 3.1098822790281226e-05,
+      "loss": 0.15391263961791993,
+      "step": 2415
+    },
+    {
+      "epoch": 0.4403202328966521,
+      "grad_norm": 0.1640717089176178,
+      "learning_rate": 3.102741229770827e-05,
+      "loss": 0.15515168905258178,
+      "step": 2420
+    },
+    {
+      "epoch": 0.44122998544395925,
+      "grad_norm": 0.2601533830165863,
+      "learning_rate": 3.095594955330683e-05,
+      "loss": 0.1587247371673584,
+      "step": 2425
+    },
+    {
+      "epoch": 0.4421397379912664,
+      "grad_norm": 0.1352529525756836,
+      "learning_rate": 3.08844351765897e-05,
+      "loss": 0.1483217477798462,
+      "step": 2430
+    },
+    {
+      "epoch": 0.4430494905385735,
+      "grad_norm": 0.18479721248149872,
+      "learning_rate": 3.081286978751728e-05,
+      "loss": 0.15121787786483765,
+      "step": 2435
+    },
+    {
+      "epoch": 0.44395924308588064,
+      "grad_norm": 0.16954511404037476,
+      "learning_rate": 3.074125400649221e-05,
+      "loss": 0.16073100566864013,
+      "step": 2440
+    },
+    {
+      "epoch": 0.44486899563318777,
+      "grad_norm": 0.15154729783535004,
+      "learning_rate": 3.0669588454353944e-05,
+      "loss": 0.15738017559051515,
+      "step": 2445
+    },
+    {
+      "epoch": 0.4457787481804949,
+      "grad_norm": 0.1540488302707672,
+      "learning_rate": 3.059787375237344e-05,
+      "loss": 0.1515384554862976,
+      "step": 2450
+    },
+    {
+      "epoch": 0.44668850072780203,
+      "grad_norm": 0.1814432442188263,
+      "learning_rate": 3.052611052224774e-05,
+      "loss": 0.15731438398361205,
+      "step": 2455
+    },
+    {
+      "epoch": 0.44759825327510916,
+      "grad_norm": 0.16657036542892456,
+      "learning_rate": 3.0454299386094542e-05,
+      "loss": 0.15741543769836425,
+      "step": 2460
+    },
+    {
+      "epoch": 0.4485080058224163,
+      "grad_norm": 0.2177237570285797,
+      "learning_rate": 3.0382440966446875e-05,
+      "loss": 0.14972515106201173,
+      "step": 2465
+    },
+    {
+      "epoch": 0.4494177583697234,
+      "grad_norm": 0.1669909954071045,
+      "learning_rate": 3.031053588624766e-05,
+      "loss": 0.1506432294845581,
+      "step": 2470
+    },
+    {
+      "epoch": 0.45032751091703055,
+      "grad_norm": 0.1752234250307083,
+      "learning_rate": 3.0238584768844313e-05,
+      "loss": 0.14969609975814818,
+      "step": 2475
+    },
+    {
+      "epoch": 0.4512372634643377,
+      "grad_norm": 0.18267901241779327,
+      "learning_rate": 3.0166588237983363e-05,
+      "loss": 0.15112748146057128,
+      "step": 2480
+    },
+    {
+      "epoch": 0.4521470160116448,
+      "grad_norm": 0.16250105202198029,
+      "learning_rate": 3.0094546917805007e-05,
+      "loss": 0.15864100456237792,
+      "step": 2485
+    },
+    {
+      "epoch": 0.45305676855895194,
+      "grad_norm": 0.14825721085071564,
+      "learning_rate": 3.0022461432837752e-05,
+      "loss": 0.1513954520225525,
+      "step": 2490
+    },
+    {
+      "epoch": 0.4539665211062591,
+      "grad_norm": 0.1626640111207962,
+      "learning_rate": 2.9950332407992943e-05,
+      "loss": 0.1505578875541687,
+      "step": 2495
+    },
+    {
+      "epoch": 0.45487627365356625,
+      "grad_norm": 0.1535351574420929,
+      "learning_rate": 2.987816046855939e-05,
+      "loss": 0.15255829095840454,
+      "step": 2500
+    },
+    {
+      "epoch": 0.4557860262008734,
+      "grad_norm": 0.17552775144577026,
+      "learning_rate": 2.9805946240197928e-05,
+      "loss": 0.1516443133354187,
+      "step": 2505
+    },
+    {
+      "epoch": 0.4566957787481805,
+      "grad_norm": 0.16020981967449188,
+      "learning_rate": 2.9733690348935994e-05,
+      "loss": 0.14519743919372557,
+      "step": 2510
+    },
+    {
+      "epoch": 0.45760553129548764,
+      "grad_norm": 0.17800211906433105,
+      "learning_rate": 2.9661393421162204e-05,
+      "loss": 0.15679080486297609,
+      "step": 2515
+    },
+    {
+      "epoch": 0.4585152838427948,
+      "grad_norm": 0.16016991436481476,
+      "learning_rate": 2.9589056083620902e-05,
+      "loss": 0.14768127202987671,
+      "step": 2520
+    },
+    {
+      "epoch": 0.4594250363901019,
+      "grad_norm": 0.16272081434726715,
+      "learning_rate": 2.951667896340679e-05,
+      "loss": 0.1513301968574524,
+      "step": 2525
+    },
+    {
+      "epoch": 0.46033478893740903,
+      "grad_norm": 0.1726413071155548,
+      "learning_rate": 2.9444262687959402e-05,
+      "loss": 0.14819332361221313,
+      "step": 2530
+    },
+    {
+      "epoch": 0.46124454148471616,
+      "grad_norm": 0.1670403778553009,
+      "learning_rate": 2.9371807885057735e-05,
+      "loss": 0.15245940685272216,
+      "step": 2535
+    },
+    {
+      "epoch": 0.4621542940320233,
+      "grad_norm": 0.1650049239397049,
+      "learning_rate": 2.9299315182814772e-05,
+      "loss": 0.15187418460845947,
+      "step": 2540
+    },
+    {
+      "epoch": 0.4630640465793304,
+      "grad_norm": 0.16327734291553497,
+      "learning_rate": 2.9226785209672047e-05,
+      "loss": 0.15579828023910522,
+      "step": 2545
+    },
+    {
+      "epoch": 0.46397379912663755,
+      "grad_norm": 0.3367880582809448,
+      "learning_rate": 2.91542185943942e-05,
+      "loss": 0.15617697238922118,
+      "step": 2550
+    },
+    {
+      "epoch": 0.4648835516739447,
+      "grad_norm": 0.1731594055891037,
+      "learning_rate": 2.908161596606353e-05,
+      "loss": 0.1559603691101074,
+      "step": 2555
+    },
+    {
+      "epoch": 0.4657933042212518,
+      "grad_norm": 0.1477293074131012,
+      "learning_rate": 2.9008977954074517e-05,
+      "loss": 0.15567959547042848,
+      "step": 2560
+    },
+    {
+      "epoch": 0.46670305676855894,
+      "grad_norm": 0.16227173805236816,
+      "learning_rate": 2.8936305188128392e-05,
+      "loss": 0.1522113561630249,
+      "step": 2565
+    },
+    {
+      "epoch": 0.4676128093158661,
+      "grad_norm": 0.2031075656414032,
+      "learning_rate": 2.8863598298227674e-05,
+      "loss": 0.15054640769958497,
+      "step": 2570
+    },
+    {
+      "epoch": 0.4685225618631732,
+      "grad_norm": 0.18351472914218903,
+      "learning_rate": 2.8790857914670698e-05,
+      "loss": 0.15837019681930542,
+      "step": 2575
+    },
+    {
+      "epoch": 0.46943231441048033,
+      "grad_norm": 0.15914765000343323,
+      "learning_rate": 2.871808466804616e-05,
+      "loss": 0.1550259470939636,
+      "step": 2580
+    },
+    {
+      "epoch": 0.47034206695778746,
+      "grad_norm": 0.17366717755794525,
+      "learning_rate": 2.8645279189227636e-05,
+      "loss": 0.15702390670776367,
+      "step": 2585
+    },
+    {
+      "epoch": 0.4712518195050946,
+      "grad_norm": 0.13677838444709778,
+      "learning_rate": 2.8572442109368134e-05,
+      "loss": 0.15485031604766847,
+      "step": 2590
+    },
+    {
+      "epoch": 0.4721615720524017,
+      "grad_norm": 0.1477748304605484,
+      "learning_rate": 2.8499574059894617e-05,
+      "loss": 0.14577245712280273,
+      "step": 2595
+    },
+    {
+      "epoch": 0.47307132459970885,
+      "grad_norm": 0.1582217663526535,
+      "learning_rate": 2.842667567250252e-05,
+      "loss": 0.15586793422698975,
+      "step": 2600
+    },
+    {
+      "epoch": 0.47398107714701604,
+      "grad_norm": 0.19658738374710083,
+      "learning_rate": 2.8353747579150268e-05,
+      "loss": 0.15060495138168334,
+      "step": 2605
+    },
+    {
+      "epoch": 0.47489082969432317,
+      "grad_norm": 0.176767036318779,
+      "learning_rate": 2.828079041205382e-05,
+      "loss": 0.15116705894470214,
+      "step": 2610
+    },
+    {
+      "epoch": 0.4758005822416303,
+      "grad_norm": 0.16972507536411285,
+      "learning_rate": 2.820780480368117e-05,
+      "loss": 0.1541937470436096,
+      "step": 2615
+    },
+    {
+      "epoch": 0.47671033478893743,
+      "grad_norm": 0.1548585742712021,
+      "learning_rate": 2.8134791386746884e-05,
+      "loss": 0.14334756135940552,
+      "step": 2620
+    },
+    {
+      "epoch": 0.47762008733624456,
+      "grad_norm": 0.15411986410617828,
+      "learning_rate": 2.806175079420658e-05,
+      "loss": 0.14642289876937867,
+      "step": 2625
+    },
+    {
+      "epoch": 0.4785298398835517,
+      "grad_norm": 0.16609491407871246,
+      "learning_rate": 2.7988683659251474e-05,
+      "loss": 0.15083469152450563,
+      "step": 2630
+    },
+    {
+      "epoch": 0.4794395924308588,
+      "grad_norm": 0.16592684388160706,
+      "learning_rate": 2.791559061530289e-05,
+      "loss": 0.14218480587005616,
+      "step": 2635
+    },
+    {
+      "epoch": 0.48034934497816595,
+      "grad_norm": 0.1764935404062271,
+      "learning_rate": 2.7842472296006722e-05,
+      "loss": 0.15004343986511232,
+      "step": 2640
+    },
+    {
+      "epoch": 0.4812590975254731,
+      "grad_norm": 0.20094354450702667,
+      "learning_rate": 2.7769329335228022e-05,
+      "loss": 0.14975016117095946,
+      "step": 2645
+    },
+    {
+      "epoch": 0.4821688500727802,
+      "grad_norm": 0.1869269460439682,
+      "learning_rate": 2.769616236704542e-05,
+      "loss": 0.155981707572937,
+      "step": 2650
+    },
+    {
+      "epoch": 0.48307860262008734,
+      "grad_norm": 0.16671574115753174,
+      "learning_rate": 2.762297202574571e-05,
+      "loss": 0.14633859395980836,
+      "step": 2655
+    },
+    {
+      "epoch": 0.48398835516739447,
+      "grad_norm": 0.14999663829803467,
+      "learning_rate": 2.754975894581826e-05,
+      "loss": 0.15692603588104248,
+      "step": 2660
+    },
+    {
+      "epoch": 0.4848981077147016,
+      "grad_norm": 0.16893649101257324,
+      "learning_rate": 2.7476523761949592e-05,
+      "loss": 0.14530394077301026,
+      "step": 2665
+    },
+    {
+      "epoch": 0.48580786026200873,
+      "grad_norm": 0.16039884090423584,
+      "learning_rate": 2.740326710901784e-05,
+      "loss": 0.15013915300369263,
+      "step": 2670
+    },
+    {
+      "epoch": 0.48671761280931586,
+      "grad_norm": 0.16672006249427795,
+      "learning_rate": 2.732998962208725e-05,
+      "loss": 0.15667349100112915,
+      "step": 2675
+    },
+    {
+      "epoch": 0.487627365356623,
+      "grad_norm": 0.2160867303609848,
+      "learning_rate": 2.7256691936402684e-05,
+      "loss": 0.14335414171218872,
+      "step": 2680
+    },
+    {
+      "epoch": 0.4885371179039301,
+      "grad_norm": 0.349030077457428,
+      "learning_rate": 2.71833746873841e-05,
+      "loss": 0.1437530279159546,
+      "step": 2685
+    },
+    {
+      "epoch": 0.48944687045123725,
+      "grad_norm": 0.18380966782569885,
+      "learning_rate": 2.7110038510621073e-05,
+      "loss": 0.1476014256477356,
+      "step": 2690
+    },
+    {
+      "epoch": 0.4903566229985444,
+      "grad_norm": 0.1523742377758026,
+      "learning_rate": 2.703668404186722e-05,
+      "loss": 0.14578526020050048,
+      "step": 2695
+    },
+    {
+      "epoch": 0.4912663755458515,
+      "grad_norm": 0.16092729568481445,
+      "learning_rate": 2.696331191703479e-05,
+      "loss": 0.15335593223571778,
+      "step": 2700
+    },
+    {
+      "epoch": 0.49217612809315864,
+      "grad_norm": 0.17185333371162415,
+      "learning_rate": 2.688992277218904e-05,
+      "loss": 0.1540898084640503,
+      "step": 2705
+    },
+    {
+      "epoch": 0.49308588064046577,
+      "grad_norm": 0.1521969735622406,
+      "learning_rate": 2.6816517243542792e-05,
+      "loss": 0.15171396732330322,
+      "step": 2710
+    },
+    {
+      "epoch": 0.49399563318777295,
+      "grad_norm": 0.16064171493053436,
+      "learning_rate": 2.674309596745092e-05,
+      "loss": 0.1505839228630066,
+      "step": 2715
+    },
+    {
+      "epoch": 0.4949053857350801,
+      "grad_norm": 0.16430898010730743,
+      "learning_rate": 2.6669659580404795e-05,
+      "loss": 0.1551363468170166,
+      "step": 2720
+    },
+    {
+      "epoch": 0.4958151382823872,
+      "grad_norm": 0.16125477850437164,
+      "learning_rate": 2.659620871902677e-05,
+      "loss": 0.15069286823272704,
+      "step": 2725
+    },
+    {
+      "epoch": 0.49672489082969434,
+      "grad_norm": 0.1428450047969818,
+      "learning_rate": 2.652274402006471e-05,
+      "loss": 0.15511081218719483,
+      "step": 2730
+    },
+    {
+      "epoch": 0.4976346433770015,
+      "grad_norm": 0.15452754497528076,
+      "learning_rate": 2.6449266120386406e-05,
+      "loss": 0.14941939115524291,
+      "step": 2735
+    },
+    {
+      "epoch": 0.4985443959243086,
+      "grad_norm": 0.17243537306785583,
+      "learning_rate": 2.6375775656974123e-05,
+      "loss": 0.151741623878479,
+      "step": 2740
+    },
+    {
+      "epoch": 0.49945414847161573,
+      "grad_norm": 0.13736453652381897,
+      "learning_rate": 2.6302273266919008e-05,
+      "loss": 0.147042977809906,
+      "step": 2745
+    },
+    {
+      "epoch": 0.5003639010189228,
+      "grad_norm": 0.16241495311260223,
+      "learning_rate": 2.6228759587415614e-05,
+      "loss": 0.14664684534072875,
+      "step": 2750
+    },
+    {
+      "epoch": 0.50127365356623,
+      "grad_norm": 0.193496435880661,
+      "learning_rate": 2.6155235255756356e-05,
+      "loss": 0.15486966371536254,
+      "step": 2755
+    },
+    {
+      "epoch": 0.5021834061135371,
+      "grad_norm": 0.1542847901582718,
+      "learning_rate": 2.6081700909326e-05,
+      "loss": 0.15148009061813356,
+      "step": 2760
+    },
+    {
+      "epoch": 0.5030931586608443,
+      "grad_norm": 0.1696511209011078,
+      "learning_rate": 2.6008157185596142e-05,
+      "loss": 0.14190055131912233,
+      "step": 2765
+    },
+    {
+      "epoch": 0.5040029112081513,
+      "grad_norm": 0.14690077304840088,
+      "learning_rate": 2.5934604722119655e-05,
+      "loss": 0.1570739269256592,
+      "step": 2770
+    },
+    {
+      "epoch": 0.5049126637554585,
+      "grad_norm": 0.17149671912193298,
+      "learning_rate": 2.5861044156525162e-05,
+      "loss": 0.14940304756164552,
+      "step": 2775
+    },
+    {
+      "epoch": 0.5058224163027657,
+      "grad_norm": 0.16639231145381927,
+      "learning_rate": 2.578747612651155e-05,
+      "loss": 0.15691237449645995,
+      "step": 2780
+    },
+    {
+      "epoch": 0.5067321688500728,
+      "grad_norm": 0.2062763124704361,
+      "learning_rate": 2.5713901269842404e-05,
+      "loss": 0.1564734935760498,
+      "step": 2785
+    },
+    {
+      "epoch": 0.50764192139738,
+      "grad_norm": 0.12636308372020721,
+      "learning_rate": 2.5640320224340502e-05,
+      "loss": 0.14539417028427123,
+      "step": 2790
+    },
+    {
+      "epoch": 0.508551673944687,
+      "grad_norm": 0.16893689334392548,
+      "learning_rate": 2.556673362788225e-05,
+      "loss": 0.15440930128097535,
+      "step": 2795
+    },
+    {
+      "epoch": 0.5094614264919942,
+      "grad_norm": 0.16250015795230865,
+      "learning_rate": 2.54931421183922e-05,
+      "loss": 0.14485647678375244,
+      "step": 2800
+    },
+    {
+      "epoch": 0.5103711790393013,
+      "grad_norm": 0.1700994372367859,
+      "learning_rate": 2.5419546333837462e-05,
+      "loss": 0.15411126613616943,
+      "step": 2805
+    },
+    {
+      "epoch": 0.5112809315866085,
+      "grad_norm": 0.1547706127166748,
+      "learning_rate": 2.5345946912222256e-05,
+      "loss": 0.15516072511672974,
+      "step": 2810
+    },
+    {
+      "epoch": 0.5121906841339156,
+      "grad_norm": 0.17955681681632996,
+      "learning_rate": 2.527234449158228e-05,
+      "loss": 0.15546923875808716,
+      "step": 2815
+    },
+    {
+      "epoch": 0.5131004366812227,
+      "grad_norm": 0.163709819316864,
+      "learning_rate": 2.519873970997927e-05,
+      "loss": 0.15665037631988527,
+      "step": 2820
+    },
+    {
+      "epoch": 0.5140101892285298,
+      "grad_norm": 0.17859576642513275,
+      "learning_rate": 2.5125133205495405e-05,
+      "loss": 0.1539722204208374,
+      "step": 2825
+    },
+    {
+      "epoch": 0.514919941775837,
+      "grad_norm": 0.17443150281906128,
+      "learning_rate": 2.5051525616227806e-05,
+      "loss": 0.148411762714386,
+      "step": 2830
+    },
+    {
+      "epoch": 0.5158296943231441,
+      "grad_norm": 0.17397581040859222,
+      "learning_rate": 2.4977917580283007e-05,
+      "loss": 0.14880497455596925,
+      "step": 2835
+    },
+    {
+      "epoch": 0.5167394468704513,
+      "grad_norm": 0.14565663039684296,
+      "learning_rate": 2.4904309735771405e-05,
+      "loss": 0.14934680461883545,
+      "step": 2840
+    },
+    {
+      "epoch": 0.5176491994177583,
+      "grad_norm": 0.17895659804344177,
+      "learning_rate": 2.4830702720801746e-05,
+      "loss": 0.15287939310073853,
+      "step": 2845
+    },
+    {
+      "epoch": 0.5185589519650655,
+      "grad_norm": 0.15812788903713226,
+      "learning_rate": 2.4757097173475572e-05,
+      "loss": 0.14576947689056396,
+      "step": 2850
+    },
+    {
+      "epoch": 0.5194687045123726,
+      "grad_norm": 0.17123781144618988,
+      "learning_rate": 2.46834937318817e-05,
+      "loss": 0.15224847793579102,
+      "step": 2855
+    },
+    {
+      "epoch": 0.5203784570596798,
+      "grad_norm": 0.14845474064350128,
+      "learning_rate": 2.460989303409072e-05,
+      "loss": 0.14901585578918458,
+      "step": 2860
+    },
+    {
+      "epoch": 0.5212882096069869,
+      "grad_norm": 0.23493704199790955,
+      "learning_rate": 2.4536295718149407e-05,
+      "loss": 0.1517487049102783,
+      "step": 2865
+    },
+    {
+      "epoch": 0.522197962154294,
+      "grad_norm": 0.16209843754768372,
+      "learning_rate": 2.4462702422075217e-05,
+      "loss": 0.14327445030212402,
+      "step": 2870
+    },
+    {
+      "epoch": 0.5231077147016011,
+      "grad_norm": 0.17249803245067596,
+      "learning_rate": 2.4389113783850793e-05,
+      "loss": 0.1517549753189087,
+      "step": 2875
+    },
+    {
+      "epoch": 0.5240174672489083,
+      "grad_norm": 0.14561402797698975,
+      "learning_rate": 2.431553044141836e-05,
+      "loss": 0.14764087200164794,
+      "step": 2880
+    },
+    {
+      "epoch": 0.5249272197962155,
+      "grad_norm": 0.17033302783966064,
+      "learning_rate": 2.4241953032674256e-05,
+      "loss": 0.15181604623794556,
+      "step": 2885
+    },
+    {
+      "epoch": 0.5258369723435226,
+      "grad_norm": 0.1184430941939354,
+      "learning_rate": 2.4168382195463367e-05,
+      "loss": 0.14264242649078368,
+      "step": 2890
+    },
+    {
+      "epoch": 0.5267467248908297,
+      "grad_norm": 0.17521196603775024,
+      "learning_rate": 2.4094818567573618e-05,
+      "loss": 0.1509538173675537,
+      "step": 2895
+    },
+    {
+      "epoch": 0.5276564774381368,
+      "grad_norm": 0.1681576371192932,
+      "learning_rate": 2.4021262786730428e-05,
+      "loss": 0.15344605445861817,
+      "step": 2900
+    },
+    {
+      "epoch": 0.528566229985444,
+      "grad_norm": 0.17134182155132294,
+      "learning_rate": 2.3947715490591206e-05,
+      "loss": 0.15161689519882202,
+      "step": 2905
+    },
+    {
+      "epoch": 0.5294759825327511,
+      "grad_norm": 0.1796472817659378,
+      "learning_rate": 2.3874177316739778e-05,
+      "loss": 0.15086464881896972,
+      "step": 2910
+    },
+    {
+      "epoch": 0.5303857350800583,
+      "grad_norm": 0.23268625140190125,
+      "learning_rate": 2.380064890268093e-05,
+      "loss": 0.15354180335998535,
+      "step": 2915
+    },
+    {
+      "epoch": 0.5312954876273653,
+      "grad_norm": 0.16318941116333008,
+      "learning_rate": 2.372713088583481e-05,
+      "loss": 0.15131797790527343,
+      "step": 2920
+    },
+    {
+      "epoch": 0.5322052401746725,
+      "grad_norm": 0.18171803653240204,
+      "learning_rate": 2.365362390353143e-05,
+      "loss": 0.15784090757369995,
+      "step": 2925
+    },
+    {
+      "epoch": 0.5331149927219796,
+      "grad_norm": 0.17672640085220337,
+      "learning_rate": 2.3580128593005156e-05,
+      "loss": 0.15509436130523682,
+      "step": 2930
+    },
+    {
+      "epoch": 0.5340247452692868,
+      "grad_norm": 0.15985223650932312,
+      "learning_rate": 2.3506645591389174e-05,
+      "loss": 0.14851027727127075,
+      "step": 2935
+    },
+    {
+      "epoch": 0.5349344978165939,
+      "grad_norm": 0.16597607731819153,
+      "learning_rate": 2.343317553570995e-05,
+      "loss": 0.1504931092262268,
+      "step": 2940
+    },
+    {
+      "epoch": 0.535844250363901,
+      "grad_norm": 0.20180748403072357,
+      "learning_rate": 2.3359719062881725e-05,
+      "loss": 0.15023820400238036,
+      "step": 2945
+    },
+    {
+      "epoch": 0.5367540029112081,
+      "grad_norm": 0.1735963076353073,
+      "learning_rate": 2.3286276809701e-05,
+      "loss": 0.15374408960342406,
+      "step": 2950
+    },
+    {
+      "epoch": 0.5376637554585153,
+      "grad_norm": 0.17629501223564148,
+      "learning_rate": 2.3212849412840995e-05,
+      "loss": 0.15007833242416382,
+      "step": 2955
+    },
+    {
+      "epoch": 0.5385735080058224,
+      "grad_norm": 0.1493796557188034,
+      "learning_rate": 2.3139437508846155e-05,
+      "loss": 0.15206656455993653,
+      "step": 2960
+    },
+    {
+      "epoch": 0.5394832605531296,
+      "grad_norm": 0.17426837980747223,
+      "learning_rate": 2.306604173412659e-05,
+      "loss": 0.1441131591796875,
+      "step": 2965
+    },
+    {
+      "epoch": 0.5403930131004366,
+      "grad_norm": 0.16984431445598602,
+      "learning_rate": 2.2992662724952613e-05,
+      "loss": 0.14438753128051757,
+      "step": 2970
+    },
+    {
+      "epoch": 0.5413027656477438,
+      "grad_norm": 0.1814386397600174,
+      "learning_rate": 2.2919301117449167e-05,
+      "loss": 0.14887022972106934,
+      "step": 2975
+    },
+    {
+      "epoch": 0.5422125181950509,
+      "grad_norm": 0.158392995595932,
+      "learning_rate": 2.2845957547590368e-05,
+      "loss": 0.14404361248016356,
+      "step": 2980
+    },
+    {
+      "epoch": 0.5431222707423581,
+      "grad_norm": 0.17496263980865479,
+      "learning_rate": 2.2772632651193953e-05,
+      "loss": 0.1454906702041626,
+      "step": 2985
+    },
+    {
+      "epoch": 0.5440320232896652,
+      "grad_norm": 0.157533198595047,
+      "learning_rate": 2.2699327063915766e-05,
+      "loss": 0.1458217740058899,
+      "step": 2990
+    },
+    {
+      "epoch": 0.5449417758369723,
+      "grad_norm": 0.1767890453338623,
+      "learning_rate": 2.262604142124427e-05,
+      "loss": 0.14384825229644777,
+      "step": 2995
+    },
+    {
+      "epoch": 0.5458515283842795,
+      "grad_norm": 0.1851050704717636,
+      "learning_rate": 2.2552776358495033e-05,
+      "loss": 0.14832457304000854,
+      "step": 3000
+    },
+    {
+      "epoch": 0.5467612809315866,
+      "grad_norm": 0.164175882935524,
+      "learning_rate": 2.247953251080521e-05,
+      "loss": 0.14999878406524658,
+      "step": 3005
+    },
+    {
+      "epoch": 0.5476710334788938,
+      "grad_norm": 0.3403675854206085,
+      "learning_rate": 2.240631051312804e-05,
+      "loss": 0.1443937063217163,
+      "step": 3010
+    },
+    {
+      "epoch": 0.5485807860262009,
+      "grad_norm": 0.16751109063625336,
+      "learning_rate": 2.2333111000227342e-05,
+      "loss": 0.1462402105331421,
+      "step": 3015
+    },
+    {
+      "epoch": 0.549490538573508,
+      "grad_norm": 0.14741151034832,
+      "learning_rate": 2.225993460667201e-05,
+      "loss": 0.149855899810791,
+      "step": 3020
+    },
+    {
+      "epoch": 0.5504002911208151,
+      "grad_norm": 0.20605266094207764,
+      "learning_rate": 2.218678196683054e-05,
+      "loss": 0.15413178205490113,
+      "step": 3025
+    },
+    {
+      "epoch": 0.5513100436681223,
+      "grad_norm": 0.14884796738624573,
+      "learning_rate": 2.2113653714865473e-05,
+      "loss": 0.14592334032058715,
+      "step": 3030
+    },
+    {
+      "epoch": 0.5522197962154294,
+      "grad_norm": 0.17114350199699402,
+      "learning_rate": 2.2040550484727943e-05,
+      "loss": 0.1498338460922241,
+      "step": 3035
+    },
+    {
+      "epoch": 0.5531295487627366,
+      "grad_norm": 0.16496853530406952,
+      "learning_rate": 2.196747291015219e-05,
+      "loss": 0.14650315046310425,
+      "step": 3040
+    },
+    {
+      "epoch": 0.5540393013100436,
+      "grad_norm": 0.15172401070594788,
+      "learning_rate": 2.189442162465001e-05,
+      "loss": 0.14984124898910522,
+      "step": 3045
+    },
+    {
+      "epoch": 0.5549490538573508,
+      "grad_norm": 0.19258467853069305,
+      "learning_rate": 2.182139726150532e-05,
+      "loss": 0.1486764669418335,
+      "step": 3050
+    },
+    {
+      "epoch": 0.5558588064046579,
+      "grad_norm": 0.1749001443386078,
+      "learning_rate": 2.1748400453768652e-05,
+      "loss": 0.14983701705932617,
+      "step": 3055
+    },
+    {
+      "epoch": 0.5567685589519651,
+      "grad_norm": 0.37510567903518677,
+      "learning_rate": 2.1675431834251637e-05,
+      "loss": 0.14483561515808105,
+      "step": 3060
+    },
+    {
+      "epoch": 0.5576783114992722,
+      "grad_norm": 0.16932405531406403,
+      "learning_rate": 2.1602492035521553e-05,
+      "loss": 0.14487643241882325,
+      "step": 3065
+    },
+    {
+      "epoch": 0.5585880640465793,
+      "grad_norm": 0.174176424741745,
+      "learning_rate": 2.152958168989584e-05,
+      "loss": 0.14737497568130492,
+      "step": 3070
+    },
+    {
+      "epoch": 0.5594978165938864,
+      "grad_norm": 0.1601252257823944,
+      "learning_rate": 2.1456701429436577e-05,
+      "loss": 0.15183379650115966,
+      "step": 3075
+    },
+    {
+      "epoch": 0.5604075691411936,
+      "grad_norm": 0.14960910379886627,
+      "learning_rate": 2.1383851885945085e-05,
+      "loss": 0.143074893951416,
+      "step": 3080
+    },
+    {
+      "epoch": 0.5613173216885007,
+      "grad_norm": 0.1678633838891983,
+      "learning_rate": 2.1311033690956346e-05,
+      "loss": 0.14961432218551635,
+      "step": 3085
+    },
+    {
+      "epoch": 0.5622270742358079,
+      "grad_norm": 0.15814319252967834,
+      "learning_rate": 2.1238247475733613e-05,
+      "loss": 0.14308581352233887,
+      "step": 3090
+    },
+    {
+      "epoch": 0.5631368267831149,
+      "grad_norm": 0.21240772306919098,
+      "learning_rate": 2.1165493871262887e-05,
+      "loss": 0.14737485647201537,
+      "step": 3095
+    },
+    {
+      "epoch": 0.5640465793304221,
+      "grad_norm": 0.15161271393299103,
+      "learning_rate": 2.109277350824749e-05,
+      "loss": 0.14534420967102052,
+      "step": 3100
+    },
+    {
+      "epoch": 0.5649563318777293,
+      "grad_norm": 0.16572362184524536,
+      "learning_rate": 2.1020087017102537e-05,
+      "loss": 0.14299670457839966,
+      "step": 3105
+    },
+    {
+      "epoch": 0.5658660844250364,
+      "grad_norm": 0.1548164039850235,
+      "learning_rate": 2.094743502794954e-05,
+      "loss": 0.14371142387390137,
+      "step": 3110
+    },
+    {
+      "epoch": 0.5667758369723436,
+      "grad_norm": 0.2574169933795929,
+      "learning_rate": 2.0874818170610885e-05,
+      "loss": 0.14350423812866211,
+      "step": 3115
+    },
+    {
+      "epoch": 0.5676855895196506,
+      "grad_norm": 0.16359548270702362,
+      "learning_rate": 2.080223707460443e-05,
+      "loss": 0.1520243763923645,
+      "step": 3120
+    },
+    {
+      "epoch": 0.5685953420669578,
+      "grad_norm": 0.1798320859670639,
+      "learning_rate": 2.072969236913799e-05,
+      "loss": 0.14832595586776734,
+      "step": 3125
+    },
+    {
+      "epoch": 0.5695050946142649,
+      "grad_norm": 0.17045916616916656,
+      "learning_rate": 2.0657184683103926e-05,
+      "loss": 0.15308042764663696,
+      "step": 3130
+    },
+    {
+      "epoch": 0.5704148471615721,
+      "grad_norm": 0.16345897316932678,
+      "learning_rate": 2.058471464507366e-05,
+      "loss": 0.14564799070358275,
+      "step": 3135
+    },
+    {
+      "epoch": 0.5713245997088792,
+      "grad_norm": 0.15170110762119293,
+      "learning_rate": 2.0512282883292257e-05,
+      "loss": 0.14161767959594726,
+      "step": 3140
+    },
+    {
+      "epoch": 0.5722343522561864,
+      "grad_norm": 0.8107472658157349,
+      "learning_rate": 2.0439890025672955e-05,
+      "loss": 0.14481087923049926,
+      "step": 3145
+    },
+    {
+      "epoch": 0.5731441048034934,
+      "grad_norm": 0.15346679091453552,
+      "learning_rate": 2.036753669979174e-05,
+      "loss": 0.14860262870788574,
+      "step": 3150
+    },
+    {
+      "epoch": 0.5740538573508006,
+      "grad_norm": 0.1632593423128128,
+      "learning_rate": 2.0295223532881886e-05,
+      "loss": 0.1481687307357788,
+      "step": 3155
+    },
+    {
+      "epoch": 0.5749636098981077,
+      "grad_norm": 0.23399172723293304,
+      "learning_rate": 2.022295115182852e-05,
+      "loss": 0.149153733253479,
+      "step": 3160
+    },
+    {
+      "epoch": 0.5758733624454149,
+      "grad_norm": 0.14977394044399261,
+      "learning_rate": 2.015072018316323e-05,
+      "loss": 0.14921388626098633,
+      "step": 3165
+    },
+    {
+      "epoch": 0.576783114992722,
+      "grad_norm": 0.1550658792257309,
+      "learning_rate": 2.007853125305856e-05,
+      "loss": 0.1482759475708008,
+      "step": 3170
+    },
+    {
+      "epoch": 0.5776928675400291,
+      "grad_norm": 0.16661737859249115,
+      "learning_rate": 2.0006384987322645e-05,
+      "loss": 0.14903552532196046,
+      "step": 3175
+    },
+    {
+      "epoch": 0.5786026200873362,
+      "grad_norm": 0.1746823936700821,
+      "learning_rate": 1.9934282011393753e-05,
+      "loss": 0.1412947654724121,
+      "step": 3180
+    },
+    {
+      "epoch": 0.5795123726346434,
+      "grad_norm": 0.17025792598724365,
+      "learning_rate": 1.9862222950334857e-05,
+      "loss": 0.15289769172668458,
+      "step": 3185
+    },
+    {
+      "epoch": 0.5804221251819505,
+      "grad_norm": 0.16857658326625824,
+      "learning_rate": 1.9790208428828252e-05,
+      "loss": 0.14419941902160643,
+      "step": 3190
+    },
+    {
+      "epoch": 0.5813318777292577,
+      "grad_norm": 0.16099876165390015,
+      "learning_rate": 1.9718239071170118e-05,
+      "loss": 0.14476487636566163,
+      "step": 3195
+    },
+    {
+      "epoch": 0.5822416302765647,
+      "grad_norm": 0.16140873730182648,
+      "learning_rate": 1.964631550126508e-05,
+      "loss": 0.14588416814804078,
+      "step": 3200
+    },
+    {
+      "epoch": 0.5831513828238719,
+      "grad_norm": 0.15719448029994965,
+      "learning_rate": 1.957443834262087e-05,
+      "loss": 0.15144693851470947,
+      "step": 3205
+    },
+    {
+      "epoch": 0.584061135371179,
+      "grad_norm": 0.16512645781040192,
+      "learning_rate": 1.950260821834285e-05,
+      "loss": 0.14787566661834717,
+      "step": 3210
+    },
+    {
+      "epoch": 0.5849708879184862,
+      "grad_norm": 0.18584516644477844,
+      "learning_rate": 1.9430825751128643e-05,
+      "loss": 0.14514710903167724,
+      "step": 3215
+    },
+    {
+      "epoch": 0.5858806404657934,
+      "grad_norm": 0.17640981078147888,
+      "learning_rate": 1.9359091563262742e-05,
+      "loss": 0.1511004686355591,
+      "step": 3220
+    },
+    {
+      "epoch": 0.5867903930131004,
+      "grad_norm": 0.1697624921798706,
+      "learning_rate": 1.9287406276611095e-05,
+      "loss": 0.15392563343048096,
+      "step": 3225
+    },
+    {
+      "epoch": 0.5877001455604076,
+      "grad_norm": 0.1677260845899582,
+      "learning_rate": 1.9215770512615725e-05,
+      "loss": 0.15311745405197144,
+      "step": 3230
+    },
+    {
+      "epoch": 0.5886098981077147,
+      "grad_norm": 0.15357480943202972,
+      "learning_rate": 1.9144184892289337e-05,
+      "loss": 0.14370160102844237,
+      "step": 3235
+    },
+    {
+      "epoch": 0.5895196506550219,
+      "grad_norm": 0.18601207435131073,
+      "learning_rate": 1.9072650036209955e-05,
+      "loss": 0.14095077514648438,
+      "step": 3240
+    },
+    {
+      "epoch": 0.590429403202329,
+      "grad_norm": 0.17313526570796967,
+      "learning_rate": 1.9001166564515513e-05,
+      "loss": 0.148259174823761,
+      "step": 3245
+    },
+    {
+      "epoch": 0.5913391557496361,
+      "grad_norm": 0.1634378433227539,
+      "learning_rate": 1.8929735096898504e-05,
+      "loss": 0.15082294940948487,
+      "step": 3250
+    },
+    {
+      "epoch": 0.5922489082969432,
+      "grad_norm": 0.18542174994945526,
+      "learning_rate": 1.885835625260058e-05,
+      "loss": 0.14461435079574586,
+      "step": 3255
+    },
+    {
+      "epoch": 0.5931586608442504,
+      "grad_norm": 0.1740756630897522,
+      "learning_rate": 1.87870306504072e-05,
+      "loss": 0.14083608388900756,
+      "step": 3260
+    },
+    {
+      "epoch": 0.5940684133915575,
+      "grad_norm": 0.25606217980384827,
+      "learning_rate": 1.8715758908642288e-05,
+      "loss": 0.15125386714935302,
+      "step": 3265
+    },
+    {
+      "epoch": 0.5949781659388647,
+      "grad_norm": 0.20194627344608307,
+      "learning_rate": 1.8644541645162834e-05,
+      "loss": 0.14433003664016725,
+      "step": 3270
+    },
+    {
+      "epoch": 0.5958879184861717,
+      "grad_norm": 0.1902168095111847,
+      "learning_rate": 1.8573379477353542e-05,
+      "loss": 0.14718132019042968,
+      "step": 3275
+    },
+    {
+      "epoch": 0.5967976710334789,
+      "grad_norm": 0.15122972428798676,
+      "learning_rate": 1.850227302212151e-05,
+      "loss": 0.153376567363739,
+      "step": 3280
+    },
+    {
+      "epoch": 0.597707423580786,
+      "grad_norm": 0.14331959187984467,
+      "learning_rate": 1.843122289589085e-05,
+      "loss": 0.146630597114563,
+      "step": 3285
+    },
+    {
+      "epoch": 0.5986171761280932,
+      "grad_norm": 0.15083099901676178,
+      "learning_rate": 1.836022971459737e-05,
+      "loss": 0.1445971965789795,
+      "step": 3290
+    },
+    {
+      "epoch": 0.5995269286754003,
+      "grad_norm": 0.16585418581962585,
+      "learning_rate": 1.828929409368321e-05,
+      "loss": 0.15120241641998292,
+      "step": 3295
+    },
+    {
+      "epoch": 0.6004366812227074,
+      "grad_norm": 0.1653224229812622,
+      "learning_rate": 1.8218416648091524e-05,
+      "loss": 0.14349838495254516,
+      "step": 3300
+    },
+    {
+      "epoch": 0.6013464337700145,
+      "grad_norm": 0.1891375184059143,
+      "learning_rate": 1.8147597992261124e-05,
+      "loss": 0.15171384811401367,
+      "step": 3305
+    },
+    {
+      "epoch": 0.6022561863173217,
+      "grad_norm": 0.13392704725265503,
+      "learning_rate": 1.8076838740121187e-05,
+      "loss": 0.14607118368148803,
+      "step": 3310
+    },
+    {
+      "epoch": 0.6031659388646288,
+      "grad_norm": 0.15421944856643677,
+      "learning_rate": 1.8006139505085926e-05,
+      "loss": 0.1380957007408142,
+      "step": 3315
+    },
+    {
+      "epoch": 0.604075691411936,
+      "grad_norm": 0.16637761890888214,
+      "learning_rate": 1.7935500900049246e-05,
+      "loss": 0.14604611396789552,
+      "step": 3320
+    },
+    {
+      "epoch": 0.6049854439592431,
+      "grad_norm": 0.16638441383838654,
+      "learning_rate": 1.7864923537379445e-05,
+      "loss": 0.1513611912727356,
+      "step": 3325
+    },
+    {
+      "epoch": 0.6058951965065502,
+      "grad_norm": 0.1745707094669342,
+      "learning_rate": 1.779440802891394e-05,
+      "loss": 0.15391240119934083,
+      "step": 3330
+    },
+    {
+      "epoch": 0.6068049490538574,
+      "grad_norm": 0.1620505005121231,
+      "learning_rate": 1.77239549859539e-05,
+      "loss": 0.14986472129821776,
+      "step": 3335
+    },
+    {
+      "epoch": 0.6077147016011645,
+      "grad_norm": 0.1579132080078125,
+      "learning_rate": 1.7653565019259e-05,
+      "loss": 0.1466603994369507,
+      "step": 3340
+    },
+    {
+      "epoch": 0.6086244541484717,
+      "grad_norm": 0.19154994189739227,
+      "learning_rate": 1.7583238739042086e-05,
+      "loss": 0.15228934288024903,
+      "step": 3345
+    },
+    {
+      "epoch": 0.6095342066957787,
+      "grad_norm": 0.15771779417991638,
+      "learning_rate": 1.7512976754963913e-05,
+      "loss": 0.14965078830718995,
+      "step": 3350
+    },
+    {
+      "epoch": 0.6104439592430859,
+      "grad_norm": 0.18406136333942413,
+      "learning_rate": 1.744277967612785e-05,
+      "loss": 0.1473196864128113,
+      "step": 3355
+    },
+    {
+      "epoch": 0.611353711790393,
+      "grad_norm": 0.17603816092014313,
+      "learning_rate": 1.7372648111074607e-05,
+      "loss": 0.1430676221847534,
+      "step": 3360
+    },
+    {
+      "epoch": 0.6122634643377002,
+      "grad_norm": 0.156408429145813,
+      "learning_rate": 1.7302582667776933e-05,
+      "loss": 0.14018454551696777,
+      "step": 3365
+    },
+    {
+      "epoch": 0.6131732168850073,
+      "grad_norm": 0.14504843950271606,
+      "learning_rate": 1.7232583953634407e-05,
+      "loss": 0.14505640268325806,
+      "step": 3370
+    },
+    {
+      "epoch": 0.6140829694323144,
+      "grad_norm": 0.1864968240261078,
+      "learning_rate": 1.716265257546808e-05,
+      "loss": 0.14810394048690795,
+      "step": 3375
+    },
+    {
+      "epoch": 0.6149927219796215,
+      "grad_norm": 0.1621711403131485,
+      "learning_rate": 1.7092789139515295e-05,
+      "loss": 0.14203091859817504,
+      "step": 3380
+    },
+    {
+      "epoch": 0.6159024745269287,
+      "grad_norm": 0.17994914948940277,
+      "learning_rate": 1.70229942514244e-05,
+      "loss": 0.14565644264221192,
+      "step": 3385
+    },
+    {
+      "epoch": 0.6168122270742358,
+      "grad_norm": 0.1707388162612915,
+      "learning_rate": 1.6953268516249486e-05,
+      "loss": 0.14449434280395507,
+      "step": 3390
+    },
+    {
+      "epoch": 0.617721979621543,
+      "grad_norm": 0.16425329446792603,
+      "learning_rate": 1.6883612538445175e-05,
+      "loss": 0.15185940265655518,
+      "step": 3395
+    },
+    {
+      "epoch": 0.61863173216885,
+      "grad_norm": 0.15987788140773773,
+      "learning_rate": 1.6814026921861335e-05,
+      "loss": 0.14994431734085084,
+      "step": 3400
+    },
+    {
+      "epoch": 0.6195414847161572,
+      "grad_norm": 0.2987690269947052,
+      "learning_rate": 1.6744512269737894e-05,
+      "loss": 0.14652738571166993,
+      "step": 3405
+    },
+    {
+      "epoch": 0.6204512372634643,
+      "grad_norm": 0.1681315004825592,
+      "learning_rate": 1.6675069184699574e-05,
+      "loss": 0.14566165208816528,
+      "step": 3410
+    },
+    {
+      "epoch": 0.6213609898107715,
+      "grad_norm": 0.15847846865653992,
+      "learning_rate": 1.660569826875069e-05,
+      "loss": 0.1374401330947876,
+      "step": 3415
+    },
+    {
+      "epoch": 0.6222707423580786,
+      "grad_norm": 0.16370312869548798,
+      "learning_rate": 1.6536400123269907e-05,
+      "loss": 0.14905524253845215,
+      "step": 3420
+    },
+    {
+      "epoch": 0.6231804949053857,
+      "grad_norm": 0.16054444015026093,
+      "learning_rate": 1.6467175349005054e-05,
+      "loss": 0.1496324896812439,
+      "step": 3425
+    },
+    {
+      "epoch": 0.6240902474526928,
+      "grad_norm": 0.1663951277732849,
+      "learning_rate": 1.639802454606788e-05,
+      "loss": 0.1504170298576355,
+      "step": 3430
+    },
+    {
+      "epoch": 0.625,
+      "grad_norm": 0.1591310054063797,
+      "learning_rate": 1.6328948313928906e-05,
+      "loss": 0.1410186171531677,
+      "step": 3435
+    },
+    {
+      "epoch": 0.6259097525473072,
+      "grad_norm": 0.1637524962425232,
+      "learning_rate": 1.6259947251412178e-05,
+      "loss": 0.13963305950164795,
+      "step": 3440
+    },
+    {
+      "epoch": 0.6268195050946143,
+      "grad_norm": 0.1688017100095749,
+      "learning_rate": 1.6191021956690096e-05,
+      "loss": 0.14727941751480103,
+      "step": 3445
+    },
+    {
+      "epoch": 0.6277292576419214,
+      "grad_norm": 0.1691795438528061,
+      "learning_rate": 1.612217302727821e-05,
+      "loss": 0.14856183528900146,
+      "step": 3450
+    },
+    {
+      "epoch": 0.6286390101892285,
+      "grad_norm": 0.18501746654510498,
+      "learning_rate": 1.60534010600301e-05,
+      "loss": 0.1481746554374695,
+      "step": 3455
+    },
+    {
+      "epoch": 0.6295487627365357,
+      "grad_norm": 0.16234716773033142,
+      "learning_rate": 1.5984706651132125e-05,
+      "loss": 0.1427530527114868,
+      "step": 3460
+    },
+    {
+      "epoch": 0.6304585152838428,
+      "grad_norm": 0.16013780236244202,
+      "learning_rate": 1.5916090396098293e-05,
+      "loss": 0.14264426231384278,
+      "step": 3465
+    },
+    {
+      "epoch": 0.63136826783115,
+      "grad_norm": 0.17116396129131317,
+      "learning_rate": 1.5847552889765095e-05,
+      "loss": 0.14109257459640503,
+      "step": 3470
+    },
+    {
+      "epoch": 0.632278020378457,
+      "grad_norm": 0.16949769854545593,
+      "learning_rate": 1.5779094726286344e-05,
+      "loss": 0.1387040376663208,
+      "step": 3475
+    },
+    {
+      "epoch": 0.6331877729257642,
+      "grad_norm": 0.14983431994915009,
+      "learning_rate": 1.5710716499128044e-05,
+      "loss": 0.13645120859146118,
+      "step": 3480
+    },
+    {
+      "epoch": 0.6340975254730713,
+      "grad_norm": 0.1632554531097412,
+      "learning_rate": 1.564241880106321e-05,
+      "loss": 0.14883992671966553,
+      "step": 3485
+    },
+    {
+      "epoch": 0.6350072780203785,
+      "grad_norm": 0.15686506032943726,
+      "learning_rate": 1.5574202224166744e-05,
+      "loss": 0.14244272708892822,
+      "step": 3490
+    },
+    {
+      "epoch": 0.6359170305676856,
+      "grad_norm": 0.18843458592891693,
+      "learning_rate": 1.5506067359810333e-05,
+      "loss": 0.15149861574172974,
+      "step": 3495
+    },
+    {
+      "epoch": 0.6368267831149927,
+      "grad_norm": 0.15874551236629486,
+      "learning_rate": 1.5438014798657275e-05,
+      "loss": 0.15188233852386473,
+      "step": 3500
+    },
+    {
+      "epoch": 0.6377365356622998,
+      "grad_norm": 0.17014239728450775,
+      "learning_rate": 1.5370045130657366e-05,
+      "loss": 0.14694437980651856,
+      "step": 3505
+    },
+    {
+      "epoch": 0.638646288209607,
+      "grad_norm": 0.14744038879871368,
+      "learning_rate": 1.5302158945041838e-05,
+      "loss": 0.14434736967086792,
+      "step": 3510
+    },
+    {
+      "epoch": 0.6395560407569141,
+      "grad_norm": 0.2069770246744156,
+      "learning_rate": 1.523435683031818e-05,
+      "loss": 0.13982917070388795,
+      "step": 3515
+    },
+    {
+      "epoch": 0.6404657933042213,
+      "grad_norm": 0.17811502516269684,
+      "learning_rate": 1.5166639374265063e-05,
+      "loss": 0.1408839702606201,
+      "step": 3520
+    },
+    {
+      "epoch": 0.6413755458515283,
+      "grad_norm": 0.165786474943161,
+      "learning_rate": 1.509900716392728e-05,
+      "loss": 0.15312877893447877,
+      "step": 3525
+    },
+    {
+      "epoch": 0.6422852983988355,
+      "grad_norm": 0.1633884161710739,
+      "learning_rate": 1.5031460785610596e-05,
+      "loss": 0.1488795518875122,
+      "step": 3530
+    },
+    {
+      "epoch": 0.6431950509461426,
+      "grad_norm": 0.16498984396457672,
+      "learning_rate": 1.4964000824876723e-05,
+      "loss": 0.15031465291976928,
+      "step": 3535
+    },
+    {
+      "epoch": 0.6441048034934498,
+      "grad_norm": 0.18043678998947144,
+      "learning_rate": 1.4896627866538191e-05,
+      "loss": 0.147829806804657,
+      "step": 3540
+    },
+    {
+      "epoch": 0.6450145560407569,
+      "grad_norm": 0.16813597083091736,
+      "learning_rate": 1.4829342494653315e-05,
+      "loss": 0.1418998956680298,
+      "step": 3545
+    },
+    {
+      "epoch": 0.645924308588064,
+      "grad_norm": 0.1817242056131363,
+      "learning_rate": 1.4762145292521118e-05,
+      "loss": 0.14508869647979736,
+      "step": 3550
+    },
+    {
+      "epoch": 0.6468340611353712,
+      "grad_norm": 0.14666494727134705,
+      "learning_rate": 1.469503684267628e-05,
+      "loss": 0.14159854650497436,
+      "step": 3555
+    },
+    {
+      "epoch": 0.6477438136826783,
+      "grad_norm": 0.16485381126403809,
+      "learning_rate": 1.4628017726884086e-05,
+      "loss": 0.14419105052947997,
+      "step": 3560
+    },
+    {
+      "epoch": 0.6486535662299855,
+      "grad_norm": 0.16100342571735382,
+      "learning_rate": 1.4561088526135375e-05,
+      "loss": 0.14501721858978273,
+      "step": 3565
+    },
+    {
+      "epoch": 0.6495633187772926,
+      "grad_norm": 0.16996590793132782,
+      "learning_rate": 1.4494249820641493e-05,
+      "loss": 0.1377166509628296,
+      "step": 3570
+    },
+    {
+      "epoch": 0.6504730713245997,
+      "grad_norm": 0.16168837249279022,
+      "learning_rate": 1.4427502189829339e-05,
+      "loss": 0.1414325475692749,
+      "step": 3575
+    },
+    {
+      "epoch": 0.6513828238719068,
+      "grad_norm": 0.16318906843662262,
+      "learning_rate": 1.436084621233621e-05,
+      "loss": 0.14685193300247193,
+      "step": 3580
+    },
+    {
+      "epoch": 0.652292576419214,
+      "grad_norm": 0.1636219322681427,
+      "learning_rate": 1.4294282466004899e-05,
+      "loss": 0.1405899167060852,
+      "step": 3585
+    },
+    {
+      "epoch": 0.6532023289665211,
+      "grad_norm": 0.1838461309671402,
+      "learning_rate": 1.422781152787865e-05,
+      "loss": 0.14386332035064697,
+      "step": 3590
+    },
+    {
+      "epoch": 0.6541120815138283,
+      "grad_norm": 0.1796344667673111,
+      "learning_rate": 1.4161433974196115e-05,
+      "loss": 0.1513024687767029,
+      "step": 3595
+    },
+    {
+      "epoch": 0.6550218340611353,
+      "grad_norm": 0.16424529254436493,
+      "learning_rate": 1.4095150380386427e-05,
+      "loss": 0.14238927364349366,
+      "step": 3600
+    },
+    {
+      "epoch": 0.6559315866084425,
+      "grad_norm": 0.19264160096645355,
+      "learning_rate": 1.402896132106415e-05,
+      "loss": 0.14297477006912232,
+      "step": 3605
+    },
+    {
+      "epoch": 0.6568413391557496,
+      "grad_norm": 0.18319948017597198,
+      "learning_rate": 1.3962867370024347e-05,
+      "loss": 0.1448880434036255,
+      "step": 3610
+    },
+    {
+      "epoch": 0.6577510917030568,
+      "grad_norm": 0.16507290303707123,
+      "learning_rate": 1.389686910023758e-05,
+      "loss": 0.14724698066711425,
+      "step": 3615
+    },
+    {
+      "epoch": 0.6586608442503639,
+      "grad_norm": 0.17871244251728058,
+      "learning_rate": 1.3830967083844942e-05,
+      "loss": 0.14479386806488037,
+      "step": 3620
+    },
+    {
+      "epoch": 0.659570596797671,
+      "grad_norm": 0.1846228390932083,
+      "learning_rate": 1.3765161892153112e-05,
+      "loss": 0.1453616738319397,
+      "step": 3625
+    },
+    {
+      "epoch": 0.6604803493449781,
+      "grad_norm": 0.17185978591442108,
+      "learning_rate": 1.3699454095629372e-05,
+      "loss": 0.14906206130981445,
+      "step": 3630
+    },
+    {
+      "epoch": 0.6613901018922853,
+      "grad_norm": 0.14751191437244415,
+      "learning_rate": 1.3633844263896698e-05,
+      "loss": 0.13991892337799072,
+      "step": 3635
+    },
+    {
+      "epoch": 0.6622998544395924,
+      "grad_norm": 0.22059763967990875,
+      "learning_rate": 1.3568332965728817e-05,
+      "loss": 0.14680869579315187,
+      "step": 3640
+    },
+    {
+      "epoch": 0.6632096069868996,
+      "grad_norm": 0.15295909345149994,
+      "learning_rate": 1.3502920769045232e-05,
+      "loss": 0.1404443383216858,
+      "step": 3645
+    },
+    {
+      "epoch": 0.6641193595342066,
+      "grad_norm": 0.14600558578968048,
+      "learning_rate": 1.3437608240906364e-05,
+      "loss": 0.14663270711898804,
+      "step": 3650
+    },
+    {
+      "epoch": 0.6650291120815138,
+      "grad_norm": 0.15548352897167206,
+      "learning_rate": 1.3372395947508587e-05,
+      "loss": 0.1431443452835083,
+      "step": 3655
+    },
+    {
+      "epoch": 0.665938864628821,
+      "grad_norm": 0.1813388466835022,
+      "learning_rate": 1.3307284454179342e-05,
+      "loss": 0.1458706736564636,
+      "step": 3660
+    },
+    {
+      "epoch": 0.6668486171761281,
+      "grad_norm": 0.16326870024204254,
+      "learning_rate": 1.3242274325372247e-05,
+      "loss": 0.14700595140457154,
+      "step": 3665
+    },
+    {
+      "epoch": 0.6677583697234353,
+      "grad_norm": 0.18779197335243225,
+      "learning_rate": 1.3177366124662149e-05,
+      "loss": 0.1497237801551819,
+      "step": 3670
+    },
+    {
+      "epoch": 0.6686681222707423,
+      "grad_norm": 0.16291002929210663,
+      "learning_rate": 1.3112560414740315e-05,
+      "loss": 0.1387086868286133,
+      "step": 3675
+    },
+    {
+      "epoch": 0.6695778748180495,
+      "grad_norm": 0.1532297134399414,
+      "learning_rate": 1.3047857757409487e-05,
+      "loss": 0.14497545957565308,
+      "step": 3680
+    },
+    {
+      "epoch": 0.6704876273653566,
+      "grad_norm": 0.14697515964508057,
+      "learning_rate": 1.2983258713579066e-05,
+      "loss": 0.1494283437728882,
+      "step": 3685
+    },
+    {
+      "epoch": 0.6713973799126638,
+      "grad_norm": 0.15213452279567719,
+      "learning_rate": 1.2918763843260218e-05,
+      "loss": 0.1468907594680786,
+      "step": 3690
+    },
+    {
+      "epoch": 0.6723071324599709,
+      "grad_norm": 0.1745215803384781,
+      "learning_rate": 1.285437370556099e-05,
+      "loss": 0.14997754096984864,
+      "step": 3695
+    },
+    {
+      "epoch": 0.673216885007278,
+      "grad_norm": 0.19207637012004852,
+      "learning_rate": 1.2790088858681577e-05,
+      "loss": 0.14202862977981567,
+      "step": 3700
+    },
+    {
+      "epoch": 0.6741266375545851,
+      "grad_norm": 0.1521359086036682,
+      "learning_rate": 1.2725909859909313e-05,
+      "loss": 0.14547673463821412,
+      "step": 3705
+    },
+    {
+      "epoch": 0.6750363901018923,
+      "grad_norm": 0.16975535452365875,
+      "learning_rate": 1.2661837265613999e-05,
+      "loss": 0.14006874561309815,
+      "step": 3710
+    },
+    {
+      "epoch": 0.6759461426491994,
+      "grad_norm": 0.22234582901000977,
+      "learning_rate": 1.2597871631242992e-05,
+      "loss": 0.13691173791885375,
+      "step": 3715
+    },
+    {
+      "epoch": 0.6768558951965066,
+      "grad_norm": 0.16082969307899475,
+      "learning_rate": 1.2534013511316383e-05,
+      "loss": 0.14932308197021485,
+      "step": 3720
+    },
+    {
+      "epoch": 0.6777656477438136,
+      "grad_norm": 0.1751091182231903,
+      "learning_rate": 1.247026345942226e-05,
+      "loss": 0.14531974792480468,
+      "step": 3725
+    },
+    {
+      "epoch": 0.6786754002911208,
+      "grad_norm": 0.15838147699832916,
+      "learning_rate": 1.2406622028211844e-05,
+      "loss": 0.14759832620620728,
+      "step": 3730
+    },
+    {
+      "epoch": 0.6795851528384279,
+      "grad_norm": 0.1771744042634964,
+      "learning_rate": 1.2343089769394714e-05,
+      "loss": 0.1382831573486328,
+      "step": 3735
+    },
+    {
+      "epoch": 0.6804949053857351,
+      "grad_norm": 0.16301538050174713,
+      "learning_rate": 1.2279667233734037e-05,
+      "loss": 0.14444775581359864,
+      "step": 3740
+    },
+    {
+      "epoch": 0.6814046579330422,
+      "grad_norm": 0.1584121286869049,
+      "learning_rate": 1.2216354971041796e-05,
+      "loss": 0.14200170040130616,
+      "step": 3745
+    },
+    {
+      "epoch": 0.6823144104803494,
+      "grad_norm": 0.139187291264534,
+      "learning_rate": 1.2153153530174007e-05,
+      "loss": 0.14318310022354125,
+      "step": 3750
+    },
+    {
+      "epoch": 0.6832241630276564,
+      "grad_norm": 0.13665248453617096,
+      "learning_rate": 1.2090063459025955e-05,
+      "loss": 0.1411946654319763,
+      "step": 3755
+    },
+    {
+      "epoch": 0.6841339155749636,
+      "grad_norm": 0.16273781657218933,
+      "learning_rate": 1.2027085304527475e-05,
+      "loss": 0.14873508214950562,
+      "step": 3760
+    },
+    {
+      "epoch": 0.6850436681222707,
+      "grad_norm": 0.16317526996135712,
+      "learning_rate": 1.1964219612638194e-05,
+      "loss": 0.14644203186035157,
+      "step": 3765
+    },
+    {
+      "epoch": 0.6859534206695779,
+      "grad_norm": 0.17253617942333221,
+      "learning_rate": 1.1901466928342777e-05,
+      "loss": 0.14027841091156007,
+      "step": 3770
+    },
+    {
+      "epoch": 0.6868631732168851,
+      "grad_norm": 0.19692830741405487,
+      "learning_rate": 1.183882779564624e-05,
+      "loss": 0.14411110877990724,
+      "step": 3775
+    },
+    {
+      "epoch": 0.6877729257641921,
+      "grad_norm": 0.15444578230381012,
+      "learning_rate": 1.1776302757569214e-05,
+      "loss": 0.14355008602142333,
+      "step": 3780
+    },
+    {
+      "epoch": 0.6886826783114993,
+      "grad_norm": 0.1622200757265091,
+      "learning_rate": 1.1713892356143239e-05,
+      "loss": 0.14794334173202514,
+      "step": 3785
+    },
+    {
+      "epoch": 0.6895924308588064,
+      "grad_norm": 0.1898501068353653,
+      "learning_rate": 1.1651597132406073e-05,
+      "loss": 0.1418622612953186,
+      "step": 3790
+    },
+    {
+      "epoch": 0.6905021834061136,
+      "grad_norm": 0.17803208529949188,
+      "learning_rate": 1.1589417626396973e-05,
+      "loss": 0.14576040506362914,
+      "step": 3795
+    },
+    {
+      "epoch": 0.6914119359534207,
+      "grad_norm": 0.17138013243675232,
+      "learning_rate": 1.1527354377152053e-05,
+      "loss": 0.14494270086288452,
+      "step": 3800
+    },
+    {
+      "epoch": 0.6923216885007278,
+      "grad_norm": 0.15170913934707642,
+      "learning_rate": 1.1465407922699603e-05,
+      "loss": 0.144084370136261,
+      "step": 3805
+    },
+    {
+      "epoch": 0.6932314410480349,
+      "grad_norm": 0.158562570810318,
+      "learning_rate": 1.1403578800055387e-05,
+      "loss": 0.13636608123779298,
+      "step": 3810
+    },
+    {
+      "epoch": 0.6941411935953421,
+      "grad_norm": 0.17687302827835083,
+      "learning_rate": 1.1341867545218044e-05,
+      "loss": 0.14214688539505005,
+      "step": 3815
+    },
+    {
+      "epoch": 0.6950509461426492,
+      "grad_norm": 0.15394899249076843,
+      "learning_rate": 1.1280274693164378e-05,
+      "loss": 0.14914129972457885,
+      "step": 3820
+    },
+    {
+      "epoch": 0.6959606986899564,
+      "grad_norm": 0.15709355473518372,
+      "learning_rate": 1.12188007778448e-05,
+      "loss": 0.14798580408096312,
+      "step": 3825
+    },
+    {
+      "epoch": 0.6968704512372634,
+      "grad_norm": 0.16631539165973663,
+      "learning_rate": 1.115744633217864e-05,
+      "loss": 0.14756966829299928,
+      "step": 3830
+    },
+    {
+      "epoch": 0.6977802037845706,
+      "grad_norm": 0.15893076360225677,
+      "learning_rate": 1.109621188804951e-05,
+      "loss": 0.14061959981918334,
+      "step": 3835
+    },
+    {
+      "epoch": 0.6986899563318777,
+      "grad_norm": 0.183414489030838,
+      "learning_rate": 1.103509797630077e-05,
+      "loss": 0.1448473334312439,
+      "step": 3840
+    },
+    {
+      "epoch": 0.6995997088791849,
+      "grad_norm": 0.14087305963039398,
+      "learning_rate": 1.0974105126730841e-05,
+      "loss": 0.14369285106658936,
+      "step": 3845
+    },
+    {
+      "epoch": 0.700509461426492,
+      "grad_norm": 0.16919967532157898,
+      "learning_rate": 1.0913233868088685e-05,
+      "loss": 0.1478085398674011,
+      "step": 3850
+    },
+    {
+      "epoch": 0.7014192139737991,
+      "grad_norm": 0.1439533829689026,
+      "learning_rate": 1.0852484728069178e-05,
+      "loss": 0.14376721382141114,
+      "step": 3855
+    },
+    {
+      "epoch": 0.7023289665211062,
+      "grad_norm": 0.17719274759292603,
+      "learning_rate": 1.0791858233308521e-05,
+      "loss": 0.14089040756225585,
+      "step": 3860
+    },
+    {
+      "epoch": 0.7032387190684134,
+      "grad_norm": 0.19753769040107727,
+      "learning_rate": 1.0731354909379754e-05,
+      "loss": 0.15021742582321168,
+      "step": 3865
+    },
+    {
+      "epoch": 0.7041484716157205,
+      "grad_norm": 0.19186992943286896,
+      "learning_rate": 1.0670975280788086e-05,
+      "loss": 0.14113202095031738,
+      "step": 3870
+    },
+    {
+      "epoch": 0.7050582241630277,
+      "grad_norm": 0.1709229201078415,
+      "learning_rate": 1.0610719870966443e-05,
+      "loss": 0.1500566840171814,
+      "step": 3875
+    },
+    {
+      "epoch": 0.7059679767103348,
+      "grad_norm": 0.17846204340457916,
+      "learning_rate": 1.0550589202270892e-05,
+      "loss": 0.15014195442199707,
+      "step": 3880
+    },
+    {
+      "epoch": 0.7068777292576419,
+      "grad_norm": 0.1827082335948944,
+      "learning_rate": 1.0490583795976091e-05,
+      "loss": 0.1423472762107849,
+      "step": 3885
+    },
+    {
+      "epoch": 0.7077874818049491,
+      "grad_norm": 0.17418377101421356,
+      "learning_rate": 1.043070417227083e-05,
+      "loss": 0.14668900966644288,
+      "step": 3890
+    },
+    {
+      "epoch": 0.7086972343522562,
+      "grad_norm": 0.17385616898536682,
+      "learning_rate": 1.0370950850253449e-05,
+      "loss": 0.14627279043197633,
+      "step": 3895
+    },
+    {
+      "epoch": 0.7096069868995634,
+      "grad_norm": 0.16486723721027374,
+      "learning_rate": 1.0311324347927404e-05,
+      "loss": 0.14603652954101562,
+      "step": 3900
+    },
+    {
+      "epoch": 0.7105167394468704,
+      "grad_norm": 0.21806862950325012,
+      "learning_rate": 1.0251825182196732e-05,
+      "loss": 0.1488169550895691,
+      "step": 3905
+    },
+    {
+      "epoch": 0.7114264919941776,
+      "grad_norm": 0.19884569942951202,
+      "learning_rate": 1.019245386886159e-05,
+      "loss": 0.14387656450271608,
+      "step": 3910
+    },
+    {
+      "epoch": 0.7123362445414847,
+      "grad_norm": 0.16139011085033417,
+      "learning_rate": 1.0133210922613789e-05,
+      "loss": 0.1483074426651001,
+      "step": 3915
+    },
+    {
+      "epoch": 0.7132459970887919,
+      "grad_norm": 0.17000740766525269,
+      "learning_rate": 1.007409685703229e-05,
+      "loss": 0.14050065279006957,
+      "step": 3920
+    },
+    {
+      "epoch": 0.714155749636099,
+      "grad_norm": 0.17235304415225983,
+      "learning_rate": 1.0015112184578813e-05,
+      "loss": 0.1440442681312561,
+      "step": 3925
+    },
+    {
+      "epoch": 0.7150655021834061,
+      "grad_norm": 0.15737567842006683,
+      "learning_rate": 9.956257416593362e-06,
+      "loss": 0.14960765838623047,
+      "step": 3930
+    },
+    {
+      "epoch": 0.7159752547307132,
+      "grad_norm": 0.15499180555343628,
+      "learning_rate": 9.897533063289773e-06,
+      "loss": 0.14488829374313356,
+      "step": 3935
+    },
+    {
+      "epoch": 0.7168850072780204,
+      "grad_norm": 0.17744216322898865,
+      "learning_rate": 9.838939633751337e-06,
+      "loss": 0.1416949987411499,
+      "step": 3940
+    },
+    {
+      "epoch": 0.7177947598253275,
+      "grad_norm": 0.1597192883491516,
+      "learning_rate": 9.780477635926358e-06,
+      "loss": 0.14275280237197877,
+      "step": 3945
+    },
+    {
+      "epoch": 0.7187045123726347,
+      "grad_norm": 0.17800374329090118,
+      "learning_rate": 9.722147576623743e-06,
+      "loss": 0.14532098770141602,
+      "step": 3950
+    },
+    {
+      "epoch": 0.7196142649199417,
+      "grad_norm": 0.1828162521123886,
+      "learning_rate": 9.66394996150864e-06,
+      "loss": 0.14525585174560546,
+      "step": 3955
+    },
+    {
+      "epoch": 0.7205240174672489,
+      "grad_norm": 0.1800539344549179,
+      "learning_rate": 9.605885295098005e-06,
+      "loss": 0.14235819578170777,
+      "step": 3960
+    },
+    {
+      "epoch": 0.721433770014556,
+      "grad_norm": 0.16556483507156372,
+      "learning_rate": 9.54795408075628e-06,
+      "loss": 0.13965482711791993,
+      "step": 3965
+    },
+    {
+      "epoch": 0.7223435225618632,
+      "grad_norm": 0.1592024862766266,
+      "learning_rate": 9.49015682069101e-06,
+      "loss": 0.14051042795181273,
+      "step": 3970
+    },
+    {
+      "epoch": 0.7232532751091703,
+      "grad_norm": 0.18988847732543945,
+      "learning_rate": 9.43249401594846e-06,
+      "loss": 0.1436900496482849,
+      "step": 3975
+    },
+    {
+      "epoch": 0.7241630276564774,
+      "grad_norm": 0.24433808028697968,
+      "learning_rate": 9.374966166409329e-06,
+      "loss": 0.14883997440338134,
+      "step": 3980
+    },
+    {
+      "epoch": 0.7250727802037845,
+      "grad_norm": 0.15091639757156372,
+      "learning_rate": 9.317573770784352e-06,
+      "loss": 0.14726560115814208,
+      "step": 3985
+    },
+    {
+      "epoch": 0.7259825327510917,
+      "grad_norm": 0.17045573890209198,
+      "learning_rate": 9.260317326610051e-06,
+      "loss": 0.14120506048202514,
+      "step": 3990
+    },
+    {
+      "epoch": 0.7268922852983989,
+      "grad_norm": 0.18847957253456116,
+      "learning_rate": 9.203197330244343e-06,
+      "loss": 0.1377041220664978,
+      "step": 3995
+    },
+    {
+      "epoch": 0.727802037845706,
+      "grad_norm": 0.1516445279121399,
+      "learning_rate": 9.14621427686229e-06,
+      "loss": 0.14043946266174318,
+      "step": 4000
+    },
+    {
+      "epoch": 0.7287117903930131,
+      "grad_norm": 0.18264050781726837,
+      "learning_rate": 9.0893686604518e-06,
+      "loss": 0.14080368280410765,
+      "step": 4005
+    },
+    {
+      "epoch": 0.7296215429403202,
+      "grad_norm": 0.19129371643066406,
+      "learning_rate": 9.032660973809312e-06,
+      "loss": 0.1402561902999878,
+      "step": 4010
+    },
+    {
+      "epoch": 0.7305312954876274,
+      "grad_norm": 0.15762710571289062,
+      "learning_rate": 8.976091708535567e-06,
+      "loss": 0.14421157836914061,
+      "step": 4015
+    },
+    {
+      "epoch": 0.7314410480349345,
+      "grad_norm": 0.17785198986530304,
+      "learning_rate": 8.919661355031331e-06,
+      "loss": 0.14999009370803834,
+      "step": 4020
+    },
+    {
+      "epoch": 0.7323508005822417,
+      "grad_norm": 0.15306031703948975,
+      "learning_rate": 8.8633704024931e-06,
+      "loss": 0.14101698398590087,
+      "step": 4025
+    },
+    {
+      "epoch": 0.7332605531295487,
+      "grad_norm": 0.16481758654117584,
+      "learning_rate": 8.807219338908968e-06,
+      "loss": 0.14170764684677123,
+      "step": 4030
+    },
+    {
+      "epoch": 0.7341703056768559,
+      "grad_norm": 0.14892235398292542,
+      "learning_rate": 8.751208651054257e-06,
+      "loss": 0.15317896604537964,
+      "step": 4035
+    },
+    {
+      "epoch": 0.735080058224163,
+      "grad_norm": 0.1775592565536499,
+      "learning_rate": 8.695338824487409e-06,
+      "loss": 0.1520617723464966,
+      "step": 4040
+    },
+    {
+      "epoch": 0.7359898107714702,
+      "grad_norm": 0.1614258885383606,
+      "learning_rate": 8.639610343545728e-06,
+      "loss": 0.13747400045394897,
+      "step": 4045
+    },
+    {
+      "epoch": 0.7368995633187773,
+      "grad_norm": 0.21415506303310394,
+      "learning_rate": 8.58402369134117e-06,
+      "loss": 0.1432439088821411,
+      "step": 4050
+    },
+    {
+      "epoch": 0.7378093158660844,
+      "grad_norm": 0.1759418249130249,
+      "learning_rate": 8.528579349756205e-06,
+      "loss": 0.141641104221344,
+      "step": 4055
+    },
+    {
+      "epoch": 0.7387190684133915,
+      "grad_norm": 0.16738329827785492,
+      "learning_rate": 8.47327779943957e-06,
+      "loss": 0.14294810295104982,
+      "step": 4060
+    },
+    {
+      "epoch": 0.7396288209606987,
+      "grad_norm": 0.13916844129562378,
+      "learning_rate": 8.41811951980217e-06,
+      "loss": 0.13876968622207642,
+      "step": 4065
+    },
+    {
+      "epoch": 0.7405385735080058,
+      "grad_norm": 0.1828441321849823,
+      "learning_rate": 8.36310498901288e-06,
+      "loss": 0.148428475856781,
+      "step": 4070
+    },
+    {
+      "epoch": 0.741448326055313,
+      "grad_norm": 0.16534076631069183,
+      "learning_rate": 8.308234683994415e-06,
+      "loss": 0.14222711324691772,
+      "step": 4075
+    },
+    {
+      "epoch": 0.74235807860262,
+      "grad_norm": 0.17922644317150116,
+      "learning_rate": 8.253509080419198e-06,
+      "loss": 0.14365782737731933,
+      "step": 4080
+    },
+    {
+      "epoch": 0.7432678311499272,
+      "grad_norm": 0.15061035752296448,
+      "learning_rate": 8.198928652705204e-06,
+      "loss": 0.13571925163269044,
+      "step": 4085
+    },
+    {
+      "epoch": 0.7441775836972343,
+      "grad_norm": 0.18075402081012726,
+      "learning_rate": 8.144493874011908e-06,
+      "loss": 0.14385528564453126,
+      "step": 4090
+    },
+    {
+      "epoch": 0.7450873362445415,
+      "grad_norm": 0.16514739394187927,
+      "learning_rate": 8.090205216236135e-06,
+      "loss": 0.14920626878738402,
+      "step": 4095
+    },
+    {
+      "epoch": 0.7459970887918487,
+      "grad_norm": 0.16453702747821808,
+      "learning_rate": 8.03606315000797e-06,
+      "loss": 0.14704222679138185,
+      "step": 4100
+    },
+    {
+      "epoch": 0.7469068413391557,
+      "grad_norm": 0.16719917953014374,
+      "learning_rate": 7.982068144686707e-06,
+      "loss": 0.14722511768341065,
+      "step": 4105
+    },
+    {
+      "epoch": 0.7478165938864629,
+      "grad_norm": 0.18499110639095306,
+      "learning_rate": 7.92822066835677e-06,
+      "loss": 0.1401848554611206,
+      "step": 4110
+    },
+    {
+      "epoch": 0.74872634643377,
+      "grad_norm": 0.17249563336372375,
+      "learning_rate": 7.87452118782363e-06,
+      "loss": 0.15132423639297485,
+      "step": 4115
+    },
+    {
+      "epoch": 0.7496360989810772,
+      "grad_norm": 0.15049682557582855,
+      "learning_rate": 7.8209701686098e-06,
+      "loss": 0.1341150164604187,
+      "step": 4120
+    },
+    {
+      "epoch": 0.7505458515283843,
+      "grad_norm": 0.16892646253108978,
+      "learning_rate": 7.767568074950751e-06,
+      "loss": 0.1466840147972107,
+      "step": 4125
+    },
+    {
+      "epoch": 0.7514556040756915,
+      "grad_norm": 0.17288286983966827,
+      "learning_rate": 7.714315369790942e-06,
+      "loss": 0.13819680213928223,
+      "step": 4130
+    },
+    {
+      "epoch": 0.7523653566229985,
+      "grad_norm": 0.21893996000289917,
+      "learning_rate": 7.661212514779745e-06,
+      "loss": 0.14369510412216185,
+      "step": 4135
+    },
+    {
+      "epoch": 0.7532751091703057,
+      "grad_norm": 0.1674601435661316,
+      "learning_rate": 7.608259970267509e-06,
+      "loss": 0.14810250997543334,
+      "step": 4140
+    },
+    {
+      "epoch": 0.7541848617176128,
+      "grad_norm": 0.15875539183616638,
+      "learning_rate": 7.555458195301526e-06,
+      "loss": 0.14103198051452637,
+      "step": 4145
+    },
+    {
+      "epoch": 0.75509461426492,
+      "grad_norm": 0.19454079866409302,
+      "learning_rate": 7.502807647622037e-06,
+      "loss": 0.13848764896392823,
+      "step": 4150
+    },
+    {
+      "epoch": 0.756004366812227,
+      "grad_norm": 0.1795455813407898,
+      "learning_rate": 7.450308783658341e-06,
+      "loss": 0.14459335803985596,
+      "step": 4155
+    },
+    {
+      "epoch": 0.7569141193595342,
+      "grad_norm": 0.1643362045288086,
+      "learning_rate": 7.397962058524735e-06,
+      "loss": 0.14335378408432006,
+      "step": 4160
+    },
+    {
+      "epoch": 0.7578238719068413,
+      "grad_norm": 0.16362066566944122,
+      "learning_rate": 7.3457679260166475e-06,
+      "loss": 0.14222005605697632,
+      "step": 4165
+    },
+    {
+      "epoch": 0.7587336244541485,
+      "grad_norm": 0.17313003540039062,
+      "learning_rate": 7.293726838606674e-06,
+      "loss": 0.14272255897521974,
+      "step": 4170
+    },
+    {
+      "epoch": 0.7596433770014556,
+      "grad_norm": 0.1809929460287094,
+      "learning_rate": 7.2418392474406405e-06,
+      "loss": 0.14089123010635377,
+      "step": 4175
+    },
+    {
+      "epoch": 0.7605531295487628,
+      "grad_norm": 0.14306005835533142,
+      "learning_rate": 7.19010560233373e-06,
+      "loss": 0.13531534671783446,
+      "step": 4180
+    },
+    {
+      "epoch": 0.7614628820960698,
+      "grad_norm": 0.15525390207767487,
+      "learning_rate": 7.138526351766559e-06,
+      "loss": 0.14340845346450806,
+      "step": 4185
+    },
+    {
+      "epoch": 0.762372634643377,
+      "grad_norm": 0.24478943645954132,
+      "learning_rate": 7.087101942881263e-06,
+      "loss": 0.14744555950164795,
+      "step": 4190
+    },
+    {
+      "epoch": 0.7632823871906841,
+      "grad_norm": 0.31335577368736267,
+      "learning_rate": 7.035832821477711e-06,
+      "loss": 0.1484094500541687,
+      "step": 4195
+    },
+    {
+      "epoch": 0.7641921397379913,
+      "grad_norm": 0.15140366554260254,
+      "learning_rate": 6.984719432009515e-06,
+      "loss": 0.14991614818572999,
+      "step": 4200
+    },
+    {
+      "epoch": 0.7651018922852983,
+      "grad_norm": 0.16125506162643433,
+      "learning_rate": 6.933762217580289e-06,
+      "loss": 0.1408134937286377,
+      "step": 4205
+    },
+    {
+      "epoch": 0.7660116448326055,
+      "grad_norm": 0.2501450181007385,
+      "learning_rate": 6.882961619939726e-06,
+      "loss": 0.13875640630722047,
+      "step": 4210
+    },
+    {
+      "epoch": 0.7669213973799127,
+      "grad_norm": 0.16227811574935913,
+      "learning_rate": 6.8323180794798245e-06,
+      "loss": 0.14138660430908204,
+      "step": 4215
+    },
+    {
+      "epoch": 0.7678311499272198,
+      "grad_norm": 0.16676810383796692,
+      "learning_rate": 6.781832035231053e-06,
+      "loss": 0.14696706533432008,
+      "step": 4220
+    },
+    {
+      "epoch": 0.768740902474527,
+      "grad_norm": 0.14638574421405792,
+      "learning_rate": 6.731503924858518e-06,
+      "loss": 0.14263020753860473,
+      "step": 4225
+    },
+    {
+      "epoch": 0.769650655021834,
+      "grad_norm": 0.17093190550804138,
+      "learning_rate": 6.681334184658211e-06,
+      "loss": 0.14694111347198485,
+      "step": 4230
+    },
+    {
+      "epoch": 0.7705604075691412,
+      "grad_norm": 0.17174287140369415,
+      "learning_rate": 6.631323249553201e-06,
+      "loss": 0.13854929208755493,
+      "step": 4235
+    },
+    {
+      "epoch": 0.7714701601164483,
+      "grad_norm": 0.14599016308784485,
+      "learning_rate": 6.5814715530898745e-06,
+      "loss": 0.14058833122253417,
+      "step": 4240
+    },
+    {
+      "epoch": 0.7723799126637555,
+      "grad_norm": 0.16222265362739563,
+      "learning_rate": 6.531779527434176e-06,
+      "loss": 0.1428326725959778,
+      "step": 4245
+    },
+    {
+      "epoch": 0.7732896652110626,
+      "grad_norm": 0.1741994023323059,
+      "learning_rate": 6.482247603367839e-06,
+      "loss": 0.13985042572021483,
+      "step": 4250
+    },
+    {
+      "epoch": 0.7741994177583698,
+      "grad_norm": 0.17427101731300354,
+      "learning_rate": 6.432876210284688e-06,
+      "loss": 0.1442667603492737,
+      "step": 4255
+    },
+    {
+      "epoch": 0.7751091703056768,
+      "grad_norm": 0.1665259599685669,
+      "learning_rate": 6.383665776186912e-06,
+      "loss": 0.1421986222267151,
+      "step": 4260
+    },
+    {
+      "epoch": 0.776018922852984,
+      "grad_norm": 0.1728232353925705,
+      "learning_rate": 6.334616727681303e-06,
+      "loss": 0.1367053508758545,
+      "step": 4265
+    },
+    {
+      "epoch": 0.7769286754002911,
+      "grad_norm": 0.15882381796836853,
+      "learning_rate": 6.285729489975639e-06,
+      "loss": 0.14551182985305786,
+      "step": 4270
+    },
+    {
+      "epoch": 0.7778384279475983,
+      "grad_norm": 0.242042675614357,
+      "learning_rate": 6.2370044868749115e-06,
+      "loss": 0.1455132007598877,
+      "step": 4275
+    },
+    {
+      "epoch": 0.7787481804949054,
+      "grad_norm": 0.1599501073360443,
+      "learning_rate": 6.188442140777742e-06,
+      "loss": 0.1424942970275879,
+      "step": 4280
+    },
+    {
+      "epoch": 0.7796579330422125,
+      "grad_norm": 0.15182635188102722,
+      "learning_rate": 6.140042872672647e-06,
+      "loss": 0.14212887287139891,
+      "step": 4285
+    },
+    {
+      "epoch": 0.7805676855895196,
+      "grad_norm": 0.1720375418663025,
+      "learning_rate": 6.091807102134403e-06,
+      "loss": 0.14243412017822266,
+      "step": 4290
+    },
+    {
+      "epoch": 0.7814774381368268,
+      "grad_norm": 0.16436047852039337,
+      "learning_rate": 6.043735247320454e-06,
+      "loss": 0.15035657882690429,
+      "step": 4295
+    },
+    {
+      "epoch": 0.7823871906841339,
+      "grad_norm": 0.1498408019542694,
+      "learning_rate": 5.995827724967218e-06,
+      "loss": 0.14494839906692505,
+      "step": 4300
+    },
+    {
+      "epoch": 0.7832969432314411,
+      "grad_norm": 0.16924560070037842,
+      "learning_rate": 5.948084950386535e-06,
+      "loss": 0.13581212759017944,
+      "step": 4305
+    },
+    {
+      "epoch": 0.7842066957787481,
+      "grad_norm": 0.15889139473438263,
+      "learning_rate": 5.900507337462036e-06,
+      "loss": 0.15071530342102052,
+      "step": 4310
+    },
+    {
+      "epoch": 0.7851164483260553,
+      "grad_norm": 0.17201054096221924,
+      "learning_rate": 5.853095298645542e-06,
+      "loss": 0.1398628830909729,
+      "step": 4315
+    },
+    {
+      "epoch": 0.7860262008733624,
+      "grad_norm": 0.17965619266033173,
+      "learning_rate": 5.805849244953548e-06,
+      "loss": 0.14666696786880493,
+      "step": 4320
+    },
+    {
+      "epoch": 0.7869359534206696,
+      "grad_norm": 0.17514032125473022,
+      "learning_rate": 5.758769585963569e-06,
+      "loss": 0.1383386731147766,
+      "step": 4325
+    },
+    {
+      "epoch": 0.7878457059679768,
+      "grad_norm": 0.17497631907463074,
+      "learning_rate": 5.7118567298106744e-06,
+      "loss": 0.14362354278564454,
+      "step": 4330
+    },
+    {
+      "epoch": 0.7887554585152838,
+      "grad_norm": 0.16770458221435547,
+      "learning_rate": 5.665111083183905e-06,
+      "loss": 0.14136618375778198,
+      "step": 4335
+    },
+    {
+      "epoch": 0.789665211062591,
+      "grad_norm": 0.17134106159210205,
+      "learning_rate": 5.618533051322747e-06,
+      "loss": 0.1401529550552368,
+      "step": 4340
+    },
+    {
+      "epoch": 0.7905749636098981,
+      "grad_norm": 0.19458788633346558,
+      "learning_rate": 5.5721230380136435e-06,
+      "loss": 0.1393273115158081,
+      "step": 4345
+    },
+    {
+      "epoch": 0.7914847161572053,
+      "grad_norm": 0.19483692944049835,
+      "learning_rate": 5.525881445586467e-06,
+      "loss": 0.1369825482368469,
+      "step": 4350
+    },
+    {
+      "epoch": 0.7923944687045124,
+      "grad_norm": 0.3052191734313965,
+      "learning_rate": 5.4798086749110495e-06,
+      "loss": 0.14762181043624878,
+      "step": 4355
+    },
+    {
+      "epoch": 0.7933042212518195,
+      "grad_norm": 0.164458766579628,
+      "learning_rate": 5.4339051253937065e-06,
+      "loss": 0.14501686096191407,
+      "step": 4360
+    },
+    {
+      "epoch": 0.7942139737991266,
+      "grad_norm": 0.1719193458557129,
+      "learning_rate": 5.3881711949737625e-06,
+      "loss": 0.13321092128753662,
+      "step": 4365
+    },
+    {
+      "epoch": 0.7951237263464338,
+      "grad_norm": 0.17219696938991547,
+      "learning_rate": 5.342607280120121e-06,
+      "loss": 0.1413906455039978,
+      "step": 4370
+    },
+    {
+      "epoch": 0.7960334788937409,
+      "grad_norm": 0.15083056688308716,
+      "learning_rate": 5.297213775827789e-06,
+      "loss": 0.14772192239761353,
+      "step": 4375
+    },
+    {
+      "epoch": 0.7969432314410481,
+      "grad_norm": 0.1699071079492569,
+      "learning_rate": 5.251991075614507e-06,
+      "loss": 0.1392375946044922,
+      "step": 4380
+    },
+    {
+      "epoch": 0.7978529839883551,
+      "grad_norm": 0.1680395007133484,
+      "learning_rate": 5.206939571517302e-06,
+      "loss": 0.14185575246810914,
+      "step": 4385
+    },
+    {
+      "epoch": 0.7987627365356623,
+      "grad_norm": 0.16526710987091064,
+      "learning_rate": 5.162059654089083e-06,
+      "loss": 0.15001428127288818,
+      "step": 4390
+    },
+    {
+      "epoch": 0.7996724890829694,
+      "grad_norm": 0.16281752288341522,
+      "learning_rate": 5.1173517123952794e-06,
+      "loss": 0.13747023344039916,
+      "step": 4395
+    },
+    {
+      "epoch": 0.8005822416302766,
+      "grad_norm": 0.1454378366470337,
+      "learning_rate": 5.072816134010458e-06,
+      "loss": 0.14710829257965088,
+      "step": 4400
+    },
+    {
+      "epoch": 0.8014919941775837,
+      "grad_norm": 0.16565890610218048,
+      "learning_rate": 5.028453305014966e-06,
+      "loss": 0.14138611555099487,
+      "step": 4405
+    },
+    {
+      "epoch": 0.8024017467248908,
+      "grad_norm": 0.1962810605764389,
+      "learning_rate": 4.984263609991577e-06,
+      "loss": 0.13836177587509155,
+      "step": 4410
+    },
+    {
+      "epoch": 0.8033114992721979,
+      "grad_norm": 0.16091369092464447,
+      "learning_rate": 4.940247432022149e-06,
+      "loss": 0.14407440423965454,
+      "step": 4415
+    },
+    {
+      "epoch": 0.8042212518195051,
+      "grad_norm": 0.1930241584777832,
+      "learning_rate": 4.89640515268433e-06,
+      "loss": 0.14346336126327514,
+      "step": 4420
+    },
+    {
+      "epoch": 0.8051310043668122,
+      "grad_norm": 0.19301500916481018,
+      "learning_rate": 4.852737152048242e-06,
+      "loss": 0.14174317121505736,
+      "step": 4425
+    },
+    {
+      "epoch": 0.8060407569141194,
+      "grad_norm": 0.1541353315114975,
+      "learning_rate": 4.80924380867315e-06,
+      "loss": 0.14100592136383056,
+      "step": 4430
+    },
+    {
+      "epoch": 0.8069505094614265,
+      "grad_norm": 0.16285750269889832,
+      "learning_rate": 4.765925499604243e-06,
+      "loss": 0.1441288709640503,
+      "step": 4435
+    },
+    {
+      "epoch": 0.8078602620087336,
+      "grad_norm": 0.17382675409317017,
+      "learning_rate": 4.722782600369299e-06,
+      "loss": 0.13763951063156127,
+      "step": 4440
+    },
+    {
+      "epoch": 0.8087700145560408,
+      "grad_norm": 0.1697344034910202,
+      "learning_rate": 4.679815484975505e-06,
+      "loss": 0.1410105347633362,
+      "step": 4445
+    },
+    {
+      "epoch": 0.8096797671033479,
+      "grad_norm": 0.19964542984962463,
+      "learning_rate": 4.637024525906131e-06,
+      "loss": 0.1439276695251465,
+      "step": 4450
+    },
+    {
+      "epoch": 0.8105895196506551,
+      "grad_norm": 0.165307879447937,
+      "learning_rate": 4.59441009411736e-06,
+      "loss": 0.13897504806518554,
+      "step": 4455
+    },
+    {
+      "epoch": 0.8114992721979621,
+      "grad_norm": 0.16687989234924316,
+      "learning_rate": 4.551972559035067e-06,
+      "loss": 0.1422593355178833,
+      "step": 4460
+    },
+    {
+      "epoch": 0.8124090247452693,
+      "grad_norm": 0.15737789869308472,
+      "learning_rate": 4.509712288551571e-06,
+      "loss": 0.1452128052711487,
+      "step": 4465
+    },
+    {
+      "epoch": 0.8133187772925764,
+      "grad_norm": 0.17116659879684448,
+      "learning_rate": 4.467629649022509e-06,
+      "loss": 0.14385371208190917,
+      "step": 4470
+    },
+    {
+      "epoch": 0.8142285298398836,
+      "grad_norm": 0.17457640171051025,
+      "learning_rate": 4.425725005263623e-06,
+      "loss": 0.14808475971221924,
+      "step": 4475
+    },
+    {
+      "epoch": 0.8151382823871907,
+      "grad_norm": 0.1621970385313034,
+      "learning_rate": 4.383998720547583e-06,
+      "loss": 0.13927959203720092,
+      "step": 4480
+    },
+    {
+      "epoch": 0.8160480349344978,
+      "grad_norm": 0.176296666264534,
+      "learning_rate": 4.342451156600896e-06,
+      "loss": 0.15041060447692872,
+      "step": 4485
+    },
+    {
+      "epoch": 0.8169577874818049,
+      "grad_norm": 0.17157645523548126,
+      "learning_rate": 4.301082673600698e-06,
+      "loss": 0.13932652473449708,
+      "step": 4490
+    },
+    {
+      "epoch": 0.8178675400291121,
+      "grad_norm": 0.15378527343273163,
+      "learning_rate": 4.259893630171682e-06,
+      "loss": 0.1406856894493103,
+      "step": 4495
+    },
+    {
+      "epoch": 0.8187772925764192,
+      "grad_norm": 0.1750226765871048,
+      "learning_rate": 4.218884383382987e-06,
+      "loss": 0.1350164532661438,
+      "step": 4500
+    },
+    {
+      "epoch": 0.8196870451237264,
+      "grad_norm": 0.1393742561340332,
+      "learning_rate": 4.178055288745053e-06,
+      "loss": 0.13769235610961914,
+      "step": 4505
+    },
+    {
+      "epoch": 0.8205967976710334,
+      "grad_norm": 0.1668994128704071,
+      "learning_rate": 4.137406700206617e-06,
+      "loss": 0.14029752016067504,
+      "step": 4510
+    },
+    {
+      "epoch": 0.8215065502183406,
+      "grad_norm": 0.1833454668521881,
+      "learning_rate": 4.0969389701515675e-06,
+      "loss": 0.14276301860809326,
+      "step": 4515
+    },
+    {
+      "epoch": 0.8224163027656477,
+      "grad_norm": 0.16187874972820282,
+      "learning_rate": 4.056652449395945e-06,
+      "loss": 0.1444832682609558,
+      "step": 4520
+    },
+    {
+      "epoch": 0.8233260553129549,
+      "grad_norm": 0.1453280746936798,
+      "learning_rate": 4.01654748718488e-06,
+      "loss": 0.14512733221054078,
+      "step": 4525
+    },
+    {
+      "epoch": 0.824235807860262,
+      "grad_norm": 0.1782725751399994,
+      "learning_rate": 3.976624431189563e-06,
+      "loss": 0.14093561172485353,
+      "step": 4530
+    },
+    {
+      "epoch": 0.8251455604075691,
+      "grad_norm": 0.17374491691589355,
+      "learning_rate": 3.936883627504234e-06,
+      "loss": 0.14031401872634888,
+      "step": 4535
+    },
+    {
+      "epoch": 0.8260553129548762,
+      "grad_norm": 0.1609172821044922,
+      "learning_rate": 3.897325420643174e-06,
+      "loss": 0.1428336262702942,
+      "step": 4540
+    },
+    {
+      "epoch": 0.8269650655021834,
+      "grad_norm": 0.1520884931087494,
+      "learning_rate": 3.85795015353774e-06,
+      "loss": 0.1460547924041748,
+      "step": 4545
+    },
+    {
+      "epoch": 0.8278748180494906,
+      "grad_norm": 0.20986326038837433,
+      "learning_rate": 3.818758167533376e-06,
+      "loss": 0.14706350564956666,
+      "step": 4550
+    },
+    {
+      "epoch": 0.8287845705967977,
+      "grad_norm": 0.16825413703918457,
+      "learning_rate": 3.7797498023866396e-06,
+      "loss": 0.14507200717926025,
+      "step": 4555
+    },
+    {
+      "epoch": 0.8296943231441049,
+      "grad_norm": 0.16758380830287933,
+      "learning_rate": 3.740925396262296e-06,
+      "loss": 0.14898381233215333,
+      "step": 4560
+    },
+    {
+      "epoch": 0.8306040756914119,
+      "grad_norm": 0.15207453072071075,
+      "learning_rate": 3.7022852857303503e-06,
+      "loss": 0.14138854742050172,
+      "step": 4565
+    },
+    {
+      "epoch": 0.8315138282387191,
+      "grad_norm": 0.15150749683380127,
+      "learning_rate": 3.66382980576315e-06,
+      "loss": 0.13894975185394287,
+      "step": 4570
+    },
+    {
+      "epoch": 0.8324235807860262,
+      "grad_norm": 0.17071188986301422,
+      "learning_rate": 3.625559289732472e-06,
+      "loss": 0.14072470664978026,
+      "step": 4575
+    },
+    {
+      "epoch": 0.8333333333333334,
+      "grad_norm": 0.154335618019104,
+      "learning_rate": 3.5874740694066294e-06,
+      "loss": 0.13791344165802003,
+      "step": 4580
+    },
+    {
+      "epoch": 0.8342430858806404,
+      "grad_norm": 0.14017128944396973,
+      "learning_rate": 3.5495744749476116e-06,
+      "loss": 0.14427922964096068,
+      "step": 4585
+    },
+    {
+      "epoch": 0.8351528384279476,
+      "grad_norm": 0.17210033535957336,
+      "learning_rate": 3.5118608349081983e-06,
+      "loss": 0.15191166400909423,
+      "step": 4590
+    },
+    {
+      "epoch": 0.8360625909752547,
+      "grad_norm": 0.18715685606002808,
+      "learning_rate": 3.4743334762291358e-06,
+      "loss": 0.14451316595077515,
+      "step": 4595
+    },
+    {
+      "epoch": 0.8369723435225619,
+      "grad_norm": 0.18079884350299835,
+      "learning_rate": 3.436992724236293e-06,
+      "loss": 0.13530746698379517,
+      "step": 4600
+    },
+    {
+      "epoch": 0.837882096069869,
+      "grad_norm": 0.13519920408725739,
+      "learning_rate": 3.399838902637817e-06,
+      "loss": 0.1477964401245117,
+      "step": 4605
+    },
+    {
+      "epoch": 0.8387918486171762,
+      "grad_norm": 0.1778026670217514,
+      "learning_rate": 3.3628723335213885e-06,
+      "loss": 0.14419831037521363,
+      "step": 4610
+    },
+    {
+      "epoch": 0.8397016011644832,
+      "grad_norm": 0.15165366232395172,
+      "learning_rate": 3.326093337351355e-06,
+      "loss": 0.13888469934463502,
+      "step": 4615
+    },
+    {
+      "epoch": 0.8406113537117904,
+      "grad_norm": 0.17049473524093628,
+      "learning_rate": 3.2895022329660018e-06,
+      "loss": 0.14438477754592896,
+      "step": 4620
+    },
+    {
+      "epoch": 0.8415211062590975,
+      "grad_norm": 0.16536414623260498,
+      "learning_rate": 3.2530993375747833e-06,
+      "loss": 0.1444351315498352,
+      "step": 4625
+    },
+    {
+      "epoch": 0.8424308588064047,
+      "grad_norm": 0.17570015788078308,
+      "learning_rate": 3.2168849667555402e-06,
+      "loss": 0.13861945867538453,
+      "step": 4630
+    },
+    {
+      "epoch": 0.8433406113537117,
+      "grad_norm": 0.1699545532464981,
+      "learning_rate": 3.1808594344518132e-06,
+      "loss": 0.13902754783630372,
+      "step": 4635
+    },
+    {
+      "epoch": 0.8442503639010189,
+      "grad_norm": 0.12331254780292511,
+      "learning_rate": 3.1450230529700837e-06,
+      "loss": 0.14104254245758058,
+      "step": 4640
+    },
+    {
+      "epoch": 0.845160116448326,
+      "grad_norm": 0.1508190929889679,
+      "learning_rate": 3.1093761329770708e-06,
+      "loss": 0.13288766145706177,
+      "step": 4645
+    },
+    {
+      "epoch": 0.8460698689956332,
+      "grad_norm": 0.19049489498138428,
+      "learning_rate": 3.0739189834970735e-06,
+      "loss": 0.14914840459823608,
+      "step": 4650
+    },
+    {
+      "epoch": 0.8469796215429404,
+      "grad_norm": 0.1662369966506958,
+      "learning_rate": 3.0386519119092293e-06,
+      "loss": 0.14222898483276367,
+      "step": 4655
+    },
+    {
+      "epoch": 0.8478893740902474,
+      "grad_norm": 0.18985967338085175,
+      "learning_rate": 3.0035752239449126e-06,
+      "loss": 0.14431113004684448,
+      "step": 4660
+    },
+    {
+      "epoch": 0.8487991266375546,
+      "grad_norm": 0.17005261778831482,
+      "learning_rate": 2.9686892236850337e-06,
+      "loss": 0.14140807390213012,
+      "step": 4665
+    },
+    {
+      "epoch": 0.8497088791848617,
+      "grad_norm": 0.16786684095859528,
+      "learning_rate": 2.9339942135574394e-06,
+      "loss": 0.14161460399627684,
+      "step": 4670
+    },
+    {
+      "epoch": 0.8506186317321689,
+      "grad_norm": 0.16358181834220886,
+      "learning_rate": 2.899490494334281e-06,
+      "loss": 0.14674670696258546,
+      "step": 4675
+    },
+    {
+      "epoch": 0.851528384279476,
+      "grad_norm": 0.1651349812746048,
+      "learning_rate": 2.8651783651293867e-06,
+      "loss": 0.13794611692428588,
+      "step": 4680
+    },
+    {
+      "epoch": 0.8524381368267832,
+      "grad_norm": 0.16934923827648163,
+      "learning_rate": 2.831058123395694e-06,
+      "loss": 0.13199397325515747,
+      "step": 4685
+    },
+    {
+      "epoch": 0.8533478893740902,
+      "grad_norm": 0.1704150140285492,
+      "learning_rate": 2.797130064922665e-06,
+      "loss": 0.14044904708862305,
+      "step": 4690
+    },
+    {
+      "epoch": 0.8542576419213974,
+      "grad_norm": 0.1814192682504654,
+      "learning_rate": 2.7633944838337143e-06,
+      "loss": 0.1465100646018982,
+      "step": 4695
+    },
+    {
+      "epoch": 0.8551673944687045,
+      "grad_norm": 0.18942610919475555,
+      "learning_rate": 2.729851672583669e-06,
+      "loss": 0.14685982465744019,
+      "step": 4700
+    },
+    {
+      "epoch": 0.8560771470160117,
+      "grad_norm": 0.17895208299160004,
+      "learning_rate": 2.6965019219562155e-06,
+      "loss": 0.13971571922302245,
+      "step": 4705
+    },
+    {
+      "epoch": 0.8569868995633187,
+      "grad_norm": 0.22735828161239624,
+      "learning_rate": 2.6633455210614055e-06,
+      "loss": 0.13776102066040039,
+      "step": 4710
+    },
+    {
+      "epoch": 0.8578966521106259,
+      "grad_norm": 0.16779793798923492,
+      "learning_rate": 2.630382757333133e-06,
+      "loss": 0.14134042263031005,
+      "step": 4715
+    },
+    {
+      "epoch": 0.858806404657933,
+      "grad_norm": 0.2148888260126114,
+      "learning_rate": 2.597613916526637e-06,
+      "loss": 0.14680721759796142,
+      "step": 4720
+    },
+    {
+      "epoch": 0.8597161572052402,
+      "grad_norm": 0.16560257971286774,
+      "learning_rate": 2.565039282716045e-06,
+      "loss": 0.14137234687805175,
+      "step": 4725
+    },
+    {
+      "epoch": 0.8606259097525473,
+      "grad_norm": 0.16197068989276886,
+      "learning_rate": 2.532659138291879e-06,
+      "loss": 0.14969314336776735,
+      "step": 4730
+    },
+    {
+      "epoch": 0.8615356622998545,
+      "grad_norm": 0.14650246500968933,
+      "learning_rate": 2.5004737639586497e-06,
+      "loss": 0.13532910346984864,
+      "step": 4735
+    },
+    {
+      "epoch": 0.8624454148471615,
+      "grad_norm": 0.1565634310245514,
+      "learning_rate": 2.4684834387323943e-06,
+      "loss": 0.14146244525909424,
+      "step": 4740
+    },
+    {
+      "epoch": 0.8633551673944687,
+      "grad_norm": 0.18060864508152008,
+      "learning_rate": 2.4366884399382393e-06,
+      "loss": 0.14218534231185914,
+      "step": 4745
+    },
+    {
+      "epoch": 0.8642649199417758,
+      "grad_norm": 0.24613255262374878,
+      "learning_rate": 2.4050890432080557e-06,
+      "loss": 0.13907679319381713,
+      "step": 4750
+    },
+    {
+      "epoch": 0.865174672489083,
+      "grad_norm": 0.16036023199558258,
+      "learning_rate": 2.3736855224780057e-06,
+      "loss": 0.13718113899230958,
+      "step": 4755
+    },
+    {
+      "epoch": 0.86608442503639,
+      "grad_norm": 0.16678516566753387,
+      "learning_rate": 2.3424781499862075e-06,
+      "loss": 0.1327962040901184,
+      "step": 4760
+    },
+    {
+      "epoch": 0.8669941775836972,
+      "grad_norm": 0.1763770878314972,
+      "learning_rate": 2.3114671962703727e-06,
+      "loss": 0.14390318393707274,
+      "step": 4765
+    },
+    {
+      "epoch": 0.8679039301310044,
+      "grad_norm": 0.17735697329044342,
+      "learning_rate": 2.280652930165428e-06,
+      "loss": 0.15223288536071777,
+      "step": 4770
+    },
+    {
+      "epoch": 0.8688136826783115,
+      "grad_norm": 0.15827041864395142,
+      "learning_rate": 2.250035618801241e-06,
+      "loss": 0.14296332597732545,
+      "step": 4775
+    },
+    {
+      "epoch": 0.8697234352256187,
+      "grad_norm": 0.16876135766506195,
+      "learning_rate": 2.219615527600244e-06,
+      "loss": 0.1359076738357544,
+      "step": 4780
+    },
+    {
+      "epoch": 0.8706331877729258,
+      "grad_norm": 0.1800110638141632,
+      "learning_rate": 2.189392920275174e-06,
+      "loss": 0.1424281358718872,
+      "step": 4785
+    },
+    {
+      "epoch": 0.8715429403202329,
+      "grad_norm": 0.1409560889005661,
+      "learning_rate": 2.159368058826783e-06,
+      "loss": 0.14480490684509278,
+      "step": 4790
+    },
+    {
+      "epoch": 0.87245269286754,
+      "grad_norm": 0.1634288728237152,
+      "learning_rate": 2.129541203541535e-06,
+      "loss": 0.14513269662857056,
+      "step": 4795
+    },
+    {
+      "epoch": 0.8733624454148472,
+      "grad_norm": 0.17126062512397766,
+      "learning_rate": 2.099912612989391e-06,
+      "loss": 0.13546934127807617,
+      "step": 4800
+    },
+    {
+      "epoch": 0.8742721979621543,
+      "grad_norm": 0.16704080998897552,
+      "learning_rate": 2.0704825440215457e-06,
+      "loss": 0.13852492570877076,
+      "step": 4805
+    },
+    {
+      "epoch": 0.8751819505094615,
+      "grad_norm": 0.1725970208644867,
+      "learning_rate": 2.0412512517681946e-06,
+      "loss": 0.14504197835922242,
+      "step": 4810
+    },
+    {
+      "epoch": 0.8760917030567685,
+      "grad_norm": 0.1700201779603958,
+      "learning_rate": 2.0122189896363387e-06,
+      "loss": 0.14312338829040527,
+      "step": 4815
+    },
+    {
+      "epoch": 0.8770014556040757,
+      "grad_norm": 0.16491736471652985,
+      "learning_rate": 1.9833860093075834e-06,
+      "loss": 0.14062976837158203,
+      "step": 4820
+    },
+    {
+      "epoch": 0.8779112081513828,
+      "grad_norm": 0.13748787343502045,
+      "learning_rate": 1.9547525607359537e-06,
+      "loss": 0.1346171498298645,
+      "step": 4825
+    },
+    {
+      "epoch": 0.87882096069869,
+      "grad_norm": 0.16399399936199188,
+      "learning_rate": 1.926318892145712e-06,
+      "loss": 0.14178123474121093,
+      "step": 4830
+    },
+    {
+      "epoch": 0.879730713245997,
+      "grad_norm": 0.14491963386535645,
+      "learning_rate": 1.8980852500292412e-06,
+      "loss": 0.1408564567565918,
+      "step": 4835
+    },
+    {
+      "epoch": 0.8806404657933042,
+      "grad_norm": 0.17335423827171326,
+      "learning_rate": 1.8700518791448851e-06,
+      "loss": 0.14403265714645386,
+      "step": 4840
+    },
+    {
+      "epoch": 0.8815502183406113,
+      "grad_norm": 0.17399625480175018,
+      "learning_rate": 1.8422190225148155e-06,
+      "loss": 0.14289036989212037,
+      "step": 4845
+    },
+    {
+      "epoch": 0.8824599708879185,
+      "grad_norm": 0.17945612967014313,
+      "learning_rate": 1.814586921422956e-06,
+      "loss": 0.14494109153747559,
+      "step": 4850
+    },
+    {
+      "epoch": 0.8833697234352256,
+      "grad_norm": 0.1910620480775833,
+      "learning_rate": 1.7871558154128664e-06,
+      "loss": 0.13726245164871215,
+      "step": 4855
+    },
+    {
+      "epoch": 0.8842794759825328,
+      "grad_norm": 0.1771879345178604,
+      "learning_rate": 1.7599259422856756e-06,
+      "loss": 0.1464752197265625,
+      "step": 4860
+    },
+    {
+      "epoch": 0.8851892285298398,
+      "grad_norm": 0.19427461922168732,
+      "learning_rate": 1.7328975380980218e-06,
+      "loss": 0.13823356628417968,
+      "step": 4865
+    },
+    {
+      "epoch": 0.886098981077147,
+      "grad_norm": 0.1491149365901947,
+      "learning_rate": 1.7060708371599897e-06,
+      "loss": 0.1338604211807251,
+      "step": 4870
+    },
+    {
+      "epoch": 0.8870087336244541,
+      "grad_norm": 0.16087733209133148,
+      "learning_rate": 1.6794460720331057e-06,
+      "loss": 0.14184389114379883,
+      "step": 4875
+    },
+    {
+      "epoch": 0.8879184861717613,
+      "grad_norm": 0.14506325125694275,
+      "learning_rate": 1.653023473528309e-06,
+      "loss": 0.14267687797546386,
+      "step": 4880
+    },
+    {
+      "epoch": 0.8888282387190685,
+      "grad_norm": 0.16886365413665771,
+      "learning_rate": 1.626803270703936e-06,
+      "loss": 0.14266083240509034,
+      "step": 4885
+    },
+    {
+      "epoch": 0.8897379912663755,
+      "grad_norm": 0.1891999989748001,
+      "learning_rate": 1.6007856908637652e-06,
+      "loss": 0.1398016929626465,
+      "step": 4890
+    },
+    {
+      "epoch": 0.8906477438136827,
+      "grad_norm": 0.17645299434661865,
+      "learning_rate": 1.5749709595550083e-06,
+      "loss": 0.13869571685791016,
+      "step": 4895
+    },
+    {
+      "epoch": 0.8915574963609898,
+      "grad_norm": 0.17714262008666992,
+      "learning_rate": 1.549359300566408e-06,
+      "loss": 0.14957486391067504,
+      "step": 4900
+    },
+    {
+      "epoch": 0.892467248908297,
+      "grad_norm": 0.18025240302085876,
+      "learning_rate": 1.5239509359262355e-06,
+      "loss": 0.1358652949333191,
+      "step": 4905
+    },
+    {
+      "epoch": 0.8933770014556041,
+      "grad_norm": 0.17539937794208527,
+      "learning_rate": 1.4987460859004154e-06,
+      "loss": 0.13833394050598144,
+      "step": 4910
+    },
+    {
+      "epoch": 0.8942867540029112,
+      "grad_norm": 0.1772230565547943,
+      "learning_rate": 1.4737449689905953e-06,
+      "loss": 0.14202116727828978,
+      "step": 4915
+    },
+    {
+      "epoch": 0.8951965065502183,
+      "grad_norm": 0.1670161783695221,
+      "learning_rate": 1.4489478019322433e-06,
+      "loss": 0.1403665542602539,
+      "step": 4920
+    },
+    {
+      "epoch": 0.8961062590975255,
+      "grad_norm": 0.1697034239768982,
+      "learning_rate": 1.4243547996927926e-06,
+      "loss": 0.1401481032371521,
+      "step": 4925
+    },
+    {
+      "epoch": 0.8970160116448326,
+      "grad_norm": 0.16474860906600952,
+      "learning_rate": 1.3999661754697636e-06,
+      "loss": 0.13969850540161133,
+      "step": 4930
+    },
+    {
+      "epoch": 0.8979257641921398,
+      "grad_norm": 0.1664883941411972,
+      "learning_rate": 1.3757821406889027e-06,
+      "loss": 0.1399069309234619,
+      "step": 4935
+    },
+    {
+      "epoch": 0.8988355167394468,
+      "grad_norm": 0.16675794124603271,
+      "learning_rate": 1.351802905002386e-06,
+      "loss": 0.14129226207733153,
+      "step": 4940
+    },
+    {
+      "epoch": 0.899745269286754,
+      "grad_norm": 0.17529809474945068,
+      "learning_rate": 1.3280286762869632e-06,
+      "loss": 0.14663081169128417,
+      "step": 4945
+    },
+    {
+      "epoch": 0.9006550218340611,
+      "grad_norm": 0.17758169770240784,
+      "learning_rate": 1.3044596606421795e-06,
+      "loss": 0.13986254930496217,
+      "step": 4950
+    },
+    {
+      "epoch": 0.9015647743813683,
+      "grad_norm": 0.153225839138031,
+      "learning_rate": 1.2810960623885815e-06,
+      "loss": 0.14236698150634766,
+      "step": 4955
+    },
+    {
+      "epoch": 0.9024745269286754,
+      "grad_norm": 0.169761523604393,
+      "learning_rate": 1.2579380840659376e-06,
+      "loss": 0.1450445055961609,
+      "step": 4960
+    },
+    {
+      "epoch": 0.9033842794759825,
+      "grad_norm": 0.16659331321716309,
+      "learning_rate": 1.2349859264315034e-06,
+      "loss": 0.14043926000595092,
+      "step": 4965
+    },
+    {
+      "epoch": 0.9042940320232896,
+      "grad_norm": 0.16748706996440887,
+      "learning_rate": 1.2122397884582553e-06,
+      "loss": 0.14725675582885742,
+      "step": 4970
+    },
+    {
+      "epoch": 0.9052037845705968,
+      "grad_norm": 0.1600511223077774,
+      "learning_rate": 1.1896998673331883e-06,
+      "loss": 0.14551150798797607,
+      "step": 4975
+    },
+    {
+      "epoch": 0.9061135371179039,
+      "grad_norm": 0.24318362772464752,
+      "learning_rate": 1.1673663584555934e-06,
+      "loss": 0.14470888376235963,
+      "step": 4980
+    },
+    {
+      "epoch": 0.9070232896652111,
+      "grad_norm": 0.16443821787834167,
+      "learning_rate": 1.1452394554353706e-06,
+      "loss": 0.13639854192733764,
+      "step": 4985
+    },
+    {
+      "epoch": 0.9079330422125182,
+      "grad_norm": 0.14277774095535278,
+      "learning_rate": 1.1233193500913453e-06,
+      "loss": 0.13749881982803344,
+      "step": 4990
+    },
+    {
+      "epoch": 0.9088427947598253,
+      "grad_norm": 0.1610947549343109,
+      "learning_rate": 1.1016062324496008e-06,
+      "loss": 0.1385629653930664,
+      "step": 4995
+    },
+    {
+      "epoch": 0.9097525473071325,
+      "grad_norm": 0.17888498306274414,
+      "learning_rate": 1.080100290741845e-06,
+      "loss": 0.14225621223449708,
+      "step": 5000
+    },
+    {
+      "epoch": 0.9106622998544396,
+      "grad_norm": 0.17488449811935425,
+      "learning_rate": 1.0588017114037729e-06,
+      "loss": 0.14187805652618407,
+      "step": 5005
+    },
+    {
+      "epoch": 0.9115720524017468,
+      "grad_norm": 0.16410665214061737,
+      "learning_rate": 1.0377106790734392e-06,
+      "loss": 0.1407416582107544,
+      "step": 5010
+    },
+    {
+      "epoch": 0.9124818049490538,
+      "grad_norm": 0.18115971982479095,
+      "learning_rate": 1.016827376589674e-06,
+      "loss": 0.1427263855934143,
+      "step": 5015
+    },
+    {
+      "epoch": 0.913391557496361,
+      "grad_norm": 0.18507841229438782,
+      "learning_rate": 9.961519849904898e-07,
+      "loss": 0.1390499472618103,
+      "step": 5020
+    },
+    {
+      "epoch": 0.9143013100436681,
+      "grad_norm": 0.21296796202659607,
+      "learning_rate": 9.75684683511513e-07,
+      "loss": 0.1382216691970825,
+      "step": 5025
+    },
+    {
+      "epoch": 0.9152110625909753,
+      "grad_norm": 0.2308044582605362,
+      "learning_rate": 9.55425649584435e-07,
+      "loss": 0.14271280765533448,
+      "step": 5030
+    },
+    {
+      "epoch": 0.9161208151382824,
+      "grad_norm": 0.15796682238578796,
+      "learning_rate": 9.353750588354527e-07,
+      "loss": 0.13807624578475952,
+      "step": 5035
+    },
+    {
+      "epoch": 0.9170305676855895,
+      "grad_norm": 0.1695316582918167,
+      "learning_rate": 9.155330850837834e-07,
+      "loss": 0.14289476871490478,
+      "step": 5040
+    },
+    {
+      "epoch": 0.9179403202328966,
+      "grad_norm": 0.1738404780626297,
+      "learning_rate": 8.958999003401191e-07,
+      "loss": 0.14070619344711305,
+      "step": 5045
+    },
+    {
+      "epoch": 0.9188500727802038,
+      "grad_norm": 0.20618964731693268,
+      "learning_rate": 8.764756748051662e-07,
+      "loss": 0.14535053968429565,
+      "step": 5050
+    },
+    {
+      "epoch": 0.9197598253275109,
+      "grad_norm": 0.1506137251853943,
+      "learning_rate": 8.572605768681546e-07,
+      "loss": 0.13995139598846434,
+      "step": 5055
+    },
+    {
+      "epoch": 0.9206695778748181,
+      "grad_norm": 0.17772039771080017,
+      "learning_rate": 8.382547731053708e-07,
+      "loss": 0.14470311403274536,
+      "step": 5060
+    },
+    {
+      "epoch": 0.9215793304221251,
+      "grad_norm": 0.19897456467151642,
+      "learning_rate": 8.194584282787382e-07,
+      "loss": 0.144488525390625,
+      "step": 5065
+    },
+    {
+      "epoch": 0.9224890829694323,
+      "grad_norm": 0.15899236500263214,
+      "learning_rate": 8.008717053343606e-07,
+      "loss": 0.1352991580963135,
+      "step": 5070
+    },
+    {
+      "epoch": 0.9233988355167394,
+      "grad_norm": 0.14965768158435822,
+      "learning_rate": 7.824947654011345e-07,
+      "loss": 0.13827911615371705,
+      "step": 5075
+    },
+    {
+      "epoch": 0.9243085880640466,
+      "grad_norm": 0.43651485443115234,
+      "learning_rate": 7.643277677893329e-07,
+      "loss": 0.14149526357650757,
+      "step": 5080
+    },
+    {
+      "epoch": 0.9252183406113537,
+      "grad_norm": 0.19912713766098022,
+      "learning_rate": 7.463708699892325e-07,
+      "loss": 0.14357032775878906,
+      "step": 5085
+    },
+    {
+      "epoch": 0.9261280931586608,
+      "grad_norm": 0.1635904610157013,
+      "learning_rate": 7.286242276697524e-07,
+      "loss": 0.13550699949264527,
+      "step": 5090
+    },
+    {
+      "epoch": 0.9270378457059679,
+      "grad_norm": 0.19391080737113953,
+      "learning_rate": 7.11087994677101e-07,
+      "loss": 0.14674756526947022,
+      "step": 5095
+    },
+    {
+      "epoch": 0.9279475982532751,
+      "grad_norm": 0.17458125948905945,
+      "learning_rate": 6.937623230334284e-07,
+      "loss": 0.14155579805374147,
+      "step": 5100
+    },
+    {
+      "epoch": 0.9288573508005823,
+      "grad_norm": 0.1617971807718277,
+      "learning_rate": 6.766473629355452e-07,
+      "loss": 0.140555477142334,
+      "step": 5105
+    },
+    {
+      "epoch": 0.9297671033478894,
+      "grad_norm": 0.16945427656173706,
+      "learning_rate": 6.59743262753576e-07,
+      "loss": 0.13607511520385743,
+      "step": 5110
+    },
+    {
+      "epoch": 0.9306768558951966,
+      "grad_norm": 0.18347840011119843,
+      "learning_rate": 6.43050169029702e-07,
+      "loss": 0.14903461933135986,
+      "step": 5115
+    },
+    {
+      "epoch": 0.9315866084425036,
+      "grad_norm": 0.15434837341308594,
+      "learning_rate": 6.265682264768869e-07,
+      "loss": 0.14146015644073487,
+      "step": 5120
+    },
+    {
+      "epoch": 0.9324963609898108,
+      "grad_norm": 0.1397712528705597,
+      "learning_rate": 6.10297577977606e-07,
+      "loss": 0.14261592626571656,
+      "step": 5125
+    },
+    {
+      "epoch": 0.9334061135371179,
+      "grad_norm": 0.1765873283147812,
+      "learning_rate": 5.942383645826361e-07,
+      "loss": 0.13559447526931762,
+      "step": 5130
+    },
+    {
+      "epoch": 0.9343158660844251,
+      "grad_norm": 0.1656057983636856,
+      "learning_rate": 5.783907255098003e-07,
+      "loss": 0.13961490392684936,
+      "step": 5135
+    },
+    {
+      "epoch": 0.9352256186317321,
+      "grad_norm": 0.2169366180896759,
+      "learning_rate": 5.627547981427894e-07,
+      "loss": 0.1447835922241211,
+      "step": 5140
+    },
+    {
+      "epoch": 0.9361353711790393,
+      "grad_norm": 0.18623125553131104,
+      "learning_rate": 5.473307180299508e-07,
+      "loss": 0.14366730451583862,
+      "step": 5145
+    },
+    {
+      "epoch": 0.9370451237263464,
+      "grad_norm": 0.15423963963985443,
+      "learning_rate": 5.32118618883129e-07,
+      "loss": 0.14295632839202882,
+      "step": 5150
+    },
+    {
+      "epoch": 0.9379548762736536,
+      "grad_norm": 0.18423247337341309,
+      "learning_rate": 5.17118632576491e-07,
+      "loss": 0.14137414693832398,
+      "step": 5155
+    },
+    {
+      "epoch": 0.9388646288209607,
+      "grad_norm": 0.15338757634162903,
+      "learning_rate": 5.023308891453915e-07,
+      "loss": 0.13583066463470458,
+      "step": 5160
+    },
+    {
+      "epoch": 0.9397743813682679,
+      "grad_norm": 0.2293633222579956,
+      "learning_rate": 4.877555167852515e-07,
+      "loss": 0.14819620847702025,
+      "step": 5165
+    },
+    {
+      "epoch": 0.9406841339155749,
+      "grad_norm": 0.16889944672584534,
+      "learning_rate": 4.7339264185043974e-07,
+      "loss": 0.13617686033248902,
+      "step": 5170
+    },
+    {
+      "epoch": 0.9415938864628821,
+      "grad_norm": 0.1767464578151703,
+      "learning_rate": 4.5924238885316775e-07,
+      "loss": 0.13487552404403685,
+      "step": 5175
+    },
+    {
+      "epoch": 0.9425036390101892,
+      "grad_norm": 0.16697899997234344,
+      "learning_rate": 4.453048804624327e-07,
+      "loss": 0.1446886420249939,
+      "step": 5180
+    },
+    {
+      "epoch": 0.9434133915574964,
+      "grad_norm": 0.19576266407966614,
+      "learning_rate": 4.315802375029293e-07,
+      "loss": 0.14252450466156005,
+      "step": 5185
+    },
+    {
+      "epoch": 0.9443231441048034,
+      "grad_norm": 0.14838077127933502,
+      "learning_rate": 4.18068578954034e-07,
+      "loss": 0.13933032751083374,
+      "step": 5190
+    },
+    {
+      "epoch": 0.9452328966521106,
+      "grad_norm": 0.18481744825839996,
+      "learning_rate": 4.047700219487388e-07,
+      "loss": 0.1410665273666382,
+      "step": 5195
+    },
+    {
+      "epoch": 0.9461426491994177,
+      "grad_norm": 0.16954176127910614,
+      "learning_rate": 3.9168468177265547e-07,
+      "loss": 0.1421758770942688,
+      "step": 5200
+    },
+    {
+      "epoch": 0.9470524017467249,
+      "grad_norm": 0.17614421248435974,
+      "learning_rate": 3.7881267186301306e-07,
+      "loss": 0.14059911966323851,
+      "step": 5205
+    },
+    {
+      "epoch": 0.9479621542940321,
+      "grad_norm": 0.1637226939201355,
+      "learning_rate": 3.6615410380767544e-07,
+      "loss": 0.1360395908355713,
+      "step": 5210
+    },
+    {
+      "epoch": 0.9488719068413392,
+      "grad_norm": 0.18330250680446625,
+      "learning_rate": 3.5370908734417006e-07,
+      "loss": 0.14543824195861815,
+      "step": 5215
+    },
+    {
+      "epoch": 0.9497816593886463,
+      "grad_norm": 0.1895420402288437,
+      "learning_rate": 3.414777303587413e-07,
+      "loss": 0.15304578542709352,
+      "step": 5220
+    },
+    {
+      "epoch": 0.9506914119359534,
+      "grad_norm": 0.15384933352470398,
+      "learning_rate": 3.294601388854041e-07,
+      "loss": 0.14675912857055665,
+      "step": 5225
+    },
+    {
+      "epoch": 0.9516011644832606,
+      "grad_norm": 0.20188499987125397,
+      "learning_rate": 3.1765641710505e-07,
+      "loss": 0.14068362712860108,
+      "step": 5230
+    },
+    {
+      "epoch": 0.9525109170305677,
+      "grad_norm": 0.16467279195785522,
+      "learning_rate": 3.060666673445123e-07,
+      "loss": 0.14733167886734008,
+      "step": 5235
+    },
+    {
+      "epoch": 0.9534206695778749,
+      "grad_norm": 0.16632016003131866,
+      "learning_rate": 2.9469099007569943e-07,
+      "loss": 0.13753929138183593,
+      "step": 5240
+    },
+    {
+      "epoch": 0.9543304221251819,
+      "grad_norm": 0.1477566957473755,
+      "learning_rate": 2.83529483914724e-07,
+      "loss": 0.14354891777038575,
+      "step": 5245
+    },
+    {
+      "epoch": 0.9552401746724891,
+      "grad_norm": 0.1693645417690277,
+      "learning_rate": 2.7258224562102805e-07,
+      "loss": 0.14622807502746582,
+      "step": 5250
+    },
+    {
+      "epoch": 0.9561499272197962,
+      "grad_norm": 0.17574062943458557,
+      "learning_rate": 2.6184937009657295e-07,
+      "loss": 0.1344899296760559,
+      "step": 5255
+    },
+    {
+      "epoch": 0.9570596797671034,
+      "grad_norm": 0.17448563873767853,
+      "learning_rate": 2.513309503850009e-07,
+      "loss": 0.1355789542198181,
+      "step": 5260
+    },
+    {
+      "epoch": 0.9579694323144105,
+      "grad_norm": 0.16993778944015503,
+      "learning_rate": 2.41027077670819e-07,
+      "loss": 0.151595401763916,
+      "step": 5265
+    },
+    {
+      "epoch": 0.9588791848617176,
+      "grad_norm": 0.16944102942943573,
+      "learning_rate": 2.3093784127863062e-07,
+      "loss": 0.1466623306274414,
+      "step": 5270
+    },
+    {
+      "epoch": 0.9597889374090247,
+      "grad_norm": 0.18085163831710815,
+      "learning_rate": 2.2106332867234402e-07,
+      "loss": 0.14645814895629883,
+      "step": 5275
+    },
+    {
+      "epoch": 0.9606986899563319,
+      "grad_norm": 0.14682307839393616,
+      "learning_rate": 2.1140362545442605e-07,
+      "loss": 0.13901774883270263,
+      "step": 5280
+    },
+    {
+      "epoch": 0.961608442503639,
+      "grad_norm": 0.17189526557922363,
+      "learning_rate": 2.0195881536514694e-07,
+      "loss": 0.14153491258621215,
+      "step": 5285
+    },
+    {
+      "epoch": 0.9625181950509462,
+      "grad_norm": 0.1977207362651825,
+      "learning_rate": 1.9272898028186714e-07,
+      "loss": 0.1437437653541565,
+      "step": 5290
+    },
+    {
+      "epoch": 0.9634279475982532,
+      "grad_norm": 0.16637668013572693,
+      "learning_rate": 1.837142002183184e-07,
+      "loss": 0.13910138607025146,
+      "step": 5295
+    },
+    {
+      "epoch": 0.9643377001455604,
+      "grad_norm": 0.18155774474143982,
+      "learning_rate": 1.7491455332391548e-07,
+      "loss": 0.14177814722061158,
+      "step": 5300
+    },
+    {
+      "epoch": 0.9652474526928675,
+      "grad_norm": 0.32478174567222595,
+      "learning_rate": 1.6633011588307878e-07,
+      "loss": 0.14292703866958617,
+      "step": 5305
+    },
+    {
+      "epoch": 0.9661572052401747,
+      "grad_norm": 0.18050940334796906,
+      "learning_rate": 1.5796096231456558e-07,
+      "loss": 0.13252723217010498,
+      "step": 5310
+    },
+    {
+      "epoch": 0.9670669577874818,
+      "grad_norm": 0.15919657051563263,
+      "learning_rate": 1.4980716517083715e-07,
+      "loss": 0.14491976499557496,
+      "step": 5315
+    },
+    {
+      "epoch": 0.9679767103347889,
+      "grad_norm": 0.15895310044288635,
+      "learning_rate": 1.4186879513741758e-07,
+      "loss": 0.13617006540298462,
+      "step": 5320
+    },
+    {
+      "epoch": 0.9688864628820961,
+      "grad_norm": 0.1543736606836319,
+      "learning_rate": 1.3414592103228595e-07,
+      "loss": 0.14220429658889772,
+      "step": 5325
+    },
+    {
+      "epoch": 0.9697962154294032,
+      "grad_norm": 0.16660647094249725,
+      "learning_rate": 1.2663860980528797e-07,
+      "loss": 0.14069980382919312,
+      "step": 5330
+    },
+    {
+      "epoch": 0.9707059679767104,
+      "grad_norm": 0.15238550305366516,
+      "learning_rate": 1.1934692653754186e-07,
+      "loss": 0.13978019952774048,
+      "step": 5335
+    },
+    {
+      "epoch": 0.9716157205240175,
+      "grad_norm": 0.1649473011493683,
+      "learning_rate": 1.1227093444088066e-07,
+      "loss": 0.1401435136795044,
+      "step": 5340
+    },
+    {
+      "epoch": 0.9725254730713246,
+      "grad_norm": 0.14920124411582947,
+      "learning_rate": 1.0541069485730249e-07,
+      "loss": 0.13952178955078126,
+      "step": 5345
+    },
+    {
+      "epoch": 0.9734352256186317,
+      "grad_norm": 0.16802479326725006,
+      "learning_rate": 9.876626725844329e-08,
+      "loss": 0.14808181524276734,
+      "step": 5350
+    },
+    {
+      "epoch": 0.9743449781659389,
+      "grad_norm": 0.18096603453159332,
+      "learning_rate": 9.233770924505781e-08,
+      "loss": 0.13938647508621216,
+      "step": 5355
+    },
+    {
+      "epoch": 0.975254730713246,
+      "grad_norm": 0.1658579558134079,
+      "learning_rate": 8.612507654651991e-08,
+      "loss": 0.14219754934310913,
+      "step": 5360
+    },
+    {
+      "epoch": 0.9761644832605532,
+      "grad_norm": 0.1547713279724121,
+      "learning_rate": 8.012842302033696e-08,
+      "loss": 0.14298388957977295,
+      "step": 5365
+    },
+    {
+      "epoch": 0.9770742358078602,
+      "grad_norm": 0.18247587978839874,
+      "learning_rate": 7.434780065169178e-08,
+      "loss": 0.14103788137435913,
+      "step": 5370
+    },
+    {
+      "epoch": 0.9779839883551674,
+      "grad_norm": 0.17593605816364288,
+      "learning_rate": 6.878325955297915e-08,
+      "loss": 0.1450013041496277,
+      "step": 5375
+    },
+    {
+      "epoch": 0.9788937409024745,
+      "grad_norm": 0.17178039252758026,
+      "learning_rate": 6.343484796338395e-08,
+      "loss": 0.14021269083023072,
+      "step": 5380
+    },
+    {
+      "epoch": 0.9798034934497817,
+      "grad_norm": 0.17904147505760193,
+      "learning_rate": 5.830261224845923e-08,
+      "loss": 0.1460060477256775,
+      "step": 5385
+    },
+    {
+      "epoch": 0.9807132459970888,
+      "grad_norm": 0.16323266923427582,
+      "learning_rate": 5.338659689971548e-08,
+      "loss": 0.13915741443634033,
+      "step": 5390
+    },
+    {
+      "epoch": 0.9816229985443959,
+      "grad_norm": 0.1829039305448532,
+      "learning_rate": 4.8686844534248655e-08,
+      "loss": 0.1372266888618469,
+      "step": 5395
+    },
+    {
+      "epoch": 0.982532751091703,
+      "grad_norm": 0.16742415726184845,
+      "learning_rate": 4.420339589435995e-08,
+      "loss": 0.14404670000076295,
+      "step": 5400
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.967782995976911e+18,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-5400/training_args.bin b/checkpoint-5400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-5400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/checkpoint-5500/README.md b/checkpoint-5500/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-5500/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-5500/adapter_config.json b/checkpoint-5500/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-5500/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-5500/adapter_model.safetensors b/checkpoint-5500/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..13e7eccf51f5d9d11e1fc349773e81db85eac36b
--- /dev/null
+++ b/checkpoint-5500/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:023fcb9c596c99c5e8d74320f9720621834918ec3bcd5d877b44b0fe0907ce2e
+size 169741912
diff --git a/checkpoint-5500/chat_template.jinja b/checkpoint-5500/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-5500/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-5500/optimizer.pt b/checkpoint-5500/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b3c813f7ee054bd6cc17032b68e0ee8e23f652ff
--- /dev/null
+++ b/checkpoint-5500/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a943077e29417c2f64c4e35a6044a31f0885d613fe3e295a70b06474feaca5da
+size 72807355
diff --git a/checkpoint-5500/processor_config.json b/checkpoint-5500/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-5500/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-5500/rng_state.pth b/checkpoint-5500/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..bd3c190a62ba12f21428f29e0f4bde711034a75b
--- /dev/null
+++ b/checkpoint-5500/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f4a9f217e852f439efa6bd32fde98d6867f11aa6ea13ddc021ba10af6a0b0934
+size 14645
diff --git a/checkpoint-5500/scheduler.pt b/checkpoint-5500/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..110cd65327bf1c41e27ad865b4a927c404da750c
--- /dev/null
+++ b/checkpoint-5500/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bbfcd72a0f469e954d7ef4acd4596e1e654cea8ecfeeb0bc5b3be32d628eac2c
+size 1465
diff --git a/checkpoint-5500/tokenizer.json b/checkpoint-5500/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-5500/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-5500/tokenizer_config.json b/checkpoint-5500/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-5500/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-5500/trainer_state.json b/checkpoint-5500/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..14b1ef0421dcf74a09f0cd31dddcd67f0043815f
--- /dev/null
+++ b/checkpoint-5500/trainer_state.json
@@ -0,0 +1,7742 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0007278020378456,
+  "eval_steps": 100,
+  "global_step": 5500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    },
+    {
+      "epoch": 0.03729985443959243,
+      "grad_norm": 0.061678580939769745,
+      "learning_rate": 4.999340748636462e-05,
+      "loss": 0.24956226348876953,
+      "step": 205
+    },
+    {
+      "epoch": 0.03820960698689956,
+      "grad_norm": 0.07328873127698898,
+      "learning_rate": 4.999160884067051e-05,
+      "loss": 0.26169676780700685,
+      "step": 210
+    },
+    {
+      "epoch": 0.0391193595342067,
+      "grad_norm": 0.08287990838289261,
+      "learning_rate": 4.9989593541926246e-05,
+      "loss": 0.2574604034423828,
+      "step": 215
+    },
+    {
+      "epoch": 0.04002911208151383,
+      "grad_norm": 0.06787359714508057,
+      "learning_rate": 4.9987361607602525e-05,
+      "loss": 0.25351409912109374,
+      "step": 220
+    },
+    {
+      "epoch": 0.04093886462882096,
+      "grad_norm": 0.06695502996444702,
+      "learning_rate": 4.998491305704805e-05,
+      "loss": 0.24522039890289307,
+      "step": 225
+    },
+    {
+      "epoch": 0.041848617176128096,
+      "grad_norm": 0.08872214704751968,
+      "learning_rate": 4.9982247911489375e-05,
+      "loss": 0.2581867933273315,
+      "step": 230
+    },
+    {
+      "epoch": 0.042758369723435226,
+      "grad_norm": 0.07637131959199905,
+      "learning_rate": 4.9979366194030743e-05,
+      "loss": 0.25569658279418944,
+      "step": 235
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 0.08158119022846222,
+      "learning_rate": 4.997626792965385e-05,
+      "loss": 0.2529409646987915,
+      "step": 240
+    },
+    {
+      "epoch": 0.04457787481804949,
+      "grad_norm": 0.07529161125421524,
+      "learning_rate": 4.997295314521766e-05,
+      "loss": 0.24049024581909179,
+      "step": 245
+    },
+    {
+      "epoch": 0.04548762736535662,
+      "grad_norm": 0.08860139548778534,
+      "learning_rate": 4.996942186945813e-05,
+      "loss": 0.2490522861480713,
+      "step": 250
+    },
+    {
+      "epoch": 0.04639737991266375,
+      "grad_norm": 0.0850321501493454,
+      "learning_rate": 4.9965674132988005e-05,
+      "loss": 0.24180831909179687,
+      "step": 255
+    },
+    {
+      "epoch": 0.04730713245997089,
+      "grad_norm": 0.07556115090847015,
+      "learning_rate": 4.996170996829653e-05,
+      "loss": 0.2509631872177124,
+      "step": 260
+    },
+    {
+      "epoch": 0.04821688500727802,
+      "grad_norm": 0.07971206307411194,
+      "learning_rate": 4.995752940974918e-05,
+      "loss": 0.24398891925811766,
+      "step": 265
+    },
+    {
+      "epoch": 0.04912663755458515,
+      "grad_norm": 0.09149336814880371,
+      "learning_rate": 4.9953132493587344e-05,
+      "loss": 0.2300492286682129,
+      "step": 270
+    },
+    {
+      "epoch": 0.050036390101892286,
+      "grad_norm": 0.08265820890665054,
+      "learning_rate": 4.9948519257928034e-05,
+      "loss": 0.24246792793273925,
+      "step": 275
+    },
+    {
+      "epoch": 0.050946142649199416,
+      "grad_norm": 0.10328587144613266,
+      "learning_rate": 4.9943689742763534e-05,
+      "loss": 0.2367171049118042,
+      "step": 280
+    },
+    {
+      "epoch": 0.05185589519650655,
+      "grad_norm": 0.0836917981505394,
+      "learning_rate": 4.993864398996105e-05,
+      "loss": 0.23215813636779786,
+      "step": 285
+    },
+    {
+      "epoch": 0.05276564774381368,
+      "grad_norm": 0.09475161135196686,
+      "learning_rate": 4.99333820432624e-05,
+      "loss": 0.2350748062133789,
+      "step": 290
+    },
+    {
+      "epoch": 0.05367540029112081,
+      "grad_norm": 0.08040128648281097,
+      "learning_rate": 4.992790394828355e-05,
+      "loss": 0.23253886699676513,
+      "step": 295
+    },
+    {
+      "epoch": 0.05458515283842795,
+      "grad_norm": 0.08852150291204453,
+      "learning_rate": 4.992220975251428e-05,
+      "loss": 0.23856515884399415,
+      "step": 300
+    },
+    {
+      "epoch": 0.05549490538573508,
+      "grad_norm": 0.09565229713916779,
+      "learning_rate": 4.991629950531775e-05,
+      "loss": 0.23311660289764405,
+      "step": 305
+    },
+    {
+      "epoch": 0.05640465793304221,
+      "grad_norm": 0.08158160001039505,
+      "learning_rate": 4.991017325793009e-05,
+      "loss": 0.22467944622039795,
+      "step": 310
+    },
+    {
+      "epoch": 0.05731441048034935,
+      "grad_norm": 0.07746429741382599,
+      "learning_rate": 4.990383106345994e-05,
+      "loss": 0.229844069480896,
+      "step": 315
+    },
+    {
+      "epoch": 0.05822416302765648,
+      "grad_norm": 0.08564355969429016,
+      "learning_rate": 4.989727297688797e-05,
+      "loss": 0.22414517402648926,
+      "step": 320
+    },
+    {
+      "epoch": 0.05913391557496361,
+      "grad_norm": 0.07517435401678085,
+      "learning_rate": 4.9890499055066435e-05,
+      "loss": 0.2236532211303711,
+      "step": 325
+    },
+    {
+      "epoch": 0.060043668122270744,
+      "grad_norm": 0.111734539270401,
+      "learning_rate": 4.988350935671869e-05,
+      "loss": 0.21474847793579102,
+      "step": 330
+    },
+    {
+      "epoch": 0.060953420669577874,
+      "grad_norm": 0.09906989336013794,
+      "learning_rate": 4.987630394243866e-05,
+      "loss": 0.23321933746337892,
+      "step": 335
+    },
+    {
+      "epoch": 0.06186317321688501,
+      "grad_norm": 0.10131457448005676,
+      "learning_rate": 4.98688828746903e-05,
+      "loss": 0.2310662031173706,
+      "step": 340
+    },
+    {
+      "epoch": 0.06277292576419213,
+      "grad_norm": 0.09203507006168365,
+      "learning_rate": 4.986124621780708e-05,
+      "loss": 0.22021169662475587,
+      "step": 345
+    },
+    {
+      "epoch": 0.06368267831149928,
+      "grad_norm": 0.09505912661552429,
+      "learning_rate": 4.9853394037991416e-05,
+      "loss": 0.2197155237197876,
+      "step": 350
+    },
+    {
+      "epoch": 0.06459243085880641,
+      "grad_norm": 0.09038657695055008,
+      "learning_rate": 4.984532640331412e-05,
+      "loss": 0.22066287994384765,
+      "step": 355
+    },
+    {
+      "epoch": 0.06550218340611354,
+      "grad_norm": 0.09707064181566238,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 0.22455451488494874,
+      "step": 360
+    },
+    {
+      "epoch": 0.06641193595342067,
+      "grad_norm": 0.10367228090763092,
+      "learning_rate": 4.98285450509961e-05,
+      "loss": 0.21993820667266845,
+      "step": 365
+    },
+    {
+      "epoch": 0.0673216885007278,
+      "grad_norm": 0.12229471653699875,
+      "learning_rate": 4.9819831478833456e-05,
+      "loss": 0.2168867588043213,
+      "step": 370
+    },
+    {
+      "epoch": 0.06823144104803494,
+      "grad_norm": 0.0964592918753624,
+      "learning_rate": 4.981090274276406e-05,
+      "loss": 0.21579203605651856,
+      "step": 375
+    },
+    {
+      "epoch": 0.06914119359534207,
+      "grad_norm": 0.09400496631860733,
+      "learning_rate": 4.980175892019141e-05,
+      "loss": 0.20972180366516113,
+      "step": 380
+    },
+    {
+      "epoch": 0.0700509461426492,
+      "grad_norm": 0.08158645778894424,
+      "learning_rate": 4.9792400090383594e-05,
+      "loss": 0.22148358821868896,
+      "step": 385
+    },
+    {
+      "epoch": 0.07096069868995633,
+      "grad_norm": 0.10916394740343094,
+      "learning_rate": 4.978282633447261e-05,
+      "loss": 0.2214418649673462,
+      "step": 390
+    },
+    {
+      "epoch": 0.07187045123726346,
+      "grad_norm": 0.11138810962438583,
+      "learning_rate": 4.9773037735453636e-05,
+      "loss": 0.21814754009246826,
+      "step": 395
+    },
+    {
+      "epoch": 0.07278020378457059,
+      "grad_norm": 0.10914396494626999,
+      "learning_rate": 4.9763034378184365e-05,
+      "loss": 0.21310818195343018,
+      "step": 400
+    },
+    {
+      "epoch": 0.07368995633187773,
+      "grad_norm": 0.1043366864323616,
+      "learning_rate": 4.975281634938421e-05,
+      "loss": 0.21266789436340333,
+      "step": 405
+    },
+    {
+      "epoch": 0.07459970887918486,
+      "grad_norm": 0.1036868542432785,
+      "learning_rate": 4.9742383737633594e-05,
+      "loss": 0.21606721878051757,
+      "step": 410
+    },
+    {
+      "epoch": 0.075509461426492,
+      "grad_norm": 0.11640442907810211,
+      "learning_rate": 4.9731736633373144e-05,
+      "loss": 0.21532948017120362,
+      "step": 415
+    },
+    {
+      "epoch": 0.07641921397379912,
+      "grad_norm": 0.11219926178455353,
+      "learning_rate": 4.9720875128902956e-05,
+      "loss": 0.2191627025604248,
+      "step": 420
+    },
+    {
+      "epoch": 0.07732896652110625,
+      "grad_norm": 0.12103637307882309,
+      "learning_rate": 4.970979931838176e-05,
+      "loss": 0.20938868522644044,
+      "step": 425
+    },
+    {
+      "epoch": 0.0782387190684134,
+      "grad_norm": 0.13274189829826355,
+      "learning_rate": 4.96985092978261e-05,
+      "loss": 0.21792960166931152,
+      "step": 430
+    },
+    {
+      "epoch": 0.07914847161572053,
+      "grad_norm": 0.11164513230323792,
+      "learning_rate": 4.968700516510954e-05,
+      "loss": 0.2022618055343628,
+      "step": 435
+    },
+    {
+      "epoch": 0.08005822416302766,
+      "grad_norm": 0.09532847255468369,
+      "learning_rate": 4.967528701996174e-05,
+      "loss": 0.21255812644958497,
+      "step": 440
+    },
+    {
+      "epoch": 0.08096797671033479,
+      "grad_norm": 0.10279258340597153,
+      "learning_rate": 4.96633549639677e-05,
+      "loss": 0.20683050155639648,
+      "step": 445
+    },
+    {
+      "epoch": 0.08187772925764192,
+      "grad_norm": 0.1257462352514267,
+      "learning_rate": 4.965120910056677e-05,
+      "loss": 0.21419920921325683,
+      "step": 450
+    },
+    {
+      "epoch": 0.08278748180494905,
+      "grad_norm": 0.11663137376308441,
+      "learning_rate": 4.963884953505186e-05,
+      "loss": 0.2072287082672119,
+      "step": 455
+    },
+    {
+      "epoch": 0.08369723435225619,
+      "grad_norm": 0.10488224029541016,
+      "learning_rate": 4.96262763745684e-05,
+      "loss": 0.1982678532600403,
+      "step": 460
+    },
+    {
+      "epoch": 0.08460698689956332,
+      "grad_norm": 0.11801692098379135,
+      "learning_rate": 4.961348972811354e-05,
+      "loss": 0.20662031173706055,
+      "step": 465
+    },
+    {
+      "epoch": 0.08551673944687045,
+      "grad_norm": 0.11318827420473099,
+      "learning_rate": 4.96004897065351e-05,
+      "loss": 0.20947303771972656,
+      "step": 470
+    },
+    {
+      "epoch": 0.08642649199417758,
+      "grad_norm": 0.13409486413002014,
+      "learning_rate": 4.95872764225307e-05,
+      "loss": 0.19670876264572143,
+      "step": 475
+    },
+    {
+      "epoch": 0.08733624454148471,
+      "grad_norm": 0.14440792798995972,
+      "learning_rate": 4.957384999064672e-05,
+      "loss": 0.19842848777770997,
+      "step": 480
+    },
+    {
+      "epoch": 0.08824599708879186,
+      "grad_norm": 0.12246996909379959,
+      "learning_rate": 4.956021052727731e-05,
+      "loss": 0.20318071842193602,
+      "step": 485
+    },
+    {
+      "epoch": 0.08915574963609899,
+      "grad_norm": 0.13437233865261078,
+      "learning_rate": 4.954635815066342e-05,
+      "loss": 0.21675212383270265,
+      "step": 490
+    },
+    {
+      "epoch": 0.09006550218340612,
+      "grad_norm": 0.11109672486782074,
+      "learning_rate": 4.9532292980891744e-05,
+      "loss": 0.2100757837295532,
+      "step": 495
+    },
+    {
+      "epoch": 0.09097525473071325,
+      "grad_norm": 0.1388893872499466,
+      "learning_rate": 4.9518015139893675e-05,
+      "loss": 0.20303285121917725,
+      "step": 500
+    },
+    {
+      "epoch": 0.09188500727802038,
+      "grad_norm": 0.13239721953868866,
+      "learning_rate": 4.950352475144427e-05,
+      "loss": 0.2152268409729004,
+      "step": 505
+    },
+    {
+      "epoch": 0.0927947598253275,
+      "grad_norm": 0.12834979593753815,
+      "learning_rate": 4.948882194116119e-05,
+      "loss": 0.20799248218536376,
+      "step": 510
+    },
+    {
+      "epoch": 0.09370451237263465,
+      "grad_norm": 0.11886704713106155,
+      "learning_rate": 4.947390683650354e-05,
+      "loss": 0.20394976139068605,
+      "step": 515
+    },
+    {
+      "epoch": 0.09461426491994178,
+      "grad_norm": 0.11398876458406448,
+      "learning_rate": 4.945877956677083e-05,
+      "loss": 0.2091092586517334,
+      "step": 520
+    },
+    {
+      "epoch": 0.09552401746724891,
+      "grad_norm": 0.1422540694475174,
+      "learning_rate": 4.944344026310186e-05,
+      "loss": 0.19564238786697388,
+      "step": 525
+    },
+    {
+      "epoch": 0.09643377001455604,
+      "grad_norm": 0.11359584331512451,
+      "learning_rate": 4.9427889058473535e-05,
+      "loss": 0.20493624210357667,
+      "step": 530
+    },
+    {
+      "epoch": 0.09734352256186317,
+      "grad_norm": 0.11703553050756454,
+      "learning_rate": 4.941212608769974e-05,
+      "loss": 0.2098615884780884,
+      "step": 535
+    },
+    {
+      "epoch": 0.0982532751091703,
+      "grad_norm": 0.14552047848701477,
+      "learning_rate": 4.939615148743017e-05,
+      "loss": 0.20382182598114013,
+      "step": 540
+    },
+    {
+      "epoch": 0.09916302765647744,
+      "grad_norm": 0.13178016245365143,
+      "learning_rate": 4.937996539614914e-05,
+      "loss": 0.19901862144470214,
+      "step": 545
+    },
+    {
+      "epoch": 0.10007278020378457,
+      "grad_norm": 0.635392427444458,
+      "learning_rate": 4.936356795417439e-05,
+      "loss": 0.20694944858551026,
+      "step": 550
+    },
+    {
+      "epoch": 0.1009825327510917,
+      "grad_norm": 0.15019077062606812,
+      "learning_rate": 4.934695930365586e-05,
+      "loss": 0.19313746690750122,
+      "step": 555
+    },
+    {
+      "epoch": 0.10189228529839883,
+      "grad_norm": 0.12941956520080566,
+      "learning_rate": 4.9330139588574474e-05,
+      "loss": 0.19671722650527954,
+      "step": 560
+    },
+    {
+      "epoch": 0.10280203784570596,
+      "grad_norm": 0.13818831741809845,
+      "learning_rate": 4.931310895474088e-05,
+      "loss": 0.20026786327362062,
+      "step": 565
+    },
+    {
+      "epoch": 0.1037117903930131,
+      "grad_norm": 0.12011194974184036,
+      "learning_rate": 4.929586754979417e-05,
+      "loss": 0.1932437539100647,
+      "step": 570
+    },
+    {
+      "epoch": 0.10462154294032024,
+      "grad_norm": 0.1345364898443222,
+      "learning_rate": 4.9278415523200644e-05,
+      "loss": 0.20245940685272218,
+      "step": 575
+    },
+    {
+      "epoch": 0.10553129548762737,
+      "grad_norm": 0.13281017541885376,
+      "learning_rate": 4.926075302625247e-05,
+      "loss": 0.19864981174468993,
+      "step": 580
+    },
+    {
+      "epoch": 0.1064410480349345,
+      "grad_norm": 0.13465586304664612,
+      "learning_rate": 4.924288021206639e-05,
+      "loss": 0.19573183059692384,
+      "step": 585
+    },
+    {
+      "epoch": 0.10735080058224163,
+      "grad_norm": 0.15225961804389954,
+      "learning_rate": 4.9224797235582396e-05,
+      "loss": 0.19946801662445068,
+      "step": 590
+    },
+    {
+      "epoch": 0.10826055312954876,
+      "grad_norm": 0.12816746532917023,
+      "learning_rate": 4.92065042535624e-05,
+      "loss": 0.19851526021957397,
+      "step": 595
+    },
+    {
+      "epoch": 0.1091703056768559,
+      "grad_norm": 0.13802853226661682,
+      "learning_rate": 4.9188001424588824e-05,
+      "loss": 0.19321763515472412,
+      "step": 600
+    },
+    {
+      "epoch": 0.11008005822416303,
+      "grad_norm": 0.17504797875881195,
+      "learning_rate": 4.9169288909063295e-05,
+      "loss": 0.2032616138458252,
+      "step": 605
+    },
+    {
+      "epoch": 0.11098981077147016,
+      "grad_norm": 0.13544194400310516,
+      "learning_rate": 4.91503668692052e-05,
+      "loss": 0.2011256456375122,
+      "step": 610
+    },
+    {
+      "epoch": 0.11189956331877729,
+      "grad_norm": 1.3976134061813354,
+      "learning_rate": 4.91312354690503e-05,
+      "loss": 0.19916868209838867,
+      "step": 615
+    },
+    {
+      "epoch": 0.11280931586608442,
+      "grad_norm": 0.1465059071779251,
+      "learning_rate": 4.91118948744493e-05,
+      "loss": 0.19487457275390624,
+      "step": 620
+    },
+    {
+      "epoch": 0.11371906841339156,
+      "grad_norm": 0.12103168666362762,
+      "learning_rate": 4.909234525306645e-05,
+      "loss": 0.1907251238822937,
+      "step": 625
+    },
+    {
+      "epoch": 0.1146288209606987,
+      "grad_norm": 0.12660574913024902,
+      "learning_rate": 4.907258677437802e-05,
+      "loss": 0.19327253103256226,
+      "step": 630
+    },
+    {
+      "epoch": 0.11553857350800582,
+      "grad_norm": 0.1347813606262207,
+      "learning_rate": 4.90526196096709e-05,
+      "loss": 0.19637736082077026,
+      "step": 635
+    },
+    {
+      "epoch": 0.11644832605531295,
+      "grad_norm": 0.14953652024269104,
+      "learning_rate": 4.903244393204107e-05,
+      "loss": 0.20325069427490233,
+      "step": 640
+    },
+    {
+      "epoch": 0.11735807860262008,
+      "grad_norm": 0.13936272263526917,
+      "learning_rate": 4.901205991639213e-05,
+      "loss": 0.1930275321006775,
+      "step": 645
+    },
+    {
+      "epoch": 0.11826783114992721,
+      "grad_norm": 0.1448420137166977,
+      "learning_rate": 4.899146773943374e-05,
+      "loss": 0.20026936531066894,
+      "step": 650
+    },
+    {
+      "epoch": 0.11917758369723436,
+      "grad_norm": 0.1312534064054489,
+      "learning_rate": 4.897066757968014e-05,
+      "loss": 0.19062033891677857,
+      "step": 655
+    },
+    {
+      "epoch": 0.12008733624454149,
+      "grad_norm": 0.13644742965698242,
+      "learning_rate": 4.894965961744859e-05,
+      "loss": 0.18719595670700073,
+      "step": 660
+    },
+    {
+      "epoch": 0.12099708879184862,
+      "grad_norm": 0.14276087284088135,
+      "learning_rate": 4.892844403485777e-05,
+      "loss": 0.19784307479858398,
+      "step": 665
+    },
+    {
+      "epoch": 0.12190684133915575,
+      "grad_norm": 0.14735399186611176,
+      "learning_rate": 4.890702101582623e-05,
+      "loss": 0.19163782596588136,
+      "step": 670
+    },
+    {
+      "epoch": 0.12281659388646288,
+      "grad_norm": 0.15742065012454987,
+      "learning_rate": 4.888539074607082e-05,
+      "loss": 0.19312986135482788,
+      "step": 675
+    },
+    {
+      "epoch": 0.12372634643377002,
+      "grad_norm": 0.12917031347751617,
+      "learning_rate": 4.8863553413105025e-05,
+      "loss": 0.20066320896148682,
+      "step": 680
+    },
+    {
+      "epoch": 0.12463609898107715,
+      "grad_norm": 0.1484801322221756,
+      "learning_rate": 4.884150920623737e-05,
+      "loss": 0.20096096992492676,
+      "step": 685
+    },
+    {
+      "epoch": 0.12554585152838427,
+      "grad_norm": 0.1455296128988266,
+      "learning_rate": 4.88192583165698e-05,
+      "loss": 0.20518505573272705,
+      "step": 690
+    },
+    {
+      "epoch": 0.12645560407569142,
+      "grad_norm": 0.14517490565776825,
+      "learning_rate": 4.879680093699598e-05,
+      "loss": 0.18859238624572755,
+      "step": 695
+    },
+    {
+      "epoch": 0.12736535662299855,
+      "grad_norm": 0.18778090178966522,
+      "learning_rate": 4.877413726219964e-05,
+      "loss": 0.197074818611145,
+      "step": 700
+    },
+    {
+      "epoch": 0.12827510917030568,
+      "grad_norm": 0.13497677445411682,
+      "learning_rate": 4.87512674886529e-05,
+      "loss": 0.18713107109069824,
+      "step": 705
+    },
+    {
+      "epoch": 0.12918486171761281,
+      "grad_norm": 0.12657155096530914,
+      "learning_rate": 4.872819181461455e-05,
+      "loss": 0.1858484387397766,
+      "step": 710
+    },
+    {
+      "epoch": 0.13009461426491994,
+      "grad_norm": 0.11458148807287216,
+      "learning_rate": 4.870491044012834e-05,
+      "loss": 0.18732179403305055,
+      "step": 715
+    },
+    {
+      "epoch": 0.13100436681222707,
+      "grad_norm": 0.13000249862670898,
+      "learning_rate": 4.8681423567021244e-05,
+      "loss": 0.1872936010360718,
+      "step": 720
+    },
+    {
+      "epoch": 0.1319141193595342,
+      "grad_norm": 0.14580890536308289,
+      "learning_rate": 4.865773139890172e-05,
+      "loss": 0.19280019998550416,
+      "step": 725
+    },
+    {
+      "epoch": 0.13282387190684133,
+      "grad_norm": 0.1507277935743332,
+      "learning_rate": 4.8633834141157913e-05,
+      "loss": 0.1898929238319397,
+      "step": 730
+    },
+    {
+      "epoch": 0.13373362445414846,
+      "grad_norm": 0.1418737769126892,
+      "learning_rate": 4.860973200095592e-05,
+      "loss": 0.17926375865936278,
+      "step": 735
+    },
+    {
+      "epoch": 0.1346433770014556,
+      "grad_norm": 0.17151866853237152,
+      "learning_rate": 4.858542518723794e-05,
+      "loss": 0.18963592052459716,
+      "step": 740
+    },
+    {
+      "epoch": 0.13555312954876272,
+      "grad_norm": 0.11162743717432022,
+      "learning_rate": 4.8560913910720535e-05,
+      "loss": 0.19466646909713745,
+      "step": 745
+    },
+    {
+      "epoch": 0.13646288209606988,
+      "grad_norm": 0.15628376603126526,
+      "learning_rate": 4.8536198383892725e-05,
+      "loss": 0.19494034051895143,
+      "step": 750
+    },
+    {
+      "epoch": 0.137372634643377,
+      "grad_norm": 0.18209289014339447,
+      "learning_rate": 4.851127882101421e-05,
+      "loss": 0.18747550249099731,
+      "step": 755
+    },
+    {
+      "epoch": 0.13828238719068414,
+      "grad_norm": 0.14559614658355713,
+      "learning_rate": 4.8486155438113454e-05,
+      "loss": 0.1897158980369568,
+      "step": 760
+    },
+    {
+      "epoch": 0.13919213973799127,
+      "grad_norm": 0.3198587894439697,
+      "learning_rate": 4.846082845298586e-05,
+      "loss": 0.18571001291275024,
+      "step": 765
+    },
+    {
+      "epoch": 0.1401018922852984,
+      "grad_norm": 0.1486678421497345,
+      "learning_rate": 4.843529808519189e-05,
+      "loss": 0.19561930894851684,
+      "step": 770
+    },
+    {
+      "epoch": 0.14101164483260553,
+      "grad_norm": 0.15318170189857483,
+      "learning_rate": 4.840956455605509e-05,
+      "loss": 0.187040114402771,
+      "step": 775
+    },
+    {
+      "epoch": 0.14192139737991266,
+      "grad_norm": 0.13754244148731232,
+      "learning_rate": 4.838362808866025e-05,
+      "loss": 0.18345539569854735,
+      "step": 780
+    },
+    {
+      "epoch": 0.1428311499272198,
+      "grad_norm": 0.12943248450756073,
+      "learning_rate": 4.835748890785143e-05,
+      "loss": 0.1921079397201538,
+      "step": 785
+    },
+    {
+      "epoch": 0.14374090247452692,
+      "grad_norm": 0.110458143055439,
+      "learning_rate": 4.833114724023001e-05,
+      "loss": 0.17927205562591553,
+      "step": 790
+    },
+    {
+      "epoch": 0.14465065502183405,
+      "grad_norm": 0.2421770840883255,
+      "learning_rate": 4.830460331415275e-05,
+      "loss": 0.18317567110061644,
+      "step": 795
+    },
+    {
+      "epoch": 0.14556040756914118,
+      "grad_norm": 0.14752762019634247,
+      "learning_rate": 4.8277857359729787e-05,
+      "loss": 0.1843916058540344,
+      "step": 800
+    },
+    {
+      "epoch": 0.14647016011644834,
+      "grad_norm": 0.15043556690216064,
+      "learning_rate": 4.8250909608822644e-05,
+      "loss": 0.18354393243789674,
+      "step": 805
+    },
+    {
+      "epoch": 0.14737991266375547,
+      "grad_norm": 0.1381794661283493,
+      "learning_rate": 4.822376029504223e-05,
+      "loss": 0.1789781332015991,
+      "step": 810
+    },
+    {
+      "epoch": 0.1482896652110626,
+      "grad_norm": 0.18386174738407135,
+      "learning_rate": 4.819640965374681e-05,
+      "loss": 0.19494292736053467,
+      "step": 815
+    },
+    {
+      "epoch": 0.14919941775836973,
+      "grad_norm": 0.13829593360424042,
+      "learning_rate": 4.816885792203996e-05,
+      "loss": 0.18486063480377196,
+      "step": 820
+    },
+    {
+      "epoch": 0.15010917030567686,
+      "grad_norm": 0.15033291280269623,
+      "learning_rate": 4.814110533876852e-05,
+      "loss": 0.18061509132385253,
+      "step": 825
+    },
+    {
+      "epoch": 0.151018922852984,
+      "grad_norm": 0.17150473594665527,
+      "learning_rate": 4.811315214452051e-05,
+      "loss": 0.18464866876602173,
+      "step": 830
+    },
+    {
+      "epoch": 0.15192867540029112,
+      "grad_norm": 0.15317125618457794,
+      "learning_rate": 4.808499858162307e-05,
+      "loss": 0.1837708592414856,
+      "step": 835
+    },
+    {
+      "epoch": 0.15283842794759825,
+      "grad_norm": 0.2671392560005188,
+      "learning_rate": 4.805664489414031e-05,
+      "loss": 0.19338636398315429,
+      "step": 840
+    },
+    {
+      "epoch": 0.15374818049490538,
+      "grad_norm": 0.14047028124332428,
+      "learning_rate": 4.802809132787125e-05,
+      "loss": 0.17069108486175538,
+      "step": 845
+    },
+    {
+      "epoch": 0.1546579330422125,
+      "grad_norm": 0.1520431935787201,
+      "learning_rate": 4.799933813034768e-05,
+      "loss": 0.18607735633850098,
+      "step": 850
+    },
+    {
+      "epoch": 0.15556768558951964,
+      "grad_norm": 0.17239463329315186,
+      "learning_rate": 4.797038555083197e-05,
+      "loss": 0.18069062232971192,
+      "step": 855
+    },
+    {
+      "epoch": 0.1564774381368268,
+      "grad_norm": 0.1377955675125122,
+      "learning_rate": 4.794123384031495e-05,
+      "loss": 0.18870222568511963,
+      "step": 860
+    },
+    {
+      "epoch": 0.15738719068413393,
+      "grad_norm": 0.15901461243629456,
+      "learning_rate": 4.791188325151373e-05,
+      "loss": 0.18128334283828734,
+      "step": 865
+    },
+    {
+      "epoch": 0.15829694323144106,
+      "grad_norm": 0.14634132385253906,
+      "learning_rate": 4.7882334038869495e-05,
+      "loss": 0.1866163969039917,
+      "step": 870
+    },
+    {
+      "epoch": 0.1592066957787482,
+      "grad_norm": 0.15361061692237854,
+      "learning_rate": 4.785258645854529e-05,
+      "loss": 0.17850807905197144,
+      "step": 875
+    },
+    {
+      "epoch": 0.16011644832605532,
+      "grad_norm": 0.13751649856567383,
+      "learning_rate": 4.782264076842385e-05,
+      "loss": 0.17731113433837892,
+      "step": 880
+    },
+    {
+      "epoch": 0.16102620087336245,
+      "grad_norm": 0.17909638583660126,
+      "learning_rate": 4.7792497228105314e-05,
+      "loss": 0.18344542980194092,
+      "step": 885
+    },
+    {
+      "epoch": 0.16193595342066958,
+      "grad_norm": 0.16038304567337036,
+      "learning_rate": 4.776215609890498e-05,
+      "loss": 0.18868647813796996,
+      "step": 890
+    },
+    {
+      "epoch": 0.1628457059679767,
+      "grad_norm": 0.1653951108455658,
+      "learning_rate": 4.773161764385107e-05,
+      "loss": 0.18614152669906617,
+      "step": 895
+    },
+    {
+      "epoch": 0.16375545851528384,
+      "grad_norm": 0.16193026304244995,
+      "learning_rate": 4.770088212768241e-05,
+      "loss": 0.18564575910568237,
+      "step": 900
+    },
+    {
+      "epoch": 0.16466521106259097,
+      "grad_norm": 0.16048531234264374,
+      "learning_rate": 4.7669949816846173e-05,
+      "loss": 0.18330031633377075,
+      "step": 905
+    },
+    {
+      "epoch": 0.1655749636098981,
+      "grad_norm": 0.1440177708864212,
+      "learning_rate": 4.7638820979495534e-05,
+      "loss": 0.17712442874908446,
+      "step": 910
+    },
+    {
+      "epoch": 0.16648471615720525,
+      "grad_norm": 0.19635969400405884,
+      "learning_rate": 4.760749588548738e-05,
+      "loss": 0.18679027557373046,
+      "step": 915
+    },
+    {
+      "epoch": 0.16739446870451238,
+      "grad_norm": 0.15576541423797607,
+      "learning_rate": 4.757597480637995e-05,
+      "loss": 0.19283764362335204,
+      "step": 920
+    },
+    {
+      "epoch": 0.1683042212518195,
+      "grad_norm": 0.1550331562757492,
+      "learning_rate": 4.7544258015430463e-05,
+      "loss": 0.18269542455673218,
+      "step": 925
+    },
+    {
+      "epoch": 0.16921397379912664,
+      "grad_norm": 0.18369626998901367,
+      "learning_rate": 4.75123457875928e-05,
+      "loss": 0.1697891116142273,
+      "step": 930
+    },
+    {
+      "epoch": 0.17012372634643377,
+      "grad_norm": 0.15266314148902893,
+      "learning_rate": 4.7480238399515074e-05,
+      "loss": 0.18523451089859008,
+      "step": 935
+    },
+    {
+      "epoch": 0.1710334788937409,
+      "grad_norm": 0.16709664463996887,
+      "learning_rate": 4.744793612953724e-05,
+      "loss": 0.1803238034248352,
+      "step": 940
+    },
+    {
+      "epoch": 0.17194323144104803,
+      "grad_norm": 0.14929179847240448,
+      "learning_rate": 4.741543925768872e-05,
+      "loss": 0.1861217737197876,
+      "step": 945
+    },
+    {
+      "epoch": 0.17285298398835516,
+      "grad_norm": 0.1362280696630478,
+      "learning_rate": 4.7382748065685915e-05,
+      "loss": 0.17896100282669067,
+      "step": 950
+    },
+    {
+      "epoch": 0.1737627365356623,
+      "grad_norm": 0.15290239453315735,
+      "learning_rate": 4.734986283692982e-05,
+      "loss": 0.18432788848876952,
+      "step": 955
+    },
+    {
+      "epoch": 0.17467248908296942,
+      "grad_norm": 0.1287035197019577,
+      "learning_rate": 4.73167838565035e-05,
+      "loss": 0.18485682010650634,
+      "step": 960
+    },
+    {
+      "epoch": 0.17558224163027655,
+      "grad_norm": 0.17969627678394318,
+      "learning_rate": 4.728351141116971e-05,
+      "loss": 0.17361557483673096,
+      "step": 965
+    },
+    {
+      "epoch": 0.1764919941775837,
+      "grad_norm": 0.13751201331615448,
+      "learning_rate": 4.7250045789368326e-05,
+      "loss": 0.1731679320335388,
+      "step": 970
+    },
+    {
+      "epoch": 0.17740174672489084,
+      "grad_norm": 0.1603265255689621,
+      "learning_rate": 4.721638728121388e-05,
+      "loss": 0.17308170795440675,
+      "step": 975
+    },
+    {
+      "epoch": 0.17831149927219797,
+      "grad_norm": 0.1592789888381958,
+      "learning_rate": 4.718253617849306e-05,
+      "loss": 0.17534757852554322,
+      "step": 980
+    },
+    {
+      "epoch": 0.1792212518195051,
+      "grad_norm": 0.12727224826812744,
+      "learning_rate": 4.714849277466214e-05,
+      "loss": 0.17817609310150145,
+      "step": 985
+    },
+    {
+      "epoch": 0.18013100436681223,
+      "grad_norm": 0.15401554107666016,
+      "learning_rate": 4.711425736484447e-05,
+      "loss": 0.1733405351638794,
+      "step": 990
+    },
+    {
+      "epoch": 0.18104075691411936,
+      "grad_norm": 0.13253968954086304,
+      "learning_rate": 4.7079830245827906e-05,
+      "loss": 0.17846795320510864,
+      "step": 995
+    },
+    {
+      "epoch": 0.1819505094614265,
+      "grad_norm": 0.21846213936805725,
+      "learning_rate": 4.7045211716062245e-05,
+      "loss": 0.18021599054336548,
+      "step": 1000
+    },
+    {
+      "epoch": 0.18286026200873362,
+      "grad_norm": 0.16867990791797638,
+      "learning_rate": 4.7010402075656595e-05,
+      "loss": 0.18232386112213134,
+      "step": 1005
+    },
+    {
+      "epoch": 0.18377001455604075,
+      "grad_norm": 0.17180582880973816,
+      "learning_rate": 4.697540162637686e-05,
+      "loss": 0.1816317319869995,
+      "step": 1010
+    },
+    {
+      "epoch": 0.18467976710334788,
+      "grad_norm": 0.16480213403701782,
+      "learning_rate": 4.694021067164303e-05,
+      "loss": 0.17718446254730225,
+      "step": 1015
+    },
+    {
+      "epoch": 0.185589519650655,
+      "grad_norm": 0.15015918016433716,
+      "learning_rate": 4.6904829516526605e-05,
+      "loss": 0.17412011623382567,
+      "step": 1020
+    },
+    {
+      "epoch": 0.18649927219796217,
+      "grad_norm": 0.14445139467716217,
+      "learning_rate": 4.686925846774795e-05,
+      "loss": 0.1778018832206726,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1874090247452693,
+      "grad_norm": 0.1701960265636444,
+      "learning_rate": 4.683349783367362e-05,
+      "loss": 0.16901081800460815,
+      "step": 1030
+    },
+    {
+      "epoch": 0.18831877729257643,
+      "grad_norm": 0.15894867479801178,
+      "learning_rate": 4.679754792431368e-05,
+      "loss": 0.17055928707122803,
+      "step": 1035
+    },
+    {
+      "epoch": 0.18922852983988356,
+      "grad_norm": 0.1511942446231842,
+      "learning_rate": 4.676140905131903e-05,
+      "loss": 0.17339680194854737,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1901382823871907,
+      "grad_norm": 0.14735209941864014,
+      "learning_rate": 4.672508152797872e-05,
+      "loss": 0.17802717685699462,
+      "step": 1045
+    },
+    {
+      "epoch": 0.19104803493449782,
+      "grad_norm": 0.17367291450500488,
+      "learning_rate": 4.66885656692172e-05,
+      "loss": 0.1732744097709656,
+      "step": 1050
+    },
+    {
+      "epoch": 0.19195778748180495,
+      "grad_norm": 0.147227481007576,
+      "learning_rate": 4.665186179159159e-05,
+      "loss": 0.17040517330169677,
+      "step": 1055
+    },
+    {
+      "epoch": 0.19286754002911208,
+      "grad_norm": 0.1709655076265335,
+      "learning_rate": 4.6614970213289e-05,
+      "loss": 0.17794088125228882,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1937772925764192,
+      "grad_norm": 0.1588088721036911,
+      "learning_rate": 4.657789125412366e-05,
+      "loss": 0.17180380821228028,
+      "step": 1065
+    },
+    {
+      "epoch": 0.19468704512372634,
+      "grad_norm": 0.14827021956443787,
+      "learning_rate": 4.654062523553428e-05,
+      "loss": 0.182997989654541,
+      "step": 1070
+    },
+    {
+      "epoch": 0.19559679767103347,
+      "grad_norm": 0.16230466961860657,
+      "learning_rate": 4.6503172480581126e-05,
+      "loss": 0.17346880435943604,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1965065502183406,
+      "grad_norm": 0.1637624353170395,
+      "learning_rate": 4.646553331394333e-05,
+      "loss": 0.17263576984405518,
+      "step": 1080
+    },
+    {
+      "epoch": 0.19741630276564776,
+      "grad_norm": 0.15977843105793,
+      "learning_rate": 4.642770806191603e-05,
+      "loss": 0.17284308671951293,
+      "step": 1085
+    },
+    {
+      "epoch": 0.19832605531295489,
+      "grad_norm": 0.15394869446754456,
+      "learning_rate": 4.6389697052407534e-05,
+      "loss": 0.17797101736068727,
+      "step": 1090
+    },
+    {
+      "epoch": 0.19923580786026202,
+      "grad_norm": 0.15995225310325623,
+      "learning_rate": 4.6351500614936485e-05,
+      "loss": 0.18137198686599731,
+      "step": 1095
+    },
+    {
+      "epoch": 0.20014556040756915,
+      "grad_norm": 0.1779479682445526,
+      "learning_rate": 4.6313119080629006e-05,
+      "loss": 0.17998344898223878,
+      "step": 1100
+    },
+    {
+      "epoch": 0.20105531295487628,
+      "grad_norm": 0.14362832903862,
+      "learning_rate": 4.627455278221584e-05,
+      "loss": 0.18196423053741456,
+      "step": 1105
+    },
+    {
+      "epoch": 0.2019650655021834,
+      "grad_norm": 0.15951639413833618,
+      "learning_rate": 4.623580205402947e-05,
+      "loss": 0.17423888444900512,
+      "step": 1110
+    },
+    {
+      "epoch": 0.20287481804949054,
+      "grad_norm": 0.17273563146591187,
+      "learning_rate": 4.619686723200115e-05,
+      "loss": 0.17392473220825194,
+      "step": 1115
+    },
+    {
+      "epoch": 0.20378457059679767,
+      "grad_norm": 0.1655360758304596,
+      "learning_rate": 4.615774865365813e-05,
+      "loss": 0.17528389692306517,
+      "step": 1120
+    },
+    {
+      "epoch": 0.2046943231441048,
+      "grad_norm": 0.15920691192150116,
+      "learning_rate": 4.611844665812058e-05,
+      "loss": 0.1806849241256714,
+      "step": 1125
+    },
+    {
+      "epoch": 0.20560407569141192,
+      "grad_norm": 0.16114577651023865,
+      "learning_rate": 4.607896158609875e-05,
+      "loss": 0.17217352390289306,
+      "step": 1130
+    },
+    {
+      "epoch": 0.20651382823871905,
+      "grad_norm": 0.1499422937631607,
+      "learning_rate": 4.603929377988999e-05,
+      "loss": 0.17806737422943114,
+      "step": 1135
+    },
+    {
+      "epoch": 0.2074235807860262,
+      "grad_norm": 0.17605191469192505,
+      "learning_rate": 4.5999443583375765e-05,
+      "loss": 0.17842113971710205,
+      "step": 1140
+    },
+    {
+      "epoch": 0.20833333333333334,
+      "grad_norm": 0.16117210686206818,
+      "learning_rate": 4.595941134201871e-05,
+      "loss": 0.18379683494567872,
+      "step": 1145
+    },
+    {
+      "epoch": 0.20924308588064047,
+      "grad_norm": 0.21199050545692444,
+      "learning_rate": 4.591919740285957e-05,
+      "loss": 0.16286123991012574,
+      "step": 1150
+    },
+    {
+      "epoch": 0.2101528384279476,
+      "grad_norm": 0.15100529789924622,
+      "learning_rate": 4.587880211451427e-05,
+      "loss": 0.17995200157165528,
+      "step": 1155
+    },
+    {
+      "epoch": 0.21106259097525473,
+      "grad_norm": 0.16618172824382782,
+      "learning_rate": 4.583822582717085e-05,
+      "loss": 0.16960303783416747,
+      "step": 1160
+    },
+    {
+      "epoch": 0.21197234352256186,
+      "grad_norm": 0.14743569493293762,
+      "learning_rate": 4.579746889258643e-05,
+      "loss": 0.17762668132781984,
+      "step": 1165
+    },
+    {
+      "epoch": 0.212882096069869,
+      "grad_norm": 0.1697179079055786,
+      "learning_rate": 4.575653166408417e-05,
+      "loss": 0.16656005382537842,
+      "step": 1170
+    },
+    {
+      "epoch": 0.21379184861717612,
+      "grad_norm": 0.14886513352394104,
+      "learning_rate": 4.57154144965502e-05,
+      "loss": 0.17091882228851318,
+      "step": 1175
+    },
+    {
+      "epoch": 0.21470160116448325,
+      "grad_norm": 0.18197473883628845,
+      "learning_rate": 4.5674117746430556e-05,
+      "loss": 0.1770920753479004,
+      "step": 1180
+    },
+    {
+      "epoch": 0.21561135371179038,
+      "grad_norm": 0.17323088645935059,
+      "learning_rate": 4.563264177172807e-05,
+      "loss": 0.1734643578529358,
+      "step": 1185
+    },
+    {
+      "epoch": 0.2165211062590975,
+      "grad_norm": 0.1521984338760376,
+      "learning_rate": 4.559098693199929e-05,
+      "loss": 0.17515116930007935,
+      "step": 1190
+    },
+    {
+      "epoch": 0.21743085880640467,
+      "grad_norm": 0.1842304915189743,
+      "learning_rate": 4.554915358835134e-05,
+      "loss": 0.16798022985458375,
+      "step": 1195
+    },
+    {
+      "epoch": 0.2183406113537118,
+      "grad_norm": 0.14753451943397522,
+      "learning_rate": 4.5507142103438794e-05,
+      "loss": 0.1755476713180542,
+      "step": 1200
+    },
+    {
+      "epoch": 0.21925036390101893,
+      "grad_norm": 0.17096194624900818,
+      "learning_rate": 4.546495284146057e-05,
+      "loss": 0.1792473554611206,
+      "step": 1205
+    },
+    {
+      "epoch": 0.22016011644832606,
+      "grad_norm": 0.1579233556985855,
+      "learning_rate": 4.542258616815669e-05,
+      "loss": 0.17230144739151002,
+      "step": 1210
+    },
+    {
+      "epoch": 0.2210698689956332,
+      "grad_norm": 0.177297905087471,
+      "learning_rate": 4.5380042450805216e-05,
+      "loss": 0.1807127833366394,
+      "step": 1215
+    },
+    {
+      "epoch": 0.22197962154294032,
+      "grad_norm": 0.14331696927547455,
+      "learning_rate": 4.533732205821897e-05,
+      "loss": 0.17201389074325563,
+      "step": 1220
+    },
+    {
+      "epoch": 0.22288937409024745,
+      "grad_norm": 0.14473360776901245,
+      "learning_rate": 4.529442536074239e-05,
+      "loss": 0.17036900520324708,
+      "step": 1225
+    },
+    {
+      "epoch": 0.22379912663755458,
+      "grad_norm": 0.1820901483297348,
+      "learning_rate": 4.5251352730248314e-05,
+      "loss": 0.17704882621765136,
+      "step": 1230
+    },
+    {
+      "epoch": 0.2247088791848617,
+      "grad_norm": 0.1948976367712021,
+      "learning_rate": 4.5208104540134746e-05,
+      "loss": 0.1706973433494568,
+      "step": 1235
+    },
+    {
+      "epoch": 0.22561863173216884,
+      "grad_norm": 0.16660070419311523,
+      "learning_rate": 4.51646811653216e-05,
+      "loss": 0.17636821269989014,
+      "step": 1240
+    },
+    {
+      "epoch": 0.22652838427947597,
+      "grad_norm": 0.1699984073638916,
+      "learning_rate": 4.512108298224751e-05,
+      "loss": 0.16986632347106934,
+      "step": 1245
+    },
+    {
+      "epoch": 0.22743813682678313,
+      "grad_norm": 0.17601042985916138,
+      "learning_rate": 4.50773103688665e-05,
+      "loss": 0.17507898807525635,
+      "step": 1250
+    },
+    {
+      "epoch": 0.22834788937409026,
+      "grad_norm": 0.17557238042354584,
+      "learning_rate": 4.503336370464476e-05,
+      "loss": 0.17702863216400147,
+      "step": 1255
+    },
+    {
+      "epoch": 0.2292576419213974,
+      "grad_norm": 0.1800651252269745,
+      "learning_rate": 4.498924337055729e-05,
+      "loss": 0.16419180631637573,
+      "step": 1260
+    },
+    {
+      "epoch": 0.23016739446870452,
+      "grad_norm": 0.2022479772567749,
+      "learning_rate": 4.494494974908468e-05,
+      "loss": 0.17482060194015503,
+      "step": 1265
+    },
+    {
+      "epoch": 0.23107714701601165,
+      "grad_norm": 0.14180205762386322,
+      "learning_rate": 4.490048322420973e-05,
+      "loss": 0.1723136067390442,
+      "step": 1270
+    },
+    {
+      "epoch": 0.23198689956331878,
+      "grad_norm": 0.18607310950756073,
+      "learning_rate": 4.485584418141419e-05,
+      "loss": 0.17096419334411622,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2328966521106259,
+      "grad_norm": 0.15958310663700104,
+      "learning_rate": 4.481103300767529e-05,
+      "loss": 0.1656244158744812,
+      "step": 1280
+    },
+    {
+      "epoch": 0.23380640465793304,
+      "grad_norm": 0.17552383244037628,
+      "learning_rate": 4.476605009146255e-05,
+      "loss": 0.17677626609802247,
+      "step": 1285
+    },
+    {
+      "epoch": 0.23471615720524017,
+      "grad_norm": 0.15299823880195618,
+      "learning_rate": 4.472089582273429e-05,
+      "loss": 0.1778991103172302,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2356259097525473,
+      "grad_norm": 0.14613987505435944,
+      "learning_rate": 4.46755705929343e-05,
+      "loss": 0.17071452140808105,
+      "step": 1295
+    },
+    {
+      "epoch": 0.23653566229985443,
+      "grad_norm": 0.17781122028827667,
+      "learning_rate": 4.463007479498843e-05,
+      "loss": 0.16955430507659913,
+      "step": 1300
+    },
+    {
+      "epoch": 0.23744541484716158,
+      "grad_norm": 0.16326487064361572,
+      "learning_rate": 4.458440882330119e-05,
+      "loss": 0.1777693510055542,
+      "step": 1305
+    },
+    {
+      "epoch": 0.23835516739446871,
+      "grad_norm": 0.17701926827430725,
+      "learning_rate": 4.4538573073752365e-05,
+      "loss": 0.16323351860046387,
+      "step": 1310
+    },
+    {
+      "epoch": 0.23926491994177584,
+      "grad_norm": 0.13104717433452606,
+      "learning_rate": 4.449256794369349e-05,
+      "loss": 0.17653456926345826,
+      "step": 1315
+    },
+    {
+      "epoch": 0.24017467248908297,
+      "grad_norm": 0.1796836256980896,
+      "learning_rate": 4.444639383194452e-05,
+      "loss": 0.17189600467681884,
+      "step": 1320
+    },
+    {
+      "epoch": 0.2410844250363901,
+      "grad_norm": 0.14919696748256683,
+      "learning_rate": 4.440005113879029e-05,
+      "loss": 0.17003334760665895,
+      "step": 1325
+    },
+    {
+      "epoch": 0.24199417758369723,
+      "grad_norm": 0.1728784441947937,
+      "learning_rate": 4.4353540265977064e-05,
+      "loss": 0.17397408485412597,
+      "step": 1330
+    },
+    {
+      "epoch": 0.24290393013100436,
+      "grad_norm": 0.14591015875339508,
+      "learning_rate": 4.43068616167091e-05,
+      "loss": 0.16498478651046752,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2438136826783115,
+      "grad_norm": 0.18417201936244965,
+      "learning_rate": 4.4260015595645055e-05,
+      "loss": 0.16841750144958495,
+      "step": 1340
+    },
+    {
+      "epoch": 0.24472343522561862,
+      "grad_norm": 0.16264279186725616,
+      "learning_rate": 4.4213002608894605e-05,
+      "loss": 0.16907373666763306,
+      "step": 1345
+    },
+    {
+      "epoch": 0.24563318777292575,
+      "grad_norm": 0.15248481929302216,
+      "learning_rate": 4.416582306401481e-05,
+      "loss": 0.15931472778320313,
+      "step": 1350
+    },
+    {
+      "epoch": 0.24654294032023288,
+      "grad_norm": 0.1488373875617981,
+      "learning_rate": 4.4118477370006636e-05,
+      "loss": 0.1701716423034668,
+      "step": 1355
+    },
+    {
+      "epoch": 0.24745269286754004,
+      "grad_norm": 0.14679782092571259,
+      "learning_rate": 4.407096593731142e-05,
+      "loss": 0.157412326335907,
+      "step": 1360
+    },
+    {
+      "epoch": 0.24836244541484717,
+      "grad_norm": 0.17139530181884766,
+      "learning_rate": 4.402328917780728e-05,
+      "loss": 0.17303754091262818,
+      "step": 1365
+    },
+    {
+      "epoch": 0.2492721979621543,
+      "grad_norm": 0.1534871757030487,
+      "learning_rate": 4.397544750480554e-05,
+      "loss": 0.1786255121231079,
+      "step": 1370
+    },
+    {
+      "epoch": 0.2501819505094614,
+      "grad_norm": 0.1876252293586731,
+      "learning_rate": 4.39274413330472e-05,
+      "loss": 0.16442898511886597,
+      "step": 1375
+    },
+    {
+      "epoch": 0.25109170305676853,
+      "grad_norm": 0.16165752708911896,
+      "learning_rate": 4.387927107869928e-05,
+      "loss": 0.1780426025390625,
+      "step": 1380
+    },
+    {
+      "epoch": 0.25200145560407566,
+      "grad_norm": 0.17242255806922913,
+      "learning_rate": 4.383093715935124e-05,
+      "loss": 0.15959256887435913,
+      "step": 1385
+    },
+    {
+      "epoch": 0.25291120815138285,
+      "grad_norm": 0.1627114862203598,
+      "learning_rate": 4.378243999401137e-05,
+      "loss": 0.17606115341186523,
+      "step": 1390
+    },
+    {
+      "epoch": 0.25382096069869,
+      "grad_norm": 0.15911224484443665,
+      "learning_rate": 4.373378000310312e-05,
+      "loss": 0.16798585653305054,
+      "step": 1395
+    },
+    {
+      "epoch": 0.2547307132459971,
+      "grad_norm": 0.15542249381542206,
+      "learning_rate": 4.3684957608461505e-05,
+      "loss": 0.1695417881011963,
+      "step": 1400
+    },
+    {
+      "epoch": 0.25564046579330424,
+      "grad_norm": 0.1475304812192917,
+      "learning_rate": 4.363597323332941e-05,
+      "loss": 0.16340878009796142,
+      "step": 1405
+    },
+    {
+      "epoch": 0.25655021834061137,
+      "grad_norm": 0.16943927109241486,
+      "learning_rate": 4.358682730235395e-05,
+      "loss": 0.17240238189697266,
+      "step": 1410
+    },
+    {
+      "epoch": 0.2574599708879185,
+      "grad_norm": 0.1816391944885254,
+      "learning_rate": 4.3537520241582744e-05,
+      "loss": 0.16558437347412108,
+      "step": 1415
+    },
+    {
+      "epoch": 0.25836972343522563,
+      "grad_norm": 0.23851341009140015,
+      "learning_rate": 4.348805247846027e-05,
+      "loss": 0.16796000003814698,
+      "step": 1420
+    },
+    {
+      "epoch": 0.25927947598253276,
+      "grad_norm": 0.15415243804454803,
+      "learning_rate": 4.343842444182414e-05,
+      "loss": 0.1746017098426819,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2601892285298399,
+      "grad_norm": 0.15651032328605652,
+      "learning_rate": 4.338863656190139e-05,
+      "loss": 0.1649057984352112,
+      "step": 1430
+    },
+    {
+      "epoch": 0.261098981077147,
+      "grad_norm": 0.16601966321468353,
+      "learning_rate": 4.333868927030471e-05,
+      "loss": 0.15888988971710205,
+      "step": 1435
+    },
+    {
+      "epoch": 0.26200873362445415,
+      "grad_norm": 0.1549467295408249,
+      "learning_rate": 4.328858300002876e-05,
+      "loss": 0.16357985734939576,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2629184861717613,
+      "grad_norm": 0.16332370042800903,
+      "learning_rate": 4.32383181854464e-05,
+      "loss": 0.16749982833862304,
+      "step": 1445
+    },
+    {
+      "epoch": 0.2638282387190684,
+      "grad_norm": 0.14827077090740204,
+      "learning_rate": 4.3187895262304894e-05,
+      "loss": 0.16886214017868043,
+      "step": 1450
+    },
+    {
+      "epoch": 0.26473799126637554,
+      "grad_norm": 0.1557198166847229,
+      "learning_rate": 4.313731466772216e-05,
+      "loss": 0.17512214183807373,
+      "step": 1455
+    },
+    {
+      "epoch": 0.26564774381368267,
+      "grad_norm": 0.17263570427894592,
+      "learning_rate": 4.308657684018299e-05,
+      "loss": 0.16248074769973755,
+      "step": 1460
+    },
+    {
+      "epoch": 0.2665574963609898,
+      "grad_norm": 0.17135761678218842,
+      "learning_rate": 4.303568221953521e-05,
+      "loss": 0.16605921983718872,
+      "step": 1465
+    },
+    {
+      "epoch": 0.26746724890829693,
+      "grad_norm": 0.14322632551193237,
+      "learning_rate": 4.2984631246985897e-05,
+      "loss": 0.1610772728919983,
+      "step": 1470
+    },
+    {
+      "epoch": 0.26837700145560406,
+      "grad_norm": 0.18852312862873077,
+      "learning_rate": 4.2933424365097564e-05,
+      "loss": 0.1686462163925171,
+      "step": 1475
+    },
+    {
+      "epoch": 0.2692867540029112,
+      "grad_norm": 0.1780245155096054,
+      "learning_rate": 4.2882062017784294e-05,
+      "loss": 0.16953932046890258,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2701965065502183,
+      "grad_norm": 0.180568665266037,
+      "learning_rate": 4.2830544650307895e-05,
+      "loss": 0.16442664861679077,
+      "step": 1485
+    },
+    {
+      "epoch": 0.27110625909752545,
+      "grad_norm": 0.16876435279846191,
+      "learning_rate": 4.277887270927407e-05,
+      "loss": 0.17128173112869263,
+      "step": 1490
+    },
+    {
+      "epoch": 0.2720160116448326,
+      "grad_norm": 0.164053276181221,
+      "learning_rate": 4.2727046642628513e-05,
+      "loss": 0.16331382989883422,
+      "step": 1495
+    },
+    {
+      "epoch": 0.27292576419213976,
+      "grad_norm": 0.14577528834342957,
+      "learning_rate": 4.267506689965305e-05,
+      "loss": 0.1638316035270691,
+      "step": 1500
+    },
+    {
+      "epoch": 0.2738355167394469,
+      "grad_norm": 0.1648740917444229,
+      "learning_rate": 4.262293393096171e-05,
+      "loss": 0.15332664251327516,
+      "step": 1505
+    },
+    {
+      "epoch": 0.274745269286754,
+      "grad_norm": 0.16445094347000122,
+      "learning_rate": 4.257064818849685e-05,
+      "loss": 0.1706634521484375,
+      "step": 1510
+    },
+    {
+      "epoch": 0.27565502183406115,
+      "grad_norm": 0.1584935486316681,
+      "learning_rate": 4.251821012552524e-05,
+      "loss": 0.1684114694595337,
+      "step": 1515
+    },
+    {
+      "epoch": 0.2765647743813683,
+      "grad_norm": 0.17215611040592194,
+      "learning_rate": 4.24656201966341e-05,
+      "loss": 0.15594131946563722,
+      "step": 1520
+    },
+    {
+      "epoch": 0.2774745269286754,
+      "grad_norm": 0.15945589542388916,
+      "learning_rate": 4.2412878857727214e-05,
+      "loss": 0.1686659574508667,
+      "step": 1525
+    },
+    {
+      "epoch": 0.27838427947598254,
+      "grad_norm": 0.16103951632976532,
+      "learning_rate": 4.2359986566020906e-05,
+      "loss": 0.17779340744018554,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2792940320232897,
+      "grad_norm": 0.1770307570695877,
+      "learning_rate": 4.230694378004014e-05,
+      "loss": 0.16786882877349854,
+      "step": 1535
+    },
+    {
+      "epoch": 0.2802037845705968,
+      "grad_norm": 0.16225053369998932,
+      "learning_rate": 4.2253750959614504e-05,
+      "loss": 0.16558897495269775,
+      "step": 1540
+    },
+    {
+      "epoch": 0.28111353711790393,
+      "grad_norm": 0.27213969826698303,
+      "learning_rate": 4.220040856587425e-05,
+      "loss": 0.1641119599342346,
+      "step": 1545
+    },
+    {
+      "epoch": 0.28202328966521106,
+      "grad_norm": 0.1773071587085724,
+      "learning_rate": 4.2146917061246284e-05,
+      "loss": 0.16919140815734862,
+      "step": 1550
+    },
+    {
+      "epoch": 0.2829330422125182,
+      "grad_norm": 0.15519705414772034,
+      "learning_rate": 4.209327690945014e-05,
+      "loss": 0.15501506328582765,
+      "step": 1555
+    },
+    {
+      "epoch": 0.2838427947598253,
+      "grad_norm": 0.19921597838401794,
+      "learning_rate": 4.203948857549402e-05,
+      "loss": 0.1690821886062622,
+      "step": 1560
+    },
+    {
+      "epoch": 0.28475254730713245,
+      "grad_norm": 0.15417630970478058,
+      "learning_rate": 4.1985552525670696e-05,
+      "loss": 0.1675640344619751,
+      "step": 1565
+    },
+    {
+      "epoch": 0.2856622998544396,
+      "grad_norm": 0.1739572137594223,
+      "learning_rate": 4.193146922755348e-05,
+      "loss": 0.16738017797470092,
+      "step": 1570
+    },
+    {
+      "epoch": 0.2865720524017467,
+      "grad_norm": 0.1384361982345581,
+      "learning_rate": 4.187723914999221e-05,
+      "loss": 0.16802358627319336,
+      "step": 1575
+    },
+    {
+      "epoch": 0.28748180494905384,
+      "grad_norm": 0.1491454839706421,
+      "learning_rate": 4.182286276310915e-05,
+      "loss": 0.1619583249092102,
+      "step": 1580
+    },
+    {
+      "epoch": 0.288391557496361,
+      "grad_norm": 0.15831919014453888,
+      "learning_rate": 4.176834053829492e-05,
+      "loss": 0.1625199794769287,
+      "step": 1585
+    },
+    {
+      "epoch": 0.2893013100436681,
+      "grad_norm": 0.16265396773815155,
+      "learning_rate": 4.1713672948204416e-05,
+      "loss": 0.16718552112579346,
+      "step": 1590
+    },
+    {
+      "epoch": 0.29021106259097523,
+      "grad_norm": 0.15153461694717407,
+      "learning_rate": 4.1658860466752714e-05,
+      "loss": 0.15979087352752686,
+      "step": 1595
+    },
+    {
+      "epoch": 0.29112081513828236,
+      "grad_norm": 0.1620412915945053,
+      "learning_rate": 4.160390356911096e-05,
+      "loss": 0.16103557348251343,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2920305676855895,
+      "grad_norm": 0.16673807799816132,
+      "learning_rate": 4.154880273170223e-05,
+      "loss": 0.16394708156585694,
+      "step": 1605
+    },
+    {
+      "epoch": 0.2929403202328967,
+      "grad_norm": 0.14834867417812347,
+      "learning_rate": 4.149355843219744e-05,
+      "loss": 0.15916435718536376,
+      "step": 1610
+    },
+    {
+      "epoch": 0.2938500727802038,
+      "grad_norm": 0.16977964341640472,
+      "learning_rate": 4.143817114951119e-05,
+      "loss": 0.16538127660751342,
+      "step": 1615
+    },
+    {
+      "epoch": 0.29475982532751094,
+      "grad_norm": 0.17986875772476196,
+      "learning_rate": 4.138264136379756e-05,
+      "loss": 0.15514618158340454,
+      "step": 1620
+    },
+    {
+      "epoch": 0.29566957787481807,
+      "grad_norm": 0.15794920921325684,
+      "learning_rate": 4.132696955644605e-05,
+      "loss": 0.15992183685302735,
+      "step": 1625
+    },
+    {
+      "epoch": 0.2965793304221252,
+      "grad_norm": 0.19955399632453918,
+      "learning_rate": 4.127115621007731e-05,
+      "loss": 0.16362056732177735,
+      "step": 1630
+    },
+    {
+      "epoch": 0.29748908296943233,
+      "grad_norm": 0.1352023035287857,
+      "learning_rate": 4.121520180853903e-05,
+      "loss": 0.15631601810455323,
+      "step": 1635
+    },
+    {
+      "epoch": 0.29839883551673946,
+      "grad_norm": 0.15340781211853027,
+      "learning_rate": 4.1159106836901674e-05,
+      "loss": 0.1571858048439026,
+      "step": 1640
+    },
+    {
+      "epoch": 0.2993085880640466,
+      "grad_norm": 0.15311770141124725,
+      "learning_rate": 4.110287178145433e-05,
+      "loss": 0.16082344055175782,
+      "step": 1645
+    },
+    {
+      "epoch": 0.3002183406113537,
+      "grad_norm": 0.17811627686023712,
+      "learning_rate": 4.10464971297005e-05,
+      "loss": 0.16117215156555176,
+      "step": 1650
+    },
+    {
+      "epoch": 0.30112809315866085,
+      "grad_norm": 0.21060039103031158,
+      "learning_rate": 4.0989983370353805e-05,
+      "loss": 0.15838587284088135,
+      "step": 1655
+    },
+    {
+      "epoch": 0.302037845705968,
+      "grad_norm": 0.155836820602417,
+      "learning_rate": 4.093333099333383e-05,
+      "loss": 0.16648870706558228,
+      "step": 1660
+    },
+    {
+      "epoch": 0.3029475982532751,
+      "grad_norm": 0.13711698353290558,
+      "learning_rate": 4.0876540489761826e-05,
+      "loss": 0.16899349689483642,
+      "step": 1665
+    },
+    {
+      "epoch": 0.30385735080058224,
+      "grad_norm": 0.15162716805934906,
+      "learning_rate": 4.0819612351956485e-05,
+      "loss": 0.16574090719223022,
+      "step": 1670
+    },
+    {
+      "epoch": 0.30476710334788937,
+      "grad_norm": 0.15016348659992218,
+      "learning_rate": 4.0762547073429615e-05,
+      "loss": 0.1689780354499817,
+      "step": 1675
+    },
+    {
+      "epoch": 0.3056768558951965,
+      "grad_norm": 0.15182986855506897,
+      "learning_rate": 4.070534514888194e-05,
+      "loss": 0.1593686819076538,
+      "step": 1680
+    },
+    {
+      "epoch": 0.3065866084425036,
+      "grad_norm": 0.15648750960826874,
+      "learning_rate": 4.0648007074198765e-05,
+      "loss": 0.16436235904693602,
+      "step": 1685
+    },
+    {
+      "epoch": 0.30749636098981076,
+      "grad_norm": 0.18339484930038452,
+      "learning_rate": 4.0590533346445665e-05,
+      "loss": 0.1678077220916748,
+      "step": 1690
+    },
+    {
+      "epoch": 0.3084061135371179,
+      "grad_norm": 0.16426527500152588,
+      "learning_rate": 4.053292446386422e-05,
+      "loss": 0.1689227342605591,
+      "step": 1695
+    },
+    {
+      "epoch": 0.309315866084425,
+      "grad_norm": 0.16129335761070251,
+      "learning_rate": 4.047518092586766e-05,
+      "loss": 0.16592445373535156,
+      "step": 1700
+    },
+    {
+      "epoch": 0.31022561863173215,
+      "grad_norm": 0.15512363612651825,
+      "learning_rate": 4.041730323303654e-05,
+      "loss": 0.16142364740371704,
+      "step": 1705
+    },
+    {
+      "epoch": 0.3111353711790393,
+      "grad_norm": 0.159842386841774,
+      "learning_rate": 4.0359291887114425e-05,
+      "loss": 0.1702875852584839,
+      "step": 1710
+    },
+    {
+      "epoch": 0.3120451237263464,
+      "grad_norm": 0.19558854401111603,
+      "learning_rate": 4.030114739100352e-05,
+      "loss": 0.15966148376464845,
+      "step": 1715
+    },
+    {
+      "epoch": 0.3129548762736536,
+      "grad_norm": 0.1577496975660324,
+      "learning_rate": 4.024287024876029e-05,
+      "loss": 0.1620358943939209,
+      "step": 1720
+    },
+    {
+      "epoch": 0.3138646288209607,
+      "grad_norm": 0.1629355251789093,
+      "learning_rate": 4.0184460965591144e-05,
+      "loss": 0.16511552333831786,
+      "step": 1725
+    },
+    {
+      "epoch": 0.31477438136826785,
+      "grad_norm": 0.17060767114162445,
+      "learning_rate": 4.0125920047848e-05,
+      "loss": 0.15672838687896729,
+      "step": 1730
+    },
+    {
+      "epoch": 0.315684133915575,
+      "grad_norm": 0.22447620332241058,
+      "learning_rate": 4.006724800302394e-05,
+      "loss": 0.15339784622192382,
+      "step": 1735
+    },
+    {
+      "epoch": 0.3165938864628821,
+      "grad_norm": 0.14572037756443024,
+      "learning_rate": 4.000844533974878e-05,
+      "loss": 0.16566959619522095,
+      "step": 1740
+    },
+    {
+      "epoch": 0.31750363901018924,
+      "grad_norm": 0.15915483236312866,
+      "learning_rate": 3.9949512567784684e-05,
+      "loss": 0.16153957843780517,
+      "step": 1745
+    },
+    {
+      "epoch": 0.3184133915574964,
+      "grad_norm": 0.1668540984392166,
+      "learning_rate": 3.9890450198021704e-05,
+      "loss": 0.1659809947013855,
+      "step": 1750
+    },
+    {
+      "epoch": 0.3193231441048035,
+      "grad_norm": 0.16612035036087036,
+      "learning_rate": 3.983125874247341e-05,
+      "loss": 0.16941241025924683,
+      "step": 1755
+    },
+    {
+      "epoch": 0.32023289665211063,
+      "grad_norm": 0.15163679420948029,
+      "learning_rate": 3.9771938714272407e-05,
+      "loss": 0.16053590774536133,
+      "step": 1760
+    },
+    {
+      "epoch": 0.32114264919941776,
+      "grad_norm": 0.1797824203968048,
+      "learning_rate": 3.97124906276659e-05,
+      "loss": 0.1667110800743103,
+      "step": 1765
+    },
+    {
+      "epoch": 0.3220524017467249,
+      "grad_norm": 0.15076608955860138,
+      "learning_rate": 3.9652914998011237e-05,
+      "loss": 0.1607860803604126,
+      "step": 1770
+    },
+    {
+      "epoch": 0.322962154294032,
+      "grad_norm": 0.16523587703704834,
+      "learning_rate": 3.959321234177144e-05,
+      "loss": 0.16515827178955078,
+      "step": 1775
+    },
+    {
+      "epoch": 0.32387190684133915,
+      "grad_norm": 0.22065149247646332,
+      "learning_rate": 3.9533383176510746e-05,
+      "loss": 0.1618957757949829,
+      "step": 1780
+    },
+    {
+      "epoch": 0.3247816593886463,
+      "grad_norm": 0.16426463425159454,
+      "learning_rate": 3.9473428020890066e-05,
+      "loss": 0.15763382911682128,
+      "step": 1785
+    },
+    {
+      "epoch": 0.3256914119359534,
+      "grad_norm": 0.16474904119968414,
+      "learning_rate": 3.941334739466257e-05,
+      "loss": 0.15135571956634522,
+      "step": 1790
+    },
+    {
+      "epoch": 0.32660116448326054,
+      "grad_norm": 0.16746412217617035,
+      "learning_rate": 3.935314181866909e-05,
+      "loss": 0.15925389528274536,
+      "step": 1795
+    },
+    {
+      "epoch": 0.32751091703056767,
+      "grad_norm": 0.17819371819496155,
+      "learning_rate": 3.929281181483369e-05,
+      "loss": 0.1598669171333313,
+      "step": 1800
+    },
+    {
+      "epoch": 0.3284206695778748,
+      "grad_norm": 0.1816040277481079,
+      "learning_rate": 3.923235790615907e-05,
+      "loss": 0.1652522087097168,
+      "step": 1805
+    },
+    {
+      "epoch": 0.32933042212518193,
+      "grad_norm": 0.14846695959568024,
+      "learning_rate": 3.917178061672211e-05,
+      "loss": 0.16665585041046144,
+      "step": 1810
+    },
+    {
+      "epoch": 0.33024017467248906,
+      "grad_norm": 0.1734926551580429,
+      "learning_rate": 3.911108047166924e-05,
+      "loss": 0.16069791316986085,
+      "step": 1815
+    },
+    {
+      "epoch": 0.3311499272197962,
+      "grad_norm": 0.16154922544956207,
+      "learning_rate": 3.905025799721194e-05,
+      "loss": 0.16114097833633423,
+      "step": 1820
+    },
+    {
+      "epoch": 0.3320596797671033,
+      "grad_norm": 0.1538771390914917,
+      "learning_rate": 3.898931372062217e-05,
+      "loss": 0.1602831244468689,
+      "step": 1825
+    },
+    {
+      "epoch": 0.3329694323144105,
+      "grad_norm": 0.14036566019058228,
+      "learning_rate": 3.892824817022781e-05,
+      "loss": 0.1502395749092102,
+      "step": 1830
+    },
+    {
+      "epoch": 0.33387918486171764,
+      "grad_norm": 0.19212059676647186,
+      "learning_rate": 3.886706187540804e-05,
+      "loss": 0.16265250444412233,
+      "step": 1835
+    },
+    {
+      "epoch": 0.33478893740902477,
+      "grad_norm": 0.17410333454608917,
+      "learning_rate": 3.880575536658881e-05,
+      "loss": 0.15689224004745483,
+      "step": 1840
+    },
+    {
+      "epoch": 0.3356986899563319,
+      "grad_norm": 0.15165294706821442,
+      "learning_rate": 3.874432917523817e-05,
+      "loss": 0.15033140182495117,
+      "step": 1845
+    },
+    {
+      "epoch": 0.336608442503639,
+      "grad_norm": 0.16166730225086212,
+      "learning_rate": 3.8682783833861736e-05,
+      "loss": 0.16896235942840576,
+      "step": 1850
+    },
+    {
+      "epoch": 0.33751819505094616,
+      "grad_norm": 0.16497021913528442,
+      "learning_rate": 3.8621119875998026e-05,
+      "loss": 0.1600774645805359,
+      "step": 1855
+    },
+    {
+      "epoch": 0.3384279475982533,
+      "grad_norm": 0.17264948785305023,
+      "learning_rate": 3.855933783621384e-05,
+      "loss": 0.16947593688964843,
+      "step": 1860
+    },
+    {
+      "epoch": 0.3393377001455604,
+      "grad_norm": 0.16870704293251038,
+      "learning_rate": 3.8497438250099636e-05,
+      "loss": 0.16062095165252685,
+      "step": 1865
+    },
+    {
+      "epoch": 0.34024745269286755,
+      "grad_norm": 0.16644036769866943,
+      "learning_rate": 3.843542165426492e-05,
+      "loss": 0.16015599966049193,
+      "step": 1870
+    },
+    {
+      "epoch": 0.3411572052401747,
+      "grad_norm": 0.1626352220773697,
+      "learning_rate": 3.837328858633349e-05,
+      "loss": 0.17444703578948975,
+      "step": 1875
+    },
+    {
+      "epoch": 0.3420669577874818,
+      "grad_norm": 0.1427375227212906,
+      "learning_rate": 3.83110395849389e-05,
+      "loss": 0.1589805006980896,
+      "step": 1880
+    },
+    {
+      "epoch": 0.34297671033478894,
+      "grad_norm": 0.17840255796909332,
+      "learning_rate": 3.824867518971973e-05,
+      "loss": 0.15953952074050903,
+      "step": 1885
+    },
+    {
+      "epoch": 0.34388646288209607,
+      "grad_norm": 0.16998249292373657,
+      "learning_rate": 3.818619594131489e-05,
+      "loss": 0.16027032136917113,
+      "step": 1890
+    },
+    {
+      "epoch": 0.3447962154294032,
+      "grad_norm": 0.14950257539749146,
+      "learning_rate": 3.812360238135897e-05,
+      "loss": 0.15335670709609986,
+      "step": 1895
+    },
+    {
+      "epoch": 0.3457059679767103,
+      "grad_norm": 0.1678011417388916,
+      "learning_rate": 3.806089505247752e-05,
+      "loss": 0.1560648798942566,
+      "step": 1900
+    },
+    {
+      "epoch": 0.34661572052401746,
+      "grad_norm": 0.17944541573524475,
+      "learning_rate": 3.799807449828238e-05,
+      "loss": 0.16072254180908202,
+      "step": 1905
+    },
+    {
+      "epoch": 0.3475254730713246,
+      "grad_norm": 0.166817307472229,
+      "learning_rate": 3.793514126336691e-05,
+      "loss": 0.1542820692062378,
+      "step": 1910
+    },
+    {
+      "epoch": 0.3484352256186317,
+      "grad_norm": 0.16047626733779907,
+      "learning_rate": 3.787209589330134e-05,
+      "loss": 0.16092092990875245,
+      "step": 1915
+    },
+    {
+      "epoch": 0.34934497816593885,
+      "grad_norm": 0.16478900611400604,
+      "learning_rate": 3.7808938934627965e-05,
+      "loss": 0.16765867471694945,
+      "step": 1920
+    },
+    {
+      "epoch": 0.350254730713246,
+      "grad_norm": 0.15349514782428741,
+      "learning_rate": 3.774567093485648e-05,
+      "loss": 0.15890377759933472,
+      "step": 1925
+    },
+    {
+      "epoch": 0.3511644832605531,
+      "grad_norm": 0.1515921950340271,
+      "learning_rate": 3.768229244245917e-05,
+      "loss": 0.16668319702148438,
+      "step": 1930
+    },
+    {
+      "epoch": 0.35207423580786024,
+      "grad_norm": 0.16310466825962067,
+      "learning_rate": 3.7618804006866195e-05,
+      "loss": 0.15182652473449706,
+      "step": 1935
+    },
+    {
+      "epoch": 0.3529839883551674,
+      "grad_norm": 0.17294517159461975,
+      "learning_rate": 3.755520617846084e-05,
+      "loss": 0.16287628412246705,
+      "step": 1940
+    },
+    {
+      "epoch": 0.35389374090247455,
+      "grad_norm": 0.1482895463705063,
+      "learning_rate": 3.749149950857467e-05,
+      "loss": 0.15321952104568481,
+      "step": 1945
+    },
+    {
+      "epoch": 0.3548034934497817,
+      "grad_norm": 0.2236029952764511,
+      "learning_rate": 3.7427684549482847e-05,
+      "loss": 0.15403482913970948,
+      "step": 1950
+    },
+    {
+      "epoch": 0.3557132459970888,
+      "grad_norm": 0.20185327529907227,
+      "learning_rate": 3.736376185439927e-05,
+      "loss": 0.1633884072303772,
+      "step": 1955
+    },
+    {
+      "epoch": 0.35662299854439594,
+      "grad_norm": 0.13906247913837433,
+      "learning_rate": 3.7299731977471816e-05,
+      "loss": 0.15925350189208984,
+      "step": 1960
+    },
+    {
+      "epoch": 0.35753275109170307,
+      "grad_norm": 0.18665002286434174,
+      "learning_rate": 3.723559547377751e-05,
+      "loss": 0.1612026572227478,
+      "step": 1965
+    },
+    {
+      "epoch": 0.3584425036390102,
+      "grad_norm": 0.16913433372974396,
+      "learning_rate": 3.717135289931774e-05,
+      "loss": 0.15479494333267213,
+      "step": 1970
+    },
+    {
+      "epoch": 0.35935225618631733,
+      "grad_norm": 0.1620066910982132,
+      "learning_rate": 3.7107004811013434e-05,
+      "loss": 0.1604058027267456,
+      "step": 1975
+    },
+    {
+      "epoch": 0.36026200873362446,
+      "grad_norm": 0.16838301718235016,
+      "learning_rate": 3.704255176670021e-05,
+      "loss": 0.15335073471069335,
+      "step": 1980
+    },
+    {
+      "epoch": 0.3611717612809316,
+      "grad_norm": 0.3054695427417755,
+      "learning_rate": 3.6977994325123535e-05,
+      "loss": 0.16558053493499755,
+      "step": 1985
+    },
+    {
+      "epoch": 0.3620815138282387,
+      "grad_norm": 0.1526716649532318,
+      "learning_rate": 3.6913333045933934e-05,
+      "loss": 0.16148923635482787,
+      "step": 1990
+    },
+    {
+      "epoch": 0.36299126637554585,
+      "grad_norm": 0.15328513085842133,
+      "learning_rate": 3.684856848968209e-05,
+      "loss": 0.1553613781929016,
+      "step": 1995
+    },
+    {
+      "epoch": 0.363901018922853,
+      "grad_norm": 0.16129714250564575,
+      "learning_rate": 3.6783701217813995e-05,
+      "loss": 0.16724612712860107,
+      "step": 2000
+    },
+    {
+      "epoch": 0.3648107714701601,
+      "grad_norm": 0.15715539455413818,
+      "learning_rate": 3.6718731792666086e-05,
+      "loss": 0.15867922306060792,
+      "step": 2005
+    },
+    {
+      "epoch": 0.36572052401746724,
+      "grad_norm": 0.15569166839122772,
+      "learning_rate": 3.6653660777460366e-05,
+      "loss": 0.1552058696746826,
+      "step": 2010
+    },
+    {
+      "epoch": 0.36663027656477437,
+      "grad_norm": 0.16223010420799255,
+      "learning_rate": 3.6588488736299535e-05,
+      "loss": 0.1583200454711914,
+      "step": 2015
+    },
+    {
+      "epoch": 0.3675400291120815,
+      "grad_norm": 0.18441995978355408,
+      "learning_rate": 3.652321623416209e-05,
+      "loss": 0.15050662755966188,
+      "step": 2020
+    },
+    {
+      "epoch": 0.36844978165938863,
+      "grad_norm": 0.13792674243450165,
+      "learning_rate": 3.645784383689742e-05,
+      "loss": 0.15458759069442748,
+      "step": 2025
+    },
+    {
+      "epoch": 0.36935953420669576,
+      "grad_norm": 0.14993111789226532,
+      "learning_rate": 3.639237211122091e-05,
+      "loss": 0.15926222801208495,
+      "step": 2030
+    },
+    {
+      "epoch": 0.3702692867540029,
+      "grad_norm": 0.16815930604934692,
+      "learning_rate": 3.632680162470904e-05,
+      "loss": 0.15524441003799438,
+      "step": 2035
+    },
+    {
+      "epoch": 0.37117903930131,
+      "grad_norm": 0.13312821090221405,
+      "learning_rate": 3.626113294579441e-05,
+      "loss": 0.15883516073226928,
+      "step": 2040
+    },
+    {
+      "epoch": 0.37208879184861715,
+      "grad_norm": 0.16838273406028748,
+      "learning_rate": 3.619536664376091e-05,
+      "loss": 0.15829603672027587,
+      "step": 2045
+    },
+    {
+      "epoch": 0.37299854439592434,
+      "grad_norm": 0.14706873893737793,
+      "learning_rate": 3.612950328873869e-05,
+      "loss": 0.15644397735595703,
+      "step": 2050
+    },
+    {
+      "epoch": 0.37390829694323147,
+      "grad_norm": 0.1644199639558792,
+      "learning_rate": 3.606354345169926e-05,
+      "loss": 0.15858219861984252,
+      "step": 2055
+    },
+    {
+      "epoch": 0.3748180494905386,
+      "grad_norm": 0.18077051639556885,
+      "learning_rate": 3.599748770445055e-05,
+      "loss": 0.1641286849975586,
+      "step": 2060
+    },
+    {
+      "epoch": 0.3757278020378457,
+      "grad_norm": 0.16329127550125122,
+      "learning_rate": 3.5931336619631914e-05,
+      "loss": 0.15027186870574952,
+      "step": 2065
+    },
+    {
+      "epoch": 0.37663755458515286,
+      "grad_norm": 0.16346783936023712,
+      "learning_rate": 3.586509077070922e-05,
+      "loss": 0.1558641314506531,
+      "step": 2070
+    },
+    {
+      "epoch": 0.37754730713246,
+      "grad_norm": 0.1727602630853653,
+      "learning_rate": 3.5798750731969834e-05,
+      "loss": 0.15390506982803345,
+      "step": 2075
+    },
+    {
+      "epoch": 0.3784570596797671,
+      "grad_norm": 0.7598192691802979,
+      "learning_rate": 3.5732317078517654e-05,
+      "loss": 0.1533232808113098,
+      "step": 2080
+    },
+    {
+      "epoch": 0.37936681222707425,
+      "grad_norm": 0.1433355212211609,
+      "learning_rate": 3.5665790386268124e-05,
+      "loss": 0.15560413599014283,
+      "step": 2085
+    },
+    {
+      "epoch": 0.3802765647743814,
+      "grad_norm": 0.18439625203609467,
+      "learning_rate": 3.559917123194325e-05,
+      "loss": 0.16695556640625,
+      "step": 2090
+    },
+    {
+      "epoch": 0.3811863173216885,
+      "grad_norm": 0.1693502813577652,
+      "learning_rate": 3.55324601930666e-05,
+      "loss": 0.15957870483398437,
+      "step": 2095
+    },
+    {
+      "epoch": 0.38209606986899564,
+      "grad_norm": 0.17776088416576385,
+      "learning_rate": 3.54656578479583e-05,
+      "loss": 0.1527492880821228,
+      "step": 2100
+    },
+    {
+      "epoch": 0.38300582241630277,
+      "grad_norm": 0.15993724763393402,
+      "learning_rate": 3.539876477572998e-05,
+      "loss": 0.1567505717277527,
+      "step": 2105
+    },
+    {
+      "epoch": 0.3839155749636099,
+      "grad_norm": 0.17067375779151917,
+      "learning_rate": 3.533178155627981e-05,
+      "loss": 0.14660797119140626,
+      "step": 2110
+    },
+    {
+      "epoch": 0.384825327510917,
+      "grad_norm": 0.20239882171154022,
+      "learning_rate": 3.526470877028745e-05,
+      "loss": 0.1596767544746399,
+      "step": 2115
+    },
+    {
+      "epoch": 0.38573508005822416,
+      "grad_norm": 0.1863643079996109,
+      "learning_rate": 3.5197546999209005e-05,
+      "loss": 0.15738571882247926,
+      "step": 2120
+    },
+    {
+      "epoch": 0.3866448326055313,
+      "grad_norm": 0.16994133591651917,
+      "learning_rate": 3.5130296825272014e-05,
+      "loss": 0.16255316734313965,
+      "step": 2125
+    },
+    {
+      "epoch": 0.3875545851528384,
+      "grad_norm": 0.18703415989875793,
+      "learning_rate": 3.5062958831470355e-05,
+      "loss": 0.15206334590911866,
+      "step": 2130
+    },
+    {
+      "epoch": 0.38846433770014555,
+      "grad_norm": 0.15433982014656067,
+      "learning_rate": 3.4995533601559226e-05,
+      "loss": 0.1590178370475769,
+      "step": 2135
+    },
+    {
+      "epoch": 0.3893740902474527,
+      "grad_norm": 0.16498146951198578,
+      "learning_rate": 3.4928021720050104e-05,
+      "loss": 0.14759145975112914,
+      "step": 2140
+    },
+    {
+      "epoch": 0.3902838427947598,
+      "grad_norm": 0.17880478501319885,
+      "learning_rate": 3.486042377220562e-05,
+      "loss": 0.1642458915710449,
+      "step": 2145
+    },
+    {
+      "epoch": 0.39119359534206694,
+      "grad_norm": 0.14700061082839966,
+      "learning_rate": 3.479274034403455e-05,
+      "loss": 0.16105138063430785,
+      "step": 2150
+    },
+    {
+      "epoch": 0.39210334788937407,
+      "grad_norm": 0.1620762050151825,
+      "learning_rate": 3.472497202228664e-05,
+      "loss": 0.15104985237121582,
+      "step": 2155
+    },
+    {
+      "epoch": 0.3930131004366812,
+      "grad_norm": 0.1625058799982071,
+      "learning_rate": 3.4657119394447654e-05,
+      "loss": 0.16145485639572144,
+      "step": 2160
+    },
+    {
+      "epoch": 0.3939228529839884,
+      "grad_norm": 0.1631549596786499,
+      "learning_rate": 3.458918304873417e-05,
+      "loss": 0.16712255477905275,
+      "step": 2165
+    },
+    {
+      "epoch": 0.3948326055312955,
+      "grad_norm": 0.16041551530361176,
+      "learning_rate": 3.452116357408853e-05,
+      "loss": 0.15118330717086792,
+      "step": 2170
+    },
+    {
+      "epoch": 0.39574235807860264,
+      "grad_norm": 0.16692611575126648,
+      "learning_rate": 3.44530615601737e-05,
+      "loss": 0.16982550621032716,
+      "step": 2175
+    },
+    {
+      "epoch": 0.39665211062590977,
+      "grad_norm": 0.16082268953323364,
+      "learning_rate": 3.438487759736821e-05,
+      "loss": 0.1513260006904602,
+      "step": 2180
+    },
+    {
+      "epoch": 0.3975618631732169,
+      "grad_norm": 0.1474589854478836,
+      "learning_rate": 3.4316612276761004e-05,
+      "loss": 0.14968743324279785,
+      "step": 2185
+    },
+    {
+      "epoch": 0.39847161572052403,
+      "grad_norm": 0.14531342685222626,
+      "learning_rate": 3.42482661901463e-05,
+      "loss": 0.1563260555267334,
+      "step": 2190
+    },
+    {
+      "epoch": 0.39938136826783116,
+      "grad_norm": 0.16775506734848022,
+      "learning_rate": 3.41798399300185e-05,
+      "loss": 0.14861010313034057,
+      "step": 2195
+    },
+    {
+      "epoch": 0.4002911208151383,
+      "grad_norm": 0.15065217018127441,
+      "learning_rate": 3.411133408956703e-05,
+      "loss": 0.15559519529342652,
+      "step": 2200
+    },
+    {
+      "epoch": 0.4012008733624454,
+      "grad_norm": 0.16655296087265015,
+      "learning_rate": 3.4042749262671184e-05,
+      "loss": 0.16025567054748535,
+      "step": 2205
+    },
+    {
+      "epoch": 0.40211062590975255,
+      "grad_norm": 0.14773905277252197,
+      "learning_rate": 3.397408604389501e-05,
+      "loss": 0.15074082612991332,
+      "step": 2210
+    },
+    {
+      "epoch": 0.4030203784570597,
+      "grad_norm": 0.16233304142951965,
+      "learning_rate": 3.3905345028482125e-05,
+      "loss": 0.15490520000457764,
+      "step": 2215
+    },
+    {
+      "epoch": 0.4039301310043668,
+      "grad_norm": 0.17520153522491455,
+      "learning_rate": 3.383652681235058e-05,
+      "loss": 0.1517520785331726,
+      "step": 2220
+    },
+    {
+      "epoch": 0.40483988355167394,
+      "grad_norm": 0.14749875664710999,
+      "learning_rate": 3.376763199208766e-05,
+      "loss": 0.15410997867584228,
+      "step": 2225
+    },
+    {
+      "epoch": 0.40574963609898107,
+      "grad_norm": 0.16855919361114502,
+      "learning_rate": 3.369866116494477e-05,
+      "loss": 0.1510261058807373,
+      "step": 2230
+    },
+    {
+      "epoch": 0.4066593886462882,
+      "grad_norm": 0.1594122350215912,
+      "learning_rate": 3.362961492883218e-05,
+      "loss": 0.1493813395500183,
+      "step": 2235
+    },
+    {
+      "epoch": 0.40756914119359533,
+      "grad_norm": 0.13645926117897034,
+      "learning_rate": 3.3560493882313915e-05,
+      "loss": 0.14876762628555298,
+      "step": 2240
+    },
+    {
+      "epoch": 0.40847889374090246,
+      "grad_norm": 0.14304400980472565,
+      "learning_rate": 3.349129862460251e-05,
+      "loss": 0.15567013025283813,
+      "step": 2245
+    },
+    {
+      "epoch": 0.4093886462882096,
+      "grad_norm": 0.17040041089057922,
+      "learning_rate": 3.342202975555386e-05,
+      "loss": 0.1563249945640564,
+      "step": 2250
+    },
+    {
+      "epoch": 0.4102983988355167,
+      "grad_norm": 0.15594671666622162,
+      "learning_rate": 3.3352687875661984e-05,
+      "loss": 0.1546410083770752,
+      "step": 2255
+    },
+    {
+      "epoch": 0.41120815138282385,
+      "grad_norm": 0.1677195280790329,
+      "learning_rate": 3.328327358605384e-05,
+      "loss": 0.15710171461105346,
+      "step": 2260
+    },
+    {
+      "epoch": 0.412117903930131,
+      "grad_norm": 0.1731705516576767,
+      "learning_rate": 3.321378748848412e-05,
+      "loss": 0.16444036960601807,
+      "step": 2265
+    },
+    {
+      "epoch": 0.4130276564774381,
+      "grad_norm": 0.18779033422470093,
+      "learning_rate": 3.3144230185329984e-05,
+      "loss": 0.15659687519073487,
+      "step": 2270
+    },
+    {
+      "epoch": 0.4139374090247453,
+      "grad_norm": 0.1543768346309662,
+      "learning_rate": 3.3074602279585913e-05,
+      "loss": 0.15100739002227784,
+      "step": 2275
+    },
+    {
+      "epoch": 0.4148471615720524,
+      "grad_norm": 0.16672168672084808,
+      "learning_rate": 3.300490437485843e-05,
+      "loss": 0.15535364151000977,
+      "step": 2280
+    },
+    {
+      "epoch": 0.41575691411935956,
+      "grad_norm": 0.16741308569908142,
+      "learning_rate": 3.293513707536089e-05,
+      "loss": 0.15523911714553834,
+      "step": 2285
+    },
+    {
+      "epoch": 0.4166666666666667,
+      "grad_norm": 0.1488303542137146,
+      "learning_rate": 3.286530098590822e-05,
+      "loss": 0.1542000651359558,
+      "step": 2290
+    },
+    {
+      "epoch": 0.4175764192139738,
+      "grad_norm": 0.1637732982635498,
+      "learning_rate": 3.2795396711911694e-05,
+      "loss": 0.15354831218719484,
+      "step": 2295
+    },
+    {
+      "epoch": 0.41848617176128095,
+      "grad_norm": 0.1472022533416748,
+      "learning_rate": 3.272542485937369e-05,
+      "loss": 0.16235145330429077,
+      "step": 2300
+    },
+    {
+      "epoch": 0.4193959243085881,
+      "grad_norm": 0.15908290445804596,
+      "learning_rate": 3.265538603488241e-05,
+      "loss": 0.15642645359039306,
+      "step": 2305
+    },
+    {
+      "epoch": 0.4203056768558952,
+      "grad_norm": 0.1584865301847458,
+      "learning_rate": 3.2585280845606645e-05,
+      "loss": 0.15490249395370484,
+      "step": 2310
+    },
+    {
+      "epoch": 0.42121542940320233,
+      "grad_norm": 0.15893949568271637,
+      "learning_rate": 3.251510989929052e-05,
+      "loss": 0.1598116159439087,
+      "step": 2315
+    },
+    {
+      "epoch": 0.42212518195050946,
+      "grad_norm": 0.18930596113204956,
+      "learning_rate": 3.244487380424817e-05,
+      "loss": 0.1482008934020996,
+      "step": 2320
+    },
+    {
+      "epoch": 0.4230349344978166,
+      "grad_norm": 0.132876455783844,
+      "learning_rate": 3.237457316935856e-05,
+      "loss": 0.15304710865020751,
+      "step": 2325
+    },
+    {
+      "epoch": 0.4239446870451237,
+      "grad_norm": 0.16447032988071442,
+      "learning_rate": 3.2304208604060106e-05,
+      "loss": 0.15298750400543212,
+      "step": 2330
+    },
+    {
+      "epoch": 0.42485443959243085,
+      "grad_norm": 0.17748120427131653,
+      "learning_rate": 3.223378071834546e-05,
+      "loss": 0.1556084156036377,
+      "step": 2335
+    },
+    {
+      "epoch": 0.425764192139738,
+      "grad_norm": 0.16366586089134216,
+      "learning_rate": 3.2163290122756206e-05,
+      "loss": 0.14387927055358887,
+      "step": 2340
+    },
+    {
+      "epoch": 0.4266739446870451,
+      "grad_norm": 0.15398970246315002,
+      "learning_rate": 3.209273742837755e-05,
+      "loss": 0.16091293096542358,
+      "step": 2345
+    },
+    {
+      "epoch": 0.42758369723435224,
+      "grad_norm": 0.164212167263031,
+      "learning_rate": 3.202212324683305e-05,
+      "loss": 0.15523531436920165,
+      "step": 2350
+    },
+    {
+      "epoch": 0.4284934497816594,
+      "grad_norm": 0.16749800741672516,
+      "learning_rate": 3.1951448190279255e-05,
+      "loss": 0.15354975461959838,
+      "step": 2355
+    },
+    {
+      "epoch": 0.4294032023289665,
+      "grad_norm": 0.14137034118175507,
+      "learning_rate": 3.18807128714005e-05,
+      "loss": 0.14981694221496583,
+      "step": 2360
+    },
+    {
+      "epoch": 0.43031295487627363,
+      "grad_norm": 0.14848439395427704,
+      "learning_rate": 3.1809917903403507e-05,
+      "loss": 0.15448769330978393,
+      "step": 2365
+    },
+    {
+      "epoch": 0.43122270742358076,
+      "grad_norm": 0.1747605800628662,
+      "learning_rate": 3.1739063900012095e-05,
+      "loss": 0.15882387161254882,
+      "step": 2370
+    },
+    {
+      "epoch": 0.4321324599708879,
+      "grad_norm": 0.16054467856884003,
+      "learning_rate": 3.166815147546186e-05,
+      "loss": 0.15170297622680665,
+      "step": 2375
+    },
+    {
+      "epoch": 0.433042212518195,
+      "grad_norm": 0.15428027510643005,
+      "learning_rate": 3.1597181244494886e-05,
+      "loss": 0.16202548742294312,
+      "step": 2380
+    },
+    {
+      "epoch": 0.4339519650655022,
+      "grad_norm": 0.16747219860553741,
+      "learning_rate": 3.1526153822354325e-05,
+      "loss": 0.15461477041244506,
+      "step": 2385
+    },
+    {
+      "epoch": 0.43486171761280934,
+      "grad_norm": 0.17415772378444672,
+      "learning_rate": 3.145506982477918e-05,
+      "loss": 0.16173542737960817,
+      "step": 2390
+    },
+    {
+      "epoch": 0.43577147016011647,
+      "grad_norm": 0.1293518990278244,
+      "learning_rate": 3.1383929867998865e-05,
+      "loss": 0.15572521686553956,
+      "step": 2395
+    },
+    {
+      "epoch": 0.4366812227074236,
+      "grad_norm": 0.16909323632717133,
+      "learning_rate": 3.1312734568727935e-05,
+      "loss": 0.15898628234863282,
+      "step": 2400
+    },
+    {
+      "epoch": 0.43759097525473073,
+      "grad_norm": 0.16770294308662415,
+      "learning_rate": 3.124148454416069e-05,
+      "loss": 0.1536281704902649,
+      "step": 2405
+    },
+    {
+      "epoch": 0.43850072780203786,
+      "grad_norm": 0.14078612625598907,
+      "learning_rate": 3.117018041196585e-05,
+      "loss": 0.15274266004562378,
+      "step": 2410
+    },
+    {
+      "epoch": 0.439410480349345,
+      "grad_norm": 0.15457536280155182,
+      "learning_rate": 3.1098822790281226e-05,
+      "loss": 0.15391263961791993,
+      "step": 2415
+    },
+    {
+      "epoch": 0.4403202328966521,
+      "grad_norm": 0.1640717089176178,
+      "learning_rate": 3.102741229770827e-05,
+      "loss": 0.15515168905258178,
+      "step": 2420
+    },
+    {
+      "epoch": 0.44122998544395925,
+      "grad_norm": 0.2601533830165863,
+      "learning_rate": 3.095594955330683e-05,
+      "loss": 0.1587247371673584,
+      "step": 2425
+    },
+    {
+      "epoch": 0.4421397379912664,
+      "grad_norm": 0.1352529525756836,
+      "learning_rate": 3.08844351765897e-05,
+      "loss": 0.1483217477798462,
+      "step": 2430
+    },
+    {
+      "epoch": 0.4430494905385735,
+      "grad_norm": 0.18479721248149872,
+      "learning_rate": 3.081286978751728e-05,
+      "loss": 0.15121787786483765,
+      "step": 2435
+    },
+    {
+      "epoch": 0.44395924308588064,
+      "grad_norm": 0.16954511404037476,
+      "learning_rate": 3.074125400649221e-05,
+      "loss": 0.16073100566864013,
+      "step": 2440
+    },
+    {
+      "epoch": 0.44486899563318777,
+      "grad_norm": 0.15154729783535004,
+      "learning_rate": 3.0669588454353944e-05,
+      "loss": 0.15738017559051515,
+      "step": 2445
+    },
+    {
+      "epoch": 0.4457787481804949,
+      "grad_norm": 0.1540488302707672,
+      "learning_rate": 3.059787375237344e-05,
+      "loss": 0.1515384554862976,
+      "step": 2450
+    },
+    {
+      "epoch": 0.44668850072780203,
+      "grad_norm": 0.1814432442188263,
+      "learning_rate": 3.052611052224774e-05,
+      "loss": 0.15731438398361205,
+      "step": 2455
+    },
+    {
+      "epoch": 0.44759825327510916,
+      "grad_norm": 0.16657036542892456,
+      "learning_rate": 3.0454299386094542e-05,
+      "loss": 0.15741543769836425,
+      "step": 2460
+    },
+    {
+      "epoch": 0.4485080058224163,
+      "grad_norm": 0.2177237570285797,
+      "learning_rate": 3.0382440966446875e-05,
+      "loss": 0.14972515106201173,
+      "step": 2465
+    },
+    {
+      "epoch": 0.4494177583697234,
+      "grad_norm": 0.1669909954071045,
+      "learning_rate": 3.031053588624766e-05,
+      "loss": 0.1506432294845581,
+      "step": 2470
+    },
+    {
+      "epoch": 0.45032751091703055,
+      "grad_norm": 0.1752234250307083,
+      "learning_rate": 3.0238584768844313e-05,
+      "loss": 0.14969609975814818,
+      "step": 2475
+    },
+    {
+      "epoch": 0.4512372634643377,
+      "grad_norm": 0.18267901241779327,
+      "learning_rate": 3.0166588237983363e-05,
+      "loss": 0.15112748146057128,
+      "step": 2480
+    },
+    {
+      "epoch": 0.4521470160116448,
+      "grad_norm": 0.16250105202198029,
+      "learning_rate": 3.0094546917805007e-05,
+      "loss": 0.15864100456237792,
+      "step": 2485
+    },
+    {
+      "epoch": 0.45305676855895194,
+      "grad_norm": 0.14825721085071564,
+      "learning_rate": 3.0022461432837752e-05,
+      "loss": 0.1513954520225525,
+      "step": 2490
+    },
+    {
+      "epoch": 0.4539665211062591,
+      "grad_norm": 0.1626640111207962,
+      "learning_rate": 2.9950332407992943e-05,
+      "loss": 0.1505578875541687,
+      "step": 2495
+    },
+    {
+      "epoch": 0.45487627365356625,
+      "grad_norm": 0.1535351574420929,
+      "learning_rate": 2.987816046855939e-05,
+      "loss": 0.15255829095840454,
+      "step": 2500
+    },
+    {
+      "epoch": 0.4557860262008734,
+      "grad_norm": 0.17552775144577026,
+      "learning_rate": 2.9805946240197928e-05,
+      "loss": 0.1516443133354187,
+      "step": 2505
+    },
+    {
+      "epoch": 0.4566957787481805,
+      "grad_norm": 0.16020981967449188,
+      "learning_rate": 2.9733690348935994e-05,
+      "loss": 0.14519743919372557,
+      "step": 2510
+    },
+    {
+      "epoch": 0.45760553129548764,
+      "grad_norm": 0.17800211906433105,
+      "learning_rate": 2.9661393421162204e-05,
+      "loss": 0.15679080486297609,
+      "step": 2515
+    },
+    {
+      "epoch": 0.4585152838427948,
+      "grad_norm": 0.16016991436481476,
+      "learning_rate": 2.9589056083620902e-05,
+      "loss": 0.14768127202987671,
+      "step": 2520
+    },
+    {
+      "epoch": 0.4594250363901019,
+      "grad_norm": 0.16272081434726715,
+      "learning_rate": 2.951667896340679e-05,
+      "loss": 0.1513301968574524,
+      "step": 2525
+    },
+    {
+      "epoch": 0.46033478893740903,
+      "grad_norm": 0.1726413071155548,
+      "learning_rate": 2.9444262687959402e-05,
+      "loss": 0.14819332361221313,
+      "step": 2530
+    },
+    {
+      "epoch": 0.46124454148471616,
+      "grad_norm": 0.1670403778553009,
+      "learning_rate": 2.9371807885057735e-05,
+      "loss": 0.15245940685272216,
+      "step": 2535
+    },
+    {
+      "epoch": 0.4621542940320233,
+      "grad_norm": 0.1650049239397049,
+      "learning_rate": 2.9299315182814772e-05,
+      "loss": 0.15187418460845947,
+      "step": 2540
+    },
+    {
+      "epoch": 0.4630640465793304,
+      "grad_norm": 0.16327734291553497,
+      "learning_rate": 2.9226785209672047e-05,
+      "loss": 0.15579828023910522,
+      "step": 2545
+    },
+    {
+      "epoch": 0.46397379912663755,
+      "grad_norm": 0.3367880582809448,
+      "learning_rate": 2.91542185943942e-05,
+      "loss": 0.15617697238922118,
+      "step": 2550
+    },
+    {
+      "epoch": 0.4648835516739447,
+      "grad_norm": 0.1731594055891037,
+      "learning_rate": 2.908161596606353e-05,
+      "loss": 0.1559603691101074,
+      "step": 2555
+    },
+    {
+      "epoch": 0.4657933042212518,
+      "grad_norm": 0.1477293074131012,
+      "learning_rate": 2.9008977954074517e-05,
+      "loss": 0.15567959547042848,
+      "step": 2560
+    },
+    {
+      "epoch": 0.46670305676855894,
+      "grad_norm": 0.16227173805236816,
+      "learning_rate": 2.8936305188128392e-05,
+      "loss": 0.1522113561630249,
+      "step": 2565
+    },
+    {
+      "epoch": 0.4676128093158661,
+      "grad_norm": 0.2031075656414032,
+      "learning_rate": 2.8863598298227674e-05,
+      "loss": 0.15054640769958497,
+      "step": 2570
+    },
+    {
+      "epoch": 0.4685225618631732,
+      "grad_norm": 0.18351472914218903,
+      "learning_rate": 2.8790857914670698e-05,
+      "loss": 0.15837019681930542,
+      "step": 2575
+    },
+    {
+      "epoch": 0.46943231441048033,
+      "grad_norm": 0.15914765000343323,
+      "learning_rate": 2.871808466804616e-05,
+      "loss": 0.1550259470939636,
+      "step": 2580
+    },
+    {
+      "epoch": 0.47034206695778746,
+      "grad_norm": 0.17366717755794525,
+      "learning_rate": 2.8645279189227636e-05,
+      "loss": 0.15702390670776367,
+      "step": 2585
+    },
+    {
+      "epoch": 0.4712518195050946,
+      "grad_norm": 0.13677838444709778,
+      "learning_rate": 2.8572442109368134e-05,
+      "loss": 0.15485031604766847,
+      "step": 2590
+    },
+    {
+      "epoch": 0.4721615720524017,
+      "grad_norm": 0.1477748304605484,
+      "learning_rate": 2.8499574059894617e-05,
+      "loss": 0.14577245712280273,
+      "step": 2595
+    },
+    {
+      "epoch": 0.47307132459970885,
+      "grad_norm": 0.1582217663526535,
+      "learning_rate": 2.842667567250252e-05,
+      "loss": 0.15586793422698975,
+      "step": 2600
+    },
+    {
+      "epoch": 0.47398107714701604,
+      "grad_norm": 0.19658738374710083,
+      "learning_rate": 2.8353747579150268e-05,
+      "loss": 0.15060495138168334,
+      "step": 2605
+    },
+    {
+      "epoch": 0.47489082969432317,
+      "grad_norm": 0.176767036318779,
+      "learning_rate": 2.828079041205382e-05,
+      "loss": 0.15116705894470214,
+      "step": 2610
+    },
+    {
+      "epoch": 0.4758005822416303,
+      "grad_norm": 0.16972507536411285,
+      "learning_rate": 2.820780480368117e-05,
+      "loss": 0.1541937470436096,
+      "step": 2615
+    },
+    {
+      "epoch": 0.47671033478893743,
+      "grad_norm": 0.1548585742712021,
+      "learning_rate": 2.8134791386746884e-05,
+      "loss": 0.14334756135940552,
+      "step": 2620
+    },
+    {
+      "epoch": 0.47762008733624456,
+      "grad_norm": 0.15411986410617828,
+      "learning_rate": 2.806175079420658e-05,
+      "loss": 0.14642289876937867,
+      "step": 2625
+    },
+    {
+      "epoch": 0.4785298398835517,
+      "grad_norm": 0.16609491407871246,
+      "learning_rate": 2.7988683659251474e-05,
+      "loss": 0.15083469152450563,
+      "step": 2630
+    },
+    {
+      "epoch": 0.4794395924308588,
+      "grad_norm": 0.16592684388160706,
+      "learning_rate": 2.791559061530289e-05,
+      "loss": 0.14218480587005616,
+      "step": 2635
+    },
+    {
+      "epoch": 0.48034934497816595,
+      "grad_norm": 0.1764935404062271,
+      "learning_rate": 2.7842472296006722e-05,
+      "loss": 0.15004343986511232,
+      "step": 2640
+    },
+    {
+      "epoch": 0.4812590975254731,
+      "grad_norm": 0.20094354450702667,
+      "learning_rate": 2.7769329335228022e-05,
+      "loss": 0.14975016117095946,
+      "step": 2645
+    },
+    {
+      "epoch": 0.4821688500727802,
+      "grad_norm": 0.1869269460439682,
+      "learning_rate": 2.769616236704542e-05,
+      "loss": 0.155981707572937,
+      "step": 2650
+    },
+    {
+      "epoch": 0.48307860262008734,
+      "grad_norm": 0.16671574115753174,
+      "learning_rate": 2.762297202574571e-05,
+      "loss": 0.14633859395980836,
+      "step": 2655
+    },
+    {
+      "epoch": 0.48398835516739447,
+      "grad_norm": 0.14999663829803467,
+      "learning_rate": 2.754975894581826e-05,
+      "loss": 0.15692603588104248,
+      "step": 2660
+    },
+    {
+      "epoch": 0.4848981077147016,
+      "grad_norm": 0.16893649101257324,
+      "learning_rate": 2.7476523761949592e-05,
+      "loss": 0.14530394077301026,
+      "step": 2665
+    },
+    {
+      "epoch": 0.48580786026200873,
+      "grad_norm": 0.16039884090423584,
+      "learning_rate": 2.740326710901784e-05,
+      "loss": 0.15013915300369263,
+      "step": 2670
+    },
+    {
+      "epoch": 0.48671761280931586,
+      "grad_norm": 0.16672006249427795,
+      "learning_rate": 2.732998962208725e-05,
+      "loss": 0.15667349100112915,
+      "step": 2675
+    },
+    {
+      "epoch": 0.487627365356623,
+      "grad_norm": 0.2160867303609848,
+      "learning_rate": 2.7256691936402684e-05,
+      "loss": 0.14335414171218872,
+      "step": 2680
+    },
+    {
+      "epoch": 0.4885371179039301,
+      "grad_norm": 0.349030077457428,
+      "learning_rate": 2.71833746873841e-05,
+      "loss": 0.1437530279159546,
+      "step": 2685
+    },
+    {
+      "epoch": 0.48944687045123725,
+      "grad_norm": 0.18380966782569885,
+      "learning_rate": 2.7110038510621073e-05,
+      "loss": 0.1476014256477356,
+      "step": 2690
+    },
+    {
+      "epoch": 0.4903566229985444,
+      "grad_norm": 0.1523742377758026,
+      "learning_rate": 2.703668404186722e-05,
+      "loss": 0.14578526020050048,
+      "step": 2695
+    },
+    {
+      "epoch": 0.4912663755458515,
+      "grad_norm": 0.16092729568481445,
+      "learning_rate": 2.696331191703479e-05,
+      "loss": 0.15335593223571778,
+      "step": 2700
+    },
+    {
+      "epoch": 0.49217612809315864,
+      "grad_norm": 0.17185333371162415,
+      "learning_rate": 2.688992277218904e-05,
+      "loss": 0.1540898084640503,
+      "step": 2705
+    },
+    {
+      "epoch": 0.49308588064046577,
+      "grad_norm": 0.1521969735622406,
+      "learning_rate": 2.6816517243542792e-05,
+      "loss": 0.15171396732330322,
+      "step": 2710
+    },
+    {
+      "epoch": 0.49399563318777295,
+      "grad_norm": 0.16064171493053436,
+      "learning_rate": 2.674309596745092e-05,
+      "loss": 0.1505839228630066,
+      "step": 2715
+    },
+    {
+      "epoch": 0.4949053857350801,
+      "grad_norm": 0.16430898010730743,
+      "learning_rate": 2.6669659580404795e-05,
+      "loss": 0.1551363468170166,
+      "step": 2720
+    },
+    {
+      "epoch": 0.4958151382823872,
+      "grad_norm": 0.16125477850437164,
+      "learning_rate": 2.659620871902677e-05,
+      "loss": 0.15069286823272704,
+      "step": 2725
+    },
+    {
+      "epoch": 0.49672489082969434,
+      "grad_norm": 0.1428450047969818,
+      "learning_rate": 2.652274402006471e-05,
+      "loss": 0.15511081218719483,
+      "step": 2730
+    },
+    {
+      "epoch": 0.4976346433770015,
+      "grad_norm": 0.15452754497528076,
+      "learning_rate": 2.6449266120386406e-05,
+      "loss": 0.14941939115524291,
+      "step": 2735
+    },
+    {
+      "epoch": 0.4985443959243086,
+      "grad_norm": 0.17243537306785583,
+      "learning_rate": 2.6375775656974123e-05,
+      "loss": 0.151741623878479,
+      "step": 2740
+    },
+    {
+      "epoch": 0.49945414847161573,
+      "grad_norm": 0.13736453652381897,
+      "learning_rate": 2.6302273266919008e-05,
+      "loss": 0.147042977809906,
+      "step": 2745
+    },
+    {
+      "epoch": 0.5003639010189228,
+      "grad_norm": 0.16241495311260223,
+      "learning_rate": 2.6228759587415614e-05,
+      "loss": 0.14664684534072875,
+      "step": 2750
+    },
+    {
+      "epoch": 0.50127365356623,
+      "grad_norm": 0.193496435880661,
+      "learning_rate": 2.6155235255756356e-05,
+      "loss": 0.15486966371536254,
+      "step": 2755
+    },
+    {
+      "epoch": 0.5021834061135371,
+      "grad_norm": 0.1542847901582718,
+      "learning_rate": 2.6081700909326e-05,
+      "loss": 0.15148009061813356,
+      "step": 2760
+    },
+    {
+      "epoch": 0.5030931586608443,
+      "grad_norm": 0.1696511209011078,
+      "learning_rate": 2.6008157185596142e-05,
+      "loss": 0.14190055131912233,
+      "step": 2765
+    },
+    {
+      "epoch": 0.5040029112081513,
+      "grad_norm": 0.14690077304840088,
+      "learning_rate": 2.5934604722119655e-05,
+      "loss": 0.1570739269256592,
+      "step": 2770
+    },
+    {
+      "epoch": 0.5049126637554585,
+      "grad_norm": 0.17149671912193298,
+      "learning_rate": 2.5861044156525162e-05,
+      "loss": 0.14940304756164552,
+      "step": 2775
+    },
+    {
+      "epoch": 0.5058224163027657,
+      "grad_norm": 0.16639231145381927,
+      "learning_rate": 2.578747612651155e-05,
+      "loss": 0.15691237449645995,
+      "step": 2780
+    },
+    {
+      "epoch": 0.5067321688500728,
+      "grad_norm": 0.2062763124704361,
+      "learning_rate": 2.5713901269842404e-05,
+      "loss": 0.1564734935760498,
+      "step": 2785
+    },
+    {
+      "epoch": 0.50764192139738,
+      "grad_norm": 0.12636308372020721,
+      "learning_rate": 2.5640320224340502e-05,
+      "loss": 0.14539417028427123,
+      "step": 2790
+    },
+    {
+      "epoch": 0.508551673944687,
+      "grad_norm": 0.16893689334392548,
+      "learning_rate": 2.556673362788225e-05,
+      "loss": 0.15440930128097535,
+      "step": 2795
+    },
+    {
+      "epoch": 0.5094614264919942,
+      "grad_norm": 0.16250015795230865,
+      "learning_rate": 2.54931421183922e-05,
+      "loss": 0.14485647678375244,
+      "step": 2800
+    },
+    {
+      "epoch": 0.5103711790393013,
+      "grad_norm": 0.1700994372367859,
+      "learning_rate": 2.5419546333837462e-05,
+      "loss": 0.15411126613616943,
+      "step": 2805
+    },
+    {
+      "epoch": 0.5112809315866085,
+      "grad_norm": 0.1547706127166748,
+      "learning_rate": 2.5345946912222256e-05,
+      "loss": 0.15516072511672974,
+      "step": 2810
+    },
+    {
+      "epoch": 0.5121906841339156,
+      "grad_norm": 0.17955681681632996,
+      "learning_rate": 2.527234449158228e-05,
+      "loss": 0.15546923875808716,
+      "step": 2815
+    },
+    {
+      "epoch": 0.5131004366812227,
+      "grad_norm": 0.163709819316864,
+      "learning_rate": 2.519873970997927e-05,
+      "loss": 0.15665037631988527,
+      "step": 2820
+    },
+    {
+      "epoch": 0.5140101892285298,
+      "grad_norm": 0.17859576642513275,
+      "learning_rate": 2.5125133205495405e-05,
+      "loss": 0.1539722204208374,
+      "step": 2825
+    },
+    {
+      "epoch": 0.514919941775837,
+      "grad_norm": 0.17443150281906128,
+      "learning_rate": 2.5051525616227806e-05,
+      "loss": 0.148411762714386,
+      "step": 2830
+    },
+    {
+      "epoch": 0.5158296943231441,
+      "grad_norm": 0.17397581040859222,
+      "learning_rate": 2.4977917580283007e-05,
+      "loss": 0.14880497455596925,
+      "step": 2835
+    },
+    {
+      "epoch": 0.5167394468704513,
+      "grad_norm": 0.14565663039684296,
+      "learning_rate": 2.4904309735771405e-05,
+      "loss": 0.14934680461883545,
+      "step": 2840
+    },
+    {
+      "epoch": 0.5176491994177583,
+      "grad_norm": 0.17895659804344177,
+      "learning_rate": 2.4830702720801746e-05,
+      "loss": 0.15287939310073853,
+      "step": 2845
+    },
+    {
+      "epoch": 0.5185589519650655,
+      "grad_norm": 0.15812788903713226,
+      "learning_rate": 2.4757097173475572e-05,
+      "loss": 0.14576947689056396,
+      "step": 2850
+    },
+    {
+      "epoch": 0.5194687045123726,
+      "grad_norm": 0.17123781144618988,
+      "learning_rate": 2.46834937318817e-05,
+      "loss": 0.15224847793579102,
+      "step": 2855
+    },
+    {
+      "epoch": 0.5203784570596798,
+      "grad_norm": 0.14845474064350128,
+      "learning_rate": 2.460989303409072e-05,
+      "loss": 0.14901585578918458,
+      "step": 2860
+    },
+    {
+      "epoch": 0.5212882096069869,
+      "grad_norm": 0.23493704199790955,
+      "learning_rate": 2.4536295718149407e-05,
+      "loss": 0.1517487049102783,
+      "step": 2865
+    },
+    {
+      "epoch": 0.522197962154294,
+      "grad_norm": 0.16209843754768372,
+      "learning_rate": 2.4462702422075217e-05,
+      "loss": 0.14327445030212402,
+      "step": 2870
+    },
+    {
+      "epoch": 0.5231077147016011,
+      "grad_norm": 0.17249803245067596,
+      "learning_rate": 2.4389113783850793e-05,
+      "loss": 0.1517549753189087,
+      "step": 2875
+    },
+    {
+      "epoch": 0.5240174672489083,
+      "grad_norm": 0.14561402797698975,
+      "learning_rate": 2.431553044141836e-05,
+      "loss": 0.14764087200164794,
+      "step": 2880
+    },
+    {
+      "epoch": 0.5249272197962155,
+      "grad_norm": 0.17033302783966064,
+      "learning_rate": 2.4241953032674256e-05,
+      "loss": 0.15181604623794556,
+      "step": 2885
+    },
+    {
+      "epoch": 0.5258369723435226,
+      "grad_norm": 0.1184430941939354,
+      "learning_rate": 2.4168382195463367e-05,
+      "loss": 0.14264242649078368,
+      "step": 2890
+    },
+    {
+      "epoch": 0.5267467248908297,
+      "grad_norm": 0.17521196603775024,
+      "learning_rate": 2.4094818567573618e-05,
+      "loss": 0.1509538173675537,
+      "step": 2895
+    },
+    {
+      "epoch": 0.5276564774381368,
+      "grad_norm": 0.1681576371192932,
+      "learning_rate": 2.4021262786730428e-05,
+      "loss": 0.15344605445861817,
+      "step": 2900
+    },
+    {
+      "epoch": 0.528566229985444,
+      "grad_norm": 0.17134182155132294,
+      "learning_rate": 2.3947715490591206e-05,
+      "loss": 0.15161689519882202,
+      "step": 2905
+    },
+    {
+      "epoch": 0.5294759825327511,
+      "grad_norm": 0.1796472817659378,
+      "learning_rate": 2.3874177316739778e-05,
+      "loss": 0.15086464881896972,
+      "step": 2910
+    },
+    {
+      "epoch": 0.5303857350800583,
+      "grad_norm": 0.23268625140190125,
+      "learning_rate": 2.380064890268093e-05,
+      "loss": 0.15354180335998535,
+      "step": 2915
+    },
+    {
+      "epoch": 0.5312954876273653,
+      "grad_norm": 0.16318941116333008,
+      "learning_rate": 2.372713088583481e-05,
+      "loss": 0.15131797790527343,
+      "step": 2920
+    },
+    {
+      "epoch": 0.5322052401746725,
+      "grad_norm": 0.18171803653240204,
+      "learning_rate": 2.365362390353143e-05,
+      "loss": 0.15784090757369995,
+      "step": 2925
+    },
+    {
+      "epoch": 0.5331149927219796,
+      "grad_norm": 0.17672640085220337,
+      "learning_rate": 2.3580128593005156e-05,
+      "loss": 0.15509436130523682,
+      "step": 2930
+    },
+    {
+      "epoch": 0.5340247452692868,
+      "grad_norm": 0.15985223650932312,
+      "learning_rate": 2.3506645591389174e-05,
+      "loss": 0.14851027727127075,
+      "step": 2935
+    },
+    {
+      "epoch": 0.5349344978165939,
+      "grad_norm": 0.16597607731819153,
+      "learning_rate": 2.343317553570995e-05,
+      "loss": 0.1504931092262268,
+      "step": 2940
+    },
+    {
+      "epoch": 0.535844250363901,
+      "grad_norm": 0.20180748403072357,
+      "learning_rate": 2.3359719062881725e-05,
+      "loss": 0.15023820400238036,
+      "step": 2945
+    },
+    {
+      "epoch": 0.5367540029112081,
+      "grad_norm": 0.1735963076353073,
+      "learning_rate": 2.3286276809701e-05,
+      "loss": 0.15374408960342406,
+      "step": 2950
+    },
+    {
+      "epoch": 0.5376637554585153,
+      "grad_norm": 0.17629501223564148,
+      "learning_rate": 2.3212849412840995e-05,
+      "loss": 0.15007833242416382,
+      "step": 2955
+    },
+    {
+      "epoch": 0.5385735080058224,
+      "grad_norm": 0.1493796557188034,
+      "learning_rate": 2.3139437508846155e-05,
+      "loss": 0.15206656455993653,
+      "step": 2960
+    },
+    {
+      "epoch": 0.5394832605531296,
+      "grad_norm": 0.17426837980747223,
+      "learning_rate": 2.306604173412659e-05,
+      "loss": 0.1441131591796875,
+      "step": 2965
+    },
+    {
+      "epoch": 0.5403930131004366,
+      "grad_norm": 0.16984431445598602,
+      "learning_rate": 2.2992662724952613e-05,
+      "loss": 0.14438753128051757,
+      "step": 2970
+    },
+    {
+      "epoch": 0.5413027656477438,
+      "grad_norm": 0.1814386397600174,
+      "learning_rate": 2.2919301117449167e-05,
+      "loss": 0.14887022972106934,
+      "step": 2975
+    },
+    {
+      "epoch": 0.5422125181950509,
+      "grad_norm": 0.158392995595932,
+      "learning_rate": 2.2845957547590368e-05,
+      "loss": 0.14404361248016356,
+      "step": 2980
+    },
+    {
+      "epoch": 0.5431222707423581,
+      "grad_norm": 0.17496263980865479,
+      "learning_rate": 2.2772632651193953e-05,
+      "loss": 0.1454906702041626,
+      "step": 2985
+    },
+    {
+      "epoch": 0.5440320232896652,
+      "grad_norm": 0.157533198595047,
+      "learning_rate": 2.2699327063915766e-05,
+      "loss": 0.1458217740058899,
+      "step": 2990
+    },
+    {
+      "epoch": 0.5449417758369723,
+      "grad_norm": 0.1767890453338623,
+      "learning_rate": 2.262604142124427e-05,
+      "loss": 0.14384825229644777,
+      "step": 2995
+    },
+    {
+      "epoch": 0.5458515283842795,
+      "grad_norm": 0.1851050704717636,
+      "learning_rate": 2.2552776358495033e-05,
+      "loss": 0.14832457304000854,
+      "step": 3000
+    },
+    {
+      "epoch": 0.5467612809315866,
+      "grad_norm": 0.164175882935524,
+      "learning_rate": 2.247953251080521e-05,
+      "loss": 0.14999878406524658,
+      "step": 3005
+    },
+    {
+      "epoch": 0.5476710334788938,
+      "grad_norm": 0.3403675854206085,
+      "learning_rate": 2.240631051312804e-05,
+      "loss": 0.1443937063217163,
+      "step": 3010
+    },
+    {
+      "epoch": 0.5485807860262009,
+      "grad_norm": 0.16751109063625336,
+      "learning_rate": 2.2333111000227342e-05,
+      "loss": 0.1462402105331421,
+      "step": 3015
+    },
+    {
+      "epoch": 0.549490538573508,
+      "grad_norm": 0.14741151034832,
+      "learning_rate": 2.225993460667201e-05,
+      "loss": 0.149855899810791,
+      "step": 3020
+    },
+    {
+      "epoch": 0.5504002911208151,
+      "grad_norm": 0.20605266094207764,
+      "learning_rate": 2.218678196683054e-05,
+      "loss": 0.15413178205490113,
+      "step": 3025
+    },
+    {
+      "epoch": 0.5513100436681223,
+      "grad_norm": 0.14884796738624573,
+      "learning_rate": 2.2113653714865473e-05,
+      "loss": 0.14592334032058715,
+      "step": 3030
+    },
+    {
+      "epoch": 0.5522197962154294,
+      "grad_norm": 0.17114350199699402,
+      "learning_rate": 2.2040550484727943e-05,
+      "loss": 0.1498338460922241,
+      "step": 3035
+    },
+    {
+      "epoch": 0.5531295487627366,
+      "grad_norm": 0.16496853530406952,
+      "learning_rate": 2.196747291015219e-05,
+      "loss": 0.14650315046310425,
+      "step": 3040
+    },
+    {
+      "epoch": 0.5540393013100436,
+      "grad_norm": 0.15172401070594788,
+      "learning_rate": 2.189442162465001e-05,
+      "loss": 0.14984124898910522,
+      "step": 3045
+    },
+    {
+      "epoch": 0.5549490538573508,
+      "grad_norm": 0.19258467853069305,
+      "learning_rate": 2.182139726150532e-05,
+      "loss": 0.1486764669418335,
+      "step": 3050
+    },
+    {
+      "epoch": 0.5558588064046579,
+      "grad_norm": 0.1749001443386078,
+      "learning_rate": 2.1748400453768652e-05,
+      "loss": 0.14983701705932617,
+      "step": 3055
+    },
+    {
+      "epoch": 0.5567685589519651,
+      "grad_norm": 0.37510567903518677,
+      "learning_rate": 2.1675431834251637e-05,
+      "loss": 0.14483561515808105,
+      "step": 3060
+    },
+    {
+      "epoch": 0.5576783114992722,
+      "grad_norm": 0.16932405531406403,
+      "learning_rate": 2.1602492035521553e-05,
+      "loss": 0.14487643241882325,
+      "step": 3065
+    },
+    {
+      "epoch": 0.5585880640465793,
+      "grad_norm": 0.174176424741745,
+      "learning_rate": 2.152958168989584e-05,
+      "loss": 0.14737497568130492,
+      "step": 3070
+    },
+    {
+      "epoch": 0.5594978165938864,
+      "grad_norm": 0.1601252257823944,
+      "learning_rate": 2.1456701429436577e-05,
+      "loss": 0.15183379650115966,
+      "step": 3075
+    },
+    {
+      "epoch": 0.5604075691411936,
+      "grad_norm": 0.14960910379886627,
+      "learning_rate": 2.1383851885945085e-05,
+      "loss": 0.143074893951416,
+      "step": 3080
+    },
+    {
+      "epoch": 0.5613173216885007,
+      "grad_norm": 0.1678633838891983,
+      "learning_rate": 2.1311033690956346e-05,
+      "loss": 0.14961432218551635,
+      "step": 3085
+    },
+    {
+      "epoch": 0.5622270742358079,
+      "grad_norm": 0.15814319252967834,
+      "learning_rate": 2.1238247475733613e-05,
+      "loss": 0.14308581352233887,
+      "step": 3090
+    },
+    {
+      "epoch": 0.5631368267831149,
+      "grad_norm": 0.21240772306919098,
+      "learning_rate": 2.1165493871262887e-05,
+      "loss": 0.14737485647201537,
+      "step": 3095
+    },
+    {
+      "epoch": 0.5640465793304221,
+      "grad_norm": 0.15161271393299103,
+      "learning_rate": 2.109277350824749e-05,
+      "loss": 0.14534420967102052,
+      "step": 3100
+    },
+    {
+      "epoch": 0.5649563318777293,
+      "grad_norm": 0.16572362184524536,
+      "learning_rate": 2.1020087017102537e-05,
+      "loss": 0.14299670457839966,
+      "step": 3105
+    },
+    {
+      "epoch": 0.5658660844250364,
+      "grad_norm": 0.1548164039850235,
+      "learning_rate": 2.094743502794954e-05,
+      "loss": 0.14371142387390137,
+      "step": 3110
+    },
+    {
+      "epoch": 0.5667758369723436,
+      "grad_norm": 0.2574169933795929,
+      "learning_rate": 2.0874818170610885e-05,
+      "loss": 0.14350423812866211,
+      "step": 3115
+    },
+    {
+      "epoch": 0.5676855895196506,
+      "grad_norm": 0.16359548270702362,
+      "learning_rate": 2.080223707460443e-05,
+      "loss": 0.1520243763923645,
+      "step": 3120
+    },
+    {
+      "epoch": 0.5685953420669578,
+      "grad_norm": 0.1798320859670639,
+      "learning_rate": 2.072969236913799e-05,
+      "loss": 0.14832595586776734,
+      "step": 3125
+    },
+    {
+      "epoch": 0.5695050946142649,
+      "grad_norm": 0.17045916616916656,
+      "learning_rate": 2.0657184683103926e-05,
+      "loss": 0.15308042764663696,
+      "step": 3130
+    },
+    {
+      "epoch": 0.5704148471615721,
+      "grad_norm": 0.16345897316932678,
+      "learning_rate": 2.058471464507366e-05,
+      "loss": 0.14564799070358275,
+      "step": 3135
+    },
+    {
+      "epoch": 0.5713245997088792,
+      "grad_norm": 0.15170110762119293,
+      "learning_rate": 2.0512282883292257e-05,
+      "loss": 0.14161767959594726,
+      "step": 3140
+    },
+    {
+      "epoch": 0.5722343522561864,
+      "grad_norm": 0.8107472658157349,
+      "learning_rate": 2.0439890025672955e-05,
+      "loss": 0.14481087923049926,
+      "step": 3145
+    },
+    {
+      "epoch": 0.5731441048034934,
+      "grad_norm": 0.15346679091453552,
+      "learning_rate": 2.036753669979174e-05,
+      "loss": 0.14860262870788574,
+      "step": 3150
+    },
+    {
+      "epoch": 0.5740538573508006,
+      "grad_norm": 0.1632593423128128,
+      "learning_rate": 2.0295223532881886e-05,
+      "loss": 0.1481687307357788,
+      "step": 3155
+    },
+    {
+      "epoch": 0.5749636098981077,
+      "grad_norm": 0.23399172723293304,
+      "learning_rate": 2.022295115182852e-05,
+      "loss": 0.149153733253479,
+      "step": 3160
+    },
+    {
+      "epoch": 0.5758733624454149,
+      "grad_norm": 0.14977394044399261,
+      "learning_rate": 2.015072018316323e-05,
+      "loss": 0.14921388626098633,
+      "step": 3165
+    },
+    {
+      "epoch": 0.576783114992722,
+      "grad_norm": 0.1550658792257309,
+      "learning_rate": 2.007853125305856e-05,
+      "loss": 0.1482759475708008,
+      "step": 3170
+    },
+    {
+      "epoch": 0.5776928675400291,
+      "grad_norm": 0.16661737859249115,
+      "learning_rate": 2.0006384987322645e-05,
+      "loss": 0.14903552532196046,
+      "step": 3175
+    },
+    {
+      "epoch": 0.5786026200873362,
+      "grad_norm": 0.1746823936700821,
+      "learning_rate": 1.9934282011393753e-05,
+      "loss": 0.1412947654724121,
+      "step": 3180
+    },
+    {
+      "epoch": 0.5795123726346434,
+      "grad_norm": 0.17025792598724365,
+      "learning_rate": 1.9862222950334857e-05,
+      "loss": 0.15289769172668458,
+      "step": 3185
+    },
+    {
+      "epoch": 0.5804221251819505,
+      "grad_norm": 0.16857658326625824,
+      "learning_rate": 1.9790208428828252e-05,
+      "loss": 0.14419941902160643,
+      "step": 3190
+    },
+    {
+      "epoch": 0.5813318777292577,
+      "grad_norm": 0.16099876165390015,
+      "learning_rate": 1.9718239071170118e-05,
+      "loss": 0.14476487636566163,
+      "step": 3195
+    },
+    {
+      "epoch": 0.5822416302765647,
+      "grad_norm": 0.16140873730182648,
+      "learning_rate": 1.964631550126508e-05,
+      "loss": 0.14588416814804078,
+      "step": 3200
+    },
+    {
+      "epoch": 0.5831513828238719,
+      "grad_norm": 0.15719448029994965,
+      "learning_rate": 1.957443834262087e-05,
+      "loss": 0.15144693851470947,
+      "step": 3205
+    },
+    {
+      "epoch": 0.584061135371179,
+      "grad_norm": 0.16512645781040192,
+      "learning_rate": 1.950260821834285e-05,
+      "loss": 0.14787566661834717,
+      "step": 3210
+    },
+    {
+      "epoch": 0.5849708879184862,
+      "grad_norm": 0.18584516644477844,
+      "learning_rate": 1.9430825751128643e-05,
+      "loss": 0.14514710903167724,
+      "step": 3215
+    },
+    {
+      "epoch": 0.5858806404657934,
+      "grad_norm": 0.17640981078147888,
+      "learning_rate": 1.9359091563262742e-05,
+      "loss": 0.1511004686355591,
+      "step": 3220
+    },
+    {
+      "epoch": 0.5867903930131004,
+      "grad_norm": 0.1697624921798706,
+      "learning_rate": 1.9287406276611095e-05,
+      "loss": 0.15392563343048096,
+      "step": 3225
+    },
+    {
+      "epoch": 0.5877001455604076,
+      "grad_norm": 0.1677260845899582,
+      "learning_rate": 1.9215770512615725e-05,
+      "loss": 0.15311745405197144,
+      "step": 3230
+    },
+    {
+      "epoch": 0.5886098981077147,
+      "grad_norm": 0.15357480943202972,
+      "learning_rate": 1.9144184892289337e-05,
+      "loss": 0.14370160102844237,
+      "step": 3235
+    },
+    {
+      "epoch": 0.5895196506550219,
+      "grad_norm": 0.18601207435131073,
+      "learning_rate": 1.9072650036209955e-05,
+      "loss": 0.14095077514648438,
+      "step": 3240
+    },
+    {
+      "epoch": 0.590429403202329,
+      "grad_norm": 0.17313526570796967,
+      "learning_rate": 1.9001166564515513e-05,
+      "loss": 0.148259174823761,
+      "step": 3245
+    },
+    {
+      "epoch": 0.5913391557496361,
+      "grad_norm": 0.1634378433227539,
+      "learning_rate": 1.8929735096898504e-05,
+      "loss": 0.15082294940948487,
+      "step": 3250
+    },
+    {
+      "epoch": 0.5922489082969432,
+      "grad_norm": 0.18542174994945526,
+      "learning_rate": 1.885835625260058e-05,
+      "loss": 0.14461435079574586,
+      "step": 3255
+    },
+    {
+      "epoch": 0.5931586608442504,
+      "grad_norm": 0.1740756630897522,
+      "learning_rate": 1.87870306504072e-05,
+      "loss": 0.14083608388900756,
+      "step": 3260
+    },
+    {
+      "epoch": 0.5940684133915575,
+      "grad_norm": 0.25606217980384827,
+      "learning_rate": 1.8715758908642288e-05,
+      "loss": 0.15125386714935302,
+      "step": 3265
+    },
+    {
+      "epoch": 0.5949781659388647,
+      "grad_norm": 0.20194627344608307,
+      "learning_rate": 1.8644541645162834e-05,
+      "loss": 0.14433003664016725,
+      "step": 3270
+    },
+    {
+      "epoch": 0.5958879184861717,
+      "grad_norm": 0.1902168095111847,
+      "learning_rate": 1.8573379477353542e-05,
+      "loss": 0.14718132019042968,
+      "step": 3275
+    },
+    {
+      "epoch": 0.5967976710334789,
+      "grad_norm": 0.15122972428798676,
+      "learning_rate": 1.850227302212151e-05,
+      "loss": 0.153376567363739,
+      "step": 3280
+    },
+    {
+      "epoch": 0.597707423580786,
+      "grad_norm": 0.14331959187984467,
+      "learning_rate": 1.843122289589085e-05,
+      "loss": 0.146630597114563,
+      "step": 3285
+    },
+    {
+      "epoch": 0.5986171761280932,
+      "grad_norm": 0.15083099901676178,
+      "learning_rate": 1.836022971459737e-05,
+      "loss": 0.1445971965789795,
+      "step": 3290
+    },
+    {
+      "epoch": 0.5995269286754003,
+      "grad_norm": 0.16585418581962585,
+      "learning_rate": 1.828929409368321e-05,
+      "loss": 0.15120241641998292,
+      "step": 3295
+    },
+    {
+      "epoch": 0.6004366812227074,
+      "grad_norm": 0.1653224229812622,
+      "learning_rate": 1.8218416648091524e-05,
+      "loss": 0.14349838495254516,
+      "step": 3300
+    },
+    {
+      "epoch": 0.6013464337700145,
+      "grad_norm": 0.1891375184059143,
+      "learning_rate": 1.8147597992261124e-05,
+      "loss": 0.15171384811401367,
+      "step": 3305
+    },
+    {
+      "epoch": 0.6022561863173217,
+      "grad_norm": 0.13392704725265503,
+      "learning_rate": 1.8076838740121187e-05,
+      "loss": 0.14607118368148803,
+      "step": 3310
+    },
+    {
+      "epoch": 0.6031659388646288,
+      "grad_norm": 0.15421944856643677,
+      "learning_rate": 1.8006139505085926e-05,
+      "loss": 0.1380957007408142,
+      "step": 3315
+    },
+    {
+      "epoch": 0.604075691411936,
+      "grad_norm": 0.16637761890888214,
+      "learning_rate": 1.7935500900049246e-05,
+      "loss": 0.14604611396789552,
+      "step": 3320
+    },
+    {
+      "epoch": 0.6049854439592431,
+      "grad_norm": 0.16638441383838654,
+      "learning_rate": 1.7864923537379445e-05,
+      "loss": 0.1513611912727356,
+      "step": 3325
+    },
+    {
+      "epoch": 0.6058951965065502,
+      "grad_norm": 0.1745707094669342,
+      "learning_rate": 1.779440802891394e-05,
+      "loss": 0.15391240119934083,
+      "step": 3330
+    },
+    {
+      "epoch": 0.6068049490538574,
+      "grad_norm": 0.1620505005121231,
+      "learning_rate": 1.77239549859539e-05,
+      "loss": 0.14986472129821776,
+      "step": 3335
+    },
+    {
+      "epoch": 0.6077147016011645,
+      "grad_norm": 0.1579132080078125,
+      "learning_rate": 1.7653565019259e-05,
+      "loss": 0.1466603994369507,
+      "step": 3340
+    },
+    {
+      "epoch": 0.6086244541484717,
+      "grad_norm": 0.19154994189739227,
+      "learning_rate": 1.7583238739042086e-05,
+      "loss": 0.15228934288024903,
+      "step": 3345
+    },
+    {
+      "epoch": 0.6095342066957787,
+      "grad_norm": 0.15771779417991638,
+      "learning_rate": 1.7512976754963913e-05,
+      "loss": 0.14965078830718995,
+      "step": 3350
+    },
+    {
+      "epoch": 0.6104439592430859,
+      "grad_norm": 0.18406136333942413,
+      "learning_rate": 1.744277967612785e-05,
+      "loss": 0.1473196864128113,
+      "step": 3355
+    },
+    {
+      "epoch": 0.611353711790393,
+      "grad_norm": 0.17603816092014313,
+      "learning_rate": 1.7372648111074607e-05,
+      "loss": 0.1430676221847534,
+      "step": 3360
+    },
+    {
+      "epoch": 0.6122634643377002,
+      "grad_norm": 0.156408429145813,
+      "learning_rate": 1.7302582667776933e-05,
+      "loss": 0.14018454551696777,
+      "step": 3365
+    },
+    {
+      "epoch": 0.6131732168850073,
+      "grad_norm": 0.14504843950271606,
+      "learning_rate": 1.7232583953634407e-05,
+      "loss": 0.14505640268325806,
+      "step": 3370
+    },
+    {
+      "epoch": 0.6140829694323144,
+      "grad_norm": 0.1864968240261078,
+      "learning_rate": 1.716265257546808e-05,
+      "loss": 0.14810394048690795,
+      "step": 3375
+    },
+    {
+      "epoch": 0.6149927219796215,
+      "grad_norm": 0.1621711403131485,
+      "learning_rate": 1.7092789139515295e-05,
+      "loss": 0.14203091859817504,
+      "step": 3380
+    },
+    {
+      "epoch": 0.6159024745269287,
+      "grad_norm": 0.17994914948940277,
+      "learning_rate": 1.70229942514244e-05,
+      "loss": 0.14565644264221192,
+      "step": 3385
+    },
+    {
+      "epoch": 0.6168122270742358,
+      "grad_norm": 0.1707388162612915,
+      "learning_rate": 1.6953268516249486e-05,
+      "loss": 0.14449434280395507,
+      "step": 3390
+    },
+    {
+      "epoch": 0.617721979621543,
+      "grad_norm": 0.16425329446792603,
+      "learning_rate": 1.6883612538445175e-05,
+      "loss": 0.15185940265655518,
+      "step": 3395
+    },
+    {
+      "epoch": 0.61863173216885,
+      "grad_norm": 0.15987788140773773,
+      "learning_rate": 1.6814026921861335e-05,
+      "loss": 0.14994431734085084,
+      "step": 3400
+    },
+    {
+      "epoch": 0.6195414847161572,
+      "grad_norm": 0.2987690269947052,
+      "learning_rate": 1.6744512269737894e-05,
+      "loss": 0.14652738571166993,
+      "step": 3405
+    },
+    {
+      "epoch": 0.6204512372634643,
+      "grad_norm": 0.1681315004825592,
+      "learning_rate": 1.6675069184699574e-05,
+      "loss": 0.14566165208816528,
+      "step": 3410
+    },
+    {
+      "epoch": 0.6213609898107715,
+      "grad_norm": 0.15847846865653992,
+      "learning_rate": 1.660569826875069e-05,
+      "loss": 0.1374401330947876,
+      "step": 3415
+    },
+    {
+      "epoch": 0.6222707423580786,
+      "grad_norm": 0.16370312869548798,
+      "learning_rate": 1.6536400123269907e-05,
+      "loss": 0.14905524253845215,
+      "step": 3420
+    },
+    {
+      "epoch": 0.6231804949053857,
+      "grad_norm": 0.16054444015026093,
+      "learning_rate": 1.6467175349005054e-05,
+      "loss": 0.1496324896812439,
+      "step": 3425
+    },
+    {
+      "epoch": 0.6240902474526928,
+      "grad_norm": 0.1663951277732849,
+      "learning_rate": 1.639802454606788e-05,
+      "loss": 0.1504170298576355,
+      "step": 3430
+    },
+    {
+      "epoch": 0.625,
+      "grad_norm": 0.1591310054063797,
+      "learning_rate": 1.6328948313928906e-05,
+      "loss": 0.1410186171531677,
+      "step": 3435
+    },
+    {
+      "epoch": 0.6259097525473072,
+      "grad_norm": 0.1637524962425232,
+      "learning_rate": 1.6259947251412178e-05,
+      "loss": 0.13963305950164795,
+      "step": 3440
+    },
+    {
+      "epoch": 0.6268195050946143,
+      "grad_norm": 0.1688017100095749,
+      "learning_rate": 1.6191021956690096e-05,
+      "loss": 0.14727941751480103,
+      "step": 3445
+    },
+    {
+      "epoch": 0.6277292576419214,
+      "grad_norm": 0.1691795438528061,
+      "learning_rate": 1.612217302727821e-05,
+      "loss": 0.14856183528900146,
+      "step": 3450
+    },
+    {
+      "epoch": 0.6286390101892285,
+      "grad_norm": 0.18501746654510498,
+      "learning_rate": 1.60534010600301e-05,
+      "loss": 0.1481746554374695,
+      "step": 3455
+    },
+    {
+      "epoch": 0.6295487627365357,
+      "grad_norm": 0.16234716773033142,
+      "learning_rate": 1.5984706651132125e-05,
+      "loss": 0.1427530527114868,
+      "step": 3460
+    },
+    {
+      "epoch": 0.6304585152838428,
+      "grad_norm": 0.16013780236244202,
+      "learning_rate": 1.5916090396098293e-05,
+      "loss": 0.14264426231384278,
+      "step": 3465
+    },
+    {
+      "epoch": 0.63136826783115,
+      "grad_norm": 0.17116396129131317,
+      "learning_rate": 1.5847552889765095e-05,
+      "loss": 0.14109257459640503,
+      "step": 3470
+    },
+    {
+      "epoch": 0.632278020378457,
+      "grad_norm": 0.16949769854545593,
+      "learning_rate": 1.5779094726286344e-05,
+      "loss": 0.1387040376663208,
+      "step": 3475
+    },
+    {
+      "epoch": 0.6331877729257642,
+      "grad_norm": 0.14983431994915009,
+      "learning_rate": 1.5710716499128044e-05,
+      "loss": 0.13645120859146118,
+      "step": 3480
+    },
+    {
+      "epoch": 0.6340975254730713,
+      "grad_norm": 0.1632554531097412,
+      "learning_rate": 1.564241880106321e-05,
+      "loss": 0.14883992671966553,
+      "step": 3485
+    },
+    {
+      "epoch": 0.6350072780203785,
+      "grad_norm": 0.15686506032943726,
+      "learning_rate": 1.5574202224166744e-05,
+      "loss": 0.14244272708892822,
+      "step": 3490
+    },
+    {
+      "epoch": 0.6359170305676856,
+      "grad_norm": 0.18843458592891693,
+      "learning_rate": 1.5506067359810333e-05,
+      "loss": 0.15149861574172974,
+      "step": 3495
+    },
+    {
+      "epoch": 0.6368267831149927,
+      "grad_norm": 0.15874551236629486,
+      "learning_rate": 1.5438014798657275e-05,
+      "loss": 0.15188233852386473,
+      "step": 3500
+    },
+    {
+      "epoch": 0.6377365356622998,
+      "grad_norm": 0.17014239728450775,
+      "learning_rate": 1.5370045130657366e-05,
+      "loss": 0.14694437980651856,
+      "step": 3505
+    },
+    {
+      "epoch": 0.638646288209607,
+      "grad_norm": 0.14744038879871368,
+      "learning_rate": 1.5302158945041838e-05,
+      "loss": 0.14434736967086792,
+      "step": 3510
+    },
+    {
+      "epoch": 0.6395560407569141,
+      "grad_norm": 0.2069770246744156,
+      "learning_rate": 1.523435683031818e-05,
+      "loss": 0.13982917070388795,
+      "step": 3515
+    },
+    {
+      "epoch": 0.6404657933042213,
+      "grad_norm": 0.17811502516269684,
+      "learning_rate": 1.5166639374265063e-05,
+      "loss": 0.1408839702606201,
+      "step": 3520
+    },
+    {
+      "epoch": 0.6413755458515283,
+      "grad_norm": 0.165786474943161,
+      "learning_rate": 1.509900716392728e-05,
+      "loss": 0.15312877893447877,
+      "step": 3525
+    },
+    {
+      "epoch": 0.6422852983988355,
+      "grad_norm": 0.1633884161710739,
+      "learning_rate": 1.5031460785610596e-05,
+      "loss": 0.1488795518875122,
+      "step": 3530
+    },
+    {
+      "epoch": 0.6431950509461426,
+      "grad_norm": 0.16498984396457672,
+      "learning_rate": 1.4964000824876723e-05,
+      "loss": 0.15031465291976928,
+      "step": 3535
+    },
+    {
+      "epoch": 0.6441048034934498,
+      "grad_norm": 0.18043678998947144,
+      "learning_rate": 1.4896627866538191e-05,
+      "loss": 0.147829806804657,
+      "step": 3540
+    },
+    {
+      "epoch": 0.6450145560407569,
+      "grad_norm": 0.16813597083091736,
+      "learning_rate": 1.4829342494653315e-05,
+      "loss": 0.1418998956680298,
+      "step": 3545
+    },
+    {
+      "epoch": 0.645924308588064,
+      "grad_norm": 0.1817242056131363,
+      "learning_rate": 1.4762145292521118e-05,
+      "loss": 0.14508869647979736,
+      "step": 3550
+    },
+    {
+      "epoch": 0.6468340611353712,
+      "grad_norm": 0.14666494727134705,
+      "learning_rate": 1.469503684267628e-05,
+      "loss": 0.14159854650497436,
+      "step": 3555
+    },
+    {
+      "epoch": 0.6477438136826783,
+      "grad_norm": 0.16485381126403809,
+      "learning_rate": 1.4628017726884086e-05,
+      "loss": 0.14419105052947997,
+      "step": 3560
+    },
+    {
+      "epoch": 0.6486535662299855,
+      "grad_norm": 0.16100342571735382,
+      "learning_rate": 1.4561088526135375e-05,
+      "loss": 0.14501721858978273,
+      "step": 3565
+    },
+    {
+      "epoch": 0.6495633187772926,
+      "grad_norm": 0.16996590793132782,
+      "learning_rate": 1.4494249820641493e-05,
+      "loss": 0.1377166509628296,
+      "step": 3570
+    },
+    {
+      "epoch": 0.6504730713245997,
+      "grad_norm": 0.16168837249279022,
+      "learning_rate": 1.4427502189829339e-05,
+      "loss": 0.1414325475692749,
+      "step": 3575
+    },
+    {
+      "epoch": 0.6513828238719068,
+      "grad_norm": 0.16318906843662262,
+      "learning_rate": 1.436084621233621e-05,
+      "loss": 0.14685193300247193,
+      "step": 3580
+    },
+    {
+      "epoch": 0.652292576419214,
+      "grad_norm": 0.1636219322681427,
+      "learning_rate": 1.4294282466004899e-05,
+      "loss": 0.1405899167060852,
+      "step": 3585
+    },
+    {
+      "epoch": 0.6532023289665211,
+      "grad_norm": 0.1838461309671402,
+      "learning_rate": 1.422781152787865e-05,
+      "loss": 0.14386332035064697,
+      "step": 3590
+    },
+    {
+      "epoch": 0.6541120815138283,
+      "grad_norm": 0.1796344667673111,
+      "learning_rate": 1.4161433974196115e-05,
+      "loss": 0.1513024687767029,
+      "step": 3595
+    },
+    {
+      "epoch": 0.6550218340611353,
+      "grad_norm": 0.16424529254436493,
+      "learning_rate": 1.4095150380386427e-05,
+      "loss": 0.14238927364349366,
+      "step": 3600
+    },
+    {
+      "epoch": 0.6559315866084425,
+      "grad_norm": 0.19264160096645355,
+      "learning_rate": 1.402896132106415e-05,
+      "loss": 0.14297477006912232,
+      "step": 3605
+    },
+    {
+      "epoch": 0.6568413391557496,
+      "grad_norm": 0.18319948017597198,
+      "learning_rate": 1.3962867370024347e-05,
+      "loss": 0.1448880434036255,
+      "step": 3610
+    },
+    {
+      "epoch": 0.6577510917030568,
+      "grad_norm": 0.16507290303707123,
+      "learning_rate": 1.389686910023758e-05,
+      "loss": 0.14724698066711425,
+      "step": 3615
+    },
+    {
+      "epoch": 0.6586608442503639,
+      "grad_norm": 0.17871244251728058,
+      "learning_rate": 1.3830967083844942e-05,
+      "loss": 0.14479386806488037,
+      "step": 3620
+    },
+    {
+      "epoch": 0.659570596797671,
+      "grad_norm": 0.1846228390932083,
+      "learning_rate": 1.3765161892153112e-05,
+      "loss": 0.1453616738319397,
+      "step": 3625
+    },
+    {
+      "epoch": 0.6604803493449781,
+      "grad_norm": 0.17185978591442108,
+      "learning_rate": 1.3699454095629372e-05,
+      "loss": 0.14906206130981445,
+      "step": 3630
+    },
+    {
+      "epoch": 0.6613901018922853,
+      "grad_norm": 0.14751191437244415,
+      "learning_rate": 1.3633844263896698e-05,
+      "loss": 0.13991892337799072,
+      "step": 3635
+    },
+    {
+      "epoch": 0.6622998544395924,
+      "grad_norm": 0.22059763967990875,
+      "learning_rate": 1.3568332965728817e-05,
+      "loss": 0.14680869579315187,
+      "step": 3640
+    },
+    {
+      "epoch": 0.6632096069868996,
+      "grad_norm": 0.15295909345149994,
+      "learning_rate": 1.3502920769045232e-05,
+      "loss": 0.1404443383216858,
+      "step": 3645
+    },
+    {
+      "epoch": 0.6641193595342066,
+      "grad_norm": 0.14600558578968048,
+      "learning_rate": 1.3437608240906364e-05,
+      "loss": 0.14663270711898804,
+      "step": 3650
+    },
+    {
+      "epoch": 0.6650291120815138,
+      "grad_norm": 0.15548352897167206,
+      "learning_rate": 1.3372395947508587e-05,
+      "loss": 0.1431443452835083,
+      "step": 3655
+    },
+    {
+      "epoch": 0.665938864628821,
+      "grad_norm": 0.1813388466835022,
+      "learning_rate": 1.3307284454179342e-05,
+      "loss": 0.1458706736564636,
+      "step": 3660
+    },
+    {
+      "epoch": 0.6668486171761281,
+      "grad_norm": 0.16326870024204254,
+      "learning_rate": 1.3242274325372247e-05,
+      "loss": 0.14700595140457154,
+      "step": 3665
+    },
+    {
+      "epoch": 0.6677583697234353,
+      "grad_norm": 0.18779197335243225,
+      "learning_rate": 1.3177366124662149e-05,
+      "loss": 0.1497237801551819,
+      "step": 3670
+    },
+    {
+      "epoch": 0.6686681222707423,
+      "grad_norm": 0.16291002929210663,
+      "learning_rate": 1.3112560414740315e-05,
+      "loss": 0.1387086868286133,
+      "step": 3675
+    },
+    {
+      "epoch": 0.6695778748180495,
+      "grad_norm": 0.1532297134399414,
+      "learning_rate": 1.3047857757409487e-05,
+      "loss": 0.14497545957565308,
+      "step": 3680
+    },
+    {
+      "epoch": 0.6704876273653566,
+      "grad_norm": 0.14697515964508057,
+      "learning_rate": 1.2983258713579066e-05,
+      "loss": 0.1494283437728882,
+      "step": 3685
+    },
+    {
+      "epoch": 0.6713973799126638,
+      "grad_norm": 0.15213452279567719,
+      "learning_rate": 1.2918763843260218e-05,
+      "loss": 0.1468907594680786,
+      "step": 3690
+    },
+    {
+      "epoch": 0.6723071324599709,
+      "grad_norm": 0.1745215803384781,
+      "learning_rate": 1.285437370556099e-05,
+      "loss": 0.14997754096984864,
+      "step": 3695
+    },
+    {
+      "epoch": 0.673216885007278,
+      "grad_norm": 0.19207637012004852,
+      "learning_rate": 1.2790088858681577e-05,
+      "loss": 0.14202862977981567,
+      "step": 3700
+    },
+    {
+      "epoch": 0.6741266375545851,
+      "grad_norm": 0.1521359086036682,
+      "learning_rate": 1.2725909859909313e-05,
+      "loss": 0.14547673463821412,
+      "step": 3705
+    },
+    {
+      "epoch": 0.6750363901018923,
+      "grad_norm": 0.16975535452365875,
+      "learning_rate": 1.2661837265613999e-05,
+      "loss": 0.14006874561309815,
+      "step": 3710
+    },
+    {
+      "epoch": 0.6759461426491994,
+      "grad_norm": 0.22234582901000977,
+      "learning_rate": 1.2597871631242992e-05,
+      "loss": 0.13691173791885375,
+      "step": 3715
+    },
+    {
+      "epoch": 0.6768558951965066,
+      "grad_norm": 0.16082969307899475,
+      "learning_rate": 1.2534013511316383e-05,
+      "loss": 0.14932308197021485,
+      "step": 3720
+    },
+    {
+      "epoch": 0.6777656477438136,
+      "grad_norm": 0.1751091182231903,
+      "learning_rate": 1.247026345942226e-05,
+      "loss": 0.14531974792480468,
+      "step": 3725
+    },
+    {
+      "epoch": 0.6786754002911208,
+      "grad_norm": 0.15838147699832916,
+      "learning_rate": 1.2406622028211844e-05,
+      "loss": 0.14759832620620728,
+      "step": 3730
+    },
+    {
+      "epoch": 0.6795851528384279,
+      "grad_norm": 0.1771744042634964,
+      "learning_rate": 1.2343089769394714e-05,
+      "loss": 0.1382831573486328,
+      "step": 3735
+    },
+    {
+      "epoch": 0.6804949053857351,
+      "grad_norm": 0.16301538050174713,
+      "learning_rate": 1.2279667233734037e-05,
+      "loss": 0.14444775581359864,
+      "step": 3740
+    },
+    {
+      "epoch": 0.6814046579330422,
+      "grad_norm": 0.1584121286869049,
+      "learning_rate": 1.2216354971041796e-05,
+      "loss": 0.14200170040130616,
+      "step": 3745
+    },
+    {
+      "epoch": 0.6823144104803494,
+      "grad_norm": 0.139187291264534,
+      "learning_rate": 1.2153153530174007e-05,
+      "loss": 0.14318310022354125,
+      "step": 3750
+    },
+    {
+      "epoch": 0.6832241630276564,
+      "grad_norm": 0.13665248453617096,
+      "learning_rate": 1.2090063459025955e-05,
+      "loss": 0.1411946654319763,
+      "step": 3755
+    },
+    {
+      "epoch": 0.6841339155749636,
+      "grad_norm": 0.16273781657218933,
+      "learning_rate": 1.2027085304527475e-05,
+      "loss": 0.14873508214950562,
+      "step": 3760
+    },
+    {
+      "epoch": 0.6850436681222707,
+      "grad_norm": 0.16317526996135712,
+      "learning_rate": 1.1964219612638194e-05,
+      "loss": 0.14644203186035157,
+      "step": 3765
+    },
+    {
+      "epoch": 0.6859534206695779,
+      "grad_norm": 0.17253617942333221,
+      "learning_rate": 1.1901466928342777e-05,
+      "loss": 0.14027841091156007,
+      "step": 3770
+    },
+    {
+      "epoch": 0.6868631732168851,
+      "grad_norm": 0.19692830741405487,
+      "learning_rate": 1.183882779564624e-05,
+      "loss": 0.14411110877990724,
+      "step": 3775
+    },
+    {
+      "epoch": 0.6877729257641921,
+      "grad_norm": 0.15444578230381012,
+      "learning_rate": 1.1776302757569214e-05,
+      "loss": 0.14355008602142333,
+      "step": 3780
+    },
+    {
+      "epoch": 0.6886826783114993,
+      "grad_norm": 0.1622200757265091,
+      "learning_rate": 1.1713892356143239e-05,
+      "loss": 0.14794334173202514,
+      "step": 3785
+    },
+    {
+      "epoch": 0.6895924308588064,
+      "grad_norm": 0.1898501068353653,
+      "learning_rate": 1.1651597132406073e-05,
+      "loss": 0.1418622612953186,
+      "step": 3790
+    },
+    {
+      "epoch": 0.6905021834061136,
+      "grad_norm": 0.17803208529949188,
+      "learning_rate": 1.1589417626396973e-05,
+      "loss": 0.14576040506362914,
+      "step": 3795
+    },
+    {
+      "epoch": 0.6914119359534207,
+      "grad_norm": 0.17138013243675232,
+      "learning_rate": 1.1527354377152053e-05,
+      "loss": 0.14494270086288452,
+      "step": 3800
+    },
+    {
+      "epoch": 0.6923216885007278,
+      "grad_norm": 0.15170913934707642,
+      "learning_rate": 1.1465407922699603e-05,
+      "loss": 0.144084370136261,
+      "step": 3805
+    },
+    {
+      "epoch": 0.6932314410480349,
+      "grad_norm": 0.158562570810318,
+      "learning_rate": 1.1403578800055387e-05,
+      "loss": 0.13636608123779298,
+      "step": 3810
+    },
+    {
+      "epoch": 0.6941411935953421,
+      "grad_norm": 0.17687302827835083,
+      "learning_rate": 1.1341867545218044e-05,
+      "loss": 0.14214688539505005,
+      "step": 3815
+    },
+    {
+      "epoch": 0.6950509461426492,
+      "grad_norm": 0.15394899249076843,
+      "learning_rate": 1.1280274693164378e-05,
+      "loss": 0.14914129972457885,
+      "step": 3820
+    },
+    {
+      "epoch": 0.6959606986899564,
+      "grad_norm": 0.15709355473518372,
+      "learning_rate": 1.12188007778448e-05,
+      "loss": 0.14798580408096312,
+      "step": 3825
+    },
+    {
+      "epoch": 0.6968704512372634,
+      "grad_norm": 0.16631539165973663,
+      "learning_rate": 1.115744633217864e-05,
+      "loss": 0.14756966829299928,
+      "step": 3830
+    },
+    {
+      "epoch": 0.6977802037845706,
+      "grad_norm": 0.15893076360225677,
+      "learning_rate": 1.109621188804951e-05,
+      "loss": 0.14061959981918334,
+      "step": 3835
+    },
+    {
+      "epoch": 0.6986899563318777,
+      "grad_norm": 0.183414489030838,
+      "learning_rate": 1.103509797630077e-05,
+      "loss": 0.1448473334312439,
+      "step": 3840
+    },
+    {
+      "epoch": 0.6995997088791849,
+      "grad_norm": 0.14087305963039398,
+      "learning_rate": 1.0974105126730841e-05,
+      "loss": 0.14369285106658936,
+      "step": 3845
+    },
+    {
+      "epoch": 0.700509461426492,
+      "grad_norm": 0.16919967532157898,
+      "learning_rate": 1.0913233868088685e-05,
+      "loss": 0.1478085398674011,
+      "step": 3850
+    },
+    {
+      "epoch": 0.7014192139737991,
+      "grad_norm": 0.1439533829689026,
+      "learning_rate": 1.0852484728069178e-05,
+      "loss": 0.14376721382141114,
+      "step": 3855
+    },
+    {
+      "epoch": 0.7023289665211062,
+      "grad_norm": 0.17719274759292603,
+      "learning_rate": 1.0791858233308521e-05,
+      "loss": 0.14089040756225585,
+      "step": 3860
+    },
+    {
+      "epoch": 0.7032387190684134,
+      "grad_norm": 0.19753769040107727,
+      "learning_rate": 1.0731354909379754e-05,
+      "loss": 0.15021742582321168,
+      "step": 3865
+    },
+    {
+      "epoch": 0.7041484716157205,
+      "grad_norm": 0.19186992943286896,
+      "learning_rate": 1.0670975280788086e-05,
+      "loss": 0.14113202095031738,
+      "step": 3870
+    },
+    {
+      "epoch": 0.7050582241630277,
+      "grad_norm": 0.1709229201078415,
+      "learning_rate": 1.0610719870966443e-05,
+      "loss": 0.1500566840171814,
+      "step": 3875
+    },
+    {
+      "epoch": 0.7059679767103348,
+      "grad_norm": 0.17846204340457916,
+      "learning_rate": 1.0550589202270892e-05,
+      "loss": 0.15014195442199707,
+      "step": 3880
+    },
+    {
+      "epoch": 0.7068777292576419,
+      "grad_norm": 0.1827082335948944,
+      "learning_rate": 1.0490583795976091e-05,
+      "loss": 0.1423472762107849,
+      "step": 3885
+    },
+    {
+      "epoch": 0.7077874818049491,
+      "grad_norm": 0.17418377101421356,
+      "learning_rate": 1.043070417227083e-05,
+      "loss": 0.14668900966644288,
+      "step": 3890
+    },
+    {
+      "epoch": 0.7086972343522562,
+      "grad_norm": 0.17385616898536682,
+      "learning_rate": 1.0370950850253449e-05,
+      "loss": 0.14627279043197633,
+      "step": 3895
+    },
+    {
+      "epoch": 0.7096069868995634,
+      "grad_norm": 0.16486723721027374,
+      "learning_rate": 1.0311324347927404e-05,
+      "loss": 0.14603652954101562,
+      "step": 3900
+    },
+    {
+      "epoch": 0.7105167394468704,
+      "grad_norm": 0.21806862950325012,
+      "learning_rate": 1.0251825182196732e-05,
+      "loss": 0.1488169550895691,
+      "step": 3905
+    },
+    {
+      "epoch": 0.7114264919941776,
+      "grad_norm": 0.19884569942951202,
+      "learning_rate": 1.019245386886159e-05,
+      "loss": 0.14387656450271608,
+      "step": 3910
+    },
+    {
+      "epoch": 0.7123362445414847,
+      "grad_norm": 0.16139011085033417,
+      "learning_rate": 1.0133210922613789e-05,
+      "loss": 0.1483074426651001,
+      "step": 3915
+    },
+    {
+      "epoch": 0.7132459970887919,
+      "grad_norm": 0.17000740766525269,
+      "learning_rate": 1.007409685703229e-05,
+      "loss": 0.14050065279006957,
+      "step": 3920
+    },
+    {
+      "epoch": 0.714155749636099,
+      "grad_norm": 0.17235304415225983,
+      "learning_rate": 1.0015112184578813e-05,
+      "loss": 0.1440442681312561,
+      "step": 3925
+    },
+    {
+      "epoch": 0.7150655021834061,
+      "grad_norm": 0.15737567842006683,
+      "learning_rate": 9.956257416593362e-06,
+      "loss": 0.14960765838623047,
+      "step": 3930
+    },
+    {
+      "epoch": 0.7159752547307132,
+      "grad_norm": 0.15499180555343628,
+      "learning_rate": 9.897533063289773e-06,
+      "loss": 0.14488829374313356,
+      "step": 3935
+    },
+    {
+      "epoch": 0.7168850072780204,
+      "grad_norm": 0.17744216322898865,
+      "learning_rate": 9.838939633751337e-06,
+      "loss": 0.1416949987411499,
+      "step": 3940
+    },
+    {
+      "epoch": 0.7177947598253275,
+      "grad_norm": 0.1597192883491516,
+      "learning_rate": 9.780477635926358e-06,
+      "loss": 0.14275280237197877,
+      "step": 3945
+    },
+    {
+      "epoch": 0.7187045123726347,
+      "grad_norm": 0.17800374329090118,
+      "learning_rate": 9.722147576623743e-06,
+      "loss": 0.14532098770141602,
+      "step": 3950
+    },
+    {
+      "epoch": 0.7196142649199417,
+      "grad_norm": 0.1828162521123886,
+      "learning_rate": 9.66394996150864e-06,
+      "loss": 0.14525585174560546,
+      "step": 3955
+    },
+    {
+      "epoch": 0.7205240174672489,
+      "grad_norm": 0.1800539344549179,
+      "learning_rate": 9.605885295098005e-06,
+      "loss": 0.14235819578170777,
+      "step": 3960
+    },
+    {
+      "epoch": 0.721433770014556,
+      "grad_norm": 0.16556483507156372,
+      "learning_rate": 9.54795408075628e-06,
+      "loss": 0.13965482711791993,
+      "step": 3965
+    },
+    {
+      "epoch": 0.7223435225618632,
+      "grad_norm": 0.1592024862766266,
+      "learning_rate": 9.49015682069101e-06,
+      "loss": 0.14051042795181273,
+      "step": 3970
+    },
+    {
+      "epoch": 0.7232532751091703,
+      "grad_norm": 0.18988847732543945,
+      "learning_rate": 9.43249401594846e-06,
+      "loss": 0.1436900496482849,
+      "step": 3975
+    },
+    {
+      "epoch": 0.7241630276564774,
+      "grad_norm": 0.24433808028697968,
+      "learning_rate": 9.374966166409329e-06,
+      "loss": 0.14883997440338134,
+      "step": 3980
+    },
+    {
+      "epoch": 0.7250727802037845,
+      "grad_norm": 0.15091639757156372,
+      "learning_rate": 9.317573770784352e-06,
+      "loss": 0.14726560115814208,
+      "step": 3985
+    },
+    {
+      "epoch": 0.7259825327510917,
+      "grad_norm": 0.17045573890209198,
+      "learning_rate": 9.260317326610051e-06,
+      "loss": 0.14120506048202514,
+      "step": 3990
+    },
+    {
+      "epoch": 0.7268922852983989,
+      "grad_norm": 0.18847957253456116,
+      "learning_rate": 9.203197330244343e-06,
+      "loss": 0.1377041220664978,
+      "step": 3995
+    },
+    {
+      "epoch": 0.727802037845706,
+      "grad_norm": 0.1516445279121399,
+      "learning_rate": 9.14621427686229e-06,
+      "loss": 0.14043946266174318,
+      "step": 4000
+    },
+    {
+      "epoch": 0.7287117903930131,
+      "grad_norm": 0.18264050781726837,
+      "learning_rate": 9.0893686604518e-06,
+      "loss": 0.14080368280410765,
+      "step": 4005
+    },
+    {
+      "epoch": 0.7296215429403202,
+      "grad_norm": 0.19129371643066406,
+      "learning_rate": 9.032660973809312e-06,
+      "loss": 0.1402561902999878,
+      "step": 4010
+    },
+    {
+      "epoch": 0.7305312954876274,
+      "grad_norm": 0.15762710571289062,
+      "learning_rate": 8.976091708535567e-06,
+      "loss": 0.14421157836914061,
+      "step": 4015
+    },
+    {
+      "epoch": 0.7314410480349345,
+      "grad_norm": 0.17785198986530304,
+      "learning_rate": 8.919661355031331e-06,
+      "loss": 0.14999009370803834,
+      "step": 4020
+    },
+    {
+      "epoch": 0.7323508005822417,
+      "grad_norm": 0.15306031703948975,
+      "learning_rate": 8.8633704024931e-06,
+      "loss": 0.14101698398590087,
+      "step": 4025
+    },
+    {
+      "epoch": 0.7332605531295487,
+      "grad_norm": 0.16481758654117584,
+      "learning_rate": 8.807219338908968e-06,
+      "loss": 0.14170764684677123,
+      "step": 4030
+    },
+    {
+      "epoch": 0.7341703056768559,
+      "grad_norm": 0.14892235398292542,
+      "learning_rate": 8.751208651054257e-06,
+      "loss": 0.15317896604537964,
+      "step": 4035
+    },
+    {
+      "epoch": 0.735080058224163,
+      "grad_norm": 0.1775592565536499,
+      "learning_rate": 8.695338824487409e-06,
+      "loss": 0.1520617723464966,
+      "step": 4040
+    },
+    {
+      "epoch": 0.7359898107714702,
+      "grad_norm": 0.1614258885383606,
+      "learning_rate": 8.639610343545728e-06,
+      "loss": 0.13747400045394897,
+      "step": 4045
+    },
+    {
+      "epoch": 0.7368995633187773,
+      "grad_norm": 0.21415506303310394,
+      "learning_rate": 8.58402369134117e-06,
+      "loss": 0.1432439088821411,
+      "step": 4050
+    },
+    {
+      "epoch": 0.7378093158660844,
+      "grad_norm": 0.1759418249130249,
+      "learning_rate": 8.528579349756205e-06,
+      "loss": 0.141641104221344,
+      "step": 4055
+    },
+    {
+      "epoch": 0.7387190684133915,
+      "grad_norm": 0.16738329827785492,
+      "learning_rate": 8.47327779943957e-06,
+      "loss": 0.14294810295104982,
+      "step": 4060
+    },
+    {
+      "epoch": 0.7396288209606987,
+      "grad_norm": 0.13916844129562378,
+      "learning_rate": 8.41811951980217e-06,
+      "loss": 0.13876968622207642,
+      "step": 4065
+    },
+    {
+      "epoch": 0.7405385735080058,
+      "grad_norm": 0.1828441321849823,
+      "learning_rate": 8.36310498901288e-06,
+      "loss": 0.148428475856781,
+      "step": 4070
+    },
+    {
+      "epoch": 0.741448326055313,
+      "grad_norm": 0.16534076631069183,
+      "learning_rate": 8.308234683994415e-06,
+      "loss": 0.14222711324691772,
+      "step": 4075
+    },
+    {
+      "epoch": 0.74235807860262,
+      "grad_norm": 0.17922644317150116,
+      "learning_rate": 8.253509080419198e-06,
+      "loss": 0.14365782737731933,
+      "step": 4080
+    },
+    {
+      "epoch": 0.7432678311499272,
+      "grad_norm": 0.15061035752296448,
+      "learning_rate": 8.198928652705204e-06,
+      "loss": 0.13571925163269044,
+      "step": 4085
+    },
+    {
+      "epoch": 0.7441775836972343,
+      "grad_norm": 0.18075402081012726,
+      "learning_rate": 8.144493874011908e-06,
+      "loss": 0.14385528564453126,
+      "step": 4090
+    },
+    {
+      "epoch": 0.7450873362445415,
+      "grad_norm": 0.16514739394187927,
+      "learning_rate": 8.090205216236135e-06,
+      "loss": 0.14920626878738402,
+      "step": 4095
+    },
+    {
+      "epoch": 0.7459970887918487,
+      "grad_norm": 0.16453702747821808,
+      "learning_rate": 8.03606315000797e-06,
+      "loss": 0.14704222679138185,
+      "step": 4100
+    },
+    {
+      "epoch": 0.7469068413391557,
+      "grad_norm": 0.16719917953014374,
+      "learning_rate": 7.982068144686707e-06,
+      "loss": 0.14722511768341065,
+      "step": 4105
+    },
+    {
+      "epoch": 0.7478165938864629,
+      "grad_norm": 0.18499110639095306,
+      "learning_rate": 7.92822066835677e-06,
+      "loss": 0.1401848554611206,
+      "step": 4110
+    },
+    {
+      "epoch": 0.74872634643377,
+      "grad_norm": 0.17249563336372375,
+      "learning_rate": 7.87452118782363e-06,
+      "loss": 0.15132423639297485,
+      "step": 4115
+    },
+    {
+      "epoch": 0.7496360989810772,
+      "grad_norm": 0.15049682557582855,
+      "learning_rate": 7.8209701686098e-06,
+      "loss": 0.1341150164604187,
+      "step": 4120
+    },
+    {
+      "epoch": 0.7505458515283843,
+      "grad_norm": 0.16892646253108978,
+      "learning_rate": 7.767568074950751e-06,
+      "loss": 0.1466840147972107,
+      "step": 4125
+    },
+    {
+      "epoch": 0.7514556040756915,
+      "grad_norm": 0.17288286983966827,
+      "learning_rate": 7.714315369790942e-06,
+      "loss": 0.13819680213928223,
+      "step": 4130
+    },
+    {
+      "epoch": 0.7523653566229985,
+      "grad_norm": 0.21893996000289917,
+      "learning_rate": 7.661212514779745e-06,
+      "loss": 0.14369510412216185,
+      "step": 4135
+    },
+    {
+      "epoch": 0.7532751091703057,
+      "grad_norm": 0.1674601435661316,
+      "learning_rate": 7.608259970267509e-06,
+      "loss": 0.14810250997543334,
+      "step": 4140
+    },
+    {
+      "epoch": 0.7541848617176128,
+      "grad_norm": 0.15875539183616638,
+      "learning_rate": 7.555458195301526e-06,
+      "loss": 0.14103198051452637,
+      "step": 4145
+    },
+    {
+      "epoch": 0.75509461426492,
+      "grad_norm": 0.19454079866409302,
+      "learning_rate": 7.502807647622037e-06,
+      "loss": 0.13848764896392823,
+      "step": 4150
+    },
+    {
+      "epoch": 0.756004366812227,
+      "grad_norm": 0.1795455813407898,
+      "learning_rate": 7.450308783658341e-06,
+      "loss": 0.14459335803985596,
+      "step": 4155
+    },
+    {
+      "epoch": 0.7569141193595342,
+      "grad_norm": 0.1643362045288086,
+      "learning_rate": 7.397962058524735e-06,
+      "loss": 0.14335378408432006,
+      "step": 4160
+    },
+    {
+      "epoch": 0.7578238719068413,
+      "grad_norm": 0.16362066566944122,
+      "learning_rate": 7.3457679260166475e-06,
+      "loss": 0.14222005605697632,
+      "step": 4165
+    },
+    {
+      "epoch": 0.7587336244541485,
+      "grad_norm": 0.17313003540039062,
+      "learning_rate": 7.293726838606674e-06,
+      "loss": 0.14272255897521974,
+      "step": 4170
+    },
+    {
+      "epoch": 0.7596433770014556,
+      "grad_norm": 0.1809929460287094,
+      "learning_rate": 7.2418392474406405e-06,
+      "loss": 0.14089123010635377,
+      "step": 4175
+    },
+    {
+      "epoch": 0.7605531295487628,
+      "grad_norm": 0.14306005835533142,
+      "learning_rate": 7.19010560233373e-06,
+      "loss": 0.13531534671783446,
+      "step": 4180
+    },
+    {
+      "epoch": 0.7614628820960698,
+      "grad_norm": 0.15525390207767487,
+      "learning_rate": 7.138526351766559e-06,
+      "loss": 0.14340845346450806,
+      "step": 4185
+    },
+    {
+      "epoch": 0.762372634643377,
+      "grad_norm": 0.24478943645954132,
+      "learning_rate": 7.087101942881263e-06,
+      "loss": 0.14744555950164795,
+      "step": 4190
+    },
+    {
+      "epoch": 0.7632823871906841,
+      "grad_norm": 0.31335577368736267,
+      "learning_rate": 7.035832821477711e-06,
+      "loss": 0.1484094500541687,
+      "step": 4195
+    },
+    {
+      "epoch": 0.7641921397379913,
+      "grad_norm": 0.15140366554260254,
+      "learning_rate": 6.984719432009515e-06,
+      "loss": 0.14991614818572999,
+      "step": 4200
+    },
+    {
+      "epoch": 0.7651018922852983,
+      "grad_norm": 0.16125506162643433,
+      "learning_rate": 6.933762217580289e-06,
+      "loss": 0.1408134937286377,
+      "step": 4205
+    },
+    {
+      "epoch": 0.7660116448326055,
+      "grad_norm": 0.2501450181007385,
+      "learning_rate": 6.882961619939726e-06,
+      "loss": 0.13875640630722047,
+      "step": 4210
+    },
+    {
+      "epoch": 0.7669213973799127,
+      "grad_norm": 0.16227811574935913,
+      "learning_rate": 6.8323180794798245e-06,
+      "loss": 0.14138660430908204,
+      "step": 4215
+    },
+    {
+      "epoch": 0.7678311499272198,
+      "grad_norm": 0.16676810383796692,
+      "learning_rate": 6.781832035231053e-06,
+      "loss": 0.14696706533432008,
+      "step": 4220
+    },
+    {
+      "epoch": 0.768740902474527,
+      "grad_norm": 0.14638574421405792,
+      "learning_rate": 6.731503924858518e-06,
+      "loss": 0.14263020753860473,
+      "step": 4225
+    },
+    {
+      "epoch": 0.769650655021834,
+      "grad_norm": 0.17093190550804138,
+      "learning_rate": 6.681334184658211e-06,
+      "loss": 0.14694111347198485,
+      "step": 4230
+    },
+    {
+      "epoch": 0.7705604075691412,
+      "grad_norm": 0.17174287140369415,
+      "learning_rate": 6.631323249553201e-06,
+      "loss": 0.13854929208755493,
+      "step": 4235
+    },
+    {
+      "epoch": 0.7714701601164483,
+      "grad_norm": 0.14599016308784485,
+      "learning_rate": 6.5814715530898745e-06,
+      "loss": 0.14058833122253417,
+      "step": 4240
+    },
+    {
+      "epoch": 0.7723799126637555,
+      "grad_norm": 0.16222265362739563,
+      "learning_rate": 6.531779527434176e-06,
+      "loss": 0.1428326725959778,
+      "step": 4245
+    },
+    {
+      "epoch": 0.7732896652110626,
+      "grad_norm": 0.1741994023323059,
+      "learning_rate": 6.482247603367839e-06,
+      "loss": 0.13985042572021483,
+      "step": 4250
+    },
+    {
+      "epoch": 0.7741994177583698,
+      "grad_norm": 0.17427101731300354,
+      "learning_rate": 6.432876210284688e-06,
+      "loss": 0.1442667603492737,
+      "step": 4255
+    },
+    {
+      "epoch": 0.7751091703056768,
+      "grad_norm": 0.1665259599685669,
+      "learning_rate": 6.383665776186912e-06,
+      "loss": 0.1421986222267151,
+      "step": 4260
+    },
+    {
+      "epoch": 0.776018922852984,
+      "grad_norm": 0.1728232353925705,
+      "learning_rate": 6.334616727681303e-06,
+      "loss": 0.1367053508758545,
+      "step": 4265
+    },
+    {
+      "epoch": 0.7769286754002911,
+      "grad_norm": 0.15882381796836853,
+      "learning_rate": 6.285729489975639e-06,
+      "loss": 0.14551182985305786,
+      "step": 4270
+    },
+    {
+      "epoch": 0.7778384279475983,
+      "grad_norm": 0.242042675614357,
+      "learning_rate": 6.2370044868749115e-06,
+      "loss": 0.1455132007598877,
+      "step": 4275
+    },
+    {
+      "epoch": 0.7787481804949054,
+      "grad_norm": 0.1599501073360443,
+      "learning_rate": 6.188442140777742e-06,
+      "loss": 0.1424942970275879,
+      "step": 4280
+    },
+    {
+      "epoch": 0.7796579330422125,
+      "grad_norm": 0.15182635188102722,
+      "learning_rate": 6.140042872672647e-06,
+      "loss": 0.14212887287139891,
+      "step": 4285
+    },
+    {
+      "epoch": 0.7805676855895196,
+      "grad_norm": 0.1720375418663025,
+      "learning_rate": 6.091807102134403e-06,
+      "loss": 0.14243412017822266,
+      "step": 4290
+    },
+    {
+      "epoch": 0.7814774381368268,
+      "grad_norm": 0.16436047852039337,
+      "learning_rate": 6.043735247320454e-06,
+      "loss": 0.15035657882690429,
+      "step": 4295
+    },
+    {
+      "epoch": 0.7823871906841339,
+      "grad_norm": 0.1498408019542694,
+      "learning_rate": 5.995827724967218e-06,
+      "loss": 0.14494839906692505,
+      "step": 4300
+    },
+    {
+      "epoch": 0.7832969432314411,
+      "grad_norm": 0.16924560070037842,
+      "learning_rate": 5.948084950386535e-06,
+      "loss": 0.13581212759017944,
+      "step": 4305
+    },
+    {
+      "epoch": 0.7842066957787481,
+      "grad_norm": 0.15889139473438263,
+      "learning_rate": 5.900507337462036e-06,
+      "loss": 0.15071530342102052,
+      "step": 4310
+    },
+    {
+      "epoch": 0.7851164483260553,
+      "grad_norm": 0.17201054096221924,
+      "learning_rate": 5.853095298645542e-06,
+      "loss": 0.1398628830909729,
+      "step": 4315
+    },
+    {
+      "epoch": 0.7860262008733624,
+      "grad_norm": 0.17965619266033173,
+      "learning_rate": 5.805849244953548e-06,
+      "loss": 0.14666696786880493,
+      "step": 4320
+    },
+    {
+      "epoch": 0.7869359534206696,
+      "grad_norm": 0.17514032125473022,
+      "learning_rate": 5.758769585963569e-06,
+      "loss": 0.1383386731147766,
+      "step": 4325
+    },
+    {
+      "epoch": 0.7878457059679768,
+      "grad_norm": 0.17497631907463074,
+      "learning_rate": 5.7118567298106744e-06,
+      "loss": 0.14362354278564454,
+      "step": 4330
+    },
+    {
+      "epoch": 0.7887554585152838,
+      "grad_norm": 0.16770458221435547,
+      "learning_rate": 5.665111083183905e-06,
+      "loss": 0.14136618375778198,
+      "step": 4335
+    },
+    {
+      "epoch": 0.789665211062591,
+      "grad_norm": 0.17134106159210205,
+      "learning_rate": 5.618533051322747e-06,
+      "loss": 0.1401529550552368,
+      "step": 4340
+    },
+    {
+      "epoch": 0.7905749636098981,
+      "grad_norm": 0.19458788633346558,
+      "learning_rate": 5.5721230380136435e-06,
+      "loss": 0.1393273115158081,
+      "step": 4345
+    },
+    {
+      "epoch": 0.7914847161572053,
+      "grad_norm": 0.19483692944049835,
+      "learning_rate": 5.525881445586467e-06,
+      "loss": 0.1369825482368469,
+      "step": 4350
+    },
+    {
+      "epoch": 0.7923944687045124,
+      "grad_norm": 0.3052191734313965,
+      "learning_rate": 5.4798086749110495e-06,
+      "loss": 0.14762181043624878,
+      "step": 4355
+    },
+    {
+      "epoch": 0.7933042212518195,
+      "grad_norm": 0.164458766579628,
+      "learning_rate": 5.4339051253937065e-06,
+      "loss": 0.14501686096191407,
+      "step": 4360
+    },
+    {
+      "epoch": 0.7942139737991266,
+      "grad_norm": 0.1719193458557129,
+      "learning_rate": 5.3881711949737625e-06,
+      "loss": 0.13321092128753662,
+      "step": 4365
+    },
+    {
+      "epoch": 0.7951237263464338,
+      "grad_norm": 0.17219696938991547,
+      "learning_rate": 5.342607280120121e-06,
+      "loss": 0.1413906455039978,
+      "step": 4370
+    },
+    {
+      "epoch": 0.7960334788937409,
+      "grad_norm": 0.15083056688308716,
+      "learning_rate": 5.297213775827789e-06,
+      "loss": 0.14772192239761353,
+      "step": 4375
+    },
+    {
+      "epoch": 0.7969432314410481,
+      "grad_norm": 0.1699071079492569,
+      "learning_rate": 5.251991075614507e-06,
+      "loss": 0.1392375946044922,
+      "step": 4380
+    },
+    {
+      "epoch": 0.7978529839883551,
+      "grad_norm": 0.1680395007133484,
+      "learning_rate": 5.206939571517302e-06,
+      "loss": 0.14185575246810914,
+      "step": 4385
+    },
+    {
+      "epoch": 0.7987627365356623,
+      "grad_norm": 0.16526710987091064,
+      "learning_rate": 5.162059654089083e-06,
+      "loss": 0.15001428127288818,
+      "step": 4390
+    },
+    {
+      "epoch": 0.7996724890829694,
+      "grad_norm": 0.16281752288341522,
+      "learning_rate": 5.1173517123952794e-06,
+      "loss": 0.13747023344039916,
+      "step": 4395
+    },
+    {
+      "epoch": 0.8005822416302766,
+      "grad_norm": 0.1454378366470337,
+      "learning_rate": 5.072816134010458e-06,
+      "loss": 0.14710829257965088,
+      "step": 4400
+    },
+    {
+      "epoch": 0.8014919941775837,
+      "grad_norm": 0.16565890610218048,
+      "learning_rate": 5.028453305014966e-06,
+      "loss": 0.14138611555099487,
+      "step": 4405
+    },
+    {
+      "epoch": 0.8024017467248908,
+      "grad_norm": 0.1962810605764389,
+      "learning_rate": 4.984263609991577e-06,
+      "loss": 0.13836177587509155,
+      "step": 4410
+    },
+    {
+      "epoch": 0.8033114992721979,
+      "grad_norm": 0.16091369092464447,
+      "learning_rate": 4.940247432022149e-06,
+      "loss": 0.14407440423965454,
+      "step": 4415
+    },
+    {
+      "epoch": 0.8042212518195051,
+      "grad_norm": 0.1930241584777832,
+      "learning_rate": 4.89640515268433e-06,
+      "loss": 0.14346336126327514,
+      "step": 4420
+    },
+    {
+      "epoch": 0.8051310043668122,
+      "grad_norm": 0.19301500916481018,
+      "learning_rate": 4.852737152048242e-06,
+      "loss": 0.14174317121505736,
+      "step": 4425
+    },
+    {
+      "epoch": 0.8060407569141194,
+      "grad_norm": 0.1541353315114975,
+      "learning_rate": 4.80924380867315e-06,
+      "loss": 0.14100592136383056,
+      "step": 4430
+    },
+    {
+      "epoch": 0.8069505094614265,
+      "grad_norm": 0.16285750269889832,
+      "learning_rate": 4.765925499604243e-06,
+      "loss": 0.1441288709640503,
+      "step": 4435
+    },
+    {
+      "epoch": 0.8078602620087336,
+      "grad_norm": 0.17382675409317017,
+      "learning_rate": 4.722782600369299e-06,
+      "loss": 0.13763951063156127,
+      "step": 4440
+    },
+    {
+      "epoch": 0.8087700145560408,
+      "grad_norm": 0.1697344034910202,
+      "learning_rate": 4.679815484975505e-06,
+      "loss": 0.1410105347633362,
+      "step": 4445
+    },
+    {
+      "epoch": 0.8096797671033479,
+      "grad_norm": 0.19964542984962463,
+      "learning_rate": 4.637024525906131e-06,
+      "loss": 0.1439276695251465,
+      "step": 4450
+    },
+    {
+      "epoch": 0.8105895196506551,
+      "grad_norm": 0.165307879447937,
+      "learning_rate": 4.59441009411736e-06,
+      "loss": 0.13897504806518554,
+      "step": 4455
+    },
+    {
+      "epoch": 0.8114992721979621,
+      "grad_norm": 0.16687989234924316,
+      "learning_rate": 4.551972559035067e-06,
+      "loss": 0.1422593355178833,
+      "step": 4460
+    },
+    {
+      "epoch": 0.8124090247452693,
+      "grad_norm": 0.15737789869308472,
+      "learning_rate": 4.509712288551571e-06,
+      "loss": 0.1452128052711487,
+      "step": 4465
+    },
+    {
+      "epoch": 0.8133187772925764,
+      "grad_norm": 0.17116659879684448,
+      "learning_rate": 4.467629649022509e-06,
+      "loss": 0.14385371208190917,
+      "step": 4470
+    },
+    {
+      "epoch": 0.8142285298398836,
+      "grad_norm": 0.17457640171051025,
+      "learning_rate": 4.425725005263623e-06,
+      "loss": 0.14808475971221924,
+      "step": 4475
+    },
+    {
+      "epoch": 0.8151382823871907,
+      "grad_norm": 0.1621970385313034,
+      "learning_rate": 4.383998720547583e-06,
+      "loss": 0.13927959203720092,
+      "step": 4480
+    },
+    {
+      "epoch": 0.8160480349344978,
+      "grad_norm": 0.176296666264534,
+      "learning_rate": 4.342451156600896e-06,
+      "loss": 0.15041060447692872,
+      "step": 4485
+    },
+    {
+      "epoch": 0.8169577874818049,
+      "grad_norm": 0.17157645523548126,
+      "learning_rate": 4.301082673600698e-06,
+      "loss": 0.13932652473449708,
+      "step": 4490
+    },
+    {
+      "epoch": 0.8178675400291121,
+      "grad_norm": 0.15378527343273163,
+      "learning_rate": 4.259893630171682e-06,
+      "loss": 0.1406856894493103,
+      "step": 4495
+    },
+    {
+      "epoch": 0.8187772925764192,
+      "grad_norm": 0.1750226765871048,
+      "learning_rate": 4.218884383382987e-06,
+      "loss": 0.1350164532661438,
+      "step": 4500
+    },
+    {
+      "epoch": 0.8196870451237264,
+      "grad_norm": 0.1393742561340332,
+      "learning_rate": 4.178055288745053e-06,
+      "loss": 0.13769235610961914,
+      "step": 4505
+    },
+    {
+      "epoch": 0.8205967976710334,
+      "grad_norm": 0.1668994128704071,
+      "learning_rate": 4.137406700206617e-06,
+      "loss": 0.14029752016067504,
+      "step": 4510
+    },
+    {
+      "epoch": 0.8215065502183406,
+      "grad_norm": 0.1833454668521881,
+      "learning_rate": 4.0969389701515675e-06,
+      "loss": 0.14276301860809326,
+      "step": 4515
+    },
+    {
+      "epoch": 0.8224163027656477,
+      "grad_norm": 0.16187874972820282,
+      "learning_rate": 4.056652449395945e-06,
+      "loss": 0.1444832682609558,
+      "step": 4520
+    },
+    {
+      "epoch": 0.8233260553129549,
+      "grad_norm": 0.1453280746936798,
+      "learning_rate": 4.01654748718488e-06,
+      "loss": 0.14512733221054078,
+      "step": 4525
+    },
+    {
+      "epoch": 0.824235807860262,
+      "grad_norm": 0.1782725751399994,
+      "learning_rate": 3.976624431189563e-06,
+      "loss": 0.14093561172485353,
+      "step": 4530
+    },
+    {
+      "epoch": 0.8251455604075691,
+      "grad_norm": 0.17374491691589355,
+      "learning_rate": 3.936883627504234e-06,
+      "loss": 0.14031401872634888,
+      "step": 4535
+    },
+    {
+      "epoch": 0.8260553129548762,
+      "grad_norm": 0.1609172821044922,
+      "learning_rate": 3.897325420643174e-06,
+      "loss": 0.1428336262702942,
+      "step": 4540
+    },
+    {
+      "epoch": 0.8269650655021834,
+      "grad_norm": 0.1520884931087494,
+      "learning_rate": 3.85795015353774e-06,
+      "loss": 0.1460547924041748,
+      "step": 4545
+    },
+    {
+      "epoch": 0.8278748180494906,
+      "grad_norm": 0.20986326038837433,
+      "learning_rate": 3.818758167533376e-06,
+      "loss": 0.14706350564956666,
+      "step": 4550
+    },
+    {
+      "epoch": 0.8287845705967977,
+      "grad_norm": 0.16825413703918457,
+      "learning_rate": 3.7797498023866396e-06,
+      "loss": 0.14507200717926025,
+      "step": 4555
+    },
+    {
+      "epoch": 0.8296943231441049,
+      "grad_norm": 0.16758380830287933,
+      "learning_rate": 3.740925396262296e-06,
+      "loss": 0.14898381233215333,
+      "step": 4560
+    },
+    {
+      "epoch": 0.8306040756914119,
+      "grad_norm": 0.15207453072071075,
+      "learning_rate": 3.7022852857303503e-06,
+      "loss": 0.14138854742050172,
+      "step": 4565
+    },
+    {
+      "epoch": 0.8315138282387191,
+      "grad_norm": 0.15150749683380127,
+      "learning_rate": 3.66382980576315e-06,
+      "loss": 0.13894975185394287,
+      "step": 4570
+    },
+    {
+      "epoch": 0.8324235807860262,
+      "grad_norm": 0.17071188986301422,
+      "learning_rate": 3.625559289732472e-06,
+      "loss": 0.14072470664978026,
+      "step": 4575
+    },
+    {
+      "epoch": 0.8333333333333334,
+      "grad_norm": 0.154335618019104,
+      "learning_rate": 3.5874740694066294e-06,
+      "loss": 0.13791344165802003,
+      "step": 4580
+    },
+    {
+      "epoch": 0.8342430858806404,
+      "grad_norm": 0.14017128944396973,
+      "learning_rate": 3.5495744749476116e-06,
+      "loss": 0.14427922964096068,
+      "step": 4585
+    },
+    {
+      "epoch": 0.8351528384279476,
+      "grad_norm": 0.17210033535957336,
+      "learning_rate": 3.5118608349081983e-06,
+      "loss": 0.15191166400909423,
+      "step": 4590
+    },
+    {
+      "epoch": 0.8360625909752547,
+      "grad_norm": 0.18715685606002808,
+      "learning_rate": 3.4743334762291358e-06,
+      "loss": 0.14451316595077515,
+      "step": 4595
+    },
+    {
+      "epoch": 0.8369723435225619,
+      "grad_norm": 0.18079884350299835,
+      "learning_rate": 3.436992724236293e-06,
+      "loss": 0.13530746698379517,
+      "step": 4600
+    },
+    {
+      "epoch": 0.837882096069869,
+      "grad_norm": 0.13519920408725739,
+      "learning_rate": 3.399838902637817e-06,
+      "loss": 0.1477964401245117,
+      "step": 4605
+    },
+    {
+      "epoch": 0.8387918486171762,
+      "grad_norm": 0.1778026670217514,
+      "learning_rate": 3.3628723335213885e-06,
+      "loss": 0.14419831037521363,
+      "step": 4610
+    },
+    {
+      "epoch": 0.8397016011644832,
+      "grad_norm": 0.15165366232395172,
+      "learning_rate": 3.326093337351355e-06,
+      "loss": 0.13888469934463502,
+      "step": 4615
+    },
+    {
+      "epoch": 0.8406113537117904,
+      "grad_norm": 0.17049473524093628,
+      "learning_rate": 3.2895022329660018e-06,
+      "loss": 0.14438477754592896,
+      "step": 4620
+    },
+    {
+      "epoch": 0.8415211062590975,
+      "grad_norm": 0.16536414623260498,
+      "learning_rate": 3.2530993375747833e-06,
+      "loss": 0.1444351315498352,
+      "step": 4625
+    },
+    {
+      "epoch": 0.8424308588064047,
+      "grad_norm": 0.17570015788078308,
+      "learning_rate": 3.2168849667555402e-06,
+      "loss": 0.13861945867538453,
+      "step": 4630
+    },
+    {
+      "epoch": 0.8433406113537117,
+      "grad_norm": 0.1699545532464981,
+      "learning_rate": 3.1808594344518132e-06,
+      "loss": 0.13902754783630372,
+      "step": 4635
+    },
+    {
+      "epoch": 0.8442503639010189,
+      "grad_norm": 0.12331254780292511,
+      "learning_rate": 3.1450230529700837e-06,
+      "loss": 0.14104254245758058,
+      "step": 4640
+    },
+    {
+      "epoch": 0.845160116448326,
+      "grad_norm": 0.1508190929889679,
+      "learning_rate": 3.1093761329770708e-06,
+      "loss": 0.13288766145706177,
+      "step": 4645
+    },
+    {
+      "epoch": 0.8460698689956332,
+      "grad_norm": 0.19049489498138428,
+      "learning_rate": 3.0739189834970735e-06,
+      "loss": 0.14914840459823608,
+      "step": 4650
+    },
+    {
+      "epoch": 0.8469796215429404,
+      "grad_norm": 0.1662369966506958,
+      "learning_rate": 3.0386519119092293e-06,
+      "loss": 0.14222898483276367,
+      "step": 4655
+    },
+    {
+      "epoch": 0.8478893740902474,
+      "grad_norm": 0.18985967338085175,
+      "learning_rate": 3.0035752239449126e-06,
+      "loss": 0.14431113004684448,
+      "step": 4660
+    },
+    {
+      "epoch": 0.8487991266375546,
+      "grad_norm": 0.17005261778831482,
+      "learning_rate": 2.9686892236850337e-06,
+      "loss": 0.14140807390213012,
+      "step": 4665
+    },
+    {
+      "epoch": 0.8497088791848617,
+      "grad_norm": 0.16786684095859528,
+      "learning_rate": 2.9339942135574394e-06,
+      "loss": 0.14161460399627684,
+      "step": 4670
+    },
+    {
+      "epoch": 0.8506186317321689,
+      "grad_norm": 0.16358181834220886,
+      "learning_rate": 2.899490494334281e-06,
+      "loss": 0.14674670696258546,
+      "step": 4675
+    },
+    {
+      "epoch": 0.851528384279476,
+      "grad_norm": 0.1651349812746048,
+      "learning_rate": 2.8651783651293867e-06,
+      "loss": 0.13794611692428588,
+      "step": 4680
+    },
+    {
+      "epoch": 0.8524381368267832,
+      "grad_norm": 0.16934923827648163,
+      "learning_rate": 2.831058123395694e-06,
+      "loss": 0.13199397325515747,
+      "step": 4685
+    },
+    {
+      "epoch": 0.8533478893740902,
+      "grad_norm": 0.1704150140285492,
+      "learning_rate": 2.797130064922665e-06,
+      "loss": 0.14044904708862305,
+      "step": 4690
+    },
+    {
+      "epoch": 0.8542576419213974,
+      "grad_norm": 0.1814192682504654,
+      "learning_rate": 2.7633944838337143e-06,
+      "loss": 0.1465100646018982,
+      "step": 4695
+    },
+    {
+      "epoch": 0.8551673944687045,
+      "grad_norm": 0.18942610919475555,
+      "learning_rate": 2.729851672583669e-06,
+      "loss": 0.14685982465744019,
+      "step": 4700
+    },
+    {
+      "epoch": 0.8560771470160117,
+      "grad_norm": 0.17895208299160004,
+      "learning_rate": 2.6965019219562155e-06,
+      "loss": 0.13971571922302245,
+      "step": 4705
+    },
+    {
+      "epoch": 0.8569868995633187,
+      "grad_norm": 0.22735828161239624,
+      "learning_rate": 2.6633455210614055e-06,
+      "loss": 0.13776102066040039,
+      "step": 4710
+    },
+    {
+      "epoch": 0.8578966521106259,
+      "grad_norm": 0.16779793798923492,
+      "learning_rate": 2.630382757333133e-06,
+      "loss": 0.14134042263031005,
+      "step": 4715
+    },
+    {
+      "epoch": 0.858806404657933,
+      "grad_norm": 0.2148888260126114,
+      "learning_rate": 2.597613916526637e-06,
+      "loss": 0.14680721759796142,
+      "step": 4720
+    },
+    {
+      "epoch": 0.8597161572052402,
+      "grad_norm": 0.16560257971286774,
+      "learning_rate": 2.565039282716045e-06,
+      "loss": 0.14137234687805175,
+      "step": 4725
+    },
+    {
+      "epoch": 0.8606259097525473,
+      "grad_norm": 0.16197068989276886,
+      "learning_rate": 2.532659138291879e-06,
+      "loss": 0.14969314336776735,
+      "step": 4730
+    },
+    {
+      "epoch": 0.8615356622998545,
+      "grad_norm": 0.14650246500968933,
+      "learning_rate": 2.5004737639586497e-06,
+      "loss": 0.13532910346984864,
+      "step": 4735
+    },
+    {
+      "epoch": 0.8624454148471615,
+      "grad_norm": 0.1565634310245514,
+      "learning_rate": 2.4684834387323943e-06,
+      "loss": 0.14146244525909424,
+      "step": 4740
+    },
+    {
+      "epoch": 0.8633551673944687,
+      "grad_norm": 0.18060864508152008,
+      "learning_rate": 2.4366884399382393e-06,
+      "loss": 0.14218534231185914,
+      "step": 4745
+    },
+    {
+      "epoch": 0.8642649199417758,
+      "grad_norm": 0.24613255262374878,
+      "learning_rate": 2.4050890432080557e-06,
+      "loss": 0.13907679319381713,
+      "step": 4750
+    },
+    {
+      "epoch": 0.865174672489083,
+      "grad_norm": 0.16036023199558258,
+      "learning_rate": 2.3736855224780057e-06,
+      "loss": 0.13718113899230958,
+      "step": 4755
+    },
+    {
+      "epoch": 0.86608442503639,
+      "grad_norm": 0.16678516566753387,
+      "learning_rate": 2.3424781499862075e-06,
+      "loss": 0.1327962040901184,
+      "step": 4760
+    },
+    {
+      "epoch": 0.8669941775836972,
+      "grad_norm": 0.1763770878314972,
+      "learning_rate": 2.3114671962703727e-06,
+      "loss": 0.14390318393707274,
+      "step": 4765
+    },
+    {
+      "epoch": 0.8679039301310044,
+      "grad_norm": 0.17735697329044342,
+      "learning_rate": 2.280652930165428e-06,
+      "loss": 0.15223288536071777,
+      "step": 4770
+    },
+    {
+      "epoch": 0.8688136826783115,
+      "grad_norm": 0.15827041864395142,
+      "learning_rate": 2.250035618801241e-06,
+      "loss": 0.14296332597732545,
+      "step": 4775
+    },
+    {
+      "epoch": 0.8697234352256187,
+      "grad_norm": 0.16876135766506195,
+      "learning_rate": 2.219615527600244e-06,
+      "loss": 0.1359076738357544,
+      "step": 4780
+    },
+    {
+      "epoch": 0.8706331877729258,
+      "grad_norm": 0.1800110638141632,
+      "learning_rate": 2.189392920275174e-06,
+      "loss": 0.1424281358718872,
+      "step": 4785
+    },
+    {
+      "epoch": 0.8715429403202329,
+      "grad_norm": 0.1409560889005661,
+      "learning_rate": 2.159368058826783e-06,
+      "loss": 0.14480490684509278,
+      "step": 4790
+    },
+    {
+      "epoch": 0.87245269286754,
+      "grad_norm": 0.1634288728237152,
+      "learning_rate": 2.129541203541535e-06,
+      "loss": 0.14513269662857056,
+      "step": 4795
+    },
+    {
+      "epoch": 0.8733624454148472,
+      "grad_norm": 0.17126062512397766,
+      "learning_rate": 2.099912612989391e-06,
+      "loss": 0.13546934127807617,
+      "step": 4800
+    },
+    {
+      "epoch": 0.8742721979621543,
+      "grad_norm": 0.16704080998897552,
+      "learning_rate": 2.0704825440215457e-06,
+      "loss": 0.13852492570877076,
+      "step": 4805
+    },
+    {
+      "epoch": 0.8751819505094615,
+      "grad_norm": 0.1725970208644867,
+      "learning_rate": 2.0412512517681946e-06,
+      "loss": 0.14504197835922242,
+      "step": 4810
+    },
+    {
+      "epoch": 0.8760917030567685,
+      "grad_norm": 0.1700201779603958,
+      "learning_rate": 2.0122189896363387e-06,
+      "loss": 0.14312338829040527,
+      "step": 4815
+    },
+    {
+      "epoch": 0.8770014556040757,
+      "grad_norm": 0.16491736471652985,
+      "learning_rate": 1.9833860093075834e-06,
+      "loss": 0.14062976837158203,
+      "step": 4820
+    },
+    {
+      "epoch": 0.8779112081513828,
+      "grad_norm": 0.13748787343502045,
+      "learning_rate": 1.9547525607359537e-06,
+      "loss": 0.1346171498298645,
+      "step": 4825
+    },
+    {
+      "epoch": 0.87882096069869,
+      "grad_norm": 0.16399399936199188,
+      "learning_rate": 1.926318892145712e-06,
+      "loss": 0.14178123474121093,
+      "step": 4830
+    },
+    {
+      "epoch": 0.879730713245997,
+      "grad_norm": 0.14491963386535645,
+      "learning_rate": 1.8980852500292412e-06,
+      "loss": 0.1408564567565918,
+      "step": 4835
+    },
+    {
+      "epoch": 0.8806404657933042,
+      "grad_norm": 0.17335423827171326,
+      "learning_rate": 1.8700518791448851e-06,
+      "loss": 0.14403265714645386,
+      "step": 4840
+    },
+    {
+      "epoch": 0.8815502183406113,
+      "grad_norm": 0.17399625480175018,
+      "learning_rate": 1.8422190225148155e-06,
+      "loss": 0.14289036989212037,
+      "step": 4845
+    },
+    {
+      "epoch": 0.8824599708879185,
+      "grad_norm": 0.17945612967014313,
+      "learning_rate": 1.814586921422956e-06,
+      "loss": 0.14494109153747559,
+      "step": 4850
+    },
+    {
+      "epoch": 0.8833697234352256,
+      "grad_norm": 0.1910620480775833,
+      "learning_rate": 1.7871558154128664e-06,
+      "loss": 0.13726245164871215,
+      "step": 4855
+    },
+    {
+      "epoch": 0.8842794759825328,
+      "grad_norm": 0.1771879345178604,
+      "learning_rate": 1.7599259422856756e-06,
+      "loss": 0.1464752197265625,
+      "step": 4860
+    },
+    {
+      "epoch": 0.8851892285298398,
+      "grad_norm": 0.19427461922168732,
+      "learning_rate": 1.7328975380980218e-06,
+      "loss": 0.13823356628417968,
+      "step": 4865
+    },
+    {
+      "epoch": 0.886098981077147,
+      "grad_norm": 0.1491149365901947,
+      "learning_rate": 1.7060708371599897e-06,
+      "loss": 0.1338604211807251,
+      "step": 4870
+    },
+    {
+      "epoch": 0.8870087336244541,
+      "grad_norm": 0.16087733209133148,
+      "learning_rate": 1.6794460720331057e-06,
+      "loss": 0.14184389114379883,
+      "step": 4875
+    },
+    {
+      "epoch": 0.8879184861717613,
+      "grad_norm": 0.14506325125694275,
+      "learning_rate": 1.653023473528309e-06,
+      "loss": 0.14267687797546386,
+      "step": 4880
+    },
+    {
+      "epoch": 0.8888282387190685,
+      "grad_norm": 0.16886365413665771,
+      "learning_rate": 1.626803270703936e-06,
+      "loss": 0.14266083240509034,
+      "step": 4885
+    },
+    {
+      "epoch": 0.8897379912663755,
+      "grad_norm": 0.1891999989748001,
+      "learning_rate": 1.6007856908637652e-06,
+      "loss": 0.1398016929626465,
+      "step": 4890
+    },
+    {
+      "epoch": 0.8906477438136827,
+      "grad_norm": 0.17645299434661865,
+      "learning_rate": 1.5749709595550083e-06,
+      "loss": 0.13869571685791016,
+      "step": 4895
+    },
+    {
+      "epoch": 0.8915574963609898,
+      "grad_norm": 0.17714262008666992,
+      "learning_rate": 1.549359300566408e-06,
+      "loss": 0.14957486391067504,
+      "step": 4900
+    },
+    {
+      "epoch": 0.892467248908297,
+      "grad_norm": 0.18025240302085876,
+      "learning_rate": 1.5239509359262355e-06,
+      "loss": 0.1358652949333191,
+      "step": 4905
+    },
+    {
+      "epoch": 0.8933770014556041,
+      "grad_norm": 0.17539937794208527,
+      "learning_rate": 1.4987460859004154e-06,
+      "loss": 0.13833394050598144,
+      "step": 4910
+    },
+    {
+      "epoch": 0.8942867540029112,
+      "grad_norm": 0.1772230565547943,
+      "learning_rate": 1.4737449689905953e-06,
+      "loss": 0.14202116727828978,
+      "step": 4915
+    },
+    {
+      "epoch": 0.8951965065502183,
+      "grad_norm": 0.1670161783695221,
+      "learning_rate": 1.4489478019322433e-06,
+      "loss": 0.1403665542602539,
+      "step": 4920
+    },
+    {
+      "epoch": 0.8961062590975255,
+      "grad_norm": 0.1697034239768982,
+      "learning_rate": 1.4243547996927926e-06,
+      "loss": 0.1401481032371521,
+      "step": 4925
+    },
+    {
+      "epoch": 0.8970160116448326,
+      "grad_norm": 0.16474860906600952,
+      "learning_rate": 1.3999661754697636e-06,
+      "loss": 0.13969850540161133,
+      "step": 4930
+    },
+    {
+      "epoch": 0.8979257641921398,
+      "grad_norm": 0.1664883941411972,
+      "learning_rate": 1.3757821406889027e-06,
+      "loss": 0.1399069309234619,
+      "step": 4935
+    },
+    {
+      "epoch": 0.8988355167394468,
+      "grad_norm": 0.16675794124603271,
+      "learning_rate": 1.351802905002386e-06,
+      "loss": 0.14129226207733153,
+      "step": 4940
+    },
+    {
+      "epoch": 0.899745269286754,
+      "grad_norm": 0.17529809474945068,
+      "learning_rate": 1.3280286762869632e-06,
+      "loss": 0.14663081169128417,
+      "step": 4945
+    },
+    {
+      "epoch": 0.9006550218340611,
+      "grad_norm": 0.17758169770240784,
+      "learning_rate": 1.3044596606421795e-06,
+      "loss": 0.13986254930496217,
+      "step": 4950
+    },
+    {
+      "epoch": 0.9015647743813683,
+      "grad_norm": 0.153225839138031,
+      "learning_rate": 1.2810960623885815e-06,
+      "loss": 0.14236698150634766,
+      "step": 4955
+    },
+    {
+      "epoch": 0.9024745269286754,
+      "grad_norm": 0.169761523604393,
+      "learning_rate": 1.2579380840659376e-06,
+      "loss": 0.1450445055961609,
+      "step": 4960
+    },
+    {
+      "epoch": 0.9033842794759825,
+      "grad_norm": 0.16659331321716309,
+      "learning_rate": 1.2349859264315034e-06,
+      "loss": 0.14043926000595092,
+      "step": 4965
+    },
+    {
+      "epoch": 0.9042940320232896,
+      "grad_norm": 0.16748706996440887,
+      "learning_rate": 1.2122397884582553e-06,
+      "loss": 0.14725675582885742,
+      "step": 4970
+    },
+    {
+      "epoch": 0.9052037845705968,
+      "grad_norm": 0.1600511223077774,
+      "learning_rate": 1.1896998673331883e-06,
+      "loss": 0.14551150798797607,
+      "step": 4975
+    },
+    {
+      "epoch": 0.9061135371179039,
+      "grad_norm": 0.24318362772464752,
+      "learning_rate": 1.1673663584555934e-06,
+      "loss": 0.14470888376235963,
+      "step": 4980
+    },
+    {
+      "epoch": 0.9070232896652111,
+      "grad_norm": 0.16443821787834167,
+      "learning_rate": 1.1452394554353706e-06,
+      "loss": 0.13639854192733764,
+      "step": 4985
+    },
+    {
+      "epoch": 0.9079330422125182,
+      "grad_norm": 0.14277774095535278,
+      "learning_rate": 1.1233193500913453e-06,
+      "loss": 0.13749881982803344,
+      "step": 4990
+    },
+    {
+      "epoch": 0.9088427947598253,
+      "grad_norm": 0.1610947549343109,
+      "learning_rate": 1.1016062324496008e-06,
+      "loss": 0.1385629653930664,
+      "step": 4995
+    },
+    {
+      "epoch": 0.9097525473071325,
+      "grad_norm": 0.17888498306274414,
+      "learning_rate": 1.080100290741845e-06,
+      "loss": 0.14225621223449708,
+      "step": 5000
+    },
+    {
+      "epoch": 0.9106622998544396,
+      "grad_norm": 0.17488449811935425,
+      "learning_rate": 1.0588017114037729e-06,
+      "loss": 0.14187805652618407,
+      "step": 5005
+    },
+    {
+      "epoch": 0.9115720524017468,
+      "grad_norm": 0.16410665214061737,
+      "learning_rate": 1.0377106790734392e-06,
+      "loss": 0.1407416582107544,
+      "step": 5010
+    },
+    {
+      "epoch": 0.9124818049490538,
+      "grad_norm": 0.18115971982479095,
+      "learning_rate": 1.016827376589674e-06,
+      "loss": 0.1427263855934143,
+      "step": 5015
+    },
+    {
+      "epoch": 0.913391557496361,
+      "grad_norm": 0.18507841229438782,
+      "learning_rate": 9.961519849904898e-07,
+      "loss": 0.1390499472618103,
+      "step": 5020
+    },
+    {
+      "epoch": 0.9143013100436681,
+      "grad_norm": 0.21296796202659607,
+      "learning_rate": 9.75684683511513e-07,
+      "loss": 0.1382216691970825,
+      "step": 5025
+    },
+    {
+      "epoch": 0.9152110625909753,
+      "grad_norm": 0.2308044582605362,
+      "learning_rate": 9.55425649584435e-07,
+      "loss": 0.14271280765533448,
+      "step": 5030
+    },
+    {
+      "epoch": 0.9161208151382824,
+      "grad_norm": 0.15796682238578796,
+      "learning_rate": 9.353750588354527e-07,
+      "loss": 0.13807624578475952,
+      "step": 5035
+    },
+    {
+      "epoch": 0.9170305676855895,
+      "grad_norm": 0.1695316582918167,
+      "learning_rate": 9.155330850837834e-07,
+      "loss": 0.14289476871490478,
+      "step": 5040
+    },
+    {
+      "epoch": 0.9179403202328966,
+      "grad_norm": 0.1738404780626297,
+      "learning_rate": 8.958999003401191e-07,
+      "loss": 0.14070619344711305,
+      "step": 5045
+    },
+    {
+      "epoch": 0.9188500727802038,
+      "grad_norm": 0.20618964731693268,
+      "learning_rate": 8.764756748051662e-07,
+      "loss": 0.14535053968429565,
+      "step": 5050
+    },
+    {
+      "epoch": 0.9197598253275109,
+      "grad_norm": 0.1506137251853943,
+      "learning_rate": 8.572605768681546e-07,
+      "loss": 0.13995139598846434,
+      "step": 5055
+    },
+    {
+      "epoch": 0.9206695778748181,
+      "grad_norm": 0.17772039771080017,
+      "learning_rate": 8.382547731053708e-07,
+      "loss": 0.14470311403274536,
+      "step": 5060
+    },
+    {
+      "epoch": 0.9215793304221251,
+      "grad_norm": 0.19897456467151642,
+      "learning_rate": 8.194584282787382e-07,
+      "loss": 0.144488525390625,
+      "step": 5065
+    },
+    {
+      "epoch": 0.9224890829694323,
+      "grad_norm": 0.15899236500263214,
+      "learning_rate": 8.008717053343606e-07,
+      "loss": 0.1352991580963135,
+      "step": 5070
+    },
+    {
+      "epoch": 0.9233988355167394,
+      "grad_norm": 0.14965768158435822,
+      "learning_rate": 7.824947654011345e-07,
+      "loss": 0.13827911615371705,
+      "step": 5075
+    },
+    {
+      "epoch": 0.9243085880640466,
+      "grad_norm": 0.43651485443115234,
+      "learning_rate": 7.643277677893329e-07,
+      "loss": 0.14149526357650757,
+      "step": 5080
+    },
+    {
+      "epoch": 0.9252183406113537,
+      "grad_norm": 0.19912713766098022,
+      "learning_rate": 7.463708699892325e-07,
+      "loss": 0.14357032775878906,
+      "step": 5085
+    },
+    {
+      "epoch": 0.9261280931586608,
+      "grad_norm": 0.1635904610157013,
+      "learning_rate": 7.286242276697524e-07,
+      "loss": 0.13550699949264527,
+      "step": 5090
+    },
+    {
+      "epoch": 0.9270378457059679,
+      "grad_norm": 0.19391080737113953,
+      "learning_rate": 7.11087994677101e-07,
+      "loss": 0.14674756526947022,
+      "step": 5095
+    },
+    {
+      "epoch": 0.9279475982532751,
+      "grad_norm": 0.17458125948905945,
+      "learning_rate": 6.937623230334284e-07,
+      "loss": 0.14155579805374147,
+      "step": 5100
+    },
+    {
+      "epoch": 0.9288573508005823,
+      "grad_norm": 0.1617971807718277,
+      "learning_rate": 6.766473629355452e-07,
+      "loss": 0.140555477142334,
+      "step": 5105
+    },
+    {
+      "epoch": 0.9297671033478894,
+      "grad_norm": 0.16945427656173706,
+      "learning_rate": 6.59743262753576e-07,
+      "loss": 0.13607511520385743,
+      "step": 5110
+    },
+    {
+      "epoch": 0.9306768558951966,
+      "grad_norm": 0.18347840011119843,
+      "learning_rate": 6.43050169029702e-07,
+      "loss": 0.14903461933135986,
+      "step": 5115
+    },
+    {
+      "epoch": 0.9315866084425036,
+      "grad_norm": 0.15434837341308594,
+      "learning_rate": 6.265682264768869e-07,
+      "loss": 0.14146015644073487,
+      "step": 5120
+    },
+    {
+      "epoch": 0.9324963609898108,
+      "grad_norm": 0.1397712528705597,
+      "learning_rate": 6.10297577977606e-07,
+      "loss": 0.14261592626571656,
+      "step": 5125
+    },
+    {
+      "epoch": 0.9334061135371179,
+      "grad_norm": 0.1765873283147812,
+      "learning_rate": 5.942383645826361e-07,
+      "loss": 0.13559447526931762,
+      "step": 5130
+    },
+    {
+      "epoch": 0.9343158660844251,
+      "grad_norm": 0.1656057983636856,
+      "learning_rate": 5.783907255098003e-07,
+      "loss": 0.13961490392684936,
+      "step": 5135
+    },
+    {
+      "epoch": 0.9352256186317321,
+      "grad_norm": 0.2169366180896759,
+      "learning_rate": 5.627547981427894e-07,
+      "loss": 0.1447835922241211,
+      "step": 5140
+    },
+    {
+      "epoch": 0.9361353711790393,
+      "grad_norm": 0.18623125553131104,
+      "learning_rate": 5.473307180299508e-07,
+      "loss": 0.14366730451583862,
+      "step": 5145
+    },
+    {
+      "epoch": 0.9370451237263464,
+      "grad_norm": 0.15423963963985443,
+      "learning_rate": 5.32118618883129e-07,
+      "loss": 0.14295632839202882,
+      "step": 5150
+    },
+    {
+      "epoch": 0.9379548762736536,
+      "grad_norm": 0.18423247337341309,
+      "learning_rate": 5.17118632576491e-07,
+      "loss": 0.14137414693832398,
+      "step": 5155
+    },
+    {
+      "epoch": 0.9388646288209607,
+      "grad_norm": 0.15338757634162903,
+      "learning_rate": 5.023308891453915e-07,
+      "loss": 0.13583066463470458,
+      "step": 5160
+    },
+    {
+      "epoch": 0.9397743813682679,
+      "grad_norm": 0.2293633222579956,
+      "learning_rate": 4.877555167852515e-07,
+      "loss": 0.14819620847702025,
+      "step": 5165
+    },
+    {
+      "epoch": 0.9406841339155749,
+      "grad_norm": 0.16889944672584534,
+      "learning_rate": 4.7339264185043974e-07,
+      "loss": 0.13617686033248902,
+      "step": 5170
+    },
+    {
+      "epoch": 0.9415938864628821,
+      "grad_norm": 0.1767464578151703,
+      "learning_rate": 4.5924238885316775e-07,
+      "loss": 0.13487552404403685,
+      "step": 5175
+    },
+    {
+      "epoch": 0.9425036390101892,
+      "grad_norm": 0.16697899997234344,
+      "learning_rate": 4.453048804624327e-07,
+      "loss": 0.1446886420249939,
+      "step": 5180
+    },
+    {
+      "epoch": 0.9434133915574964,
+      "grad_norm": 0.19576266407966614,
+      "learning_rate": 4.315802375029293e-07,
+      "loss": 0.14252450466156005,
+      "step": 5185
+    },
+    {
+      "epoch": 0.9443231441048034,
+      "grad_norm": 0.14838077127933502,
+      "learning_rate": 4.18068578954034e-07,
+      "loss": 0.13933032751083374,
+      "step": 5190
+    },
+    {
+      "epoch": 0.9452328966521106,
+      "grad_norm": 0.18481744825839996,
+      "learning_rate": 4.047700219487388e-07,
+      "loss": 0.1410665273666382,
+      "step": 5195
+    },
+    {
+      "epoch": 0.9461426491994177,
+      "grad_norm": 0.16954176127910614,
+      "learning_rate": 3.9168468177265547e-07,
+      "loss": 0.1421758770942688,
+      "step": 5200
+    },
+    {
+      "epoch": 0.9470524017467249,
+      "grad_norm": 0.17614421248435974,
+      "learning_rate": 3.7881267186301306e-07,
+      "loss": 0.14059911966323851,
+      "step": 5205
+    },
+    {
+      "epoch": 0.9479621542940321,
+      "grad_norm": 0.1637226939201355,
+      "learning_rate": 3.6615410380767544e-07,
+      "loss": 0.1360395908355713,
+      "step": 5210
+    },
+    {
+      "epoch": 0.9488719068413392,
+      "grad_norm": 0.18330250680446625,
+      "learning_rate": 3.5370908734417006e-07,
+      "loss": 0.14543824195861815,
+      "step": 5215
+    },
+    {
+      "epoch": 0.9497816593886463,
+      "grad_norm": 0.1895420402288437,
+      "learning_rate": 3.414777303587413e-07,
+      "loss": 0.15304578542709352,
+      "step": 5220
+    },
+    {
+      "epoch": 0.9506914119359534,
+      "grad_norm": 0.15384933352470398,
+      "learning_rate": 3.294601388854041e-07,
+      "loss": 0.14675912857055665,
+      "step": 5225
+    },
+    {
+      "epoch": 0.9516011644832606,
+      "grad_norm": 0.20188499987125397,
+      "learning_rate": 3.1765641710505e-07,
+      "loss": 0.14068362712860108,
+      "step": 5230
+    },
+    {
+      "epoch": 0.9525109170305677,
+      "grad_norm": 0.16467279195785522,
+      "learning_rate": 3.060666673445123e-07,
+      "loss": 0.14733167886734008,
+      "step": 5235
+    },
+    {
+      "epoch": 0.9534206695778749,
+      "grad_norm": 0.16632016003131866,
+      "learning_rate": 2.9469099007569943e-07,
+      "loss": 0.13753929138183593,
+      "step": 5240
+    },
+    {
+      "epoch": 0.9543304221251819,
+      "grad_norm": 0.1477566957473755,
+      "learning_rate": 2.83529483914724e-07,
+      "loss": 0.14354891777038575,
+      "step": 5245
+    },
+    {
+      "epoch": 0.9552401746724891,
+      "grad_norm": 0.1693645417690277,
+      "learning_rate": 2.7258224562102805e-07,
+      "loss": 0.14622807502746582,
+      "step": 5250
+    },
+    {
+      "epoch": 0.9561499272197962,
+      "grad_norm": 0.17574062943458557,
+      "learning_rate": 2.6184937009657295e-07,
+      "loss": 0.1344899296760559,
+      "step": 5255
+    },
+    {
+      "epoch": 0.9570596797671034,
+      "grad_norm": 0.17448563873767853,
+      "learning_rate": 2.513309503850009e-07,
+      "loss": 0.1355789542198181,
+      "step": 5260
+    },
+    {
+      "epoch": 0.9579694323144105,
+      "grad_norm": 0.16993778944015503,
+      "learning_rate": 2.41027077670819e-07,
+      "loss": 0.151595401763916,
+      "step": 5265
+    },
+    {
+      "epoch": 0.9588791848617176,
+      "grad_norm": 0.16944102942943573,
+      "learning_rate": 2.3093784127863062e-07,
+      "loss": 0.1466623306274414,
+      "step": 5270
+    },
+    {
+      "epoch": 0.9597889374090247,
+      "grad_norm": 0.18085163831710815,
+      "learning_rate": 2.2106332867234402e-07,
+      "loss": 0.14645814895629883,
+      "step": 5275
+    },
+    {
+      "epoch": 0.9606986899563319,
+      "grad_norm": 0.14682307839393616,
+      "learning_rate": 2.1140362545442605e-07,
+      "loss": 0.13901774883270263,
+      "step": 5280
+    },
+    {
+      "epoch": 0.961608442503639,
+      "grad_norm": 0.17189526557922363,
+      "learning_rate": 2.0195881536514694e-07,
+      "loss": 0.14153491258621215,
+      "step": 5285
+    },
+    {
+      "epoch": 0.9625181950509462,
+      "grad_norm": 0.1977207362651825,
+      "learning_rate": 1.9272898028186714e-07,
+      "loss": 0.1437437653541565,
+      "step": 5290
+    },
+    {
+      "epoch": 0.9634279475982532,
+      "grad_norm": 0.16637668013572693,
+      "learning_rate": 1.837142002183184e-07,
+      "loss": 0.13910138607025146,
+      "step": 5295
+    },
+    {
+      "epoch": 0.9643377001455604,
+      "grad_norm": 0.18155774474143982,
+      "learning_rate": 1.7491455332391548e-07,
+      "loss": 0.14177814722061158,
+      "step": 5300
+    },
+    {
+      "epoch": 0.9652474526928675,
+      "grad_norm": 0.32478174567222595,
+      "learning_rate": 1.6633011588307878e-07,
+      "loss": 0.14292703866958617,
+      "step": 5305
+    },
+    {
+      "epoch": 0.9661572052401747,
+      "grad_norm": 0.18050940334796906,
+      "learning_rate": 1.5796096231456558e-07,
+      "loss": 0.13252723217010498,
+      "step": 5310
+    },
+    {
+      "epoch": 0.9670669577874818,
+      "grad_norm": 0.15919657051563263,
+      "learning_rate": 1.4980716517083715e-07,
+      "loss": 0.14491976499557496,
+      "step": 5315
+    },
+    {
+      "epoch": 0.9679767103347889,
+      "grad_norm": 0.15895310044288635,
+      "learning_rate": 1.4186879513741758e-07,
+      "loss": 0.13617006540298462,
+      "step": 5320
+    },
+    {
+      "epoch": 0.9688864628820961,
+      "grad_norm": 0.1543736606836319,
+      "learning_rate": 1.3414592103228595e-07,
+      "loss": 0.14220429658889772,
+      "step": 5325
+    },
+    {
+      "epoch": 0.9697962154294032,
+      "grad_norm": 0.16660647094249725,
+      "learning_rate": 1.2663860980528797e-07,
+      "loss": 0.14069980382919312,
+      "step": 5330
+    },
+    {
+      "epoch": 0.9707059679767104,
+      "grad_norm": 0.15238550305366516,
+      "learning_rate": 1.1934692653754186e-07,
+      "loss": 0.13978019952774048,
+      "step": 5335
+    },
+    {
+      "epoch": 0.9716157205240175,
+      "grad_norm": 0.1649473011493683,
+      "learning_rate": 1.1227093444088066e-07,
+      "loss": 0.1401435136795044,
+      "step": 5340
+    },
+    {
+      "epoch": 0.9725254730713246,
+      "grad_norm": 0.14920124411582947,
+      "learning_rate": 1.0541069485730249e-07,
+      "loss": 0.13952178955078126,
+      "step": 5345
+    },
+    {
+      "epoch": 0.9734352256186317,
+      "grad_norm": 0.16802479326725006,
+      "learning_rate": 9.876626725844329e-08,
+      "loss": 0.14808181524276734,
+      "step": 5350
+    },
+    {
+      "epoch": 0.9743449781659389,
+      "grad_norm": 0.18096603453159332,
+      "learning_rate": 9.233770924505781e-08,
+      "loss": 0.13938647508621216,
+      "step": 5355
+    },
+    {
+      "epoch": 0.975254730713246,
+      "grad_norm": 0.1658579558134079,
+      "learning_rate": 8.612507654651991e-08,
+      "loss": 0.14219754934310913,
+      "step": 5360
+    },
+    {
+      "epoch": 0.9761644832605532,
+      "grad_norm": 0.1547713279724121,
+      "learning_rate": 8.012842302033696e-08,
+      "loss": 0.14298388957977295,
+      "step": 5365
+    },
+    {
+      "epoch": 0.9770742358078602,
+      "grad_norm": 0.18247587978839874,
+      "learning_rate": 7.434780065169178e-08,
+      "loss": 0.14103788137435913,
+      "step": 5370
+    },
+    {
+      "epoch": 0.9779839883551674,
+      "grad_norm": 0.17593605816364288,
+      "learning_rate": 6.878325955297915e-08,
+      "loss": 0.1450013041496277,
+      "step": 5375
+    },
+    {
+      "epoch": 0.9788937409024745,
+      "grad_norm": 0.17178039252758026,
+      "learning_rate": 6.343484796338395e-08,
+      "loss": 0.14021269083023072,
+      "step": 5380
+    },
+    {
+      "epoch": 0.9798034934497817,
+      "grad_norm": 0.17904147505760193,
+      "learning_rate": 5.830261224845923e-08,
+      "loss": 0.1460060477256775,
+      "step": 5385
+    },
+    {
+      "epoch": 0.9807132459970888,
+      "grad_norm": 0.16323266923427582,
+      "learning_rate": 5.338659689971548e-08,
+      "loss": 0.13915741443634033,
+      "step": 5390
+    },
+    {
+      "epoch": 0.9816229985443959,
+      "grad_norm": 0.1829039305448532,
+      "learning_rate": 4.8686844534248655e-08,
+      "loss": 0.1372266888618469,
+      "step": 5395
+    },
+    {
+      "epoch": 0.982532751091703,
+      "grad_norm": 0.16742415726184845,
+      "learning_rate": 4.420339589435995e-08,
+      "loss": 0.14404670000076295,
+      "step": 5400
+    },
+    {
+      "epoch": 0.9834425036390102,
+      "grad_norm": 0.20223695039749146,
+      "learning_rate": 3.9936289847206097e-08,
+      "loss": 0.14259873628616332,
+      "step": 5405
+    },
+    {
+      "epoch": 0.9843522561863173,
+      "grad_norm": 0.16556286811828613,
+      "learning_rate": 3.588556338446625e-08,
+      "loss": 0.1486513614654541,
+      "step": 5410
+    },
+    {
+      "epoch": 0.9852620087336245,
+      "grad_norm": 0.17593473196029663,
+      "learning_rate": 3.205125162201727e-08,
+      "loss": 0.1421863079071045,
+      "step": 5415
+    },
+    {
+      "epoch": 0.9861717612809315,
+      "grad_norm": 0.16081394255161285,
+      "learning_rate": 2.8433387799631228e-08,
+      "loss": 0.14347248077392577,
+      "step": 5420
+    },
+    {
+      "epoch": 0.9870815138282387,
+      "grad_norm": 0.16976398229599,
+      "learning_rate": 2.503200328067834e-08,
+      "loss": 0.13244209289550782,
+      "step": 5425
+    },
+    {
+      "epoch": 0.9879912663755459,
+      "grad_norm": 0.1760631501674652,
+      "learning_rate": 2.1847127551874458e-08,
+      "loss": 0.13686281442642212,
+      "step": 5430
+    },
+    {
+      "epoch": 0.988901018922853,
+      "grad_norm": 0.1616654098033905,
+      "learning_rate": 1.8878788223009036e-08,
+      "loss": 0.14251030683517457,
+      "step": 5435
+    },
+    {
+      "epoch": 0.9898107714701602,
+      "grad_norm": 0.17077618837356567,
+      "learning_rate": 1.6127011026703663e-08,
+      "loss": 0.13724164962768554,
+      "step": 5440
+    },
+    {
+      "epoch": 0.9907205240174672,
+      "grad_norm": 0.17870111763477325,
+      "learning_rate": 1.359181981820945e-08,
+      "loss": 0.13891533613204957,
+      "step": 5445
+    },
+    {
+      "epoch": 0.9916302765647744,
+      "grad_norm": 0.17270788550376892,
+      "learning_rate": 1.1273236575173873e-08,
+      "loss": 0.1403287410736084,
+      "step": 5450
+    },
+    {
+      "epoch": 0.9925400291120815,
+      "grad_norm": 0.14556582272052765,
+      "learning_rate": 9.171281397471476e-09,
+      "loss": 0.14006919860839845,
+      "step": 5455
+    },
+    {
+      "epoch": 0.9934497816593887,
+      "grad_norm": 0.173613503575325,
+      "learning_rate": 7.285972507017902e-09,
+      "loss": 0.13570475578308105,
+      "step": 5460
+    },
+    {
+      "epoch": 0.9943595342066958,
+      "grad_norm": 0.17131361365318298,
+      "learning_rate": 5.617326247614463e-09,
+      "loss": 0.13654524087905884,
+      "step": 5465
+    },
+    {
+      "epoch": 0.995269286754003,
+      "grad_norm": 0.1668723076581955,
+      "learning_rate": 4.1653570848121385e-09,
+      "loss": 0.13835798501968383,
+      "step": 5470
+    },
+    {
+      "epoch": 0.99617903930131,
+      "grad_norm": 0.17216536402702332,
+      "learning_rate": 2.9300776057727962e-09,
+      "loss": 0.1460067868232727,
+      "step": 5475
+    },
+    {
+      "epoch": 0.9970887918486172,
+      "grad_norm": 0.17424245178699493,
+      "learning_rate": 1.911498519177601e-09,
+      "loss": 0.13620004653930665,
+      "step": 5480
+    },
+    {
+      "epoch": 0.9979985443959243,
+      "grad_norm": 0.20578418672084808,
+      "learning_rate": 1.1096286551187663e-09,
+      "loss": 0.13895630836486816,
+      "step": 5485
+    },
+    {
+      "epoch": 0.9989082969432315,
+      "grad_norm": 0.19702354073524475,
+      "learning_rate": 5.244749650301639e-10,
+      "loss": 0.14265141487121583,
+      "step": 5490
+    },
+    {
+      "epoch": 0.9998180494905385,
+      "grad_norm": 0.16503211855888367,
+      "learning_rate": 1.560425216318162e-10,
+      "loss": 0.15260394811630248,
+      "step": 5495
+    },
+    {
+      "epoch": 1.0007278020378456,
+      "grad_norm": 0.18294842541217804,
+      "learning_rate": 4.334518874382631e-12,
+      "loss": 0.14066877365112304,
+      "step": 5500
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.0227691751004954e+18,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-5500/training_args.bin b/checkpoint-5500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-5500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/checkpoint-600/README.md b/checkpoint-600/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-600/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-600/adapter_config.json b/checkpoint-600/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-600/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-600/adapter_model.safetensors b/checkpoint-600/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f3df1e614ada9585a69f31c15e39d6ed0612e47c
--- /dev/null
+++ b/checkpoint-600/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:079988b42f6ac1d048db321d7423b2482c1a86094d65b822f0554bcb55d53829
+size 169741912
diff --git a/checkpoint-600/chat_template.jinja b/checkpoint-600/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-600/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-600/optimizer.pt b/checkpoint-600/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6fda89d6b203087797d99363bb2e51c2475ba10d
--- /dev/null
+++ b/checkpoint-600/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:253cd8f2f837c96e77fd97da074f4de32dac3ab6554465896c052e17a99224e0
+size 72807355
diff --git a/checkpoint-600/processor_config.json b/checkpoint-600/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-600/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-600/rng_state.pth b/checkpoint-600/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-600/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-600/scheduler.pt b/checkpoint-600/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..696d2dcd63a1a52d8a725d739aa534f92ae2f9b6
--- /dev/null
+++ b/checkpoint-600/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a077206253fffe03d7133f10fc8a0dcefffd2da6f33c9dacdf9ab88464770da4
+size 1465
diff --git a/checkpoint-600/tokenizer.json b/checkpoint-600/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-600/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-600/tokenizer_config.json b/checkpoint-600/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-600/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-600/trainer_state.json b/checkpoint-600/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..586fa47a37ccc44f5501b31eb922888617139321
--- /dev/null
+++ b/checkpoint-600/trainer_state.json
@@ -0,0 +1,882 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.1091703056768559,
+  "eval_steps": 100,
+  "global_step": 600,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    },
+    {
+      "epoch": 0.03729985443959243,
+      "grad_norm": 0.061678580939769745,
+      "learning_rate": 4.999340748636462e-05,
+      "loss": 0.24956226348876953,
+      "step": 205
+    },
+    {
+      "epoch": 0.03820960698689956,
+      "grad_norm": 0.07328873127698898,
+      "learning_rate": 4.999160884067051e-05,
+      "loss": 0.26169676780700685,
+      "step": 210
+    },
+    {
+      "epoch": 0.0391193595342067,
+      "grad_norm": 0.08287990838289261,
+      "learning_rate": 4.9989593541926246e-05,
+      "loss": 0.2574604034423828,
+      "step": 215
+    },
+    {
+      "epoch": 0.04002911208151383,
+      "grad_norm": 0.06787359714508057,
+      "learning_rate": 4.9987361607602525e-05,
+      "loss": 0.25351409912109374,
+      "step": 220
+    },
+    {
+      "epoch": 0.04093886462882096,
+      "grad_norm": 0.06695502996444702,
+      "learning_rate": 4.998491305704805e-05,
+      "loss": 0.24522039890289307,
+      "step": 225
+    },
+    {
+      "epoch": 0.041848617176128096,
+      "grad_norm": 0.08872214704751968,
+      "learning_rate": 4.9982247911489375e-05,
+      "loss": 0.2581867933273315,
+      "step": 230
+    },
+    {
+      "epoch": 0.042758369723435226,
+      "grad_norm": 0.07637131959199905,
+      "learning_rate": 4.9979366194030743e-05,
+      "loss": 0.25569658279418944,
+      "step": 235
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 0.08158119022846222,
+      "learning_rate": 4.997626792965385e-05,
+      "loss": 0.2529409646987915,
+      "step": 240
+    },
+    {
+      "epoch": 0.04457787481804949,
+      "grad_norm": 0.07529161125421524,
+      "learning_rate": 4.997295314521766e-05,
+      "loss": 0.24049024581909179,
+      "step": 245
+    },
+    {
+      "epoch": 0.04548762736535662,
+      "grad_norm": 0.08860139548778534,
+      "learning_rate": 4.996942186945813e-05,
+      "loss": 0.2490522861480713,
+      "step": 250
+    },
+    {
+      "epoch": 0.04639737991266375,
+      "grad_norm": 0.0850321501493454,
+      "learning_rate": 4.9965674132988005e-05,
+      "loss": 0.24180831909179687,
+      "step": 255
+    },
+    {
+      "epoch": 0.04730713245997089,
+      "grad_norm": 0.07556115090847015,
+      "learning_rate": 4.996170996829653e-05,
+      "loss": 0.2509631872177124,
+      "step": 260
+    },
+    {
+      "epoch": 0.04821688500727802,
+      "grad_norm": 0.07971206307411194,
+      "learning_rate": 4.995752940974918e-05,
+      "loss": 0.24398891925811766,
+      "step": 265
+    },
+    {
+      "epoch": 0.04912663755458515,
+      "grad_norm": 0.09149336814880371,
+      "learning_rate": 4.9953132493587344e-05,
+      "loss": 0.2300492286682129,
+      "step": 270
+    },
+    {
+      "epoch": 0.050036390101892286,
+      "grad_norm": 0.08265820890665054,
+      "learning_rate": 4.9948519257928034e-05,
+      "loss": 0.24246792793273925,
+      "step": 275
+    },
+    {
+      "epoch": 0.050946142649199416,
+      "grad_norm": 0.10328587144613266,
+      "learning_rate": 4.9943689742763534e-05,
+      "loss": 0.2367171049118042,
+      "step": 280
+    },
+    {
+      "epoch": 0.05185589519650655,
+      "grad_norm": 0.0836917981505394,
+      "learning_rate": 4.993864398996105e-05,
+      "loss": 0.23215813636779786,
+      "step": 285
+    },
+    {
+      "epoch": 0.05276564774381368,
+      "grad_norm": 0.09475161135196686,
+      "learning_rate": 4.99333820432624e-05,
+      "loss": 0.2350748062133789,
+      "step": 290
+    },
+    {
+      "epoch": 0.05367540029112081,
+      "grad_norm": 0.08040128648281097,
+      "learning_rate": 4.992790394828355e-05,
+      "loss": 0.23253886699676513,
+      "step": 295
+    },
+    {
+      "epoch": 0.05458515283842795,
+      "grad_norm": 0.08852150291204453,
+      "learning_rate": 4.992220975251428e-05,
+      "loss": 0.23856515884399415,
+      "step": 300
+    },
+    {
+      "epoch": 0.05549490538573508,
+      "grad_norm": 0.09565229713916779,
+      "learning_rate": 4.991629950531775e-05,
+      "loss": 0.23311660289764405,
+      "step": 305
+    },
+    {
+      "epoch": 0.05640465793304221,
+      "grad_norm": 0.08158160001039505,
+      "learning_rate": 4.991017325793009e-05,
+      "loss": 0.22467944622039795,
+      "step": 310
+    },
+    {
+      "epoch": 0.05731441048034935,
+      "grad_norm": 0.07746429741382599,
+      "learning_rate": 4.990383106345994e-05,
+      "loss": 0.229844069480896,
+      "step": 315
+    },
+    {
+      "epoch": 0.05822416302765648,
+      "grad_norm": 0.08564355969429016,
+      "learning_rate": 4.989727297688797e-05,
+      "loss": 0.22414517402648926,
+      "step": 320
+    },
+    {
+      "epoch": 0.05913391557496361,
+      "grad_norm": 0.07517435401678085,
+      "learning_rate": 4.9890499055066435e-05,
+      "loss": 0.2236532211303711,
+      "step": 325
+    },
+    {
+      "epoch": 0.060043668122270744,
+      "grad_norm": 0.111734539270401,
+      "learning_rate": 4.988350935671869e-05,
+      "loss": 0.21474847793579102,
+      "step": 330
+    },
+    {
+      "epoch": 0.060953420669577874,
+      "grad_norm": 0.09906989336013794,
+      "learning_rate": 4.987630394243866e-05,
+      "loss": 0.23321933746337892,
+      "step": 335
+    },
+    {
+      "epoch": 0.06186317321688501,
+      "grad_norm": 0.10131457448005676,
+      "learning_rate": 4.98688828746903e-05,
+      "loss": 0.2310662031173706,
+      "step": 340
+    },
+    {
+      "epoch": 0.06277292576419213,
+      "grad_norm": 0.09203507006168365,
+      "learning_rate": 4.986124621780708e-05,
+      "loss": 0.22021169662475587,
+      "step": 345
+    },
+    {
+      "epoch": 0.06368267831149928,
+      "grad_norm": 0.09505912661552429,
+      "learning_rate": 4.9853394037991416e-05,
+      "loss": 0.2197155237197876,
+      "step": 350
+    },
+    {
+      "epoch": 0.06459243085880641,
+      "grad_norm": 0.09038657695055008,
+      "learning_rate": 4.984532640331412e-05,
+      "loss": 0.22066287994384765,
+      "step": 355
+    },
+    {
+      "epoch": 0.06550218340611354,
+      "grad_norm": 0.09707064181566238,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 0.22455451488494874,
+      "step": 360
+    },
+    {
+      "epoch": 0.06641193595342067,
+      "grad_norm": 0.10367228090763092,
+      "learning_rate": 4.98285450509961e-05,
+      "loss": 0.21993820667266845,
+      "step": 365
+    },
+    {
+      "epoch": 0.0673216885007278,
+      "grad_norm": 0.12229471653699875,
+      "learning_rate": 4.9819831478833456e-05,
+      "loss": 0.2168867588043213,
+      "step": 370
+    },
+    {
+      "epoch": 0.06823144104803494,
+      "grad_norm": 0.0964592918753624,
+      "learning_rate": 4.981090274276406e-05,
+      "loss": 0.21579203605651856,
+      "step": 375
+    },
+    {
+      "epoch": 0.06914119359534207,
+      "grad_norm": 0.09400496631860733,
+      "learning_rate": 4.980175892019141e-05,
+      "loss": 0.20972180366516113,
+      "step": 380
+    },
+    {
+      "epoch": 0.0700509461426492,
+      "grad_norm": 0.08158645778894424,
+      "learning_rate": 4.9792400090383594e-05,
+      "loss": 0.22148358821868896,
+      "step": 385
+    },
+    {
+      "epoch": 0.07096069868995633,
+      "grad_norm": 0.10916394740343094,
+      "learning_rate": 4.978282633447261e-05,
+      "loss": 0.2214418649673462,
+      "step": 390
+    },
+    {
+      "epoch": 0.07187045123726346,
+      "grad_norm": 0.11138810962438583,
+      "learning_rate": 4.9773037735453636e-05,
+      "loss": 0.21814754009246826,
+      "step": 395
+    },
+    {
+      "epoch": 0.07278020378457059,
+      "grad_norm": 0.10914396494626999,
+      "learning_rate": 4.9763034378184365e-05,
+      "loss": 0.21310818195343018,
+      "step": 400
+    },
+    {
+      "epoch": 0.07368995633187773,
+      "grad_norm": 0.1043366864323616,
+      "learning_rate": 4.975281634938421e-05,
+      "loss": 0.21266789436340333,
+      "step": 405
+    },
+    {
+      "epoch": 0.07459970887918486,
+      "grad_norm": 0.1036868542432785,
+      "learning_rate": 4.9742383737633594e-05,
+      "loss": 0.21606721878051757,
+      "step": 410
+    },
+    {
+      "epoch": 0.075509461426492,
+      "grad_norm": 0.11640442907810211,
+      "learning_rate": 4.9731736633373144e-05,
+      "loss": 0.21532948017120362,
+      "step": 415
+    },
+    {
+      "epoch": 0.07641921397379912,
+      "grad_norm": 0.11219926178455353,
+      "learning_rate": 4.9720875128902956e-05,
+      "loss": 0.2191627025604248,
+      "step": 420
+    },
+    {
+      "epoch": 0.07732896652110625,
+      "grad_norm": 0.12103637307882309,
+      "learning_rate": 4.970979931838176e-05,
+      "loss": 0.20938868522644044,
+      "step": 425
+    },
+    {
+      "epoch": 0.0782387190684134,
+      "grad_norm": 0.13274189829826355,
+      "learning_rate": 4.96985092978261e-05,
+      "loss": 0.21792960166931152,
+      "step": 430
+    },
+    {
+      "epoch": 0.07914847161572053,
+      "grad_norm": 0.11164513230323792,
+      "learning_rate": 4.968700516510954e-05,
+      "loss": 0.2022618055343628,
+      "step": 435
+    },
+    {
+      "epoch": 0.08005822416302766,
+      "grad_norm": 0.09532847255468369,
+      "learning_rate": 4.967528701996174e-05,
+      "loss": 0.21255812644958497,
+      "step": 440
+    },
+    {
+      "epoch": 0.08096797671033479,
+      "grad_norm": 0.10279258340597153,
+      "learning_rate": 4.96633549639677e-05,
+      "loss": 0.20683050155639648,
+      "step": 445
+    },
+    {
+      "epoch": 0.08187772925764192,
+      "grad_norm": 0.1257462352514267,
+      "learning_rate": 4.965120910056677e-05,
+      "loss": 0.21419920921325683,
+      "step": 450
+    },
+    {
+      "epoch": 0.08278748180494905,
+      "grad_norm": 0.11663137376308441,
+      "learning_rate": 4.963884953505186e-05,
+      "loss": 0.2072287082672119,
+      "step": 455
+    },
+    {
+      "epoch": 0.08369723435225619,
+      "grad_norm": 0.10488224029541016,
+      "learning_rate": 4.96262763745684e-05,
+      "loss": 0.1982678532600403,
+      "step": 460
+    },
+    {
+      "epoch": 0.08460698689956332,
+      "grad_norm": 0.11801692098379135,
+      "learning_rate": 4.961348972811354e-05,
+      "loss": 0.20662031173706055,
+      "step": 465
+    },
+    {
+      "epoch": 0.08551673944687045,
+      "grad_norm": 0.11318827420473099,
+      "learning_rate": 4.96004897065351e-05,
+      "loss": 0.20947303771972656,
+      "step": 470
+    },
+    {
+      "epoch": 0.08642649199417758,
+      "grad_norm": 0.13409486413002014,
+      "learning_rate": 4.95872764225307e-05,
+      "loss": 0.19670876264572143,
+      "step": 475
+    },
+    {
+      "epoch": 0.08733624454148471,
+      "grad_norm": 0.14440792798995972,
+      "learning_rate": 4.957384999064672e-05,
+      "loss": 0.19842848777770997,
+      "step": 480
+    },
+    {
+      "epoch": 0.08824599708879186,
+      "grad_norm": 0.12246996909379959,
+      "learning_rate": 4.956021052727731e-05,
+      "loss": 0.20318071842193602,
+      "step": 485
+    },
+    {
+      "epoch": 0.08915574963609899,
+      "grad_norm": 0.13437233865261078,
+      "learning_rate": 4.954635815066342e-05,
+      "loss": 0.21675212383270265,
+      "step": 490
+    },
+    {
+      "epoch": 0.09006550218340612,
+      "grad_norm": 0.11109672486782074,
+      "learning_rate": 4.9532292980891744e-05,
+      "loss": 0.2100757837295532,
+      "step": 495
+    },
+    {
+      "epoch": 0.09097525473071325,
+      "grad_norm": 0.1388893872499466,
+      "learning_rate": 4.9518015139893675e-05,
+      "loss": 0.20303285121917725,
+      "step": 500
+    },
+    {
+      "epoch": 0.09188500727802038,
+      "grad_norm": 0.13239721953868866,
+      "learning_rate": 4.950352475144427e-05,
+      "loss": 0.2152268409729004,
+      "step": 505
+    },
+    {
+      "epoch": 0.0927947598253275,
+      "grad_norm": 0.12834979593753815,
+      "learning_rate": 4.948882194116119e-05,
+      "loss": 0.20799248218536376,
+      "step": 510
+    },
+    {
+      "epoch": 0.09370451237263465,
+      "grad_norm": 0.11886704713106155,
+      "learning_rate": 4.947390683650354e-05,
+      "loss": 0.20394976139068605,
+      "step": 515
+    },
+    {
+      "epoch": 0.09461426491994178,
+      "grad_norm": 0.11398876458406448,
+      "learning_rate": 4.945877956677083e-05,
+      "loss": 0.2091092586517334,
+      "step": 520
+    },
+    {
+      "epoch": 0.09552401746724891,
+      "grad_norm": 0.1422540694475174,
+      "learning_rate": 4.944344026310186e-05,
+      "loss": 0.19564238786697388,
+      "step": 525
+    },
+    {
+      "epoch": 0.09643377001455604,
+      "grad_norm": 0.11359584331512451,
+      "learning_rate": 4.9427889058473535e-05,
+      "loss": 0.20493624210357667,
+      "step": 530
+    },
+    {
+      "epoch": 0.09734352256186317,
+      "grad_norm": 0.11703553050756454,
+      "learning_rate": 4.941212608769974e-05,
+      "loss": 0.2098615884780884,
+      "step": 535
+    },
+    {
+      "epoch": 0.0982532751091703,
+      "grad_norm": 0.14552047848701477,
+      "learning_rate": 4.939615148743017e-05,
+      "loss": 0.20382182598114013,
+      "step": 540
+    },
+    {
+      "epoch": 0.09916302765647744,
+      "grad_norm": 0.13178016245365143,
+      "learning_rate": 4.937996539614914e-05,
+      "loss": 0.19901862144470214,
+      "step": 545
+    },
+    {
+      "epoch": 0.10007278020378457,
+      "grad_norm": 0.635392427444458,
+      "learning_rate": 4.936356795417439e-05,
+      "loss": 0.20694944858551026,
+      "step": 550
+    },
+    {
+      "epoch": 0.1009825327510917,
+      "grad_norm": 0.15019077062606812,
+      "learning_rate": 4.934695930365586e-05,
+      "loss": 0.19313746690750122,
+      "step": 555
+    },
+    {
+      "epoch": 0.10189228529839883,
+      "grad_norm": 0.12941956520080566,
+      "learning_rate": 4.9330139588574474e-05,
+      "loss": 0.19671722650527954,
+      "step": 560
+    },
+    {
+      "epoch": 0.10280203784570596,
+      "grad_norm": 0.13818831741809845,
+      "learning_rate": 4.931310895474088e-05,
+      "loss": 0.20026786327362062,
+      "step": 565
+    },
+    {
+      "epoch": 0.1037117903930131,
+      "grad_norm": 0.12011194974184036,
+      "learning_rate": 4.929586754979417e-05,
+      "loss": 0.1932437539100647,
+      "step": 570
+    },
+    {
+      "epoch": 0.10462154294032024,
+      "grad_norm": 0.1345364898443222,
+      "learning_rate": 4.9278415523200644e-05,
+      "loss": 0.20245940685272218,
+      "step": 575
+    },
+    {
+      "epoch": 0.10553129548762737,
+      "grad_norm": 0.13281017541885376,
+      "learning_rate": 4.926075302625247e-05,
+      "loss": 0.19864981174468993,
+      "step": 580
+    },
+    {
+      "epoch": 0.1064410480349345,
+      "grad_norm": 0.13465586304664612,
+      "learning_rate": 4.924288021206639e-05,
+      "loss": 0.19573183059692384,
+      "step": 585
+    },
+    {
+      "epoch": 0.10735080058224163,
+      "grad_norm": 0.15225961804389954,
+      "learning_rate": 4.9224797235582396e-05,
+      "loss": 0.19946801662445068,
+      "step": 590
+    },
+    {
+      "epoch": 0.10826055312954876,
+      "grad_norm": 0.12816746532917023,
+      "learning_rate": 4.92065042535624e-05,
+      "loss": 0.19851526021957397,
+      "step": 595
+    },
+    {
+      "epoch": 0.1091703056768559,
+      "grad_norm": 0.13802853226661682,
+      "learning_rate": 4.9188001424588824e-05,
+      "loss": 0.19321763515472412,
+      "step": 600
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.3860575973684506e+17,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-600/training_args.bin b/checkpoint-600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/checkpoint-700/README.md b/checkpoint-700/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-700/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-700/adapter_config.json b/checkpoint-700/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-700/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-700/adapter_model.safetensors b/checkpoint-700/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..ef669b5fe5a5e86ab0b2c5d53613e1afc15f70c1
--- /dev/null
+++ b/checkpoint-700/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bfd534ebc440aac595d09991db009cd775fc55bd7bff64bcd8fdb6db3fddbf91
+size 169741912
diff --git a/checkpoint-700/chat_template.jinja b/checkpoint-700/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-700/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-700/optimizer.pt b/checkpoint-700/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b3a757ea998411e00a63daf895731d0a4b0d5973
--- /dev/null
+++ b/checkpoint-700/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eb322ffb9f7cf0b6017382f053576419422b24981728ada8328ae1108923e289
+size 72807355
diff --git a/checkpoint-700/processor_config.json b/checkpoint-700/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-700/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-700/rng_state.pth b/checkpoint-700/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-700/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-700/scheduler.pt b/checkpoint-700/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..19acdc4ff8df16bf1bddaaf0954a21835611642d
--- /dev/null
+++ b/checkpoint-700/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a175384262eeb10988bcb724a64316d5dcf76ffb38ae5f2f079617356a41f965
+size 1465
diff --git a/checkpoint-700/tokenizer.json b/checkpoint-700/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-700/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-700/tokenizer_config.json b/checkpoint-700/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-700/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-700/trainer_state.json b/checkpoint-700/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..69d2feae831ab331099f797648b54c70fc20bc13
--- /dev/null
+++ b/checkpoint-700/trainer_state.json
@@ -0,0 +1,1022 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.12736535662299855,
+  "eval_steps": 100,
+  "global_step": 700,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    },
+    {
+      "epoch": 0.03729985443959243,
+      "grad_norm": 0.061678580939769745,
+      "learning_rate": 4.999340748636462e-05,
+      "loss": 0.24956226348876953,
+      "step": 205
+    },
+    {
+      "epoch": 0.03820960698689956,
+      "grad_norm": 0.07328873127698898,
+      "learning_rate": 4.999160884067051e-05,
+      "loss": 0.26169676780700685,
+      "step": 210
+    },
+    {
+      "epoch": 0.0391193595342067,
+      "grad_norm": 0.08287990838289261,
+      "learning_rate": 4.9989593541926246e-05,
+      "loss": 0.2574604034423828,
+      "step": 215
+    },
+    {
+      "epoch": 0.04002911208151383,
+      "grad_norm": 0.06787359714508057,
+      "learning_rate": 4.9987361607602525e-05,
+      "loss": 0.25351409912109374,
+      "step": 220
+    },
+    {
+      "epoch": 0.04093886462882096,
+      "grad_norm": 0.06695502996444702,
+      "learning_rate": 4.998491305704805e-05,
+      "loss": 0.24522039890289307,
+      "step": 225
+    },
+    {
+      "epoch": 0.041848617176128096,
+      "grad_norm": 0.08872214704751968,
+      "learning_rate": 4.9982247911489375e-05,
+      "loss": 0.2581867933273315,
+      "step": 230
+    },
+    {
+      "epoch": 0.042758369723435226,
+      "grad_norm": 0.07637131959199905,
+      "learning_rate": 4.9979366194030743e-05,
+      "loss": 0.25569658279418944,
+      "step": 235
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 0.08158119022846222,
+      "learning_rate": 4.997626792965385e-05,
+      "loss": 0.2529409646987915,
+      "step": 240
+    },
+    {
+      "epoch": 0.04457787481804949,
+      "grad_norm": 0.07529161125421524,
+      "learning_rate": 4.997295314521766e-05,
+      "loss": 0.24049024581909179,
+      "step": 245
+    },
+    {
+      "epoch": 0.04548762736535662,
+      "grad_norm": 0.08860139548778534,
+      "learning_rate": 4.996942186945813e-05,
+      "loss": 0.2490522861480713,
+      "step": 250
+    },
+    {
+      "epoch": 0.04639737991266375,
+      "grad_norm": 0.0850321501493454,
+      "learning_rate": 4.9965674132988005e-05,
+      "loss": 0.24180831909179687,
+      "step": 255
+    },
+    {
+      "epoch": 0.04730713245997089,
+      "grad_norm": 0.07556115090847015,
+      "learning_rate": 4.996170996829653e-05,
+      "loss": 0.2509631872177124,
+      "step": 260
+    },
+    {
+      "epoch": 0.04821688500727802,
+      "grad_norm": 0.07971206307411194,
+      "learning_rate": 4.995752940974918e-05,
+      "loss": 0.24398891925811766,
+      "step": 265
+    },
+    {
+      "epoch": 0.04912663755458515,
+      "grad_norm": 0.09149336814880371,
+      "learning_rate": 4.9953132493587344e-05,
+      "loss": 0.2300492286682129,
+      "step": 270
+    },
+    {
+      "epoch": 0.050036390101892286,
+      "grad_norm": 0.08265820890665054,
+      "learning_rate": 4.9948519257928034e-05,
+      "loss": 0.24246792793273925,
+      "step": 275
+    },
+    {
+      "epoch": 0.050946142649199416,
+      "grad_norm": 0.10328587144613266,
+      "learning_rate": 4.9943689742763534e-05,
+      "loss": 0.2367171049118042,
+      "step": 280
+    },
+    {
+      "epoch": 0.05185589519650655,
+      "grad_norm": 0.0836917981505394,
+      "learning_rate": 4.993864398996105e-05,
+      "loss": 0.23215813636779786,
+      "step": 285
+    },
+    {
+      "epoch": 0.05276564774381368,
+      "grad_norm": 0.09475161135196686,
+      "learning_rate": 4.99333820432624e-05,
+      "loss": 0.2350748062133789,
+      "step": 290
+    },
+    {
+      "epoch": 0.05367540029112081,
+      "grad_norm": 0.08040128648281097,
+      "learning_rate": 4.992790394828355e-05,
+      "loss": 0.23253886699676513,
+      "step": 295
+    },
+    {
+      "epoch": 0.05458515283842795,
+      "grad_norm": 0.08852150291204453,
+      "learning_rate": 4.992220975251428e-05,
+      "loss": 0.23856515884399415,
+      "step": 300
+    },
+    {
+      "epoch": 0.05549490538573508,
+      "grad_norm": 0.09565229713916779,
+      "learning_rate": 4.991629950531775e-05,
+      "loss": 0.23311660289764405,
+      "step": 305
+    },
+    {
+      "epoch": 0.05640465793304221,
+      "grad_norm": 0.08158160001039505,
+      "learning_rate": 4.991017325793009e-05,
+      "loss": 0.22467944622039795,
+      "step": 310
+    },
+    {
+      "epoch": 0.05731441048034935,
+      "grad_norm": 0.07746429741382599,
+      "learning_rate": 4.990383106345994e-05,
+      "loss": 0.229844069480896,
+      "step": 315
+    },
+    {
+      "epoch": 0.05822416302765648,
+      "grad_norm": 0.08564355969429016,
+      "learning_rate": 4.989727297688797e-05,
+      "loss": 0.22414517402648926,
+      "step": 320
+    },
+    {
+      "epoch": 0.05913391557496361,
+      "grad_norm": 0.07517435401678085,
+      "learning_rate": 4.9890499055066435e-05,
+      "loss": 0.2236532211303711,
+      "step": 325
+    },
+    {
+      "epoch": 0.060043668122270744,
+      "grad_norm": 0.111734539270401,
+      "learning_rate": 4.988350935671869e-05,
+      "loss": 0.21474847793579102,
+      "step": 330
+    },
+    {
+      "epoch": 0.060953420669577874,
+      "grad_norm": 0.09906989336013794,
+      "learning_rate": 4.987630394243866e-05,
+      "loss": 0.23321933746337892,
+      "step": 335
+    },
+    {
+      "epoch": 0.06186317321688501,
+      "grad_norm": 0.10131457448005676,
+      "learning_rate": 4.98688828746903e-05,
+      "loss": 0.2310662031173706,
+      "step": 340
+    },
+    {
+      "epoch": 0.06277292576419213,
+      "grad_norm": 0.09203507006168365,
+      "learning_rate": 4.986124621780708e-05,
+      "loss": 0.22021169662475587,
+      "step": 345
+    },
+    {
+      "epoch": 0.06368267831149928,
+      "grad_norm": 0.09505912661552429,
+      "learning_rate": 4.9853394037991416e-05,
+      "loss": 0.2197155237197876,
+      "step": 350
+    },
+    {
+      "epoch": 0.06459243085880641,
+      "grad_norm": 0.09038657695055008,
+      "learning_rate": 4.984532640331412e-05,
+      "loss": 0.22066287994384765,
+      "step": 355
+    },
+    {
+      "epoch": 0.06550218340611354,
+      "grad_norm": 0.09707064181566238,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 0.22455451488494874,
+      "step": 360
+    },
+    {
+      "epoch": 0.06641193595342067,
+      "grad_norm": 0.10367228090763092,
+      "learning_rate": 4.98285450509961e-05,
+      "loss": 0.21993820667266845,
+      "step": 365
+    },
+    {
+      "epoch": 0.0673216885007278,
+      "grad_norm": 0.12229471653699875,
+      "learning_rate": 4.9819831478833456e-05,
+      "loss": 0.2168867588043213,
+      "step": 370
+    },
+    {
+      "epoch": 0.06823144104803494,
+      "grad_norm": 0.0964592918753624,
+      "learning_rate": 4.981090274276406e-05,
+      "loss": 0.21579203605651856,
+      "step": 375
+    },
+    {
+      "epoch": 0.06914119359534207,
+      "grad_norm": 0.09400496631860733,
+      "learning_rate": 4.980175892019141e-05,
+      "loss": 0.20972180366516113,
+      "step": 380
+    },
+    {
+      "epoch": 0.0700509461426492,
+      "grad_norm": 0.08158645778894424,
+      "learning_rate": 4.9792400090383594e-05,
+      "loss": 0.22148358821868896,
+      "step": 385
+    },
+    {
+      "epoch": 0.07096069868995633,
+      "grad_norm": 0.10916394740343094,
+      "learning_rate": 4.978282633447261e-05,
+      "loss": 0.2214418649673462,
+      "step": 390
+    },
+    {
+      "epoch": 0.07187045123726346,
+      "grad_norm": 0.11138810962438583,
+      "learning_rate": 4.9773037735453636e-05,
+      "loss": 0.21814754009246826,
+      "step": 395
+    },
+    {
+      "epoch": 0.07278020378457059,
+      "grad_norm": 0.10914396494626999,
+      "learning_rate": 4.9763034378184365e-05,
+      "loss": 0.21310818195343018,
+      "step": 400
+    },
+    {
+      "epoch": 0.07368995633187773,
+      "grad_norm": 0.1043366864323616,
+      "learning_rate": 4.975281634938421e-05,
+      "loss": 0.21266789436340333,
+      "step": 405
+    },
+    {
+      "epoch": 0.07459970887918486,
+      "grad_norm": 0.1036868542432785,
+      "learning_rate": 4.9742383737633594e-05,
+      "loss": 0.21606721878051757,
+      "step": 410
+    },
+    {
+      "epoch": 0.075509461426492,
+      "grad_norm": 0.11640442907810211,
+      "learning_rate": 4.9731736633373144e-05,
+      "loss": 0.21532948017120362,
+      "step": 415
+    },
+    {
+      "epoch": 0.07641921397379912,
+      "grad_norm": 0.11219926178455353,
+      "learning_rate": 4.9720875128902956e-05,
+      "loss": 0.2191627025604248,
+      "step": 420
+    },
+    {
+      "epoch": 0.07732896652110625,
+      "grad_norm": 0.12103637307882309,
+      "learning_rate": 4.970979931838176e-05,
+      "loss": 0.20938868522644044,
+      "step": 425
+    },
+    {
+      "epoch": 0.0782387190684134,
+      "grad_norm": 0.13274189829826355,
+      "learning_rate": 4.96985092978261e-05,
+      "loss": 0.21792960166931152,
+      "step": 430
+    },
+    {
+      "epoch": 0.07914847161572053,
+      "grad_norm": 0.11164513230323792,
+      "learning_rate": 4.968700516510954e-05,
+      "loss": 0.2022618055343628,
+      "step": 435
+    },
+    {
+      "epoch": 0.08005822416302766,
+      "grad_norm": 0.09532847255468369,
+      "learning_rate": 4.967528701996174e-05,
+      "loss": 0.21255812644958497,
+      "step": 440
+    },
+    {
+      "epoch": 0.08096797671033479,
+      "grad_norm": 0.10279258340597153,
+      "learning_rate": 4.96633549639677e-05,
+      "loss": 0.20683050155639648,
+      "step": 445
+    },
+    {
+      "epoch": 0.08187772925764192,
+      "grad_norm": 0.1257462352514267,
+      "learning_rate": 4.965120910056677e-05,
+      "loss": 0.21419920921325683,
+      "step": 450
+    },
+    {
+      "epoch": 0.08278748180494905,
+      "grad_norm": 0.11663137376308441,
+      "learning_rate": 4.963884953505186e-05,
+      "loss": 0.2072287082672119,
+      "step": 455
+    },
+    {
+      "epoch": 0.08369723435225619,
+      "grad_norm": 0.10488224029541016,
+      "learning_rate": 4.96262763745684e-05,
+      "loss": 0.1982678532600403,
+      "step": 460
+    },
+    {
+      "epoch": 0.08460698689956332,
+      "grad_norm": 0.11801692098379135,
+      "learning_rate": 4.961348972811354e-05,
+      "loss": 0.20662031173706055,
+      "step": 465
+    },
+    {
+      "epoch": 0.08551673944687045,
+      "grad_norm": 0.11318827420473099,
+      "learning_rate": 4.96004897065351e-05,
+      "loss": 0.20947303771972656,
+      "step": 470
+    },
+    {
+      "epoch": 0.08642649199417758,
+      "grad_norm": 0.13409486413002014,
+      "learning_rate": 4.95872764225307e-05,
+      "loss": 0.19670876264572143,
+      "step": 475
+    },
+    {
+      "epoch": 0.08733624454148471,
+      "grad_norm": 0.14440792798995972,
+      "learning_rate": 4.957384999064672e-05,
+      "loss": 0.19842848777770997,
+      "step": 480
+    },
+    {
+      "epoch": 0.08824599708879186,
+      "grad_norm": 0.12246996909379959,
+      "learning_rate": 4.956021052727731e-05,
+      "loss": 0.20318071842193602,
+      "step": 485
+    },
+    {
+      "epoch": 0.08915574963609899,
+      "grad_norm": 0.13437233865261078,
+      "learning_rate": 4.954635815066342e-05,
+      "loss": 0.21675212383270265,
+      "step": 490
+    },
+    {
+      "epoch": 0.09006550218340612,
+      "grad_norm": 0.11109672486782074,
+      "learning_rate": 4.9532292980891744e-05,
+      "loss": 0.2100757837295532,
+      "step": 495
+    },
+    {
+      "epoch": 0.09097525473071325,
+      "grad_norm": 0.1388893872499466,
+      "learning_rate": 4.9518015139893675e-05,
+      "loss": 0.20303285121917725,
+      "step": 500
+    },
+    {
+      "epoch": 0.09188500727802038,
+      "grad_norm": 0.13239721953868866,
+      "learning_rate": 4.950352475144427e-05,
+      "loss": 0.2152268409729004,
+      "step": 505
+    },
+    {
+      "epoch": 0.0927947598253275,
+      "grad_norm": 0.12834979593753815,
+      "learning_rate": 4.948882194116119e-05,
+      "loss": 0.20799248218536376,
+      "step": 510
+    },
+    {
+      "epoch": 0.09370451237263465,
+      "grad_norm": 0.11886704713106155,
+      "learning_rate": 4.947390683650354e-05,
+      "loss": 0.20394976139068605,
+      "step": 515
+    },
+    {
+      "epoch": 0.09461426491994178,
+      "grad_norm": 0.11398876458406448,
+      "learning_rate": 4.945877956677083e-05,
+      "loss": 0.2091092586517334,
+      "step": 520
+    },
+    {
+      "epoch": 0.09552401746724891,
+      "grad_norm": 0.1422540694475174,
+      "learning_rate": 4.944344026310186e-05,
+      "loss": 0.19564238786697388,
+      "step": 525
+    },
+    {
+      "epoch": 0.09643377001455604,
+      "grad_norm": 0.11359584331512451,
+      "learning_rate": 4.9427889058473535e-05,
+      "loss": 0.20493624210357667,
+      "step": 530
+    },
+    {
+      "epoch": 0.09734352256186317,
+      "grad_norm": 0.11703553050756454,
+      "learning_rate": 4.941212608769974e-05,
+      "loss": 0.2098615884780884,
+      "step": 535
+    },
+    {
+      "epoch": 0.0982532751091703,
+      "grad_norm": 0.14552047848701477,
+      "learning_rate": 4.939615148743017e-05,
+      "loss": 0.20382182598114013,
+      "step": 540
+    },
+    {
+      "epoch": 0.09916302765647744,
+      "grad_norm": 0.13178016245365143,
+      "learning_rate": 4.937996539614914e-05,
+      "loss": 0.19901862144470214,
+      "step": 545
+    },
+    {
+      "epoch": 0.10007278020378457,
+      "grad_norm": 0.635392427444458,
+      "learning_rate": 4.936356795417439e-05,
+      "loss": 0.20694944858551026,
+      "step": 550
+    },
+    {
+      "epoch": 0.1009825327510917,
+      "grad_norm": 0.15019077062606812,
+      "learning_rate": 4.934695930365586e-05,
+      "loss": 0.19313746690750122,
+      "step": 555
+    },
+    {
+      "epoch": 0.10189228529839883,
+      "grad_norm": 0.12941956520080566,
+      "learning_rate": 4.9330139588574474e-05,
+      "loss": 0.19671722650527954,
+      "step": 560
+    },
+    {
+      "epoch": 0.10280203784570596,
+      "grad_norm": 0.13818831741809845,
+      "learning_rate": 4.931310895474088e-05,
+      "loss": 0.20026786327362062,
+      "step": 565
+    },
+    {
+      "epoch": 0.1037117903930131,
+      "grad_norm": 0.12011194974184036,
+      "learning_rate": 4.929586754979417e-05,
+      "loss": 0.1932437539100647,
+      "step": 570
+    },
+    {
+      "epoch": 0.10462154294032024,
+      "grad_norm": 0.1345364898443222,
+      "learning_rate": 4.9278415523200644e-05,
+      "loss": 0.20245940685272218,
+      "step": 575
+    },
+    {
+      "epoch": 0.10553129548762737,
+      "grad_norm": 0.13281017541885376,
+      "learning_rate": 4.926075302625247e-05,
+      "loss": 0.19864981174468993,
+      "step": 580
+    },
+    {
+      "epoch": 0.1064410480349345,
+      "grad_norm": 0.13465586304664612,
+      "learning_rate": 4.924288021206639e-05,
+      "loss": 0.19573183059692384,
+      "step": 585
+    },
+    {
+      "epoch": 0.10735080058224163,
+      "grad_norm": 0.15225961804389954,
+      "learning_rate": 4.9224797235582396e-05,
+      "loss": 0.19946801662445068,
+      "step": 590
+    },
+    {
+      "epoch": 0.10826055312954876,
+      "grad_norm": 0.12816746532917023,
+      "learning_rate": 4.92065042535624e-05,
+      "loss": 0.19851526021957397,
+      "step": 595
+    },
+    {
+      "epoch": 0.1091703056768559,
+      "grad_norm": 0.13802853226661682,
+      "learning_rate": 4.9188001424588824e-05,
+      "loss": 0.19321763515472412,
+      "step": 600
+    },
+    {
+      "epoch": 0.11008005822416303,
+      "grad_norm": 0.17504797875881195,
+      "learning_rate": 4.9169288909063295e-05,
+      "loss": 0.2032616138458252,
+      "step": 605
+    },
+    {
+      "epoch": 0.11098981077147016,
+      "grad_norm": 0.13544194400310516,
+      "learning_rate": 4.91503668692052e-05,
+      "loss": 0.2011256456375122,
+      "step": 610
+    },
+    {
+      "epoch": 0.11189956331877729,
+      "grad_norm": 1.3976134061813354,
+      "learning_rate": 4.91312354690503e-05,
+      "loss": 0.19916868209838867,
+      "step": 615
+    },
+    {
+      "epoch": 0.11280931586608442,
+      "grad_norm": 0.1465059071779251,
+      "learning_rate": 4.91118948744493e-05,
+      "loss": 0.19487457275390624,
+      "step": 620
+    },
+    {
+      "epoch": 0.11371906841339156,
+      "grad_norm": 0.12103168666362762,
+      "learning_rate": 4.909234525306645e-05,
+      "loss": 0.1907251238822937,
+      "step": 625
+    },
+    {
+      "epoch": 0.1146288209606987,
+      "grad_norm": 0.12660574913024902,
+      "learning_rate": 4.907258677437802e-05,
+      "loss": 0.19327253103256226,
+      "step": 630
+    },
+    {
+      "epoch": 0.11553857350800582,
+      "grad_norm": 0.1347813606262207,
+      "learning_rate": 4.90526196096709e-05,
+      "loss": 0.19637736082077026,
+      "step": 635
+    },
+    {
+      "epoch": 0.11644832605531295,
+      "grad_norm": 0.14953652024269104,
+      "learning_rate": 4.903244393204107e-05,
+      "loss": 0.20325069427490233,
+      "step": 640
+    },
+    {
+      "epoch": 0.11735807860262008,
+      "grad_norm": 0.13936272263526917,
+      "learning_rate": 4.901205991639213e-05,
+      "loss": 0.1930275321006775,
+      "step": 645
+    },
+    {
+      "epoch": 0.11826783114992721,
+      "grad_norm": 0.1448420137166977,
+      "learning_rate": 4.899146773943374e-05,
+      "loss": 0.20026936531066894,
+      "step": 650
+    },
+    {
+      "epoch": 0.11917758369723436,
+      "grad_norm": 0.1312534064054489,
+      "learning_rate": 4.897066757968014e-05,
+      "loss": 0.19062033891677857,
+      "step": 655
+    },
+    {
+      "epoch": 0.12008733624454149,
+      "grad_norm": 0.13644742965698242,
+      "learning_rate": 4.894965961744859e-05,
+      "loss": 0.18719595670700073,
+      "step": 660
+    },
+    {
+      "epoch": 0.12099708879184862,
+      "grad_norm": 0.14276087284088135,
+      "learning_rate": 4.892844403485777e-05,
+      "loss": 0.19784307479858398,
+      "step": 665
+    },
+    {
+      "epoch": 0.12190684133915575,
+      "grad_norm": 0.14735399186611176,
+      "learning_rate": 4.890702101582623e-05,
+      "loss": 0.19163782596588136,
+      "step": 670
+    },
+    {
+      "epoch": 0.12281659388646288,
+      "grad_norm": 0.15742065012454987,
+      "learning_rate": 4.888539074607082e-05,
+      "loss": 0.19312986135482788,
+      "step": 675
+    },
+    {
+      "epoch": 0.12372634643377002,
+      "grad_norm": 0.12917031347751617,
+      "learning_rate": 4.8863553413105025e-05,
+      "loss": 0.20066320896148682,
+      "step": 680
+    },
+    {
+      "epoch": 0.12463609898107715,
+      "grad_norm": 0.1484801322221756,
+      "learning_rate": 4.884150920623737e-05,
+      "loss": 0.20096096992492676,
+      "step": 685
+    },
+    {
+      "epoch": 0.12554585152838427,
+      "grad_norm": 0.1455296128988266,
+      "learning_rate": 4.88192583165698e-05,
+      "loss": 0.20518505573272705,
+      "step": 690
+    },
+    {
+      "epoch": 0.12645560407569142,
+      "grad_norm": 0.14517490565776825,
+      "learning_rate": 4.879680093699598e-05,
+      "loss": 0.18859238624572755,
+      "step": 695
+    },
+    {
+      "epoch": 0.12736535662299855,
+      "grad_norm": 0.18778090178966522,
+      "learning_rate": 4.877413726219964e-05,
+      "loss": 0.197074818611145,
+      "step": 700
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.942520566427561e+17,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-700/training_args.bin b/checkpoint-700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/checkpoint-800/README.md b/checkpoint-800/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-800/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-800/adapter_config.json b/checkpoint-800/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-800/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-800/adapter_model.safetensors b/checkpoint-800/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a3659672fd34a35af61f8aa4a5253fadc26d8297
--- /dev/null
+++ b/checkpoint-800/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f7201736ae99ac88b287b440523029679d25d06fbadcb1429efff61306ad6b2d
+size 169741912
diff --git a/checkpoint-800/chat_template.jinja b/checkpoint-800/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-800/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-800/optimizer.pt b/checkpoint-800/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c99a70931b6823bfe79edacc5fbebafbf8b247cc
--- /dev/null
+++ b/checkpoint-800/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eceee864faeb468b238da74ec39df8a2fc1ca8a0040ebd622b313c0cbaeb8f93
+size 72807355
diff --git a/checkpoint-800/processor_config.json b/checkpoint-800/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-800/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-800/rng_state.pth b/checkpoint-800/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-800/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-800/scheduler.pt b/checkpoint-800/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..236cf74dcd1d495228ee0918233b17f512f03d86
--- /dev/null
+++ b/checkpoint-800/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c184a427ce0ea76af847dd0f82d4738b58b7d908e5a757bb1e42fd2930717745
+size 1465
diff --git a/checkpoint-800/tokenizer.json b/checkpoint-800/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-800/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-800/tokenizer_config.json b/checkpoint-800/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-800/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-800/trainer_state.json b/checkpoint-800/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..34289530de5a025deb346a8891b33b079229a560
--- /dev/null
+++ b/checkpoint-800/trainer_state.json
@@ -0,0 +1,1162 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.14556040756914118,
+  "eval_steps": 100,
+  "global_step": 800,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    },
+    {
+      "epoch": 0.03729985443959243,
+      "grad_norm": 0.061678580939769745,
+      "learning_rate": 4.999340748636462e-05,
+      "loss": 0.24956226348876953,
+      "step": 205
+    },
+    {
+      "epoch": 0.03820960698689956,
+      "grad_norm": 0.07328873127698898,
+      "learning_rate": 4.999160884067051e-05,
+      "loss": 0.26169676780700685,
+      "step": 210
+    },
+    {
+      "epoch": 0.0391193595342067,
+      "grad_norm": 0.08287990838289261,
+      "learning_rate": 4.9989593541926246e-05,
+      "loss": 0.2574604034423828,
+      "step": 215
+    },
+    {
+      "epoch": 0.04002911208151383,
+      "grad_norm": 0.06787359714508057,
+      "learning_rate": 4.9987361607602525e-05,
+      "loss": 0.25351409912109374,
+      "step": 220
+    },
+    {
+      "epoch": 0.04093886462882096,
+      "grad_norm": 0.06695502996444702,
+      "learning_rate": 4.998491305704805e-05,
+      "loss": 0.24522039890289307,
+      "step": 225
+    },
+    {
+      "epoch": 0.041848617176128096,
+      "grad_norm": 0.08872214704751968,
+      "learning_rate": 4.9982247911489375e-05,
+      "loss": 0.2581867933273315,
+      "step": 230
+    },
+    {
+      "epoch": 0.042758369723435226,
+      "grad_norm": 0.07637131959199905,
+      "learning_rate": 4.9979366194030743e-05,
+      "loss": 0.25569658279418944,
+      "step": 235
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 0.08158119022846222,
+      "learning_rate": 4.997626792965385e-05,
+      "loss": 0.2529409646987915,
+      "step": 240
+    },
+    {
+      "epoch": 0.04457787481804949,
+      "grad_norm": 0.07529161125421524,
+      "learning_rate": 4.997295314521766e-05,
+      "loss": 0.24049024581909179,
+      "step": 245
+    },
+    {
+      "epoch": 0.04548762736535662,
+      "grad_norm": 0.08860139548778534,
+      "learning_rate": 4.996942186945813e-05,
+      "loss": 0.2490522861480713,
+      "step": 250
+    },
+    {
+      "epoch": 0.04639737991266375,
+      "grad_norm": 0.0850321501493454,
+      "learning_rate": 4.9965674132988005e-05,
+      "loss": 0.24180831909179687,
+      "step": 255
+    },
+    {
+      "epoch": 0.04730713245997089,
+      "grad_norm": 0.07556115090847015,
+      "learning_rate": 4.996170996829653e-05,
+      "loss": 0.2509631872177124,
+      "step": 260
+    },
+    {
+      "epoch": 0.04821688500727802,
+      "grad_norm": 0.07971206307411194,
+      "learning_rate": 4.995752940974918e-05,
+      "loss": 0.24398891925811766,
+      "step": 265
+    },
+    {
+      "epoch": 0.04912663755458515,
+      "grad_norm": 0.09149336814880371,
+      "learning_rate": 4.9953132493587344e-05,
+      "loss": 0.2300492286682129,
+      "step": 270
+    },
+    {
+      "epoch": 0.050036390101892286,
+      "grad_norm": 0.08265820890665054,
+      "learning_rate": 4.9948519257928034e-05,
+      "loss": 0.24246792793273925,
+      "step": 275
+    },
+    {
+      "epoch": 0.050946142649199416,
+      "grad_norm": 0.10328587144613266,
+      "learning_rate": 4.9943689742763534e-05,
+      "loss": 0.2367171049118042,
+      "step": 280
+    },
+    {
+      "epoch": 0.05185589519650655,
+      "grad_norm": 0.0836917981505394,
+      "learning_rate": 4.993864398996105e-05,
+      "loss": 0.23215813636779786,
+      "step": 285
+    },
+    {
+      "epoch": 0.05276564774381368,
+      "grad_norm": 0.09475161135196686,
+      "learning_rate": 4.99333820432624e-05,
+      "loss": 0.2350748062133789,
+      "step": 290
+    },
+    {
+      "epoch": 0.05367540029112081,
+      "grad_norm": 0.08040128648281097,
+      "learning_rate": 4.992790394828355e-05,
+      "loss": 0.23253886699676513,
+      "step": 295
+    },
+    {
+      "epoch": 0.05458515283842795,
+      "grad_norm": 0.08852150291204453,
+      "learning_rate": 4.992220975251428e-05,
+      "loss": 0.23856515884399415,
+      "step": 300
+    },
+    {
+      "epoch": 0.05549490538573508,
+      "grad_norm": 0.09565229713916779,
+      "learning_rate": 4.991629950531775e-05,
+      "loss": 0.23311660289764405,
+      "step": 305
+    },
+    {
+      "epoch": 0.05640465793304221,
+      "grad_norm": 0.08158160001039505,
+      "learning_rate": 4.991017325793009e-05,
+      "loss": 0.22467944622039795,
+      "step": 310
+    },
+    {
+      "epoch": 0.05731441048034935,
+      "grad_norm": 0.07746429741382599,
+      "learning_rate": 4.990383106345994e-05,
+      "loss": 0.229844069480896,
+      "step": 315
+    },
+    {
+      "epoch": 0.05822416302765648,
+      "grad_norm": 0.08564355969429016,
+      "learning_rate": 4.989727297688797e-05,
+      "loss": 0.22414517402648926,
+      "step": 320
+    },
+    {
+      "epoch": 0.05913391557496361,
+      "grad_norm": 0.07517435401678085,
+      "learning_rate": 4.9890499055066435e-05,
+      "loss": 0.2236532211303711,
+      "step": 325
+    },
+    {
+      "epoch": 0.060043668122270744,
+      "grad_norm": 0.111734539270401,
+      "learning_rate": 4.988350935671869e-05,
+      "loss": 0.21474847793579102,
+      "step": 330
+    },
+    {
+      "epoch": 0.060953420669577874,
+      "grad_norm": 0.09906989336013794,
+      "learning_rate": 4.987630394243866e-05,
+      "loss": 0.23321933746337892,
+      "step": 335
+    },
+    {
+      "epoch": 0.06186317321688501,
+      "grad_norm": 0.10131457448005676,
+      "learning_rate": 4.98688828746903e-05,
+      "loss": 0.2310662031173706,
+      "step": 340
+    },
+    {
+      "epoch": 0.06277292576419213,
+      "grad_norm": 0.09203507006168365,
+      "learning_rate": 4.986124621780708e-05,
+      "loss": 0.22021169662475587,
+      "step": 345
+    },
+    {
+      "epoch": 0.06368267831149928,
+      "grad_norm": 0.09505912661552429,
+      "learning_rate": 4.9853394037991416e-05,
+      "loss": 0.2197155237197876,
+      "step": 350
+    },
+    {
+      "epoch": 0.06459243085880641,
+      "grad_norm": 0.09038657695055008,
+      "learning_rate": 4.984532640331412e-05,
+      "loss": 0.22066287994384765,
+      "step": 355
+    },
+    {
+      "epoch": 0.06550218340611354,
+      "grad_norm": 0.09707064181566238,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 0.22455451488494874,
+      "step": 360
+    },
+    {
+      "epoch": 0.06641193595342067,
+      "grad_norm": 0.10367228090763092,
+      "learning_rate": 4.98285450509961e-05,
+      "loss": 0.21993820667266845,
+      "step": 365
+    },
+    {
+      "epoch": 0.0673216885007278,
+      "grad_norm": 0.12229471653699875,
+      "learning_rate": 4.9819831478833456e-05,
+      "loss": 0.2168867588043213,
+      "step": 370
+    },
+    {
+      "epoch": 0.06823144104803494,
+      "grad_norm": 0.0964592918753624,
+      "learning_rate": 4.981090274276406e-05,
+      "loss": 0.21579203605651856,
+      "step": 375
+    },
+    {
+      "epoch": 0.06914119359534207,
+      "grad_norm": 0.09400496631860733,
+      "learning_rate": 4.980175892019141e-05,
+      "loss": 0.20972180366516113,
+      "step": 380
+    },
+    {
+      "epoch": 0.0700509461426492,
+      "grad_norm": 0.08158645778894424,
+      "learning_rate": 4.9792400090383594e-05,
+      "loss": 0.22148358821868896,
+      "step": 385
+    },
+    {
+      "epoch": 0.07096069868995633,
+      "grad_norm": 0.10916394740343094,
+      "learning_rate": 4.978282633447261e-05,
+      "loss": 0.2214418649673462,
+      "step": 390
+    },
+    {
+      "epoch": 0.07187045123726346,
+      "grad_norm": 0.11138810962438583,
+      "learning_rate": 4.9773037735453636e-05,
+      "loss": 0.21814754009246826,
+      "step": 395
+    },
+    {
+      "epoch": 0.07278020378457059,
+      "grad_norm": 0.10914396494626999,
+      "learning_rate": 4.9763034378184365e-05,
+      "loss": 0.21310818195343018,
+      "step": 400
+    },
+    {
+      "epoch": 0.07368995633187773,
+      "grad_norm": 0.1043366864323616,
+      "learning_rate": 4.975281634938421e-05,
+      "loss": 0.21266789436340333,
+      "step": 405
+    },
+    {
+      "epoch": 0.07459970887918486,
+      "grad_norm": 0.1036868542432785,
+      "learning_rate": 4.9742383737633594e-05,
+      "loss": 0.21606721878051757,
+      "step": 410
+    },
+    {
+      "epoch": 0.075509461426492,
+      "grad_norm": 0.11640442907810211,
+      "learning_rate": 4.9731736633373144e-05,
+      "loss": 0.21532948017120362,
+      "step": 415
+    },
+    {
+      "epoch": 0.07641921397379912,
+      "grad_norm": 0.11219926178455353,
+      "learning_rate": 4.9720875128902956e-05,
+      "loss": 0.2191627025604248,
+      "step": 420
+    },
+    {
+      "epoch": 0.07732896652110625,
+      "grad_norm": 0.12103637307882309,
+      "learning_rate": 4.970979931838176e-05,
+      "loss": 0.20938868522644044,
+      "step": 425
+    },
+    {
+      "epoch": 0.0782387190684134,
+      "grad_norm": 0.13274189829826355,
+      "learning_rate": 4.96985092978261e-05,
+      "loss": 0.21792960166931152,
+      "step": 430
+    },
+    {
+      "epoch": 0.07914847161572053,
+      "grad_norm": 0.11164513230323792,
+      "learning_rate": 4.968700516510954e-05,
+      "loss": 0.2022618055343628,
+      "step": 435
+    },
+    {
+      "epoch": 0.08005822416302766,
+      "grad_norm": 0.09532847255468369,
+      "learning_rate": 4.967528701996174e-05,
+      "loss": 0.21255812644958497,
+      "step": 440
+    },
+    {
+      "epoch": 0.08096797671033479,
+      "grad_norm": 0.10279258340597153,
+      "learning_rate": 4.96633549639677e-05,
+      "loss": 0.20683050155639648,
+      "step": 445
+    },
+    {
+      "epoch": 0.08187772925764192,
+      "grad_norm": 0.1257462352514267,
+      "learning_rate": 4.965120910056677e-05,
+      "loss": 0.21419920921325683,
+      "step": 450
+    },
+    {
+      "epoch": 0.08278748180494905,
+      "grad_norm": 0.11663137376308441,
+      "learning_rate": 4.963884953505186e-05,
+      "loss": 0.2072287082672119,
+      "step": 455
+    },
+    {
+      "epoch": 0.08369723435225619,
+      "grad_norm": 0.10488224029541016,
+      "learning_rate": 4.96262763745684e-05,
+      "loss": 0.1982678532600403,
+      "step": 460
+    },
+    {
+      "epoch": 0.08460698689956332,
+      "grad_norm": 0.11801692098379135,
+      "learning_rate": 4.961348972811354e-05,
+      "loss": 0.20662031173706055,
+      "step": 465
+    },
+    {
+      "epoch": 0.08551673944687045,
+      "grad_norm": 0.11318827420473099,
+      "learning_rate": 4.96004897065351e-05,
+      "loss": 0.20947303771972656,
+      "step": 470
+    },
+    {
+      "epoch": 0.08642649199417758,
+      "grad_norm": 0.13409486413002014,
+      "learning_rate": 4.95872764225307e-05,
+      "loss": 0.19670876264572143,
+      "step": 475
+    },
+    {
+      "epoch": 0.08733624454148471,
+      "grad_norm": 0.14440792798995972,
+      "learning_rate": 4.957384999064672e-05,
+      "loss": 0.19842848777770997,
+      "step": 480
+    },
+    {
+      "epoch": 0.08824599708879186,
+      "grad_norm": 0.12246996909379959,
+      "learning_rate": 4.956021052727731e-05,
+      "loss": 0.20318071842193602,
+      "step": 485
+    },
+    {
+      "epoch": 0.08915574963609899,
+      "grad_norm": 0.13437233865261078,
+      "learning_rate": 4.954635815066342e-05,
+      "loss": 0.21675212383270265,
+      "step": 490
+    },
+    {
+      "epoch": 0.09006550218340612,
+      "grad_norm": 0.11109672486782074,
+      "learning_rate": 4.9532292980891744e-05,
+      "loss": 0.2100757837295532,
+      "step": 495
+    },
+    {
+      "epoch": 0.09097525473071325,
+      "grad_norm": 0.1388893872499466,
+      "learning_rate": 4.9518015139893675e-05,
+      "loss": 0.20303285121917725,
+      "step": 500
+    },
+    {
+      "epoch": 0.09188500727802038,
+      "grad_norm": 0.13239721953868866,
+      "learning_rate": 4.950352475144427e-05,
+      "loss": 0.2152268409729004,
+      "step": 505
+    },
+    {
+      "epoch": 0.0927947598253275,
+      "grad_norm": 0.12834979593753815,
+      "learning_rate": 4.948882194116119e-05,
+      "loss": 0.20799248218536376,
+      "step": 510
+    },
+    {
+      "epoch": 0.09370451237263465,
+      "grad_norm": 0.11886704713106155,
+      "learning_rate": 4.947390683650354e-05,
+      "loss": 0.20394976139068605,
+      "step": 515
+    },
+    {
+      "epoch": 0.09461426491994178,
+      "grad_norm": 0.11398876458406448,
+      "learning_rate": 4.945877956677083e-05,
+      "loss": 0.2091092586517334,
+      "step": 520
+    },
+    {
+      "epoch": 0.09552401746724891,
+      "grad_norm": 0.1422540694475174,
+      "learning_rate": 4.944344026310186e-05,
+      "loss": 0.19564238786697388,
+      "step": 525
+    },
+    {
+      "epoch": 0.09643377001455604,
+      "grad_norm": 0.11359584331512451,
+      "learning_rate": 4.9427889058473535e-05,
+      "loss": 0.20493624210357667,
+      "step": 530
+    },
+    {
+      "epoch": 0.09734352256186317,
+      "grad_norm": 0.11703553050756454,
+      "learning_rate": 4.941212608769974e-05,
+      "loss": 0.2098615884780884,
+      "step": 535
+    },
+    {
+      "epoch": 0.0982532751091703,
+      "grad_norm": 0.14552047848701477,
+      "learning_rate": 4.939615148743017e-05,
+      "loss": 0.20382182598114013,
+      "step": 540
+    },
+    {
+      "epoch": 0.09916302765647744,
+      "grad_norm": 0.13178016245365143,
+      "learning_rate": 4.937996539614914e-05,
+      "loss": 0.19901862144470214,
+      "step": 545
+    },
+    {
+      "epoch": 0.10007278020378457,
+      "grad_norm": 0.635392427444458,
+      "learning_rate": 4.936356795417439e-05,
+      "loss": 0.20694944858551026,
+      "step": 550
+    },
+    {
+      "epoch": 0.1009825327510917,
+      "grad_norm": 0.15019077062606812,
+      "learning_rate": 4.934695930365586e-05,
+      "loss": 0.19313746690750122,
+      "step": 555
+    },
+    {
+      "epoch": 0.10189228529839883,
+      "grad_norm": 0.12941956520080566,
+      "learning_rate": 4.9330139588574474e-05,
+      "loss": 0.19671722650527954,
+      "step": 560
+    },
+    {
+      "epoch": 0.10280203784570596,
+      "grad_norm": 0.13818831741809845,
+      "learning_rate": 4.931310895474088e-05,
+      "loss": 0.20026786327362062,
+      "step": 565
+    },
+    {
+      "epoch": 0.1037117903930131,
+      "grad_norm": 0.12011194974184036,
+      "learning_rate": 4.929586754979417e-05,
+      "loss": 0.1932437539100647,
+      "step": 570
+    },
+    {
+      "epoch": 0.10462154294032024,
+      "grad_norm": 0.1345364898443222,
+      "learning_rate": 4.9278415523200644e-05,
+      "loss": 0.20245940685272218,
+      "step": 575
+    },
+    {
+      "epoch": 0.10553129548762737,
+      "grad_norm": 0.13281017541885376,
+      "learning_rate": 4.926075302625247e-05,
+      "loss": 0.19864981174468993,
+      "step": 580
+    },
+    {
+      "epoch": 0.1064410480349345,
+      "grad_norm": 0.13465586304664612,
+      "learning_rate": 4.924288021206639e-05,
+      "loss": 0.19573183059692384,
+      "step": 585
+    },
+    {
+      "epoch": 0.10735080058224163,
+      "grad_norm": 0.15225961804389954,
+      "learning_rate": 4.9224797235582396e-05,
+      "loss": 0.19946801662445068,
+      "step": 590
+    },
+    {
+      "epoch": 0.10826055312954876,
+      "grad_norm": 0.12816746532917023,
+      "learning_rate": 4.92065042535624e-05,
+      "loss": 0.19851526021957397,
+      "step": 595
+    },
+    {
+      "epoch": 0.1091703056768559,
+      "grad_norm": 0.13802853226661682,
+      "learning_rate": 4.9188001424588824e-05,
+      "loss": 0.19321763515472412,
+      "step": 600
+    },
+    {
+      "epoch": 0.11008005822416303,
+      "grad_norm": 0.17504797875881195,
+      "learning_rate": 4.9169288909063295e-05,
+      "loss": 0.2032616138458252,
+      "step": 605
+    },
+    {
+      "epoch": 0.11098981077147016,
+      "grad_norm": 0.13544194400310516,
+      "learning_rate": 4.91503668692052e-05,
+      "loss": 0.2011256456375122,
+      "step": 610
+    },
+    {
+      "epoch": 0.11189956331877729,
+      "grad_norm": 1.3976134061813354,
+      "learning_rate": 4.91312354690503e-05,
+      "loss": 0.19916868209838867,
+      "step": 615
+    },
+    {
+      "epoch": 0.11280931586608442,
+      "grad_norm": 0.1465059071779251,
+      "learning_rate": 4.91118948744493e-05,
+      "loss": 0.19487457275390624,
+      "step": 620
+    },
+    {
+      "epoch": 0.11371906841339156,
+      "grad_norm": 0.12103168666362762,
+      "learning_rate": 4.909234525306645e-05,
+      "loss": 0.1907251238822937,
+      "step": 625
+    },
+    {
+      "epoch": 0.1146288209606987,
+      "grad_norm": 0.12660574913024902,
+      "learning_rate": 4.907258677437802e-05,
+      "loss": 0.19327253103256226,
+      "step": 630
+    },
+    {
+      "epoch": 0.11553857350800582,
+      "grad_norm": 0.1347813606262207,
+      "learning_rate": 4.90526196096709e-05,
+      "loss": 0.19637736082077026,
+      "step": 635
+    },
+    {
+      "epoch": 0.11644832605531295,
+      "grad_norm": 0.14953652024269104,
+      "learning_rate": 4.903244393204107e-05,
+      "loss": 0.20325069427490233,
+      "step": 640
+    },
+    {
+      "epoch": 0.11735807860262008,
+      "grad_norm": 0.13936272263526917,
+      "learning_rate": 4.901205991639213e-05,
+      "loss": 0.1930275321006775,
+      "step": 645
+    },
+    {
+      "epoch": 0.11826783114992721,
+      "grad_norm": 0.1448420137166977,
+      "learning_rate": 4.899146773943374e-05,
+      "loss": 0.20026936531066894,
+      "step": 650
+    },
+    {
+      "epoch": 0.11917758369723436,
+      "grad_norm": 0.1312534064054489,
+      "learning_rate": 4.897066757968014e-05,
+      "loss": 0.19062033891677857,
+      "step": 655
+    },
+    {
+      "epoch": 0.12008733624454149,
+      "grad_norm": 0.13644742965698242,
+      "learning_rate": 4.894965961744859e-05,
+      "loss": 0.18719595670700073,
+      "step": 660
+    },
+    {
+      "epoch": 0.12099708879184862,
+      "grad_norm": 0.14276087284088135,
+      "learning_rate": 4.892844403485777e-05,
+      "loss": 0.19784307479858398,
+      "step": 665
+    },
+    {
+      "epoch": 0.12190684133915575,
+      "grad_norm": 0.14735399186611176,
+      "learning_rate": 4.890702101582623e-05,
+      "loss": 0.19163782596588136,
+      "step": 670
+    },
+    {
+      "epoch": 0.12281659388646288,
+      "grad_norm": 0.15742065012454987,
+      "learning_rate": 4.888539074607082e-05,
+      "loss": 0.19312986135482788,
+      "step": 675
+    },
+    {
+      "epoch": 0.12372634643377002,
+      "grad_norm": 0.12917031347751617,
+      "learning_rate": 4.8863553413105025e-05,
+      "loss": 0.20066320896148682,
+      "step": 680
+    },
+    {
+      "epoch": 0.12463609898107715,
+      "grad_norm": 0.1484801322221756,
+      "learning_rate": 4.884150920623737e-05,
+      "loss": 0.20096096992492676,
+      "step": 685
+    },
+    {
+      "epoch": 0.12554585152838427,
+      "grad_norm": 0.1455296128988266,
+      "learning_rate": 4.88192583165698e-05,
+      "loss": 0.20518505573272705,
+      "step": 690
+    },
+    {
+      "epoch": 0.12645560407569142,
+      "grad_norm": 0.14517490565776825,
+      "learning_rate": 4.879680093699598e-05,
+      "loss": 0.18859238624572755,
+      "step": 695
+    },
+    {
+      "epoch": 0.12736535662299855,
+      "grad_norm": 0.18778090178966522,
+      "learning_rate": 4.877413726219964e-05,
+      "loss": 0.197074818611145,
+      "step": 700
+    },
+    {
+      "epoch": 0.12827510917030568,
+      "grad_norm": 0.13497677445411682,
+      "learning_rate": 4.87512674886529e-05,
+      "loss": 0.18713107109069824,
+      "step": 705
+    },
+    {
+      "epoch": 0.12918486171761281,
+      "grad_norm": 0.12657155096530914,
+      "learning_rate": 4.872819181461455e-05,
+      "loss": 0.1858484387397766,
+      "step": 710
+    },
+    {
+      "epoch": 0.13009461426491994,
+      "grad_norm": 0.11458148807287216,
+      "learning_rate": 4.870491044012834e-05,
+      "loss": 0.18732179403305055,
+      "step": 715
+    },
+    {
+      "epoch": 0.13100436681222707,
+      "grad_norm": 0.13000249862670898,
+      "learning_rate": 4.8681423567021244e-05,
+      "loss": 0.1872936010360718,
+      "step": 720
+    },
+    {
+      "epoch": 0.1319141193595342,
+      "grad_norm": 0.14580890536308289,
+      "learning_rate": 4.865773139890172e-05,
+      "loss": 0.19280019998550416,
+      "step": 725
+    },
+    {
+      "epoch": 0.13282387190684133,
+      "grad_norm": 0.1507277935743332,
+      "learning_rate": 4.8633834141157913e-05,
+      "loss": 0.1898929238319397,
+      "step": 730
+    },
+    {
+      "epoch": 0.13373362445414846,
+      "grad_norm": 0.1418737769126892,
+      "learning_rate": 4.860973200095592e-05,
+      "loss": 0.17926375865936278,
+      "step": 735
+    },
+    {
+      "epoch": 0.1346433770014556,
+      "grad_norm": 0.17151866853237152,
+      "learning_rate": 4.858542518723794e-05,
+      "loss": 0.18963592052459716,
+      "step": 740
+    },
+    {
+      "epoch": 0.13555312954876272,
+      "grad_norm": 0.11162743717432022,
+      "learning_rate": 4.8560913910720535e-05,
+      "loss": 0.19466646909713745,
+      "step": 745
+    },
+    {
+      "epoch": 0.13646288209606988,
+      "grad_norm": 0.15628376603126526,
+      "learning_rate": 4.8536198383892725e-05,
+      "loss": 0.19494034051895143,
+      "step": 750
+    },
+    {
+      "epoch": 0.137372634643377,
+      "grad_norm": 0.18209289014339447,
+      "learning_rate": 4.851127882101421e-05,
+      "loss": 0.18747550249099731,
+      "step": 755
+    },
+    {
+      "epoch": 0.13828238719068414,
+      "grad_norm": 0.14559614658355713,
+      "learning_rate": 4.8486155438113454e-05,
+      "loss": 0.1897158980369568,
+      "step": 760
+    },
+    {
+      "epoch": 0.13919213973799127,
+      "grad_norm": 0.3198587894439697,
+      "learning_rate": 4.846082845298586e-05,
+      "loss": 0.18571001291275024,
+      "step": 765
+    },
+    {
+      "epoch": 0.1401018922852984,
+      "grad_norm": 0.1486678421497345,
+      "learning_rate": 4.843529808519189e-05,
+      "loss": 0.19561930894851684,
+      "step": 770
+    },
+    {
+      "epoch": 0.14101164483260553,
+      "grad_norm": 0.15318170189857483,
+      "learning_rate": 4.840956455605509e-05,
+      "loss": 0.187040114402771,
+      "step": 775
+    },
+    {
+      "epoch": 0.14192139737991266,
+      "grad_norm": 0.13754244148731232,
+      "learning_rate": 4.838362808866025e-05,
+      "loss": 0.18345539569854735,
+      "step": 780
+    },
+    {
+      "epoch": 0.1428311499272198,
+      "grad_norm": 0.12943248450756073,
+      "learning_rate": 4.835748890785143e-05,
+      "loss": 0.1921079397201538,
+      "step": 785
+    },
+    {
+      "epoch": 0.14374090247452692,
+      "grad_norm": 0.110458143055439,
+      "learning_rate": 4.833114724023001e-05,
+      "loss": 0.17927205562591553,
+      "step": 790
+    },
+    {
+      "epoch": 0.14465065502183405,
+      "grad_norm": 0.2421770840883255,
+      "learning_rate": 4.830460331415275e-05,
+      "loss": 0.18317567110061644,
+      "step": 795
+    },
+    {
+      "epoch": 0.14556040756914118,
+      "grad_norm": 0.14752762019634247,
+      "learning_rate": 4.8277857359729787e-05,
+      "loss": 0.1843916058540344,
+      "step": 800
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4.491847953207176e+17,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-800/training_args.bin b/checkpoint-800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/checkpoint-900/README.md b/checkpoint-900/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e
--- /dev/null
+++ b/checkpoint-900/README.md
@@ -0,0 +1,210 @@
+---
+base_model: unsloth/gemma-4-E4B-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/gemma-4-E4B-it
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.19.1
\ No newline at end of file
diff --git a/checkpoint-900/adapter_config.json b/checkpoint-900/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228
--- /dev/null
+++ b/checkpoint-900/adapter_config.json
@@ -0,0 +1,52 @@
+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Gemma4ForConditionalGeneration",
+    "parent_library": "transformers.models.gemma4.modeling_gemma4",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-900/adapter_model.safetensors b/checkpoint-900/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..4d23f27e223e86c43bfa2f80bed9064f02128aa8
--- /dev/null
+++ b/checkpoint-900/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:448f0cce2d69719e9a51b87d34c93ec5d9a1457f17e4b66a61c07c91a90a6501
+size 169741912
diff --git a/checkpoint-900/chat_template.jinja b/checkpoint-900/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7
--- /dev/null
+++ b/checkpoint-900/chat_template.jinja
@@ -0,0 +1,351 @@
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if not filter_keys or key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>\n' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Pre-scan: find last user message index for reasoning guard -#}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- set captured_content -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '<|image|>' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '<|video|>' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
+
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/checkpoint-900/optimizer.pt b/checkpoint-900/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8d84c0563604680a38932c53f3e553b60cebdad2
--- /dev/null
+++ b/checkpoint-900/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:618adf05e643bfc75c2079b649320ceb328881eb992e9bee4754063fb8a51a9e
+size 72807355
diff --git a/checkpoint-900/processor_config.json b/checkpoint-900/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/checkpoint-900/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/checkpoint-900/rng_state.pth b/checkpoint-900/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad
--- /dev/null
+++ b/checkpoint-900/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645
diff --git a/checkpoint-900/scheduler.pt b/checkpoint-900/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5d563813207098e5a394b16074e682d3cbdf758a
--- /dev/null
+++ b/checkpoint-900/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c50238c6ad9c1ec389a246215cc396aa1c87e4d537fe0cd9b3bed4a73b840f4d
+size 1465
diff --git a/checkpoint-900/tokenizer.json b/checkpoint-900/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/checkpoint-900/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/checkpoint-900/tokenizer_config.json b/checkpoint-900/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/checkpoint-900/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}
diff --git a/checkpoint-900/trainer_state.json b/checkpoint-900/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..9e0f63f3b081d00a6c3fb0a3e0d6070d7afeca4e
--- /dev/null
+++ b/checkpoint-900/trainer_state.json
@@ -0,0 +1,1302 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.16375545851528384,
+  "eval_steps": 100,
+  "global_step": 900,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009097525473071324,
+      "grad_norm": 1.0602493286132812,
+      "learning_rate": 1.2121212121212122e-06,
+      "loss": 1.7156932830810547,
+      "step": 5
+    },
+    {
+      "epoch": 0.001819505094614265,
+      "grad_norm": 1.1577719449996948,
+      "learning_rate": 2.7272727272727272e-06,
+      "loss": 1.6629371643066406,
+      "step": 10
+    },
+    {
+      "epoch": 0.0027292576419213972,
+      "grad_norm": 1.0288419723510742,
+      "learning_rate": 4.242424242424243e-06,
+      "loss": 1.6706295013427734,
+      "step": 15
+    },
+    {
+      "epoch": 0.00363901018922853,
+      "grad_norm": 2.129403829574585,
+      "learning_rate": 5.7575757575757586e-06,
+      "loss": 1.7363752365112304,
+      "step": 20
+    },
+    {
+      "epoch": 0.004548762736535662,
+      "grad_norm": 1.9468326568603516,
+      "learning_rate": 7.272727272727272e-06,
+      "loss": 1.7111135482788087,
+      "step": 25
+    },
+    {
+      "epoch": 0.0054585152838427945,
+      "grad_norm": 1.1269357204437256,
+      "learning_rate": 8.787878787878788e-06,
+      "loss": 1.6924203872680663,
+      "step": 30
+    },
+    {
+      "epoch": 0.006368267831149927,
+      "grad_norm": 1.4021248817443848,
+      "learning_rate": 1.0303030303030304e-05,
+      "loss": 1.658310317993164,
+      "step": 35
+    },
+    {
+      "epoch": 0.00727802037845706,
+      "grad_norm": 1.313381314277649,
+      "learning_rate": 1.1818181818181819e-05,
+      "loss": 1.5383296012878418,
+      "step": 40
+    },
+    {
+      "epoch": 0.008187772925764192,
+      "grad_norm": 2.4359891414642334,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.4302565574645996,
+      "step": 45
+    },
+    {
+      "epoch": 0.009097525473071324,
+      "grad_norm": 1.6459542512893677,
+      "learning_rate": 1.484848484848485e-05,
+      "loss": 1.2602953910827637,
+      "step": 50
+    },
+    {
+      "epoch": 0.010007278020378457,
+      "grad_norm": 0.7953159213066101,
+      "learning_rate": 1.6363636363636366e-05,
+      "loss": 1.204326343536377,
+      "step": 55
+    },
+    {
+      "epoch": 0.010917030567685589,
+      "grad_norm": 0.5824465155601501,
+      "learning_rate": 1.787878787878788e-05,
+      "loss": 1.068561840057373,
+      "step": 60
+    },
+    {
+      "epoch": 0.011826783114992722,
+      "grad_norm": 0.39265626668930054,
+      "learning_rate": 1.9393939393939395e-05,
+      "loss": 0.9570062637329102,
+      "step": 65
+    },
+    {
+      "epoch": 0.012736535662299854,
+      "grad_norm": 0.3387283384799957,
+      "learning_rate": 2.090909090909091e-05,
+      "loss": 0.9454713821411133,
+      "step": 70
+    },
+    {
+      "epoch": 0.013646288209606987,
+      "grad_norm": 0.3182811141014099,
+      "learning_rate": 2.2424242424242424e-05,
+      "loss": 0.8901592254638672,
+      "step": 75
+    },
+    {
+      "epoch": 0.01455604075691412,
+      "grad_norm": 0.2735312879085541,
+      "learning_rate": 2.393939393939394e-05,
+      "loss": 0.8491583824157715,
+      "step": 80
+    },
+    {
+      "epoch": 0.015465793304221253,
+      "grad_norm": 0.2376435250043869,
+      "learning_rate": 2.5454545454545454e-05,
+      "loss": 0.8109179496765136,
+      "step": 85
+    },
+    {
+      "epoch": 0.016375545851528384,
+      "grad_norm": 0.2161586880683899,
+      "learning_rate": 2.696969696969697e-05,
+      "loss": 0.76962308883667,
+      "step": 90
+    },
+    {
+      "epoch": 0.017285298398835518,
+      "grad_norm": 0.19587980210781097,
+      "learning_rate": 2.8484848484848486e-05,
+      "loss": 0.7301986694335938,
+      "step": 95
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "grad_norm": 0.20971694588661194,
+      "learning_rate": 3e-05,
+      "loss": 0.7269618034362793,
+      "step": 100
+    },
+    {
+      "epoch": 0.018195050946142648,
+      "eval_loss": 2.605874538421631,
+      "eval_runtime": 1120.0905,
+      "eval_samples_per_second": 33.935,
+      "eval_steps_per_second": 8.484,
+      "step": 100
+    },
+    {
+      "epoch": 0.01910480349344978,
+      "grad_norm": 0.10413152724504471,
+      "learning_rate": 3.151515151515151e-05,
+      "loss": 0.3250573635101318,
+      "step": 105
+    },
+    {
+      "epoch": 0.020014556040756915,
+      "grad_norm": 0.09383206814527512,
+      "learning_rate": 3.303030303030303e-05,
+      "loss": 0.3277724742889404,
+      "step": 110
+    },
+    {
+      "epoch": 0.020924308588064048,
+      "grad_norm": 0.1195850670337677,
+      "learning_rate": 3.454545454545455e-05,
+      "loss": 0.3215961217880249,
+      "step": 115
+    },
+    {
+      "epoch": 0.021834061135371178,
+      "grad_norm": 0.0715397521853447,
+      "learning_rate": 3.606060606060606e-05,
+      "loss": 0.3120795965194702,
+      "step": 120
+    },
+    {
+      "epoch": 0.02274381368267831,
+      "grad_norm": 0.068007692694664,
+      "learning_rate": 3.757575757575758e-05,
+      "loss": 0.2964257955551147,
+      "step": 125
+    },
+    {
+      "epoch": 0.023653566229985445,
+      "grad_norm": 0.09345484524965286,
+      "learning_rate": 3.909090909090909e-05,
+      "loss": 0.30776252746582033,
+      "step": 130
+    },
+    {
+      "epoch": 0.024563318777292575,
+      "grad_norm": 0.05577846243977547,
+      "learning_rate": 4.0606060606060606e-05,
+      "loss": 0.3180255889892578,
+      "step": 135
+    },
+    {
+      "epoch": 0.025473071324599708,
+      "grad_norm": 0.05919989198446274,
+      "learning_rate": 4.212121212121212e-05,
+      "loss": 0.31608285903930666,
+      "step": 140
+    },
+    {
+      "epoch": 0.02638282387190684,
+      "grad_norm": 0.05644674599170685,
+      "learning_rate": 4.3636363636363636e-05,
+      "loss": 0.2993780136108398,
+      "step": 145
+    },
+    {
+      "epoch": 0.027292576419213975,
+      "grad_norm": 0.059986088424921036,
+      "learning_rate": 4.515151515151516e-05,
+      "loss": 0.2931638479232788,
+      "step": 150
+    },
+    {
+      "epoch": 0.028202328966521105,
+      "grad_norm": 0.05941484495997429,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.29284651279449464,
+      "step": 155
+    },
+    {
+      "epoch": 0.02911208151382824,
+      "grad_norm": 0.0579044483602047,
+      "learning_rate": 4.8181818181818186e-05,
+      "loss": 0.2927037000656128,
+      "step": 160
+    },
+    {
+      "epoch": 0.030021834061135372,
+      "grad_norm": 0.061985693871974945,
+      "learning_rate": 4.9696969696969694e-05,
+      "loss": 0.28671720027923586,
+      "step": 165
+    },
+    {
+      "epoch": 0.030931586608442505,
+      "grad_norm": 0.05715535953640938,
+      "learning_rate": 4.999993064772809e-05,
+      "loss": 0.2817929744720459,
+      "step": 170
+    },
+    {
+      "epoch": 0.03184133915574964,
+      "grad_norm": 0.06549780815839767,
+      "learning_rate": 4.999964890478288e-05,
+      "loss": 0.27853829860687257,
+      "step": 175
+    },
+    {
+      "epoch": 0.03275109170305677,
+      "grad_norm": 0.05948757752776146,
+      "learning_rate": 4.999915043908795e-05,
+      "loss": 0.27522289752960205,
+      "step": 180
+    },
+    {
+      "epoch": 0.0336608442503639,
+      "grad_norm": 0.06262889504432678,
+      "learning_rate": 4.9998435254964515e-05,
+      "loss": 0.270997428894043,
+      "step": 185
+    },
+    {
+      "epoch": 0.034570596797671035,
+      "grad_norm": 0.06916829943656921,
+      "learning_rate": 4.999750335861253e-05,
+      "loss": 0.2788438558578491,
+      "step": 190
+    },
+    {
+      "epoch": 0.035480349344978165,
+      "grad_norm": 0.06128217652440071,
+      "learning_rate": 4.9996354758110624e-05,
+      "loss": 0.25649352073669435,
+      "step": 195
+    },
+    {
+      "epoch": 0.036390101892285295,
+      "grad_norm": 0.06704027950763702,
+      "learning_rate": 4.999498946341606e-05,
+      "loss": 0.25619523525238036,
+      "step": 200
+    },
+    {
+      "epoch": 0.03729985443959243,
+      "grad_norm": 0.061678580939769745,
+      "learning_rate": 4.999340748636462e-05,
+      "loss": 0.24956226348876953,
+      "step": 205
+    },
+    {
+      "epoch": 0.03820960698689956,
+      "grad_norm": 0.07328873127698898,
+      "learning_rate": 4.999160884067051e-05,
+      "loss": 0.26169676780700685,
+      "step": 210
+    },
+    {
+      "epoch": 0.0391193595342067,
+      "grad_norm": 0.08287990838289261,
+      "learning_rate": 4.9989593541926246e-05,
+      "loss": 0.2574604034423828,
+      "step": 215
+    },
+    {
+      "epoch": 0.04002911208151383,
+      "grad_norm": 0.06787359714508057,
+      "learning_rate": 4.9987361607602525e-05,
+      "loss": 0.25351409912109374,
+      "step": 220
+    },
+    {
+      "epoch": 0.04093886462882096,
+      "grad_norm": 0.06695502996444702,
+      "learning_rate": 4.998491305704805e-05,
+      "loss": 0.24522039890289307,
+      "step": 225
+    },
+    {
+      "epoch": 0.041848617176128096,
+      "grad_norm": 0.08872214704751968,
+      "learning_rate": 4.9982247911489375e-05,
+      "loss": 0.2581867933273315,
+      "step": 230
+    },
+    {
+      "epoch": 0.042758369723435226,
+      "grad_norm": 0.07637131959199905,
+      "learning_rate": 4.9979366194030743e-05,
+      "loss": 0.25569658279418944,
+      "step": 235
+    },
+    {
+      "epoch": 0.043668122270742356,
+      "grad_norm": 0.08158119022846222,
+      "learning_rate": 4.997626792965385e-05,
+      "loss": 0.2529409646987915,
+      "step": 240
+    },
+    {
+      "epoch": 0.04457787481804949,
+      "grad_norm": 0.07529161125421524,
+      "learning_rate": 4.997295314521766e-05,
+      "loss": 0.24049024581909179,
+      "step": 245
+    },
+    {
+      "epoch": 0.04548762736535662,
+      "grad_norm": 0.08860139548778534,
+      "learning_rate": 4.996942186945813e-05,
+      "loss": 0.2490522861480713,
+      "step": 250
+    },
+    {
+      "epoch": 0.04639737991266375,
+      "grad_norm": 0.0850321501493454,
+      "learning_rate": 4.9965674132988005e-05,
+      "loss": 0.24180831909179687,
+      "step": 255
+    },
+    {
+      "epoch": 0.04730713245997089,
+      "grad_norm": 0.07556115090847015,
+      "learning_rate": 4.996170996829653e-05,
+      "loss": 0.2509631872177124,
+      "step": 260
+    },
+    {
+      "epoch": 0.04821688500727802,
+      "grad_norm": 0.07971206307411194,
+      "learning_rate": 4.995752940974918e-05,
+      "loss": 0.24398891925811766,
+      "step": 265
+    },
+    {
+      "epoch": 0.04912663755458515,
+      "grad_norm": 0.09149336814880371,
+      "learning_rate": 4.9953132493587344e-05,
+      "loss": 0.2300492286682129,
+      "step": 270
+    },
+    {
+      "epoch": 0.050036390101892286,
+      "grad_norm": 0.08265820890665054,
+      "learning_rate": 4.9948519257928034e-05,
+      "loss": 0.24246792793273925,
+      "step": 275
+    },
+    {
+      "epoch": 0.050946142649199416,
+      "grad_norm": 0.10328587144613266,
+      "learning_rate": 4.9943689742763534e-05,
+      "loss": 0.2367171049118042,
+      "step": 280
+    },
+    {
+      "epoch": 0.05185589519650655,
+      "grad_norm": 0.0836917981505394,
+      "learning_rate": 4.993864398996105e-05,
+      "loss": 0.23215813636779786,
+      "step": 285
+    },
+    {
+      "epoch": 0.05276564774381368,
+      "grad_norm": 0.09475161135196686,
+      "learning_rate": 4.99333820432624e-05,
+      "loss": 0.2350748062133789,
+      "step": 290
+    },
+    {
+      "epoch": 0.05367540029112081,
+      "grad_norm": 0.08040128648281097,
+      "learning_rate": 4.992790394828355e-05,
+      "loss": 0.23253886699676513,
+      "step": 295
+    },
+    {
+      "epoch": 0.05458515283842795,
+      "grad_norm": 0.08852150291204453,
+      "learning_rate": 4.992220975251428e-05,
+      "loss": 0.23856515884399415,
+      "step": 300
+    },
+    {
+      "epoch": 0.05549490538573508,
+      "grad_norm": 0.09565229713916779,
+      "learning_rate": 4.991629950531775e-05,
+      "loss": 0.23311660289764405,
+      "step": 305
+    },
+    {
+      "epoch": 0.05640465793304221,
+      "grad_norm": 0.08158160001039505,
+      "learning_rate": 4.991017325793009e-05,
+      "loss": 0.22467944622039795,
+      "step": 310
+    },
+    {
+      "epoch": 0.05731441048034935,
+      "grad_norm": 0.07746429741382599,
+      "learning_rate": 4.990383106345994e-05,
+      "loss": 0.229844069480896,
+      "step": 315
+    },
+    {
+      "epoch": 0.05822416302765648,
+      "grad_norm": 0.08564355969429016,
+      "learning_rate": 4.989727297688797e-05,
+      "loss": 0.22414517402648926,
+      "step": 320
+    },
+    {
+      "epoch": 0.05913391557496361,
+      "grad_norm": 0.07517435401678085,
+      "learning_rate": 4.9890499055066435e-05,
+      "loss": 0.2236532211303711,
+      "step": 325
+    },
+    {
+      "epoch": 0.060043668122270744,
+      "grad_norm": 0.111734539270401,
+      "learning_rate": 4.988350935671869e-05,
+      "loss": 0.21474847793579102,
+      "step": 330
+    },
+    {
+      "epoch": 0.060953420669577874,
+      "grad_norm": 0.09906989336013794,
+      "learning_rate": 4.987630394243866e-05,
+      "loss": 0.23321933746337892,
+      "step": 335
+    },
+    {
+      "epoch": 0.06186317321688501,
+      "grad_norm": 0.10131457448005676,
+      "learning_rate": 4.98688828746903e-05,
+      "loss": 0.2310662031173706,
+      "step": 340
+    },
+    {
+      "epoch": 0.06277292576419213,
+      "grad_norm": 0.09203507006168365,
+      "learning_rate": 4.986124621780708e-05,
+      "loss": 0.22021169662475587,
+      "step": 345
+    },
+    {
+      "epoch": 0.06368267831149928,
+      "grad_norm": 0.09505912661552429,
+      "learning_rate": 4.9853394037991416e-05,
+      "loss": 0.2197155237197876,
+      "step": 350
+    },
+    {
+      "epoch": 0.06459243085880641,
+      "grad_norm": 0.09038657695055008,
+      "learning_rate": 4.984532640331412e-05,
+      "loss": 0.22066287994384765,
+      "step": 355
+    },
+    {
+      "epoch": 0.06550218340611354,
+      "grad_norm": 0.09707064181566238,
+      "learning_rate": 4.9837043383713753e-05,
+      "loss": 0.22455451488494874,
+      "step": 360
+    },
+    {
+      "epoch": 0.06641193595342067,
+      "grad_norm": 0.10367228090763092,
+      "learning_rate": 4.98285450509961e-05,
+      "loss": 0.21993820667266845,
+      "step": 365
+    },
+    {
+      "epoch": 0.0673216885007278,
+      "grad_norm": 0.12229471653699875,
+      "learning_rate": 4.9819831478833456e-05,
+      "loss": 0.2168867588043213,
+      "step": 370
+    },
+    {
+      "epoch": 0.06823144104803494,
+      "grad_norm": 0.0964592918753624,
+      "learning_rate": 4.981090274276406e-05,
+      "loss": 0.21579203605651856,
+      "step": 375
+    },
+    {
+      "epoch": 0.06914119359534207,
+      "grad_norm": 0.09400496631860733,
+      "learning_rate": 4.980175892019141e-05,
+      "loss": 0.20972180366516113,
+      "step": 380
+    },
+    {
+      "epoch": 0.0700509461426492,
+      "grad_norm": 0.08158645778894424,
+      "learning_rate": 4.9792400090383594e-05,
+      "loss": 0.22148358821868896,
+      "step": 385
+    },
+    {
+      "epoch": 0.07096069868995633,
+      "grad_norm": 0.10916394740343094,
+      "learning_rate": 4.978282633447261e-05,
+      "loss": 0.2214418649673462,
+      "step": 390
+    },
+    {
+      "epoch": 0.07187045123726346,
+      "grad_norm": 0.11138810962438583,
+      "learning_rate": 4.9773037735453636e-05,
+      "loss": 0.21814754009246826,
+      "step": 395
+    },
+    {
+      "epoch": 0.07278020378457059,
+      "grad_norm": 0.10914396494626999,
+      "learning_rate": 4.9763034378184365e-05,
+      "loss": 0.21310818195343018,
+      "step": 400
+    },
+    {
+      "epoch": 0.07368995633187773,
+      "grad_norm": 0.1043366864323616,
+      "learning_rate": 4.975281634938421e-05,
+      "loss": 0.21266789436340333,
+      "step": 405
+    },
+    {
+      "epoch": 0.07459970887918486,
+      "grad_norm": 0.1036868542432785,
+      "learning_rate": 4.9742383737633594e-05,
+      "loss": 0.21606721878051757,
+      "step": 410
+    },
+    {
+      "epoch": 0.075509461426492,
+      "grad_norm": 0.11640442907810211,
+      "learning_rate": 4.9731736633373144e-05,
+      "loss": 0.21532948017120362,
+      "step": 415
+    },
+    {
+      "epoch": 0.07641921397379912,
+      "grad_norm": 0.11219926178455353,
+      "learning_rate": 4.9720875128902956e-05,
+      "loss": 0.2191627025604248,
+      "step": 420
+    },
+    {
+      "epoch": 0.07732896652110625,
+      "grad_norm": 0.12103637307882309,
+      "learning_rate": 4.970979931838176e-05,
+      "loss": 0.20938868522644044,
+      "step": 425
+    },
+    {
+      "epoch": 0.0782387190684134,
+      "grad_norm": 0.13274189829826355,
+      "learning_rate": 4.96985092978261e-05,
+      "loss": 0.21792960166931152,
+      "step": 430
+    },
+    {
+      "epoch": 0.07914847161572053,
+      "grad_norm": 0.11164513230323792,
+      "learning_rate": 4.968700516510954e-05,
+      "loss": 0.2022618055343628,
+      "step": 435
+    },
+    {
+      "epoch": 0.08005822416302766,
+      "grad_norm": 0.09532847255468369,
+      "learning_rate": 4.967528701996174e-05,
+      "loss": 0.21255812644958497,
+      "step": 440
+    },
+    {
+      "epoch": 0.08096797671033479,
+      "grad_norm": 0.10279258340597153,
+      "learning_rate": 4.96633549639677e-05,
+      "loss": 0.20683050155639648,
+      "step": 445
+    },
+    {
+      "epoch": 0.08187772925764192,
+      "grad_norm": 0.1257462352514267,
+      "learning_rate": 4.965120910056677e-05,
+      "loss": 0.21419920921325683,
+      "step": 450
+    },
+    {
+      "epoch": 0.08278748180494905,
+      "grad_norm": 0.11663137376308441,
+      "learning_rate": 4.963884953505186e-05,
+      "loss": 0.2072287082672119,
+      "step": 455
+    },
+    {
+      "epoch": 0.08369723435225619,
+      "grad_norm": 0.10488224029541016,
+      "learning_rate": 4.96262763745684e-05,
+      "loss": 0.1982678532600403,
+      "step": 460
+    },
+    {
+      "epoch": 0.08460698689956332,
+      "grad_norm": 0.11801692098379135,
+      "learning_rate": 4.961348972811354e-05,
+      "loss": 0.20662031173706055,
+      "step": 465
+    },
+    {
+      "epoch": 0.08551673944687045,
+      "grad_norm": 0.11318827420473099,
+      "learning_rate": 4.96004897065351e-05,
+      "loss": 0.20947303771972656,
+      "step": 470
+    },
+    {
+      "epoch": 0.08642649199417758,
+      "grad_norm": 0.13409486413002014,
+      "learning_rate": 4.95872764225307e-05,
+      "loss": 0.19670876264572143,
+      "step": 475
+    },
+    {
+      "epoch": 0.08733624454148471,
+      "grad_norm": 0.14440792798995972,
+      "learning_rate": 4.957384999064672e-05,
+      "loss": 0.19842848777770997,
+      "step": 480
+    },
+    {
+      "epoch": 0.08824599708879186,
+      "grad_norm": 0.12246996909379959,
+      "learning_rate": 4.956021052727731e-05,
+      "loss": 0.20318071842193602,
+      "step": 485
+    },
+    {
+      "epoch": 0.08915574963609899,
+      "grad_norm": 0.13437233865261078,
+      "learning_rate": 4.954635815066342e-05,
+      "loss": 0.21675212383270265,
+      "step": 490
+    },
+    {
+      "epoch": 0.09006550218340612,
+      "grad_norm": 0.11109672486782074,
+      "learning_rate": 4.9532292980891744e-05,
+      "loss": 0.2100757837295532,
+      "step": 495
+    },
+    {
+      "epoch": 0.09097525473071325,
+      "grad_norm": 0.1388893872499466,
+      "learning_rate": 4.9518015139893675e-05,
+      "loss": 0.20303285121917725,
+      "step": 500
+    },
+    {
+      "epoch": 0.09188500727802038,
+      "grad_norm": 0.13239721953868866,
+      "learning_rate": 4.950352475144427e-05,
+      "loss": 0.2152268409729004,
+      "step": 505
+    },
+    {
+      "epoch": 0.0927947598253275,
+      "grad_norm": 0.12834979593753815,
+      "learning_rate": 4.948882194116119e-05,
+      "loss": 0.20799248218536376,
+      "step": 510
+    },
+    {
+      "epoch": 0.09370451237263465,
+      "grad_norm": 0.11886704713106155,
+      "learning_rate": 4.947390683650354e-05,
+      "loss": 0.20394976139068605,
+      "step": 515
+    },
+    {
+      "epoch": 0.09461426491994178,
+      "grad_norm": 0.11398876458406448,
+      "learning_rate": 4.945877956677083e-05,
+      "loss": 0.2091092586517334,
+      "step": 520
+    },
+    {
+      "epoch": 0.09552401746724891,
+      "grad_norm": 0.1422540694475174,
+      "learning_rate": 4.944344026310186e-05,
+      "loss": 0.19564238786697388,
+      "step": 525
+    },
+    {
+      "epoch": 0.09643377001455604,
+      "grad_norm": 0.11359584331512451,
+      "learning_rate": 4.9427889058473535e-05,
+      "loss": 0.20493624210357667,
+      "step": 530
+    },
+    {
+      "epoch": 0.09734352256186317,
+      "grad_norm": 0.11703553050756454,
+      "learning_rate": 4.941212608769974e-05,
+      "loss": 0.2098615884780884,
+      "step": 535
+    },
+    {
+      "epoch": 0.0982532751091703,
+      "grad_norm": 0.14552047848701477,
+      "learning_rate": 4.939615148743017e-05,
+      "loss": 0.20382182598114013,
+      "step": 540
+    },
+    {
+      "epoch": 0.09916302765647744,
+      "grad_norm": 0.13178016245365143,
+      "learning_rate": 4.937996539614914e-05,
+      "loss": 0.19901862144470214,
+      "step": 545
+    },
+    {
+      "epoch": 0.10007278020378457,
+      "grad_norm": 0.635392427444458,
+      "learning_rate": 4.936356795417439e-05,
+      "loss": 0.20694944858551026,
+      "step": 550
+    },
+    {
+      "epoch": 0.1009825327510917,
+      "grad_norm": 0.15019077062606812,
+      "learning_rate": 4.934695930365586e-05,
+      "loss": 0.19313746690750122,
+      "step": 555
+    },
+    {
+      "epoch": 0.10189228529839883,
+      "grad_norm": 0.12941956520080566,
+      "learning_rate": 4.9330139588574474e-05,
+      "loss": 0.19671722650527954,
+      "step": 560
+    },
+    {
+      "epoch": 0.10280203784570596,
+      "grad_norm": 0.13818831741809845,
+      "learning_rate": 4.931310895474088e-05,
+      "loss": 0.20026786327362062,
+      "step": 565
+    },
+    {
+      "epoch": 0.1037117903930131,
+      "grad_norm": 0.12011194974184036,
+      "learning_rate": 4.929586754979417e-05,
+      "loss": 0.1932437539100647,
+      "step": 570
+    },
+    {
+      "epoch": 0.10462154294032024,
+      "grad_norm": 0.1345364898443222,
+      "learning_rate": 4.9278415523200644e-05,
+      "loss": 0.20245940685272218,
+      "step": 575
+    },
+    {
+      "epoch": 0.10553129548762737,
+      "grad_norm": 0.13281017541885376,
+      "learning_rate": 4.926075302625247e-05,
+      "loss": 0.19864981174468993,
+      "step": 580
+    },
+    {
+      "epoch": 0.1064410480349345,
+      "grad_norm": 0.13465586304664612,
+      "learning_rate": 4.924288021206639e-05,
+      "loss": 0.19573183059692384,
+      "step": 585
+    },
+    {
+      "epoch": 0.10735080058224163,
+      "grad_norm": 0.15225961804389954,
+      "learning_rate": 4.9224797235582396e-05,
+      "loss": 0.19946801662445068,
+      "step": 590
+    },
+    {
+      "epoch": 0.10826055312954876,
+      "grad_norm": 0.12816746532917023,
+      "learning_rate": 4.92065042535624e-05,
+      "loss": 0.19851526021957397,
+      "step": 595
+    },
+    {
+      "epoch": 0.1091703056768559,
+      "grad_norm": 0.13802853226661682,
+      "learning_rate": 4.9188001424588824e-05,
+      "loss": 0.19321763515472412,
+      "step": 600
+    },
+    {
+      "epoch": 0.11008005822416303,
+      "grad_norm": 0.17504797875881195,
+      "learning_rate": 4.9169288909063295e-05,
+      "loss": 0.2032616138458252,
+      "step": 605
+    },
+    {
+      "epoch": 0.11098981077147016,
+      "grad_norm": 0.13544194400310516,
+      "learning_rate": 4.91503668692052e-05,
+      "loss": 0.2011256456375122,
+      "step": 610
+    },
+    {
+      "epoch": 0.11189956331877729,
+      "grad_norm": 1.3976134061813354,
+      "learning_rate": 4.91312354690503e-05,
+      "loss": 0.19916868209838867,
+      "step": 615
+    },
+    {
+      "epoch": 0.11280931586608442,
+      "grad_norm": 0.1465059071779251,
+      "learning_rate": 4.91118948744493e-05,
+      "loss": 0.19487457275390624,
+      "step": 620
+    },
+    {
+      "epoch": 0.11371906841339156,
+      "grad_norm": 0.12103168666362762,
+      "learning_rate": 4.909234525306645e-05,
+      "loss": 0.1907251238822937,
+      "step": 625
+    },
+    {
+      "epoch": 0.1146288209606987,
+      "grad_norm": 0.12660574913024902,
+      "learning_rate": 4.907258677437802e-05,
+      "loss": 0.19327253103256226,
+      "step": 630
+    },
+    {
+      "epoch": 0.11553857350800582,
+      "grad_norm": 0.1347813606262207,
+      "learning_rate": 4.90526196096709e-05,
+      "loss": 0.19637736082077026,
+      "step": 635
+    },
+    {
+      "epoch": 0.11644832605531295,
+      "grad_norm": 0.14953652024269104,
+      "learning_rate": 4.903244393204107e-05,
+      "loss": 0.20325069427490233,
+      "step": 640
+    },
+    {
+      "epoch": 0.11735807860262008,
+      "grad_norm": 0.13936272263526917,
+      "learning_rate": 4.901205991639213e-05,
+      "loss": 0.1930275321006775,
+      "step": 645
+    },
+    {
+      "epoch": 0.11826783114992721,
+      "grad_norm": 0.1448420137166977,
+      "learning_rate": 4.899146773943374e-05,
+      "loss": 0.20026936531066894,
+      "step": 650
+    },
+    {
+      "epoch": 0.11917758369723436,
+      "grad_norm": 0.1312534064054489,
+      "learning_rate": 4.897066757968014e-05,
+      "loss": 0.19062033891677857,
+      "step": 655
+    },
+    {
+      "epoch": 0.12008733624454149,
+      "grad_norm": 0.13644742965698242,
+      "learning_rate": 4.894965961744859e-05,
+      "loss": 0.18719595670700073,
+      "step": 660
+    },
+    {
+      "epoch": 0.12099708879184862,
+      "grad_norm": 0.14276087284088135,
+      "learning_rate": 4.892844403485777e-05,
+      "loss": 0.19784307479858398,
+      "step": 665
+    },
+    {
+      "epoch": 0.12190684133915575,
+      "grad_norm": 0.14735399186611176,
+      "learning_rate": 4.890702101582623e-05,
+      "loss": 0.19163782596588136,
+      "step": 670
+    },
+    {
+      "epoch": 0.12281659388646288,
+      "grad_norm": 0.15742065012454987,
+      "learning_rate": 4.888539074607082e-05,
+      "loss": 0.19312986135482788,
+      "step": 675
+    },
+    {
+      "epoch": 0.12372634643377002,
+      "grad_norm": 0.12917031347751617,
+      "learning_rate": 4.8863553413105025e-05,
+      "loss": 0.20066320896148682,
+      "step": 680
+    },
+    {
+      "epoch": 0.12463609898107715,
+      "grad_norm": 0.1484801322221756,
+      "learning_rate": 4.884150920623737e-05,
+      "loss": 0.20096096992492676,
+      "step": 685
+    },
+    {
+      "epoch": 0.12554585152838427,
+      "grad_norm": 0.1455296128988266,
+      "learning_rate": 4.88192583165698e-05,
+      "loss": 0.20518505573272705,
+      "step": 690
+    },
+    {
+      "epoch": 0.12645560407569142,
+      "grad_norm": 0.14517490565776825,
+      "learning_rate": 4.879680093699598e-05,
+      "loss": 0.18859238624572755,
+      "step": 695
+    },
+    {
+      "epoch": 0.12736535662299855,
+      "grad_norm": 0.18778090178966522,
+      "learning_rate": 4.877413726219964e-05,
+      "loss": 0.197074818611145,
+      "step": 700
+    },
+    {
+      "epoch": 0.12827510917030568,
+      "grad_norm": 0.13497677445411682,
+      "learning_rate": 4.87512674886529e-05,
+      "loss": 0.18713107109069824,
+      "step": 705
+    },
+    {
+      "epoch": 0.12918486171761281,
+      "grad_norm": 0.12657155096530914,
+      "learning_rate": 4.872819181461455e-05,
+      "loss": 0.1858484387397766,
+      "step": 710
+    },
+    {
+      "epoch": 0.13009461426491994,
+      "grad_norm": 0.11458148807287216,
+      "learning_rate": 4.870491044012834e-05,
+      "loss": 0.18732179403305055,
+      "step": 715
+    },
+    {
+      "epoch": 0.13100436681222707,
+      "grad_norm": 0.13000249862670898,
+      "learning_rate": 4.8681423567021244e-05,
+      "loss": 0.1872936010360718,
+      "step": 720
+    },
+    {
+      "epoch": 0.1319141193595342,
+      "grad_norm": 0.14580890536308289,
+      "learning_rate": 4.865773139890172e-05,
+      "loss": 0.19280019998550416,
+      "step": 725
+    },
+    {
+      "epoch": 0.13282387190684133,
+      "grad_norm": 0.1507277935743332,
+      "learning_rate": 4.8633834141157913e-05,
+      "loss": 0.1898929238319397,
+      "step": 730
+    },
+    {
+      "epoch": 0.13373362445414846,
+      "grad_norm": 0.1418737769126892,
+      "learning_rate": 4.860973200095592e-05,
+      "loss": 0.17926375865936278,
+      "step": 735
+    },
+    {
+      "epoch": 0.1346433770014556,
+      "grad_norm": 0.17151866853237152,
+      "learning_rate": 4.858542518723794e-05,
+      "loss": 0.18963592052459716,
+      "step": 740
+    },
+    {
+      "epoch": 0.13555312954876272,
+      "grad_norm": 0.11162743717432022,
+      "learning_rate": 4.8560913910720535e-05,
+      "loss": 0.19466646909713745,
+      "step": 745
+    },
+    {
+      "epoch": 0.13646288209606988,
+      "grad_norm": 0.15628376603126526,
+      "learning_rate": 4.8536198383892725e-05,
+      "loss": 0.19494034051895143,
+      "step": 750
+    },
+    {
+      "epoch": 0.137372634643377,
+      "grad_norm": 0.18209289014339447,
+      "learning_rate": 4.851127882101421e-05,
+      "loss": 0.18747550249099731,
+      "step": 755
+    },
+    {
+      "epoch": 0.13828238719068414,
+      "grad_norm": 0.14559614658355713,
+      "learning_rate": 4.8486155438113454e-05,
+      "loss": 0.1897158980369568,
+      "step": 760
+    },
+    {
+      "epoch": 0.13919213973799127,
+      "grad_norm": 0.3198587894439697,
+      "learning_rate": 4.846082845298586e-05,
+      "loss": 0.18571001291275024,
+      "step": 765
+    },
+    {
+      "epoch": 0.1401018922852984,
+      "grad_norm": 0.1486678421497345,
+      "learning_rate": 4.843529808519189e-05,
+      "loss": 0.19561930894851684,
+      "step": 770
+    },
+    {
+      "epoch": 0.14101164483260553,
+      "grad_norm": 0.15318170189857483,
+      "learning_rate": 4.840956455605509e-05,
+      "loss": 0.187040114402771,
+      "step": 775
+    },
+    {
+      "epoch": 0.14192139737991266,
+      "grad_norm": 0.13754244148731232,
+      "learning_rate": 4.838362808866025e-05,
+      "loss": 0.18345539569854735,
+      "step": 780
+    },
+    {
+      "epoch": 0.1428311499272198,
+      "grad_norm": 0.12943248450756073,
+      "learning_rate": 4.835748890785143e-05,
+      "loss": 0.1921079397201538,
+      "step": 785
+    },
+    {
+      "epoch": 0.14374090247452692,
+      "grad_norm": 0.110458143055439,
+      "learning_rate": 4.833114724023001e-05,
+      "loss": 0.17927205562591553,
+      "step": 790
+    },
+    {
+      "epoch": 0.14465065502183405,
+      "grad_norm": 0.2421770840883255,
+      "learning_rate": 4.830460331415275e-05,
+      "loss": 0.18317567110061644,
+      "step": 795
+    },
+    {
+      "epoch": 0.14556040756914118,
+      "grad_norm": 0.14752762019634247,
+      "learning_rate": 4.8277857359729787e-05,
+      "loss": 0.1843916058540344,
+      "step": 800
+    },
+    {
+      "epoch": 0.14647016011644834,
+      "grad_norm": 0.15043556690216064,
+      "learning_rate": 4.8250909608822644e-05,
+      "loss": 0.18354393243789674,
+      "step": 805
+    },
+    {
+      "epoch": 0.14737991266375547,
+      "grad_norm": 0.1381794661283493,
+      "learning_rate": 4.822376029504223e-05,
+      "loss": 0.1789781332015991,
+      "step": 810
+    },
+    {
+      "epoch": 0.1482896652110626,
+      "grad_norm": 0.18386174738407135,
+      "learning_rate": 4.819640965374681e-05,
+      "loss": 0.19494292736053467,
+      "step": 815
+    },
+    {
+      "epoch": 0.14919941775836973,
+      "grad_norm": 0.13829593360424042,
+      "learning_rate": 4.816885792203996e-05,
+      "loss": 0.18486063480377196,
+      "step": 820
+    },
+    {
+      "epoch": 0.15010917030567686,
+      "grad_norm": 0.15033291280269623,
+      "learning_rate": 4.814110533876852e-05,
+      "loss": 0.18061509132385253,
+      "step": 825
+    },
+    {
+      "epoch": 0.151018922852984,
+      "grad_norm": 0.17150473594665527,
+      "learning_rate": 4.811315214452051e-05,
+      "loss": 0.18464866876602173,
+      "step": 830
+    },
+    {
+      "epoch": 0.15192867540029112,
+      "grad_norm": 0.15317125618457794,
+      "learning_rate": 4.808499858162307e-05,
+      "loss": 0.1837708592414856,
+      "step": 835
+    },
+    {
+      "epoch": 0.15283842794759825,
+      "grad_norm": 0.2671392560005188,
+      "learning_rate": 4.805664489414031e-05,
+      "loss": 0.19338636398315429,
+      "step": 840
+    },
+    {
+      "epoch": 0.15374818049490538,
+      "grad_norm": 0.14047028124332428,
+      "learning_rate": 4.802809132787125e-05,
+      "loss": 0.17069108486175538,
+      "step": 845
+    },
+    {
+      "epoch": 0.1546579330422125,
+      "grad_norm": 0.1520431935787201,
+      "learning_rate": 4.799933813034768e-05,
+      "loss": 0.18607735633850098,
+      "step": 850
+    },
+    {
+      "epoch": 0.15556768558951964,
+      "grad_norm": 0.17239463329315186,
+      "learning_rate": 4.797038555083197e-05,
+      "loss": 0.18069062232971192,
+      "step": 855
+    },
+    {
+      "epoch": 0.1564774381368268,
+      "grad_norm": 0.1377955675125122,
+      "learning_rate": 4.794123384031495e-05,
+      "loss": 0.18870222568511963,
+      "step": 860
+    },
+    {
+      "epoch": 0.15738719068413393,
+      "grad_norm": 0.15901461243629456,
+      "learning_rate": 4.791188325151373e-05,
+      "loss": 0.18128334283828734,
+      "step": 865
+    },
+    {
+      "epoch": 0.15829694323144106,
+      "grad_norm": 0.14634132385253906,
+      "learning_rate": 4.7882334038869495e-05,
+      "loss": 0.1866163969039917,
+      "step": 870
+    },
+    {
+      "epoch": 0.1592066957787482,
+      "grad_norm": 0.15361061692237854,
+      "learning_rate": 4.785258645854529e-05,
+      "loss": 0.17850807905197144,
+      "step": 875
+    },
+    {
+      "epoch": 0.16011644832605532,
+      "grad_norm": 0.13751649856567383,
+      "learning_rate": 4.782264076842385e-05,
+      "loss": 0.17731113433837892,
+      "step": 880
+    },
+    {
+      "epoch": 0.16102620087336245,
+      "grad_norm": 0.17909638583660126,
+      "learning_rate": 4.7792497228105314e-05,
+      "loss": 0.18344542980194092,
+      "step": 885
+    },
+    {
+      "epoch": 0.16193595342066958,
+      "grad_norm": 0.16038304567337036,
+      "learning_rate": 4.776215609890498e-05,
+      "loss": 0.18868647813796996,
+      "step": 890
+    },
+    {
+      "epoch": 0.1628457059679767,
+      "grad_norm": 0.1653951108455658,
+      "learning_rate": 4.773161764385107e-05,
+      "loss": 0.18614152669906617,
+      "step": 895
+    },
+    {
+      "epoch": 0.16375545851528384,
+      "grad_norm": 0.16193026304244995,
+      "learning_rate": 4.770088212768241e-05,
+      "loss": 0.18564575910568237,
+      "step": 900
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 5.0362254118877645e+17,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-900/training_args.bin b/checkpoint-900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba
--- /dev/null
+++ b/checkpoint-900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
+size 5777
diff --git a/processor_config.json b/processor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1
--- /dev/null
+++ b/processor_config.json
@@ -0,0 +1,75 @@
+{
+  "audio_ms_per_token": 40,
+  "audio_seq_length": 750,
+  "feature_extractor": {
+    "dither": 0.0,
+    "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+    "feature_size": 128,
+    "fft_length": 512,
+    "fft_overdrive": false,
+    "frame_length": 320,
+    "hop_length": 160,
+    "input_scale_factor": 1.0,
+    "max_frequency": 8000.0,
+    "mel_floor": 0.001,
+    "min_frequency": 0.0,
+    "padding_side": "left",
+    "padding_value": 0.0,
+    "per_bin_mean": null,
+    "per_bin_stddev": null,
+    "preemphasis": 0.0,
+    "preemphasis_htk_flavor": true,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+  },
+  "image_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": false,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_processor_type": "Gemma4ImageProcessor",
+    "image_seq_length": 280,
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 280,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098
+  },
+  "image_seq_length": 280,
+  "processor_class": "Gemma4Processor",
+  "video_processor": {
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "do_sample_frames": true,
+    "image_mean": [
+      0.0,
+      0.0,
+      0.0
+    ],
+    "image_std": [
+      1.0,
+      1.0,
+      1.0
+    ],
+    "max_soft_tokens": 70,
+    "num_frames": 32,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "return_metadata": false,
+    "video_processor_type": "Gemma4VideoProcessor"
+  }
+}
diff --git a/tokenizer.json b/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/tokenizer_config.json b/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049
--- /dev/null
+++ b/tokenizer_config.json
@@ -0,0 +1,289 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<turn|>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 131072,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "properties": {
+      "content": {
+        "type": "string"
+      },
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "items": {
+          "properties": {
+            "function": {
+              "properties": {
+                "arguments": {
+                  "additionalProperties": {},
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call"
+                },
+                "name": {
+                  "type": "string"
+                }
+              },
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
+            },
+            "type": {
+              "const": "function"
+            }
+          },
+          "type": "object"
+        },
+        "type": "array",
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
+      }
+    },
+    "type": "object",
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "46": {
+      "content": "<|tool>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "47": {
+      "content": "<tool|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "48": {
+      "content": "<|tool_call>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "49": {
+      "content": "<tool_call|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "50": {
+      "content": "<|tool_response>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "51": {
+      "content": "<tool_response|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "52": {
+      "content": "<|\"|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "98": {
+      "content": "<|think|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "100": {
+      "content": "<|channel>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "101": {
+      "content": "<channel|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "105": {
+      "content": "<|turn>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "106": {
+      "content": "<turn|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "255999": {
+      "content": "<|image>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "256000": {
+      "content": "<|audio>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258880": {
+      "content": "<|image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258881": {
+      "content": "<|audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258882": {
+      "content": "<image|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258883": {
+      "content": "<audio|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    "258884": {
+      "content": "<|video|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  }
+}